LICENSE
README.md
pyproject.toml
src/knowledge_fidelity/__init__.py
src/knowledge_fidelity/__main__.py
src/knowledge_fidelity/audit.py
src/knowledge_fidelity/behavioral.py
src/knowledge_fidelity/calibration.py
src/knowledge_fidelity/core.py
src/knowledge_fidelity/denoise.py
src/knowledge_fidelity/utils.py
src/knowledge_fidelity/behaviors/__init__.py
src/knowledge_fidelity/behaviors/base.py
src/knowledge_fidelity/behaviors/bias.py
src/knowledge_fidelity/behaviors/factual.py
src/knowledge_fidelity/behaviors/metrics.py
src/knowledge_fidelity/behaviors/reasoning.py
src/knowledge_fidelity/behaviors/sycophancy.py
src/knowledge_fidelity/behaviors/toxicity.py
src/knowledge_fidelity/cartography/__init__.py
src/knowledge_fidelity/cartography/engine.py
src/knowledge_fidelity/cartography/schema.py
src/knowledge_fidelity/cli/__init__.py
src/knowledge_fidelity/cli/rho_audit.py
src/knowledge_fidelity/output/__init__.py
src/knowledge_fidelity/output/comparator.py
src/knowledge_fidelity/output/exporters.py
src/knowledge_fidelity/output/schema.py
src/knowledge_fidelity/probes/__init__.py
src/knowledge_fidelity/probes/registry.py
src/knowledge_fidelity/probes/data/bias/bbq_300.json
src/knowledge_fidelity/probes/data/factual/commonsense.json
src/knowledge_fidelity/probes/data/factual/default.json
src/knowledge_fidelity/probes/data/factual/mandela.json
src/knowledge_fidelity/probes/data/factual/medical.json
src/knowledge_fidelity/probes/data/factual/truthfulqa.json
src/knowledge_fidelity/probes/data/reasoning/gsm8k_100.json
src/knowledge_fidelity/probes/data/sycophancy/anthropic_150.json
src/knowledge_fidelity/probes/data/toxicity/toxigen_200.json
src/knowledge_fidelity/svd/__init__.py
src/knowledge_fidelity/svd/compress.py
src/knowledge_fidelity/svd/freeze.py
src/knowledge_fidelity/svd/importance.py
src/rho_eval/__init__.py
src/rho_eval/__main__.py
src/rho_eval/_compat.py
src/rho_eval/audit.py
src/rho_eval/behavioral.py
src/rho_eval/calibration.py
src/rho_eval/core.py
src/rho_eval/denoise.py
src/rho_eval/surgical_planner.py
src/rho_eval/utils.py
src/rho_eval.egg-info/PKG-INFO
src/rho_eval.egg-info/SOURCES.txt
src/rho_eval.egg-info/dependency_links.txt
src/rho_eval.egg-info/entry_points.txt
src/rho_eval.egg-info/requires.txt
src/rho_eval.egg-info/top_level.txt
src/rho_eval/alignment/__init__.py
src/rho_eval/alignment/dataset.py
src/rho_eval/alignment/losses.py
src/rho_eval/alignment/mlx_losses.py
src/rho_eval/alignment/mlx_trainer.py
src/rho_eval/alignment/trainer.py
src/rho_eval/behaviors/__init__.py
src/rho_eval/behaviors/base.py
src/rho_eval/behaviors/bias.py
src/rho_eval/behaviors/deception.py
src/rho_eval/behaviors/factual.py
src/rho_eval/behaviors/metrics.py
src/rho_eval/behaviors/overrefusal.py
src/rho_eval/behaviors/reasoning.py
src/rho_eval/behaviors/refusal.py
src/rho_eval/behaviors/sycophancy.py
src/rho_eval/behaviors/toxicity.py
src/rho_eval/benchmarking/__init__.py
src/rho_eval/benchmarking/adversarial.py
src/rho_eval/benchmarking/leaderboard.py
src/rho_eval/benchmarking/loader.py
src/rho_eval/benchmarking/probe_registry.py
src/rho_eval/benchmarking/reports.py
src/rho_eval/benchmarking/schema.py
src/rho_eval/benchmarking/scorers.py
src/rho_eval/benchmarking/truthfulqa.py
src/rho_eval/cartography/__init__.py
src/rho_eval/cartography/engine.py
src/rho_eval/cartography/schema.py
src/rho_eval/cli/__init__.py
src/rho_eval/cli/rho_align.py
src/rho_eval/cli/rho_audit.py
src/rho_eval/cli/rho_bench.py
src/rho_eval/cli/rho_benchmark.py
src/rho_eval/cli/rho_hybrid.py
src/rho_eval/cli/rho_interpret.py
src/rho_eval/cli/rho_leaderboard.py
src/rho_eval/cli/rho_steer.py
src/rho_eval/cli/rho_surgery.py
src/rho_eval/cli/rho_unlock.py
src/rho_eval/cli/snap_on.py
src/rho_eval/hybrid/__init__.py
src/rho_eval/hybrid/pipeline.py
src/rho_eval/hybrid/schema.py
src/rho_eval/interpretability/__init__.py
src/rho_eval/interpretability/activation.py
src/rho_eval/interpretability/heads.py
src/rho_eval/interpretability/overlap.py
src/rho_eval/interpretability/schema.py
src/rho_eval/interpretability/subspaces.py
src/rho_eval/interpretability/surgical.py
src/rho_eval/interpretability/visualize.py
src/rho_eval/output/__init__.py
src/rho_eval/output/comparator.py
src/rho_eval/output/exporters.py
src/rho_eval/output/schema.py
src/rho_eval/probes/__init__.py
src/rho_eval/probes/registry.py
src/rho_eval/probes/data/bench/clinical.json
src/rho_eval/probes/data/bench/logic.json
src/rho_eval/probes/data/bench/social.json
src/rho_eval/probes/data/bias/bbq_300.json
src/rho_eval/probes/data/bias/bridge_native.json
src/rho_eval/probes/data/bias/bridge_pairs.json
src/rho_eval/probes/data/bias/bridge_scaleup.json
src/rho_eval/probes/data/bias/gender_biology.json
src/rho_eval/probes/data/bias/sexual_orientation_biology.json
src/rho_eval/probes/data/deception/bridge_native.json
src/rho_eval/probes/data/deception/bridge_pairs.json
src/rho_eval/probes/data/deception/bridge_scaleup.json
src/rho_eval/probes/data/deception/hh_rlhf_100.json
src/rho_eval/probes/data/factual/commonsense.json
src/rho_eval/probes/data/factual/default.json
src/rho_eval/probes/data/factual/expanded_150.json
src/rho_eval/probes/data/factual/mandela.json
src/rho_eval/probes/data/factual/medical.json
src/rho_eval/probes/data/factual/truthfulqa.json
src/rho_eval/probes/data/overrefusal/benign_edgy_80.json
src/rho_eval/probes/data/overrefusal/expanded_70.json
src/rho_eval/probes/data/reasoning/bridge_native.json
src/rho_eval/probes/data/reasoning/bridge_pairs.json
src/rho_eval/probes/data/reasoning/bridge_scaleup.json
src/rho_eval/probes/data/reasoning/bridge_scaleup_v2.json
src/rho_eval/probes/data/reasoning/bridge_shortform.json
src/rho_eval/probes/data/reasoning/gsm8k_100.json
src/rho_eval/probes/data/refusal/expanded_100.json
src/rho_eval/probes/data/refusal/harmful_benign_50.json
src/rho_eval/probes/data/sycophancy/anthropic_150.json
src/rho_eval/probes/data/sycophancy/hard_100.json
src/rho_eval/probes/data/sycophancy/harder_pairs_16.json
src/rho_eval/probes/data/toxicity/bridge_native.json
src/rho_eval/probes/data/toxicity/bridge_pairs.json
src/rho_eval/probes/data/toxicity/bridge_scaleup.json
src/rho_eval/probes/data/toxicity/bridge_scaleup_v2.json
src/rho_eval/probes/data/toxicity/bridge_shortform.json
src/rho_eval/probes/data/toxicity/toxigen_200.json
src/rho_eval/snap_on/__init__.py
src/rho_eval/snap_on/inference.py
src/rho_eval/snap_on/module.py
src/rho_eval/snap_on/training.py
src/rho_eval/steering/__init__.py
src/rho_eval/steering/analyze.py
src/rho_eval/steering/collect.py
src/rho_eval/steering/sae.py
src/rho_eval/steering/schema.py
src/rho_eval/steering/steer.py
src/rho_eval/steering/train.py
src/rho_eval/svd/__init__.py
src/rho_eval/svd/compress.py
src/rho_eval/svd/freeze.py
src/rho_eval/svd/importance.py
src/rho_eval/unlock/__init__.py
src/rho_eval/unlock/contrastive.py
src/rho_eval/unlock/diagnosis.py
src/rho_eval/unlock/expression_gap.py
tests/test_alignment.py
tests/test_benchmarking.py
tests/test_cli.py
tests/test_interpretability.py
tests/test_metrics.py
tests/test_output.py
tests/test_probes.py
tests/test_registry.py
tests/test_steering.py
tests/test_surgery.py