import pathlib
from typing import Any
import pandas as pd
from bibcat import config
from bibcat.llm.io import read_output
from bibcat.utils.logger_config import setup_logger
from bibcat.utils.utils import save_json_file
logger = setup_logger(__name__, level=config.logging.level)
[docs]
def inconsistent_classifications(input_path: str | pathlib.Path, output_path: str | pathlib.Path):
"""Save falsely classified bibcodes to a json file for investigation
This code will check if llm classification is different from human classification
or incorrectly ignore the mission and save the results to a json file.
Parameters
----------
input_path: str | pathlib.Path
Input eval_output file name/path for statistics
output_path: str | pathlib.Path
File name/path to save the JSON file
Returns
-------
None
"""
data = read_output(bibcode=None, filename=input_path)
logger.debug(f"Loaded data: {data}")
results = {}
n_matched_classifications = 0
n_llm_only_classified_bibcodes = 0
for bibcode, item in data.items():
human_item = item.get("human", {})
llm_item = item.get("llm", [])
err = item.get("error", "")
if "No paper source found" in err:
continue # should skip when paper source is not found
elif not human_item and "No mission output found" in err:
continue # should skip when both human and llm don't have labels
# save the bibcode items where llm only classifications for some missions but no human classfications at all
# for all missions into a file for further inspection
elif not human_item and llm_item:
n_llm_only_classified_bibcodes += 1
results[bibcode] = {
"failures": {"flag": "llm_only_classified"},
"human": {},
"llm": llm_item,
"missions_not_in_text": item.get("hallucinated_missions", []),
}
llm_only_classified_bibcode_item = {
bibcode: {"llm": llm_item},
}
logger.debug(
"Saving the bibcode items with llm only classifications but no human classfications at all.\n"
+ "Human might have completely missed classification or completely LLM hallunication! Investigate the list!"
)
save_json_file(
pathlib.Path(config.paths.output)
/ f"llms/openai_{config.llms.openai.model}/llm_only_classified_list_for_audit.json",
llm_only_classified_bibcode_item,
indent=2,
)
continue
failures, n_matched = analyze_missions(human_item, llm_item)
n_matched_classifications += n_matched
if failures:
results[bibcode] = {
"failures": failures,
"human": human_item,
"llm": llm_item,
"missions_not_in_text": item.get("hallucinated_missions", []),
}
# summarized counts of inconsistent classifications
summary_counts = audit_summary(results)
summary_counts = {
"n_total_bibcodes": len(data),
"n_llm_only_classified_bibcodes": n_llm_only_classified_bibcodes,
"n_matched_classifications": n_matched_classifications,
**summary_counts,
}
# Add the summary to the top of the bibcode+mission breakdown results
results_with_summary = {
"summary_counts": summary_counts,
"bibcodes": results,
}
save_json_file(output_path, results_with_summary, indent=2)
[docs]
def analyze_missions(human_item: dict[str, str], llm_item: list[dict[str, Any]]) -> tuple[dict[str, str], int]:
"""Analyze and compare LLM classifications against human classifications.
Parameters
----------
human_item: dict[str, str]
human classification of mission and papertype, e.g., {"HST": "MENTION", "JWST": "SUPERMENTION"}
llm_item: list[dict[str, Any]]
list of llm classifications
Returns
-------
failure: dict
dictionary of failured cases
n_matched_classifications: int
number of matched classifications
"""
failures = {}
n_matched_classifications = 0
for mission, human_label in human_item.items():
mission_in_llm = any(mission in llm for llm in llm_item)
match_found = any(llm.get(mission) == human_label for llm in llm_item if mission in llm)
llm_science_assigned = any(llm.get(mission) == "SCIENCE" for llm in llm_item if mission in llm)
if not mission_in_llm:
if human_label == "SCIENCE":
failures[mission] = "false_negative_because_ignored"
else:
failures[mission] = "ignored"
elif match_found:
n_matched_classifications += 1
else:
if human_label == "SCIENCE":
failures[mission] = "false_negative"
elif llm_science_assigned:
failures[mission] = "false_positive"
return failures, n_matched_classifications
[docs]
def audit_summary(audit_results: dict) -> dict[str, int]:
"""Create the summary of the inconsistent classifications
Parameters
==========
audit_results: dict
the breakdown bibcode list of inconsistent llm classifications
e.g.,
"bibcodes": {"2018A&A...610A..11I": {"failures": "GALEX": "false_positive"},}
Returns
=======
summary_counts: dict[str, int]
various count summary
"""
summary_counts = {
"n_mismatched_bibcodes": 0,
"n_mismatched_classifications": 0,
"false_positive": 0,
"false_negative": 0,
"false_negative_because_ignored": 0,
"ignored": 0,
}
for bibcode, entry in audit_results.items():
error_dict = entry.get("failures", {})
error_count = len(error_dict)
if error_count > 0:
summary_counts["n_mismatched_bibcodes"] += 1
summary_counts["n_mismatched_classifications"] += error_count
for error_type in error_dict.values():
if error_type in summary_counts:
summary_counts[error_type] += 1
return summary_counts
[docs]
def save_evaluation_stats(
input_path: str | pathlib.Path,
output_path: str | pathlib.Path,
threshold_acceptance: float,
threshold_inspection: float,
):
"""Generate acceptance and inspection statistics and identify classification inconsistencies between humans and the LLM for evaluation summary data
This function performs the following actions:
- **Creates a statistics file** containing:
- **Accepted LLM Classifications**: Number of papers with classifications accepted by the LLM based on a specified threshold value for each combination of mission and paper type.
- **Human Inspection Requirements**: Number of papers requiring human inspection
- **Accepted Bibcodes**: Bibcodes corresponding to the accepted classifications.
- **Inspection-Required Bibcodes**: Bibcodes that need human inspection due to ambiguous confidence values.
Parameters
----------
input_path: str | pathlib.Path
Input paper_output file name/path for statistics
output_path: str | pathlib.Path
File name/path to save the JSON file
threshold_acceptance: float
Threshold value to accept LLM papertype
threshold_inspection: float
Threshold value to filter papers required for human inspection
Returns
-------
None
Raises
------
Exception
For any other exceptions that occur during DataFrame creation or file operations.
"""
data = read_output(bibcode=None, filename=input_path)
logger.debug(f"Loaded data: {data}")
# Build DataFrame
try:
df = pd.DataFrame(
[
(
item["llm_mission"].lower(), # mission
item["llm_papertype"].lower(), # papertype
item["mean_llm_confidences"],
bibcode,
item["in_human_class"],
item["mission_in_text"],
item["consistency"],
)
for bibcode, eval_item in data.items()
if "df" in eval_item
for index, item in enumerate(eval_item["df"])
],
columns=[
"mission",
"papertype",
"mean_llm_confidences",
"bibcode",
"in_human_class",
"mission_in_text",
"consistency",
],
)
except Exception as e:
logger.error(f"Error during operation DataFrame creation: {e}")
raise
df = df.sort_values(["mission", "papertype"]).reset_index(drop=True)
# grouping DF and aggregate other properies
grouped_df = group_by_agg("mean_llm_confidences", threshold_acceptance, threshold_inspection, df)
# Write the statistics summary
write_stats(output_path, threshold_acceptance, threshold_inspection, grouped_df)
[docs]
def save_operation_stats(
input_path: str | pathlib.Path,
output_path: str | pathlib.Path,
threshold_acceptance: float,
threshold_inspection: float,
):
"""Generate acceptance and inspection statistics from operational classifications
This function performs the following actions:
- **Creates a statistics file** containing:
- **Accepted LLM Classifications**: Number of papers with classifications accepted by the LLM based on a specified threshold value for each combination of mission and paper type.
- **Human Inspection Requirements**: Number of papers requiring human inspection
- **Accepted Bibcodes**: Bibcodes corresponding to the accepted classifications.
- **Inspection-Required Bibcodes**: Bibcodes that need human inspection due to ambiguous confidence values.
Parameters
----------
input_path: str | pathlib.Path
Input paper_output filename/path for statistics
output_path: str | pathlib.Path
File name/path to save the JSON file
threshold_acceptance: float
Threshold value to accept LLM papertype
threshold_inspection: float
Threshold value to filter papers required for human inspection
Returns
-------
None
Raises
------
Exception
For any other exceptions that occur during DataFrame creation or file operations.
"""
data = read_output(bibcode=None, filename=input_path)
logger.debug(f"The number of the loaded data: {len(data)}")
# filter out bad data
n_data = len(data)
data = {b: a for b, a in data.items() for mi in a if "error" not in mi.keys() and mi["missions"]}
logger.debug(f"Filtered {n_data - len(data)} bad data from {n_data} total entries.")
# Validate data structure
for bibcode, assessment in data.items():
assert isinstance(assessment, list), f"Assessment for {bibcode} should be a list."
for mission_item in assessment:
assert isinstance(mission_item, dict), (
f"Each mission_item should be a dict, got {type(mission_item)} for bibcode {bibcode}."
)
# Build Pandas DataFrame
try:
df = pd.DataFrame(
[
[item["mission"].lower(), item["papertype"].lower(), item["confidence"], bibcode]
for bibcode, assessment in data.items()
for mission_item in assessment
for item in mission_item["missions"]
],
columns=["mission", "papertype", "llm_confidences", "bibcode"],
)
except Exception as e:
logger.error(f"Error during operation DataFrame creation: {e}")
raise
df = df.sort_values(["mission", "papertype"]).reset_index(drop=True)
# grouping DF and aggregate other properies
grouped_df = group_by_agg("llm_confidences", threshold_acceptance, threshold_inspection, df)
# Write the statistics summary
write_stats(output_path, threshold_acceptance, threshold_inspection, grouped_df)
[docs]
def group_by_agg(confidence_name: str, threshold_acceptance: float, threshold_inspection: float, df: pd.DataFrame):
"""Group DataFrame by mission and papertype and aggregate other properties.
Parameters
----------
confidence_name: str
The Key name for LLM confidences, e.g, `"llm_confidences"` for `paper_output.json` or `"mean_llm_confidences"` for `summary_output.json`
threshold_acceptance: float
Threshold value to accept LLM papertype
threshold_inspection: float
Threshold value to filter papers required for human inspection
df: pd.DataFrame
Dataframe
Returns
-------
pd.DataFrame
"""
def inspection_condition(confidence: list[float, float]):
return (max(confidence) >= threshold_inspection) and (max(confidence) < threshold_acceptance)
def acceptance_condition(confidence: list[float, float]):
return max(confidence) >= threshold_acceptance
grouped_df = (
df.fillna(0)
.groupby(["mission", "papertype"])
.agg(
total_count=("mission", "size"),
accepted_count=(confidence_name, lambda x: sum(1 for i in x if max(i) >= threshold_acceptance)),
accepted_bibcodes=(
"bibcode",
lambda x: list(
set(
[
df.loc[i, "bibcode"]
for i in range(len(x))
if acceptance_condition(df.loc[x.index[i], confidence_name])
]
)
),
),
inspection_count=(
confidence_name,
lambda x: sum(1 for i in x if inspection_condition(i)),
),
inspection_bibcodes=(
"bibcode",
lambda x: list(
set(
[
df.loc[i, "bibcode"]
for i in range(len(x))
if inspection_condition(df.loc[x.index[i], confidence_name])
]
)
),
),
)
.reset_index()
)
return grouped_df
[docs]
def write_stats(output_path, threshold_acceptance, threshold_inspection, grouped_df):
"""Write the satistics into a JSON file.
Parameters
----------
output_path: pathlib.Path
Filename path to save the stats results.
threshold_acceptance: float
Threshold value to accept LLM papertype.
threshold_inspection: float
Threshold value to filter papers required for human inspection.
grouped_df: pd.DataFrame
Grouped DataFrame
Returns
-------
None
"""
tsv_df = grouped_df[["mission", "papertype", "total_count", "accepted_count", "inspection_count"]]
logger.info("Production counts by LLM Mission and Paper Type:\n" + tsv_df.to_string(index=False))
# Write to an ascii file
summary_file = (
pathlib.Path(config.paths.output)
/ f"llms/openai_{config.llms.openai.model}/{config.llms.eval_stats_file}_t{config.llms.performance.threshold}.txt"
)
try:
# Format and save to a text file with proper alignment
with open(summary_file, "w") as f:
f.write(tsv_df.to_string(index=False))
print(f"Data successfully written to {summary_file}")
except IOError as e:
print(f"Error writing to file: {e}")
# writing the stats table JSON
list_of_dicts = grouped_df.to_dict(orient="records")
list_of_dicts.insert(
0, {"threshold_acceptance": threshold_acceptance, "threshold_inspection": threshold_inspection}
)
save_json_file(
output_path,
list_of_dicts,
)
logger.info(f"bibcode lists for both acceptance and inspection were generated in {output_path}")