Source code for bibcat.llm.stats

import pathlib
from typing import Any

import pandas as pd

from bibcat import config
from bibcat.llm.io import read_output
from bibcat.utils.logger_config import setup_logger
from bibcat.utils.utils import save_json_file

logger = setup_logger(__name__, level=config.logging.level)


[docs] def inconsistent_classifications(input_path: str | pathlib.Path, output_path: str | pathlib.Path): """Save falsely classified bibcodes to a json file for investigation This code will check if llm classification is different from human classification or incorrectly ignore the mission and save the results to a json file. Parameters ---------- input_path: str | pathlib.Path Input eval_output file name/path for statistics output_path: str | pathlib.Path File name/path to save the JSON file Returns ------- None """ data = read_output(bibcode=None, filename=input_path) logger.debug(f"Loaded data: {data}") results = {} n_matched_classifications = 0 n_llm_only_classified_bibcodes = 0 for bibcode, item in data.items(): human_item = item.get("human", {}) llm_item = item.get("llm", []) err = item.get("error", "") if "No paper source found" in err: continue # should skip when paper source is not found elif not human_item and "No mission output found" in err: continue # should skip when both human and llm don't have labels # save the bibcode items where llm only classifications for some missions but no human classfications at all # for all missions into a file for further inspection elif not human_item and llm_item: n_llm_only_classified_bibcodes += 1 results[bibcode] = { "failures": {"flag": "llm_only_classified"}, "human": {}, "llm": llm_item, "missions_not_in_text": item.get("hallucinated_missions", []), } llm_only_classified_bibcode_item = { bibcode: {"llm": llm_item}, } logger.debug( "Saving the bibcode items with llm only classifications but no human classfications at all.\n" + "Human might have completely missed classification or completely LLM hallunication! Investigate the list!" ) save_json_file( pathlib.Path(config.paths.output) / f"llms/openai_{config.llms.openai.model}/llm_only_classified_list_for_audit.json", llm_only_classified_bibcode_item, indent=2, ) continue failures, n_matched = analyze_missions(human_item, llm_item) n_matched_classifications += n_matched if failures: results[bibcode] = { "failures": failures, "human": human_item, "llm": llm_item, "missions_not_in_text": item.get("hallucinated_missions", []), } # summarized counts of inconsistent classifications summary_counts = audit_summary(results) summary_counts = { "n_total_bibcodes": len(data), "n_llm_only_classified_bibcodes": n_llm_only_classified_bibcodes, "n_matched_classifications": n_matched_classifications, **summary_counts, } # Add the summary to the top of the bibcode+mission breakdown results results_with_summary = { "summary_counts": summary_counts, "bibcodes": results, } save_json_file(output_path, results_with_summary, indent=2)
[docs] def analyze_missions(human_item: dict[str, str], llm_item: list[dict[str, Any]]) -> tuple[dict[str, str], int]: """Analyze and compare LLM classifications against human classifications. Parameters ---------- human_item: dict[str, str] human classification of mission and papertype, e.g., {"HST": "MENTION", "JWST": "SUPERMENTION"} llm_item: list[dict[str, Any]] list of llm classifications Returns ------- failure: dict dictionary of failured cases n_matched_classifications: int number of matched classifications """ failures = {} n_matched_classifications = 0 for mission, human_label in human_item.items(): mission_in_llm = any(mission in llm for llm in llm_item) match_found = any(llm.get(mission) == human_label for llm in llm_item if mission in llm) llm_science_assigned = any(llm.get(mission) == "SCIENCE" for llm in llm_item if mission in llm) if not mission_in_llm: if human_label == "SCIENCE": failures[mission] = "false_negative_because_ignored" else: failures[mission] = "ignored" elif match_found: n_matched_classifications += 1 else: if human_label == "SCIENCE": failures[mission] = "false_negative" elif llm_science_assigned: failures[mission] = "false_positive" return failures, n_matched_classifications
[docs] def audit_summary(audit_results: dict) -> dict[str, int]: """Create the summary of the inconsistent classifications Parameters ========== audit_results: dict the breakdown bibcode list of inconsistent llm classifications e.g., "bibcodes": {"2018A&A...610A..11I": {"failures": "GALEX": "false_positive"},} Returns ======= summary_counts: dict[str, int] various count summary """ summary_counts = { "n_mismatched_bibcodes": 0, "n_mismatched_classifications": 0, "false_positive": 0, "false_negative": 0, "false_negative_because_ignored": 0, "ignored": 0, } for bibcode, entry in audit_results.items(): error_dict = entry.get("failures", {}) error_count = len(error_dict) if error_count > 0: summary_counts["n_mismatched_bibcodes"] += 1 summary_counts["n_mismatched_classifications"] += error_count for error_type in error_dict.values(): if error_type in summary_counts: summary_counts[error_type] += 1 return summary_counts
[docs] def save_evaluation_stats( input_path: str | pathlib.Path, output_path: str | pathlib.Path, threshold_acceptance: float, threshold_inspection: float, ): """Generate acceptance and inspection statistics and identify classification inconsistencies between humans and the LLM for evaluation summary data This function performs the following actions: - **Creates a statistics file** containing: - **Accepted LLM Classifications**: Number of papers with classifications accepted by the LLM based on a specified threshold value for each combination of mission and paper type. - **Human Inspection Requirements**: Number of papers requiring human inspection - **Accepted Bibcodes**: Bibcodes corresponding to the accepted classifications. - **Inspection-Required Bibcodes**: Bibcodes that need human inspection due to ambiguous confidence values. Parameters ---------- input_path: str | pathlib.Path Input paper_output file name/path for statistics output_path: str | pathlib.Path File name/path to save the JSON file threshold_acceptance: float Threshold value to accept LLM papertype threshold_inspection: float Threshold value to filter papers required for human inspection Returns ------- None Raises ------ Exception For any other exceptions that occur during DataFrame creation or file operations. """ data = read_output(bibcode=None, filename=input_path) logger.debug(f"Loaded data: {data}") # Build DataFrame try: df = pd.DataFrame( [ ( item["llm_mission"].lower(), # mission item["llm_papertype"].lower(), # papertype item["mean_llm_confidences"], bibcode, item["in_human_class"], item["mission_in_text"], item["consistency"], ) for bibcode, eval_item in data.items() if "df" in eval_item for index, item in enumerate(eval_item["df"]) ], columns=[ "mission", "papertype", "mean_llm_confidences", "bibcode", "in_human_class", "mission_in_text", "consistency", ], ) except Exception as e: logger.error(f"Error during operation DataFrame creation: {e}") raise df = df.sort_values(["mission", "papertype"]).reset_index(drop=True) # grouping DF and aggregate other properies grouped_df = group_by_agg("mean_llm_confidences", threshold_acceptance, threshold_inspection, df) # Write the statistics summary write_stats(output_path, threshold_acceptance, threshold_inspection, grouped_df)
[docs] def save_operation_stats( input_path: str | pathlib.Path, output_path: str | pathlib.Path, threshold_acceptance: float, threshold_inspection: float, ): """Generate acceptance and inspection statistics from operational classifications This function performs the following actions: - **Creates a statistics file** containing: - **Accepted LLM Classifications**: Number of papers with classifications accepted by the LLM based on a specified threshold value for each combination of mission and paper type. - **Human Inspection Requirements**: Number of papers requiring human inspection - **Accepted Bibcodes**: Bibcodes corresponding to the accepted classifications. - **Inspection-Required Bibcodes**: Bibcodes that need human inspection due to ambiguous confidence values. Parameters ---------- input_path: str | pathlib.Path Input paper_output filename/path for statistics output_path: str | pathlib.Path File name/path to save the JSON file threshold_acceptance: float Threshold value to accept LLM papertype threshold_inspection: float Threshold value to filter papers required for human inspection Returns ------- None Raises ------ Exception For any other exceptions that occur during DataFrame creation or file operations. """ data = read_output(bibcode=None, filename=input_path) logger.debug(f"The number of the loaded data: {len(data)}") # filter out bad data n_data = len(data) data = {b: a for b, a in data.items() for mi in a if "error" not in mi.keys() and mi["missions"]} logger.debug(f"Filtered {n_data - len(data)} bad data from {n_data} total entries.") # Validate data structure for bibcode, assessment in data.items(): assert isinstance(assessment, list), f"Assessment for {bibcode} should be a list." for mission_item in assessment: assert isinstance(mission_item, dict), ( f"Each mission_item should be a dict, got {type(mission_item)} for bibcode {bibcode}." ) # Build Pandas DataFrame try: df = pd.DataFrame( [ [item["mission"].lower(), item["papertype"].lower(), item["confidence"], bibcode] for bibcode, assessment in data.items() for mission_item in assessment for item in mission_item["missions"] ], columns=["mission", "papertype", "llm_confidences", "bibcode"], ) except Exception as e: logger.error(f"Error during operation DataFrame creation: {e}") raise df = df.sort_values(["mission", "papertype"]).reset_index(drop=True) # grouping DF and aggregate other properies grouped_df = group_by_agg("llm_confidences", threshold_acceptance, threshold_inspection, df) # Write the statistics summary write_stats(output_path, threshold_acceptance, threshold_inspection, grouped_df)
[docs] def group_by_agg(confidence_name: str, threshold_acceptance: float, threshold_inspection: float, df: pd.DataFrame): """Group DataFrame by mission and papertype and aggregate other properties. Parameters ---------- confidence_name: str The Key name for LLM confidences, e.g, `"llm_confidences"` for `paper_output.json` or `"mean_llm_confidences"` for `summary_output.json` threshold_acceptance: float Threshold value to accept LLM papertype threshold_inspection: float Threshold value to filter papers required for human inspection df: pd.DataFrame Dataframe Returns ------- pd.DataFrame """ def inspection_condition(confidence: list[float, float]): return (max(confidence) >= threshold_inspection) and (max(confidence) < threshold_acceptance) def acceptance_condition(confidence: list[float, float]): return max(confidence) >= threshold_acceptance grouped_df = ( df.fillna(0) .groupby(["mission", "papertype"]) .agg( total_count=("mission", "size"), accepted_count=(confidence_name, lambda x: sum(1 for i in x if max(i) >= threshold_acceptance)), accepted_bibcodes=( "bibcode", lambda x: list( set( [ df.loc[i, "bibcode"] for i in range(len(x)) if acceptance_condition(df.loc[x.index[i], confidence_name]) ] ) ), ), inspection_count=( confidence_name, lambda x: sum(1 for i in x if inspection_condition(i)), ), inspection_bibcodes=( "bibcode", lambda x: list( set( [ df.loc[i, "bibcode"] for i in range(len(x)) if inspection_condition(df.loc[x.index[i], confidence_name]) ] ) ), ), ) .reset_index() ) return grouped_df
[docs] def write_stats(output_path, threshold_acceptance, threshold_inspection, grouped_df): """Write the satistics into a JSON file. Parameters ---------- output_path: pathlib.Path Filename path to save the stats results. threshold_acceptance: float Threshold value to accept LLM papertype. threshold_inspection: float Threshold value to filter papers required for human inspection. grouped_df: pd.DataFrame Grouped DataFrame Returns ------- None """ tsv_df = grouped_df[["mission", "papertype", "total_count", "accepted_count", "inspection_count"]] logger.info("Production counts by LLM Mission and Paper Type:\n" + tsv_df.to_string(index=False)) # Write to an ascii file summary_file = ( pathlib.Path(config.paths.output) / f"llms/openai_{config.llms.openai.model}/{config.llms.eval_stats_file}_t{config.llms.performance.threshold}.txt" ) try: # Format and save to a text file with proper alignment with open(summary_file, "w") as f: f.write(tsv_df.to_string(index=False)) print(f"Data successfully written to {summary_file}") except IOError as e: print(f"Error writing to file: {e}") # writing the stats table JSON list_of_dicts = grouped_df.to_dict(orient="records") list_of_dicts.insert( 0, {"threshold_acceptance": threshold_acceptance, "threshold_inspection": threshold_inspection} ) save_json_file( output_path, list_of_dicts, ) logger.info(f"bibcode lists for both acceptance and inspection were generated in {output_path}")