Source code for bibcat.llm.stats

import pathlib
from typing import Any

import pandas as pd

from bibcat import config
from bibcat.llm.io import read_output
from bibcat.utils.logger_config import setup_logger
from bibcat.utils.utils import save_json_file

logger = setup_logger(__name__, level=config.logging.level)



[docs]
def inconsistent_classifications(input_path: str | pathlib.Path, output_path: str | pathlib.Path):
    """Save falsely classified bibcodes to a json file for investigation

    This code will check if llm classification is different from human classification
    or incorrectly ignore the mission and save the results to a json file.

    Parameters
    ----------
    input_path: str | pathlib.Path
        Input eval_output file name/path for statistics
    output_path: str | pathlib.Path
        File name/path to save the JSON file

    Returns
    -------
    None
    """

    data = read_output(bibcode=None, filename=input_path)
    logger.debug(f"Loaded data: {data}")

    results = {}
    n_matched_classifications = 0
    n_llm_only_classified_bibcodes = 0

    for bibcode, item in data.items():
        human_item = item.get("human", {})
        llm_item = item.get("llm", [])

        err = item.get("error", "")
        if "No paper source found" in err:
            continue  # should skip when paper source is not found
        elif not human_item and "No mission output found" in err:
            continue  # should skip when both human and llm don't have labels

        # save the bibcode items where llm only classifications for some missions but no human classfications at all
        # for all missions into a file for further inspection
        elif not human_item and llm_item:
            n_llm_only_classified_bibcodes += 1

            results[bibcode] = {
                "failures": {"flag": "llm_only_classified"},
                "human": {},
                "llm": llm_item,
                "missions_not_in_text": item.get("hallucinated_missions", []),
            }

            llm_only_classified_bibcode_item = {
                bibcode: {"llm": llm_item},
            }
            logger.debug(
                "Saving the bibcode items with llm only classifications but no human classfications at all.\n"
                + "Human might have completely missed classification or completely LLM hallunication! Investigate the list!"
            )
            save_json_file(
                pathlib.Path(config.paths.output)
                / f"llms/openai_{config.llms.openai.model}/llm_only_classified_list_for_audit.json",
                llm_only_classified_bibcode_item,
                indent=2,
            )
            continue

        failures, n_matched = analyze_missions(human_item, llm_item)
        n_matched_classifications += n_matched

        if failures:
            results[bibcode] = {
                "failures": failures,
                "human": human_item,
                "llm": llm_item,
                "missions_not_in_text": item.get("hallucinated_missions", []),
            }

    # summarized counts of inconsistent classifications
    summary_counts = audit_summary(results)
    summary_counts = {
        "n_total_bibcodes": len(data),
        "n_llm_only_classified_bibcodes": n_llm_only_classified_bibcodes,
        "n_matched_classifications": n_matched_classifications,
        **summary_counts,
    }

    # Add the summary to the top of the bibcode+mission breakdown results
    results_with_summary = {
        "summary_counts": summary_counts,
        "bibcodes": results,
    }

    save_json_file(output_path, results_with_summary, indent=2)




[docs]
def analyze_missions(human_item: dict[str, str], llm_item: list[dict[str, Any]]) -> tuple[dict[str, str], int]:
    """Analyze and compare LLM classifications against human classifications.

    Parameters
    ----------
    human_item: dict[str, str]
        human classification of mission and papertype, e.g., {"HST": "MENTION", "JWST": "SUPERMENTION"}
    llm_item: list[dict[str, Any]]
        list of llm classifications

    Returns
    -------
    failure: dict
        dictionary of failured cases
    n_matched_classifications: int
        number of matched classifications
    """
    failures = {}
    n_matched_classifications = 0

    for mission, human_label in human_item.items():
        mission_in_llm = any(mission in llm for llm in llm_item)
        match_found = any(llm.get(mission) == human_label for llm in llm_item if mission in llm)
        llm_science_assigned = any(llm.get(mission) == "SCIENCE" for llm in llm_item if mission in llm)

        if not mission_in_llm:
            if human_label == "SCIENCE":
                failures[mission] = "false_negative_because_ignored"
            else:
                failures[mission] = "ignored"
        elif match_found:
            n_matched_classifications += 1
        else:
            if human_label == "SCIENCE":
                failures[mission] = "false_negative"
            elif llm_science_assigned:
                failures[mission] = "false_positive"

    return failures, n_matched_classifications




[docs]
def audit_summary(audit_results: dict) -> dict[str, int]:
    """Create the summary of the inconsistent classifications

    Parameters
    ==========
    audit_results: dict
        the breakdown bibcode list of inconsistent llm classifications
        e.g.,
        "bibcodes": {"2018A&A...610A..11I": {"failures": "GALEX": "false_positive"},}


    Returns
    =======
    summary_counts: dict[str, int]
        various count summary
    """

    summary_counts = {
        "n_mismatched_bibcodes": 0,
        "n_mismatched_classifications": 0,
        "false_positive": 0,
        "false_negative": 0,
        "false_negative_because_ignored": 0,
        "ignored": 0,
    }

    for bibcode, entry in audit_results.items():
        error_dict = entry.get("failures", {})
        error_count = len(error_dict)

        if error_count > 0:
            summary_counts["n_mismatched_bibcodes"] += 1
            summary_counts["n_mismatched_classifications"] += error_count

            for error_type in error_dict.values():
                if error_type in summary_counts:
                    summary_counts[error_type] += 1

    return summary_counts




[docs]
def save_evaluation_stats(
    input_path: str | pathlib.Path,
    output_path: str | pathlib.Path,
    threshold_acceptance: float,
    threshold_inspection: float,
):
    """Generate acceptance and inspection statistics and identify classification inconsistencies between humans and the LLM for evaluation summary data

    This function performs the following actions:
         - **Creates a statistics file** containing:
            - **Accepted LLM Classifications**: Number of papers with classifications accepted by the LLM based on a specified threshold value for each combination of mission and paper type.
            - **Human Inspection Requirements**: Number of papers requiring human inspection
            - **Accepted Bibcodes**: Bibcodes corresponding to the accepted classifications.
            - **Inspection-Required Bibcodes**: Bibcodes that need human inspection due to ambiguous confidence values.

    Parameters
    ----------
    input_path: str | pathlib.Path
        Input paper_output file name/path for statistics
    output_path: str | pathlib.Path
        File name/path to save the JSON file
    threshold_acceptance: float
        Threshold value to accept LLM papertype
    threshold_inspection: float
        Threshold value to filter papers required for human inspection


    Returns
    -------
    None

    Raises
    ------
    Exception
        For any other exceptions that occur during DataFrame creation or file operations.

    """

    data = read_output(bibcode=None, filename=input_path)
    logger.debug(f"Loaded data: {data}")

    # Build DataFrame
    try:
        df = pd.DataFrame(
            [
                (
                    item["llm_mission"].lower(),  # mission
                    item["llm_papertype"].lower(),  # papertype
                    item["mean_llm_confidences"],
                    bibcode,
                    item["in_human_class"],
                    item["mission_in_text"],
                    item["consistency"],
                )
                for bibcode, eval_item in data.items()
                if "df" in eval_item
                for index, item in enumerate(eval_item["df"])
            ],
            columns=[
                "mission",
                "papertype",
                "mean_llm_confidences",
                "bibcode",
                "in_human_class",
                "mission_in_text",
                "consistency",
            ],
        )
    except Exception as e:
        logger.error(f"Error during operation DataFrame creation: {e}")
        raise
    df = df.sort_values(["mission", "papertype"]).reset_index(drop=True)

    # grouping DF and aggregate other properies
    grouped_df = group_by_agg("mean_llm_confidences", threshold_acceptance, threshold_inspection, df)

    # Write the statistics summary
    write_stats(output_path, threshold_acceptance, threshold_inspection, grouped_df)




[docs]
def save_operation_stats(
    input_path: str | pathlib.Path,
    output_path: str | pathlib.Path,
    threshold_acceptance: float,
    threshold_inspection: float,
):
    """Generate acceptance and inspection statistics from operational classifications

    This function performs the following actions:
         - **Creates a statistics file** containing:
            - **Accepted LLM Classifications**: Number of papers with classifications accepted by the LLM based on a specified threshold value for each combination of mission and paper type.
            - **Human Inspection Requirements**: Number of papers requiring human inspection
            - **Accepted Bibcodes**: Bibcodes corresponding to the accepted classifications.
            - **Inspection-Required Bibcodes**: Bibcodes that need human inspection due to ambiguous confidence values.

    Parameters
    ----------
    input_path: str | pathlib.Path
        Input paper_output filename/path for statistics
    output_path: str | pathlib.Path
        File name/path to save the JSON file
    threshold_acceptance: float
        Threshold value to accept LLM papertype
    threshold_inspection: float
        Threshold value to filter papers required for human inspection

    Returns
    -------
    None

    Raises
    ------
    Exception
        For any other exceptions that occur during DataFrame creation or file operations.

    """

    data = read_output(bibcode=None, filename=input_path)
    logger.debug(f"The number of the loaded data: {len(data)}")

    # filter out bad data
    n_data = len(data)
    data = {b: a for b, a in data.items() for mi in a if "error" not in mi.keys() and mi["missions"]}
    logger.debug(f"Filtered {n_data - len(data)} bad data from {n_data} total entries.")

    # Validate data structure
    for bibcode, assessment in data.items():
        assert isinstance(assessment, list), f"Assessment for {bibcode} should be a list."
        for mission_item in assessment:
            assert isinstance(mission_item, dict), (
                f"Each mission_item should be a dict, got {type(mission_item)} for bibcode {bibcode}."
            )

    # Build Pandas DataFrame
    try:
        df = pd.DataFrame(
            [
                [item["mission"].lower(), item["papertype"].lower(), item["confidence"], bibcode]
                for bibcode, assessment in data.items()
                for mission_item in assessment
                for item in mission_item["missions"]
            ],
            columns=["mission", "papertype", "llm_confidences", "bibcode"],
        )
    except Exception as e:
        logger.error(f"Error during operation DataFrame creation: {e}")
        raise

    df = df.sort_values(["mission", "papertype"]).reset_index(drop=True)

    # grouping DF and aggregate other properies
    grouped_df = group_by_agg("llm_confidences", threshold_acceptance, threshold_inspection, df)

    # Write the statistics summary
    write_stats(output_path, threshold_acceptance, threshold_inspection, grouped_df)




[docs]
def group_by_agg(confidence_name: str, threshold_acceptance: float, threshold_inspection: float, df: pd.DataFrame):
    """Group DataFrame by mission and papertype and aggregate other properties.

    Parameters
    ----------
    confidence_name: str
        The Key name for LLM confidences, e.g, `"llm_confidences"` for `paper_output.json` or `"mean_llm_confidences"` for `summary_output.json`
    threshold_acceptance: float
        Threshold value to accept LLM papertype
    threshold_inspection: float
        Threshold value to filter papers required for human inspection
    df: pd.DataFrame
        Dataframe

    Returns
    -------
    pd.DataFrame
    """

    def inspection_condition(confidence: list[float, float]):
        return (max(confidence) >= threshold_inspection) and (max(confidence) < threshold_acceptance)

    def acceptance_condition(confidence: list[float, float]):
        return max(confidence) >= threshold_acceptance

    grouped_df = (
        df.fillna(0)
        .groupby(["mission", "papertype"])
        .agg(
            total_count=("mission", "size"),
            accepted_count=(confidence_name, lambda x: sum(1 for i in x if max(i) >= threshold_acceptance)),
            accepted_bibcodes=(
                "bibcode",
                lambda x: list(
                    set(
                        [
                            df.loc[i, "bibcode"]
                            for i in range(len(x))
                            if acceptance_condition(df.loc[x.index[i], confidence_name])
                        ]
                    )
                ),
            ),
            inspection_count=(
                confidence_name,
                lambda x: sum(1 for i in x if inspection_condition(i)),
            ),
            inspection_bibcodes=(
                "bibcode",
                lambda x: list(
                    set(
                        [
                            df.loc[i, "bibcode"]
                            for i in range(len(x))
                            if inspection_condition(df.loc[x.index[i], confidence_name])
                        ]
                    )
                ),
            ),
        )
        .reset_index()
    )

    return grouped_df




[docs]
def write_stats(output_path, threshold_acceptance, threshold_inspection, grouped_df):
    """Write the satistics into a JSON file.

    Parameters
    ----------
    output_path: pathlib.Path
        Filename path to save the stats results.
    threshold_acceptance: float
        Threshold value to accept LLM papertype.
    threshold_inspection: float
        Threshold value to filter papers required for human inspection.
    grouped_df: pd.DataFrame
        Grouped DataFrame

    Returns
    -------
    None
    """
    tsv_df = grouped_df[["mission", "papertype", "total_count", "accepted_count", "inspection_count"]]

    logger.info("Production counts by LLM Mission and Paper Type:\n" + tsv_df.to_string(index=False))
    # Write to an ascii file
    summary_file = (
        pathlib.Path(config.paths.output)
        / f"llms/openai_{config.llms.openai.model}/{config.llms.eval_stats_file}_t{config.llms.performance.threshold}.txt"
    )
    try:
        # Format and save to a text file with proper alignment
        with open(summary_file, "w") as f:
            f.write(tsv_df.to_string(index=False))
        print(f"Data successfully written to {summary_file}")
    except IOError as e:
        print(f"Error writing to file: {e}")

    # writing the stats table JSON
    list_of_dicts = grouped_df.to_dict(orient="records")
    list_of_dicts.insert(
        0, {"threshold_acceptance": threshold_acceptance, "threshold_inspection": threshold_inspection}
    )

    save_json_file(
        output_path,
        list_of_dicts,
    )

    logger.info(f"bibcode lists for both acceptance and inspection were generated in {output_path}")