Source code for bibcat.llm.metrics
from pathlib import Path
from typing import Any
import numpy as np
from numpy.typing import NDArray
from sklearn.metrics import auc, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from bibcat import config
from bibcat.utils.logger_config import setup_logger
from bibcat.utils.utils import save_json_file
logger = setup_logger(__name__)
logger.setLevel(config.logging.level)
[docs]
def extract_eval_data(data: dict, missions: list[str]) -> dict[str, Any]:
"""Extract the evaluation data for confusion matrix and stats related to mission call-outs, and save to files.
Extract the human/llm labels and other stats related to valid MAST mission and non MAST mission call-outs from the evaluation json file,
`config.llms.eval_output_file (summary_output.json)`. This function is called when plotting a confusion matrix plot in `bibcat.llm.plots.py`
Parameters
----------
data : dict
the dict of the evaluation data of `config.llms.eval_output_file (*summary_output.json)`
missions: list[str]
list of the mission names to extract the classification labels.
Returns
-------
metrics_data: dict[str]
contains various metrics
metrics_data contains following variables:
threshold: float
threshold
n_bibcodes: int
The number of bibcodes (papers)
n_human_callouts: int
The number of callouts by human classification in the whole dataset
n_llm_callouts: int
The number of callouts by llm classification in the whole dataset
n_non_mast_callouts: int
The number of non-MAST missions by llm in the whole dataset
n_missing_ouptput_bibcodes: int
The number of bibcodes missing output in the whole dataset
non_mast_missions: list[str], sorted
The non-MAST missions called out by llm in the whole dataset
n_human_llm_mission_callouts: int
The number of mission callouts by both human and llm in the given missions
n_human_llm_hallucination: int
The number of apparent hallucination by both human and llm in the given missions
when "mission_in_text" = false
human_llm_missions: list[str]
The missions called out by both human and llm in the given missions
human_labels: list[str]
True labels, human classified labels like ["SCIENCE", "MENTION"] after mapping
llm_labels: list[str]
Predicted labels by llm after mapping
label_raws: list[dict]
list of raw labels (before mapping): dict with keys `bibcode`, `mission`, 'human_raw' and 'llm_raw'
"""
n_bibcodes = len(data)
threshold = config.llms.performance.threshold
logger.info(f"The {n_bibcodes} bibcodes are evaluated in the summary_ouput_t{threshold}.json")
logger.info(f"{len(missions)} mission(s): {', '.join(missions)} is/are evaluated!\nLooping through papers! ")
# To keep track of and output the bibcode lists and their original classifications for confusion matrix quadrants
# original (raw) labels before mapping papertypes, dict with keys `bibcode`, `mission`, 'human_raw' and 'llm_raw'
label_raws: list[dict] = []
# Keep track of human and llm labels after mapping papertype for confusion matrix
human_labels, llm_labels = [], [] # Store ground truth papertypes and LLM papertypes
# Bookkeeping mission callouts
human_llm_mission_callouts = [] # missions that have both human and llm classified papertypes
non_mast_callouts = [] # non-MAST missions outside the config.missions list
n_human_callouts = n_llm_all_callouts = (
0 # counting human mission callouts and llm callouts including non-MAST missions
)
n_human_llm_hallucination = 0 # counting mission_in_text = false in both llm and human callouts
n_missing_output_bibcodes = 0 # counting papers that ignored by both human and llm
# set the papertype for llm or human ignored the paper
ignored_papertype = config.llms.map_papertypes.ignore.upper()
for bibcode, item in data.items():
logger.info(f"\nbibcode: {bibcode}")
err = item.get("error", "")
if not err:
human_data = item.get("human") or {}
n_human_callouts += len(human_data)
llm_data = item.get("llm") # only llm classification accepted by the threshold value
n_llm_all_callouts += len(llm_data)
llm_missions = [next(iter(i)) for i in llm_data] # get llm missions
logger.info(f"llm classification accepted ={llm_missions}")
# all llm mission call-out
llm_df_missions = [i["llm_mission"] for i in item.get("df")]
logger.info(f"llm_df_missions = {llm_df_missions}")
# store the list of non MAST missions
non_mast_mission = [
next(iter(i)) for i in llm_data if next(iter(i)) not in [s.upper() for s in config.missions]
]
non_mast_callouts.extend(non_mast_mission)
for mission in missions:
# capture raw labels before mapping and extracting for this mission
# use explicit "IGNORED" marker when absent to make outputs clearer
human_raw = human_data.get(mission) if human_data and mission in human_data else "IGNORED"
llm_raw = next((v for i in llm_data for k, v in i.items() if k == mission), "IGNORED")
# record bibcode and raw labels for this mission sample (one entry per mission)
label_raws.append({"bibcode": bibcode, "mission": mission, "human_raw": human_raw, "llm_raw": llm_raw})
# extracting human labels and llm labels
human_labels, llm_labels, n_human_llm_hallucination = extract_labels(
missions,
human_labels,
llm_labels,
human_llm_mission_callouts,
ignored_papertype,
item,
n_human_llm_hallucination,
)
elif "No paper source found" in err:
# should not count as missing llm output when paper source is not found
pass
elif "No mission output found" in err:
n_missing_output_bibcodes += 1
# set llm labels to ignored papertype
llm_labels.extend([ignored_papertype] * len(missions))
human_data = item.get("human") or {}
# record bibcodes and raw label for the missions (one per mission)
for mission in missions:
# human raw label if present, else explicit marker; llm raw set to explicit marker since no output
human_raw = human_data.get(mission) if human_data and mission in human_data else "IGNORED"
label_raws.append(
{"bibcode": bibcode, "mission": mission, "human_raw": human_raw, "llm_raw": "IGNORED"}
)
n_human_callouts += len(human_data)
# assign human labels when human classifications exist
human_labels = human_labels_when_no_llm_output(missions, human_data, human_labels, ignored_papertype)
# non-MAST mission callouts
logger.info(f"Non MAST missions: {sorted(list(set(non_mast_callouts)))} called out; \n")
logger.debug(f"Non MAST mission call outs: \n {non_mast_callouts}")
logger.debug(f"human_labels = {human_labels}")
logger.debug(f"llm_labels = {llm_labels}")
logger.info(f" Set of human_labels = {set(human_labels)} and set of llm_labels = {set(llm_labels)}")
n_llm_callouts = n_llm_all_callouts - len(non_mast_callouts)
logger.info(
f"""The total numbers of mission callouts by human and llm are {n_human_callouts} and {n_llm_callouts} respectively. \n
Among these callouts, only {len(human_llm_mission_callouts)} cases are called out by both llm and human and valid for further evaluations!\n
{len(non_mast_callouts)} non-MAST missions are called out!\n"""
)
metrics_data = {
"threshold": threshold,
"n_bibcodes": n_bibcodes,
"n_human_callouts": n_human_callouts,
"n_llm_callouts": n_llm_callouts,
"n_missing_output_bibcodes": n_missing_output_bibcodes,
"n_non_mast_callouts": len(non_mast_callouts),
"non_mast_missions": sorted(list(set(non_mast_callouts))),
"human_llm_missions": sorted(list(set(human_llm_mission_callouts))),
"n_human_llm_mission_callouts": len(human_llm_mission_callouts),
"n_human_llm_hallucination": n_human_llm_hallucination,
"human_labels": human_labels,
"llm_labels": llm_labels,
"label_raws": label_raws,
}
for k, v in metrics_data.items():
logger.info(f"{k} : {v}")
# evaluation metrics summary including call-outs, confusion_matrix_report, llm performance scores, etc
output_filename = (
Path(config.paths.output)
/ f"llms/openai_{config.llms.openai.model}/{config.llms.metrics_file}_t{metrics_data['threshold']}"
)
compute_and_save_metrics(metrics_data, str(output_filename) + ".txt", str(output_filename) + ".json")
return metrics_data
[docs]
def extract_labels(
missions: list[str],
human_labels: list[str],
llm_labels: list[str],
human_llm_mission_callouts: list[str],
ignored_papertype: str,
item: dict[str, dict[str, Any]],
n_human_llm_hallucination: int,
) -> tuple[list[str], list[str], int]:
"""
Extract human and llm papertype labels when the summary output of a bibcode
has classification items other than "error"
This function extracts human and llm papertype labels from the summary_output for constructing confusion matrix,
then map the papertypes to the allowed papertypes (for instance, `MENTION` maps to `NONSCIENCE`).
Because the summary_output provides only human and llm callouts of only relevant missions, not all MAST missions,
we need to extract the relevant labels depending on the following various conditions:
1. When both human and LLM call out a given mission and their papertypes, assign them to the relevant papertypes.
2. When human calls out the mission but LLM ignores the paper, assign the human label to its relevant papertype but the LLM label to `ignored_papertype`.
3. When human ignores the paper but LLM calls out with a papertype, assign the LLM label to its relevant papertype but the human label to `ignored_papertype`.
4. When both human and LLM ignore the paper for the mission, assign both to `ignored_papertype`.
Parameters
----------
missions: list[str]
MAST missions of interest
human_labels: list[str]
human papertypes before this bibcode
llm_labels: list[str]
llm papertypes before this bibcode
human_llm_mission_callouts: list[str]
Missions called out by both human and llm
ignored_papertype: str, uppercase
`config.llms.map_papertypes.ignore.upper()`, for instance, `NONSCIENCE`
item: dict[str, dict[str, Any]]
bibcode dictionary item
n_human_llm_hallucination: int
the number of hallucinations before the current bibcode
Returns
-------
human_labels: list[str]
human papertype labels updated after the current bibcode
llm_labels: list[str]
llm papertype labels updated after the current bibcode
n_human_llm_hallucination: int
the number of hallucinations updated after the current bibcode
"""
human_data = item.get("human") or {} # e.g., `{"JWST": "SCIENCE"}`
llm_data = item.get("llm") or []
llm_missions = [next(iter(i)) for i in llm_data] # list of LLM mission callouts in `item["llm"]`
llm_df_missions = [i["llm_mission"] for i in item.get("df")] # list of LLM mision callouts in `item["df"]`
for mission in missions:
logger.info(f"Checking {mission} summary output")
llm_mission_in_text = next((i["mission_in_text"] for i in item.get("df") if i["llm_mission"] == mission), False)
# When both human and llm callout the mission with its papertype,
# this below blcok will extract and map the human/llm papertypes to their designated papertype in the config file
if mission in human_data and mission in llm_missions:
logger.info(f"{mission}:both human_label and llm_label are available!")
human_llm_mission_callouts.append(mission)
# set human labels after mapping papertype
human_labels = append_human_labels_with_mapped_papertype(human_data, mission, human_labels)
# set llm labels = final llm papertypes of missions in "llm: []" after mapping
llm_labels = append_llm_labels_with_mapped_papertype(llm_data, mission, llm_labels)
if not llm_mission_in_text:
logger.warning(
f"It appears that both human and LLM are hallucinating {mission}! Check out if the keyword search is failing"
)
n_human_llm_hallucination += 1
# When human classification is available but llm doesn't call out,
# this block will extract and map human papertype to designated papertype in the config file
# and assign llm_papertype to ignored_papertype (i.e., NONSCIENCE)
elif mission in human_data and mission not in llm_missions: # llm missing call-out
if mission in llm_df_missions:
logger.info(
f"{mission}: Human_label is available and LLM called out {mission} but the confidence value is below the threshold."
)
else:
logger.warning(
f"{mission}: Human_label is available but no llm_label is available! LLM is missing call-out! Check why LLM fails to call out mission!"
)
# set human labels after mapping papertype
human_labels = append_human_labels_with_mapped_papertype(human_data, mission, human_labels)
# set llm label to ignored papertype
llm_labels.append(ignored_papertype)
# When there is not human callout but llm callouts mission with papertype, we assgin "NONSCIENCE"
# to human papertype and extract and map human papertype to designated papertype in the config file
elif mission not in human_data and mission in llm_missions:
if llm_mission_in_text:
logger.warning(f"{mission}: check if human misses {mission} call-out! or the keyword search is failing")
else:
logger.warning(
f"{mission}: check if LLM is hallucinating {mission} call-out or the keyword search is failing!"
)
human_labels.append(ignored_papertype)
# llm labels = final llm papertypes of missions in "llm: []" after mapping
llm_labels = append_llm_labels_with_mapped_papertype(llm_data, mission, llm_labels)
# both llm and human labels not found in the main level ("llm:[]"), so assign ignored type
else:
if mission in llm_df_missions:
logger.warning(
f"Human misses calling out and LLM called out {mission} but the confidence value is below the threshold. Also, check out if the keyword search is failing!"
)
else:
logger.info(f"Both human and LLM ignored {mission}!")
human_labels.append(ignored_papertype)
llm_labels.append(ignored_papertype)
return human_labels, llm_labels, n_human_llm_hallucination
[docs]
def map_papertype(papertype: str) -> str | None:
"""Map a classified papertype to an allowed papertypes, for instance, if `papertype` is "SUPERMENTION" or "IGNORE", it will returns "NONSCIENCE" or a custom papertype.
Parameters
----------
papertype: str, uppercase
human or llm classified papertype, e.g., "SCIENCE", "DATA_INFLUENCED"
Returns
-------
mapped_papertype: str, uppercase
mapped papertype follwing `config.llms.map_papertypes`, e.g., "MENTION" if `papertype` is "SUPERMENTION"
"""
logger.debug(f"map_papertype(): input classified papertype to map = '{papertype}'")
try:
if papertype.lower() in config.llms.map_papertypes:
mapped_value = config.llms.map_papertypes.get(papertype.lower())
if mapped_value.upper() in config.llms.papertypes:
mapped_papertype = mapped_value.upper()
logger.debug(f"map_papertype(): mapped papertype is '{mapped_papertype}'")
return mapped_papertype
else:
raise ValueError(
f"The mapped papertype '{mapped_value}' for the input papertype '{papertype}' is not a valid classification."
)
else:
raise KeyError(f"The input papertype '{papertype}' is an invalid papertype.")
except KeyError as ke:
logger.error(f"KeyError encountered: {ke}", exc_info=True)
except ValueError as ve:
logger.error(f"ValeError encountered: {ve}", exc_info=True)
[docs]
def append_human_labels_with_mapped_papertype(
human_data: dict[str, str], mission: str, human_labels: list[str]
) -> None:
"""Append human papertype to the `human_labels` list after mapping it to the allowed papertype
Parameters
----------
human_data: dict[str]
human classification data per bibcode in summary_output.
e.g., "human": {"GALEX": "SCIENCE", "HST": "DATA-INFLUENCED"}
mission: str
mission name, e.g., ROMAN
human_labels: list[str]
list of human papertype labels for confusion matrix, e.g., ["SCIENCE","NONSCIENCE","SCIENCE"]
Returns
-------
None
"""
logger.debug(f"initial human papertype = '{human_data.get(mission)}'")
mapped_human_papertype = map_papertype(human_data.get(mission))
logger.debug(f"mapped papertype = '{mapped_human_papertype}'")
human_labels.append(mapped_human_papertype)
return human_labels
[docs]
def human_labels_when_no_llm_output(missions, human_data, human_labels, ignored_papertype):
"""Assign human labels when human classifications exist even with no llm output
Parameters
----------
missions: list[str]
list of missions
human_data: dict[str, str]
dictionary values of item["human"], e.g., {"JWST": "SCIENCE"}
human_labels: list[str]
True labels by human, a list of papertypes before papertype mapping,
For example, ["SCIENCE", "MENTION"]
ignored_papertype: str, uppercase
config.llms.map_papertypes.ignore.upper(), for instance, "NONSCIENCE"
Returns
-------
human_labels: list[str]
updated human labels based on the presence of human classifications
"""
# when no human label found for any mission at all, human:[]
if not human_data:
human_labels.extend([ignored_papertype] * len(missions))
return human_labels
# when at least one mission found in human_data
for mission in missions:
# e.g., if "HST": "SCIENCE", this condition is met
if mission in human_data:
human_labels = append_human_labels_with_mapped_papertype(human_data, mission, human_labels)
# e.g., the below condition meets if if "HST": "SCIENCE" and mission!="HST"
else:
human_labels.append(ignored_papertype)
return human_labels
[docs]
def append_llm_labels_with_mapped_papertype(llm_data: list[dict], mission: str, llm_labels: list[str]) -> None:
"""Append llm papertype to the `llm_labels` list after mapping it to the allowed papertype
Parameters
----------
llm_data: list[dict]
llm classification data per bibcode in summary_output.
e.g., "llm": [{"JWST": "SCIENCE"}, {"ROMAN": "SUPERMENTION"}, {"HST": "SCIENCE"}]
mission: str
mission name, e.g., ROMAN
llm_labels: list[str]
list of llm papertype labels for confusion matrix, e.g., ["SCIENCE","NONSCIENCE","SCIENCE"]
Returns
-------
None
"""
label = next((v for i in llm_data for k, v in i.items() if k == mission), None)
logger.debug(f"initial llm papertype = {label}")
mapped_llm_papertype = map_papertype(label)
logger.debug(f"mapped papertype for llm = '{mapped_llm_papertype}'")
llm_labels.append(mapped_llm_papertype)
return llm_labels
[docs]
def compute_and_save_metrics(
metrics_data: dict[str],
output_ascii_path: str | Path = "metrics_summary.txt",
output_json_path: str | Path = "metrics_summary.json",
):
"""Compute llm performance metrics (accuracy, f1, precision, and recall scores) and other stats and save results to an ascii file
Parameters
----------
metrics_data: dict[str]
contains various metrics
metrics_data contains following variables:
threshold: float
threshold
n_bibcodes: int
The number of bibcodes (papers)
n_human_callouts: int
The number of callouts by human classification in the whole dataset
n_llm_callouts: int
The number of callouts by llm classification in the whole dataset
n_non_mast_callouts: int
The number of non-MAST missions by llm in the whole dataset
n_missing_ouptput_bibcodes: int
The number of bibcodes missing output in the whole dataset
non_mast_missions; list[str], sorted
Non MAST missions called out by llm in the whole dataset
n_human_llm_mission_callouts: int
The number of mission callouts by both human and llm in the given missions
n_human_llm_hallucination: int
The number of apparent hallucination by both human and llm in the given missions
when "mission_in_text" = false
human_llm_missions: list[str]
The missions called out by both human and llm in the given missions
human_labels: list[str]
True labels, human classified labels like ["SCIENCE", "MENTION"]
llm_labels: list[str]
Predicted labels by llm
label_raws: list[dict]
list of raw labels (before mapping): dict with keys 'human_raw' and 'llm_raw'
output_ascii_path: str | Path
output file path to save the metrics summary in .txt
output_json_path: str | Path
output file path to save the metrics summary in .json
Return
------
None
"""
# t: true, f: false, p: positive, n: negative
tn, fp, fn, tp = confusion_matrix(metrics_data["human_labels"], metrics_data["llm_labels"]).ravel()
# normalize confusion matrix over the true (rows)
tnr, fpr, fnr, tpr = confusion_matrix(
metrics_data["human_labels"], metrics_data["llm_labels"], normalize="true"
).ravel()
confusion_matrix_metrics = {
"tn": tn,
"fp": fp,
"fn": fn,
"tp": tp,
"tnr": tnr,
"fpr": fpr,
"fnr": fnr,
"tpr": tpr,
}
# Encode string labels into numeric values using LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(config.llms.papertypes)
human_labels_encoded = label_encoder.transform(metrics_data["human_labels"])
llm_labels_encoded = label_encoder.transform(metrics_data["llm_labels"])
papertypes = label_encoder.classes_
# Determine number of classes
n_classes = len(papertypes)
# Create classification report
classification_performance_report = classification_report(
human_labels_encoded, llm_labels_encoded, target_names=papertypes, digits=4, output_dict=True
)
logger.info(f"classification report\n {classification_performance_report}")
# For binary classification, collect bibcodes for TN/FP/FN/TP along with raw labels.
label_raws = metrics_data.get("label_raws", [])
entries = collect_confusion_matrix_cell_entries(human_labels_encoded, llm_labels_encoded, label_raws, n_classes)
# Write results to an ASCII file
with open(output_ascii_path, "w") as f:
f.write(f"The total number of bibcodes (papers) for evaluation metrics: {metrics_data['n_bibcodes']}\n")
f.write(
f"The number of bibcodes missing output, i.e., ignored papers by flagship and mast: {metrics_data['n_missing_output_bibcodes']}\n"
)
f.write(f"The number of callouts by human: {metrics_data['n_human_callouts']}\n")
f.write(
f"The number of callouts by llm with the threshold value, {metrics_data['threshold']}: {metrics_data['n_llm_callouts']}\n\n"
)
f.write(
f"The number of callouts by both human and llm with given missions: {metrics_data['n_human_llm_mission_callouts']}\n"
)
f.write(f"Mission(s) called out by both human and llm: {', '.join(metrics_data['human_llm_missions'])}\n\n")
f.write(f"The number of non-MAST mission callouts by llm: {metrics_data['n_non_mast_callouts']}\n")
f.write(
f"The number of hallunications by both human and llm: {metrics_data['n_human_llm_hallucination']}\n Check out if the keyword search is failing\n\n"
)
f.write(f"Non-MAST missions called out by llm: {', '.join(metrics_data['non_mast_missions'])}\n\n")
f.write(f"{n_classes} papertypes: {', '.join(papertypes)} are labeled\n")
f.write(f"True Negative = {tn}, False Positive = {fp}, False Negative = {fn}, True Positive = {tp}\n\n")
f.write(
f"True Negative Rate / Specificity = {tnr.round(4)}, False Positive Rate = {fpr.round(4)}, False Negative Rate = {fnr.round(4)}, True Positive Rate / Recall = {tpr.round(4)}\n\n"
)
f.write(
f"classification report\n {classification_report(human_labels_encoded, llm_labels_encoded, target_names=papertypes, digits=4)}\n"
)
logger.info(f"Metrics saved to {output_ascii_path}")
# Save metrics_data and classlifcation report to a json file
filtered_metrics_data = {
k: v for k, v in metrics_data.items() if k not in {"human_labels", "llm_labels", "label_raws"}
}
# Collect bibcodes for TN/FP/FN/TP with raw labels.
label_raws = metrics_data.get("label_raws", [])
entries = collect_confusion_matrix_cell_entries(human_labels_encoded, llm_labels_encoded, label_raws, n_classes)
# append bibcode lists into the saved json
save_json_file(
output_json_path,
{
**filtered_metrics_data,
**confusion_matrix_metrics,
**classification_performance_report,
"fp_bibcodes": entries.get("fp", []),
"fn_bibcodes": entries.get("fn", []),
"tp_bibcodes": entries.get("tp", []),
"tn_bibcodes": entries.get("tn", []),
},
)
[docs]
def collect_confusion_matrix_cell_entries(
human_labels_encoded: NDArray[np.int64],
llm_labels_encoded: NDArray[np.int64],
label_raws: list[dict],
n_classes: int,
) -> dict:
"""Collect bibcode + raw-label dicts for confusion matrix cells.
Parameters
----------
human_labels_encoded: NDArray[np.int64]
encoded human labels
llm_labels_encoded: NDArray[np.int64]
encoded llm labels
label_raws: list[dict]
list of raw labels (before mapping): dict with keys `bibcode`, `mission`, `human_raw` and `llm_raw`
n_classes: int
number of classes
Returns
-------
entries: dict
a dict with keys 'tn','fp','fn','tp' each mapping to a list of entry dicts.
Each entry dict contains following variables:
bibcode: str
bibcode
human_raw: str
raw human label before mapping
llm_raw: str
raw llm label before mapping
"""
# Default empty structure
entries = {"tn": [], "fp": [], "fn": [], "tp": []}
# currently only suporting binary classification and we could extend it to multi-class later
if not (n_classes == 2 and label_raws and len(label_raws) == len(human_labels_encoded)):
return entries
for t, p, raw in zip(human_labels_encoded, llm_labels_encoded, label_raws):
entry = raw
if t == 0 and p == 0:
entries["tn"].append(entry)
elif t == 0 and p == 1:
entries["fp"].append(entry)
elif t == 1 and p == 0:
entries["fn"].append(entry)
elif t == 1 and p == 1:
entries["tp"].append(entry)
return entries
[docs]
def extract_roc_data(data: dict[str, dict[str, Any]], missions: list[str]):
"""Extract the human and llm classification labels and confidences
Extract the human classes and confidence values from the evaluation json file,
`config.llms.eval_output_file (summary_output.json)`.
You can extract data from only a single mission or a list of missions.
The human labels (ground truth) and llm confidence values will be used to create a ROC curve.
Parameters
----------
data : dict[str, dict[str, Any]]
the dict of the evaluation data of `config.llms.eval_output_file (summary_output.json)`
missions: list[str]
list of the mission names to extract the classification labels.
Returns
-------
tuple
A tuple of the list of human labels, llm labels, and the hreshold value for verdict acceptance.
human_labels: list[str]
True labels by human, a list of papertypes, .e.g, "SCIENCE" or "MENTION"(or "NONSCIENCE"),
see the allowed classifications in `config.llms.papertypes`
For example, ["SCIENCE", "MENTION"]
llm_confidences: list[list[float]]
A list of confidence score sets for all verdicts ([[p_science, p_mention],])
where p_science and p_mention represent confidence values of "SCIENCE" and "MENTION"(or "NONSCIENCE") respectively.
For example: [[0.9, 0.1], [0.4, 0.6]]
human_llm_missions: list[str], sorted
A set of missions, each containing both human- and LLM-classified paper types, used for evaluation plots.
"""
n_bibcodes = len(data)
logger.info(f"The number of evaluation summary data, e.g., the number of bibcodes = {n_bibcodes}")
logger.info(f"{len(missions)} mission(s): {', '.join(missions)} is/are evaluated! ")
human_labels = []
llm_confidences = [] # for ROC
human_llm_mission_callouts = [] # missions that have both human and llm classified papertypes
n_missing_output_bibcodes = 0
# set the papertype for llm or human ignored the paper
ignored_papertype = config.llms.map_papertypes.ignore.upper()
for bibcode, item in data.items():
logger.debug(f"bibcode: {bibcode}")
# when llm output summary exists
err = item.get("error")
if not err:
human_data = item["human"]
# llm missions for ROC; need to extract confidence values from `mission_conf` data frame
# where missions are accepted base on missions from item["llm"].
llm_data = item.get("llm")
llm_mission_conf = item["mission_conf"]
llm_missions = [next(iter(i)) for i in llm_data] # get llm missions
logger.info(f"llm classification accepted ={llm_missions}")
# extracting/assigning human labels and llm confidences
for mission in missions:
# When both human and llm callout the mission with its papertype,
# this clause will extract and map the human papertype to its designated papertype in the config file
# and extend the values of item["mission_conf"]["llm_mission"]["prob_papertype"] to `llm_confidences`
if mission in human_data and mission in llm_missions:
logger.info(f"Checking {mission} summary output")
human_llm_mission_callouts.append(mission)
# set human labels after mapping papertype
append_human_labels_with_mapped_papertype(human_data, mission, human_labels)
# To generate an ROC curve, we need the full range of confidence values.
# Use "prob_papertype" for each mission, as "mean_llm_confidences"
# only reflect the scores of the finally accepted papertypes in "llm:[]",
# which are always above the threshold. We require the varying values
# provided by "prob_papertype where human labels exist."
confs = [i["prob_papertype"] for i in llm_mission_conf if i["llm_mission"] == mission]
llm_confidences.extend(confs)
# When human classification is available but llm doesn't call out,
# this block will extract and map human papertype to designated papertype in the config file
# but extend [0.0,1.0] ("NONSCIENCE") to `llm_confidences'
elif mission in human_data and mission not in llm_missions: # llm missing call-out
append_human_labels_with_mapped_papertype(human_data, mission, human_labels)
llm_confidences.append([0.0, 1.0])
# When there is not human callout but llm callouts mission with papertype
# we assgin "NONSCIENCE" to human papertype and extract llm confidences from "prob_papertype"
elif mission not in human_data and mission in llm_missions:
human_labels.append(ignored_papertype)
confs = [i["prob_papertype"] for i in llm_mission_conf if i["llm_mission"] == mission]
llm_confidences.extend(confs)
# both llm and human labels not found in the main level ("llm:[]"), so assign ignored type
# but item["mission_conf"] could have mission callouts
else:
human_labels.append(ignored_papertype)
llm_confidences.append([0.0, 1.0])
elif "No paper source found" in err:
# should not count as missing llm output when paper source is not found
pass
# assign the roc input values to NONSCIENCE and [0.0, 1.0] when there is no llm output
elif "No mission output found" in err:
n_missing_output_bibcodes += 1
llm_confidences.extend([[0.0, 1.0]] * len(missions))
human_data = item.get("human") or {}
# assign human labels when human classifications exist.
human_labels = human_labels_when_no_llm_output(missions, human_data, human_labels, ignored_papertype)
logger.info(f"The number of the mission callouts by both human and llm is {len(human_llm_mission_callouts)}")
return human_labels, llm_confidences, sorted(list(set(human_llm_mission_callouts)))
[docs]
def prepare_roc_inputs(human_labels: list[str], llm_confidences: list[list[float]]):
"""Prepare input data for ROC and AUC (area under curve)
Parameters
----------
human_labels: list[str]
True labels by human, a list papertypes, .e.g, "SCIENCE" or "MENTION", see the allowed classifications in `config.llms.papertypes`
llm_confidences: list[list[float]]
Predicted labels by llm, a list of confidence score pairs for all verdicts.
Returns
-------
tuple
A tuple of confidences, binarized_human_labels, and n_classes.
binarized_human_labels: NDArray[np.int64]
Array-like of shape (n_samples,) if the binary case or (n_samples, n_classes) if the multi-class case. Binarized human labels as ROC input, e.g.,[[0][1]..],[[0 1 0 0][0 1 0 0]...]
llm_confidences: NDArray[np.float64]
Array-like of shape (n_samples,) if the binary case or (n_samples, n_classes) if the multi-class case. A list of confidence score pairs for all verdicts. Each inner list contains two floats:
the first for "SCIENCE" and the second for "MENTION". For example: [[0.9 0.1] [0.4 0.6]]
n_papertype: int
the number of available papertypes
n_verdicts: int
the number of MAST mission papertype verdicts by LLM
"""
logger.debug(f"human_labels before binarization:{human_labels}")
logger.debug(f"llm_confidences before binarization:{llm_confidences}")
# prep data for the roc plot
lb = LabelBinarizer()
# Invert the encoding to force "science" to be 1 and "mention" to be 0
binarized_human_labels = lb.fit_transform(human_labels)
logger.info(f"Classes: {lb.classes_}") # Classes are sorted alphabetically
logger.debug(f"binarized_human_labels={binarized_human_labels}")
llm_confidences = np.array(llm_confidences)
logger.debug(f"llm_confidences ={llm_confidences}")
n_papertype = len(set(human_labels))
n_verdicts = len(human_labels)
logger.info(f"The number of verdicts for ROC = {n_verdicts}")
return binarized_human_labels, llm_confidences, n_papertype, n_verdicts
[docs]
def get_roc_metrics(llm_confidences: NDArray[np.float64], binarized_human_labels: NDArray[np.int64], n_papertype: int):
"""Compute ROC curve and ROC AUC (area under curve)
Parameters
----------
llm_confidences : array-like of shape (n_samples,) if the binary case or (n_samples, n_classes) if the multi-class case
the numpy array of llm_confidences
binarized_true_labels: array-like of shape (n_samples,) if the binary case or (n_samples, n_classes) if the multi-class case
binarized_human_labels, e.g., [[0] [1] [1] [0] [0]] if the binary
Returns
-------
tuple
a tuple of false positive rate(fpr), true positive rate(tpr), and roc_auc
fpr: float
false positive rate
tpr: float
true positive rate
roc_auc: float
ROC area under curve
macro_roc_auc_ovr: float
Macro-averaged One-vs-Rest ROC AUC score for the multiclass case (only when n_papertypes > 2)
micro_roc_auc_ovr: float
Micro-averaged One-vs-Rest ROC AUC score for the multiclass case (only when n_papertypes > 2)
"""
# compute ROC curve and ROC AUC (area under curve) for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
logger.info("Creating ROC curve ouptput")
try:
if n_papertype > 2:
for i in range(n_papertype):
logger.debug(f"human_labels = {binarized_human_labels}, confidences = {llm_confidences[:, i]}")
fpr[i], tpr[i], thresholds = roc_curve(binarized_human_labels[:, i], llm_confidences[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
macro_roc_auc_ovr = roc_auc_score(
binarized_human_labels, llm_confidences, multi_class="ovr", average="macro"
)
micro_roc_auc_ovr = roc_auc_score(
binarized_human_labels, llm_confidences, multi_class="ovr", average="micro"
)
logger.info(f"Macro-averaged One-vs-Rest ROC AUC score:\n{macro_roc_auc_ovr:.2f}")
logger.info(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
return fpr, tpr, thresholds, roc_auc, macro_roc_auc_ovr, micro_roc_auc_ovr
elif n_papertype == 2:
fpr, tpr, thresholds = roc_curve(binarized_human_labels, llm_confidences[:, 0])
roc_auc = auc(fpr, tpr)
return fpr, tpr, thresholds, roc_auc
else:
raise ValueError(
f"'n_papertype' ={n_papertype} is invalid. The number of papertypes should be larger than or equal to 2."
)
except ValueError as ve:
logger.error(f"ValeError encountered: {ve}", exc_info=True)
logger.info(f"fpr={fpr}")
logger.info(f"tpr={tpr}")
logger.info(f"thresholds ={thresholds}")
logger.info(f"auc ={roc_auc}")