Source code for bibcat.llm.io

import json
import os
import pathlib
import tempfile

from bibcat import config
from bibcat.data.build_dataset import load_source_dataset
from bibcat.utils.logger_config import setup_logger
from bibcat.utils.utils import NumpyEncoder

logger = setup_logger(__name__)
logger.setLevel(config.logging.level)


[docs] def get_source(bibcode: str | None = None, index: int | None = None, body_only: bool = False) -> dict | str: """Get the source dataset for a given bibcode or index. Retrieve the entry from the combined source dataset for a given bibcode or list index. Parameters ---------- bibcode : str, optional the paper bibcode to retrieve, by default None index : int, optional the list item index to retrieve, by default None body_only : bool, optional Flag to only return the text body, by default False Returns ------- dict | str a row from the source dataset """ # load the source dataset source_dataset = load_source_dataset() n_sources = len(source_dataset) text = None if bibcode: # get the source by bibcode res = [i for i in source_dataset if i["bibcode"] == bibcode] text = res[0] if res else None if not res: logger.warning("Requested bibcode not found in source datasets.") elif index is not None: index = int(index) # get the source by index text = source_dataset[index] if index < n_sources else None if index > n_sources: logger.warning("Requested index is out of range of the number of source datasets.") return text["body"] if text and body_only else text
[docs] def get_file(filepath: str = None, bibcode: str = None, index: int = None) -> str: """Get a file path for paper data Get a file path of a paper to upload to an LLM. If a file path is provided, e.g. a local pdf file, it is returned. If a bibcode or index is provided, retrieves the source dataset and writes it out to a temporary json file. The name of the temporary file is `temp_****_[bibcode].json`, prefixed with `temp_` and suffixed with the bibcode of the paper. Parameters ---------- filepath : str, optional a local filepath to a paper, by default None bibcode : str, optional the bibcode of a source paper, by default None index : int, optional the list index of a source paper, by default None Returns ------- str the file path to the paper data """ # if a real file, just return it if filepath and os.path.isfile(filepath): return filepath # if source dataset file, extract and create temporary file if bibcode or index is not None: source = get_source(bibcode=bibcode, index=index) bc = source["bibcode"] # create temporary file with tempfile.NamedTemporaryFile(mode="w", delete=False, prefix="temp_", suffix=f"_{bc}.json") as fp: fp.write(json.dumps(source, indent=2)) fp.close() return fp.name
[docs] def get_llm_prompt(prompt_type: str) -> str: """Get an LLM prompt Retrieve a user or agent prompt for an LLM from a file or the config. A user prompt is the text to be used as the input to the LLM, while the agent, or system, prompt is the text that defines the instructions or behavior of the LLM Agent to follow. The agent prompt is only used when creating a new agent for the first time. You can define a custom user or agent prompt as a text file, located at $BIBCAT_DATA_DIR/llm_[prompt_type]_prompt.txt. For example, place your custom user prompt at $BIBCAT_DATA_DIR/llm_user_prompt.txt. This file takes precendence. If no custom prompt file is found, the default user prompt will come from the config file field: ``llms.user_prompt``. To set an agent prompt, create a file at $BIBCAT_DATA_DIR/llm_agent_prompt.txt, and add your instructions for the agent. If no custom agent prompt is found, a default agent prompt will be used. The default agent prompt will either come from the config file field: ``llms.agent_prompt`` or from the default file at etc/default_agent_prompt.txt. Parameters ---------- prompt_type : str The type of prompt to retrieve, either 'user' or 'agent' Returns ------- str the text prompt Raises ------ ValueError when an invalid prompt type is provided """ if prompt_type not in {"user", "agent"}: raise ValueError('Prompt type must be either "user" or "agent".') # if a prompt file exists, use it path = pathlib.Path(config.inputs[f"llm_{prompt_type}_base"]) / config.llms[f"llm_{prompt_type}_prompt"] if path.exists(): with open(path, "r") as f: prompt = f.read() return prompt # otherwise, use the config user prompt and default agent prompt prompt = config.llms[f"{prompt_type}_prompt"] # otherise, use the defaults if not prompt: default_prompt = pathlib.Path(__file__).parent.parent / f"etc/default_{prompt_type}_prompt.txt" with open(default_prompt, "r") as f: prompt = f.read() return prompt
[docs] def write_output(paper_key: str, response: dict): """Write the output response to a file Writes the output json response to a file, located at $BIBCAT_OUTPUT/output/llms/openai_[config.llms.openai.model]/[config.llms.prompt_output_file] The output JSON file is organized by the filename or bibcode of the input file, with each prompt response appended in the relevant section. Parameters ---------- paper_key : str the JSON key to append the response to, e.g. the bibcode or filename response : dict the response from the llm agent """ # setup the output file out = pathlib.Path(config.paths.output) / f"llms/openai_{config.llms.openai.model}/{config.llms.prompt_output_file}" out.parent.mkdir(parents=True, exist_ok=True) # write the content if not os.path.exists(out): # create a new file data = {paper_key: [response]} with open(out, "w+") as f: json.dump(data, f, indent=2, sort_keys=False) else: # append to an existing file with open(out, "r") as f: data = json.load(f) # append response to an existing file entry, or add a new one with a new paper_key or in the OPS mode if paper_key in data and not config.llms.ops: # logger.info(f"Appending the new run to {paper_key}") data[paper_key].append(response) else: data[paper_key] = [response] # write the updated file with open(out, "w") as f: json.dump(data, f, indent=2, sort_keys=False)
[docs] def read_output(bibcode: str | None = None, filename: str | pathlib.Path | None = None) -> list: """Read in the output for a given bibcode Returns the content from the output JSON file for the given bibcode. Parameters ---------- bibcode : str, optional The paper bibcode, by default None filename: Path, optional The prompt output file path Returns ------- list The output data from the LLM response """ # set filename if not present if bibcode and not filename: filename = ( pathlib.Path(config.paths.output) / f"llms/openai_{config.llms.openai.model}/{config.llms.prompt_output_file}" ) logger.info(f"reading {filename}") with open(filename, "r") as f: data = json.load(f) return data.get(bibcode) if bibcode else data
[docs] def write_summary(output: dict, output_path: str = None): """Write the evaluation summary output to a file Write the output summary statistics and info from evaluation into a JSON file. Parameters ---------- output : dict the output summary data output_path: str, optional optional output directory path Returns ------- None """ output_path = ( pathlib.Path(output_path) if output_path else pathlib.Path(config.paths.output) / f"llms/openai_{config.llms.openai.model}" ) filename = output_path / f"{config.llms.eval_output_file}_t{config.llms.performance.threshold}.json" logger.info(f"Writing output to {filename}") # write the content if not os.path.exists(filename): # create a new file with open(filename, "w+") as f: json.dump(output, f, indent=2, sort_keys=False, cls=NumpyEncoder) else: # append to an existing file with open(filename, "r") as f: data = json.load(f) # update response to an existing bibcode, or add a new one data.update(output) # write the updated file with open(filename, "w") as f: json.dump(data, f, indent=2, sort_keys=False, cls=NumpyEncoder)
[docs] def adjust_model(batch_file: pathlib.Path, orig: str, model: str): """Adjust the model in the jsonl batch file. This function replaces the original model with the new model in the specified batch file. Parameters ---------- batch_file : Path The path to the batch file to modify. orig : str The original model name to replace. model : str The new model name to use. Returns ------- Path The path to the modified batch file. """ new = pathlib.Path(str(batch_file).replace(orig, model)) new.parent.mkdir(parents=True, exist_ok=True) # extract and replace the model in the batch file with open(batch_file, "r", encoding="utf-8") as f: content = f.read() content = content.replace('"model": "gpt-4.1-mini"', f'"model": "{model}"') # write out the new jsonl batch file with open(new, "w", encoding="utf-8") as f: f.write(content) return new