Source code for bibcat.llm.io

import json
import os
import pathlib
import tempfile

from bibcat import config
from bibcat.data.build_dataset import load_source_dataset
from bibcat.utils.logger_config import setup_logger
from bibcat.utils.utils import NumpyEncoder

logger = setup_logger(__name__)
logger.setLevel(config.logging.level)



[docs]
def get_source(bibcode: str | None = None, index: int | None = None, body_only: bool = False) -> dict | str:
    """Get the source dataset for a given bibcode or index.

    Retrieve the entry from the combined source dataset for a given bibcode or list index.

    Parameters
    ----------
    bibcode : str, optional
        the paper bibcode to retrieve, by default None
    index : int, optional
        the list item index to retrieve, by default None
    body_only : bool, optional
        Flag to only return the text body, by default False

    Returns
    -------
    dict | str
        a row from the source dataset
    """
    # load the source dataset
    source_dataset = load_source_dataset()
    n_sources = len(source_dataset)

    text = None
    if bibcode:
        # get the source by bibcode
        res = [i for i in source_dataset if i["bibcode"] == bibcode]
        text = res[0] if res else None
        if not res:
            logger.warning("Requested bibcode not found in source datasets.")
    elif index is not None:
        index = int(index)
        # get the source by index
        text = source_dataset[index] if index < n_sources else None
        if index > n_sources:
            logger.warning("Requested index is out of range of the number of source datasets.")

    return text["body"] if text and body_only else text




[docs]
def get_file(filepath: str = None, bibcode: str = None, index: int = None) -> str:
    """Get a file path for paper data

    Get a file path of a paper to upload to an LLM.  If a file path is provided, e.g.
    a local pdf file, it is returned.  If a bibcode or index is provided, retrieves the
    source dataset and writes it out to a temporary json file.  The name of the temporary file
    is `temp_****_[bibcode].json`, prefixed with `temp_` and suffixed with the bibcode of the paper.

    Parameters
    ----------
    filepath : str, optional
        a local filepath to a paper, by default None
    bibcode : str, optional
        the bibcode of a source paper, by default None
    index : int, optional
        the list index of a source paper, by default None

    Returns
    -------
    str
        the file path to the paper data
    """
    # if a real file, just return it
    if filepath and os.path.isfile(filepath):
        return filepath

    # if source dataset file, extract and create temporary file
    if bibcode or index is not None:
        source = get_source(bibcode=bibcode, index=index)
        bc = source["bibcode"]

        # create temporary file
        with tempfile.NamedTemporaryFile(mode="w", delete=False, prefix="temp_", suffix=f"_{bc}.json") as fp:
            fp.write(json.dumps(source, indent=2))
            fp.close()

        return fp.name




[docs]
def get_llm_prompt(prompt_type: str) -> str:
    """Get an LLM prompt

    Retrieve a user or agent prompt for an LLM from a file or the config. A user prompt
    is the text to be used as the input to the LLM, while the agent, or system, prompt is
    the text that defines the instructions or behavior of the LLM Agent to follow.  The agent
    prompt is only used when creating a new agent for the first time.

    You can define a custom user or agent prompt as a text file, located at
    $BIBCAT_DATA_DIR/llm_[prompt_type]_prompt.txt.  For example, place your custom user prompt
    at $BIBCAT_DATA_DIR/llm_user_prompt.txt. This file takes precendence. If no custom prompt file
    is found, the default user prompt will come from the config file field: ``llms.user_prompt``.

    To set an agent prompt, create a file at $BIBCAT_DATA_DIR/llm_agent_prompt.txt, and add your
    instructions for the agent.  If no custom agent prompt is found, a default agent prompt will
    be used. The default agent prompt will either come from the config file field: ``llms.agent_prompt``
    or from the default file at etc/default_agent_prompt.txt.

    Parameters
    ----------
    prompt_type : str
        The type of prompt to retrieve, either 'user' or 'agent'

    Returns
    -------
    str
        the text prompt

    Raises
    ------
    ValueError
        when an invalid prompt type is provided
    """

    if prompt_type not in {"user", "agent"}:
        raise ValueError('Prompt type must be either "user" or "agent".')

    # if a prompt file exists, use it
    path = pathlib.Path(config.inputs[f"llm_{prompt_type}_base"]) / config.llms[f"llm_{prompt_type}_prompt"]
    if path.exists():
        with open(path, "r") as f:
            prompt = f.read()
            return prompt

    # otherwise, use the config user prompt and default agent prompt
    prompt = config.llms[f"{prompt_type}_prompt"]

    # otherise, use the defaults
    if not prompt:
        default_prompt = pathlib.Path(__file__).parent.parent / f"etc/default_{prompt_type}_prompt.txt"
        with open(default_prompt, "r") as f:
            prompt = f.read()

    return prompt




[docs]
def write_output(paper_key: str, response: dict):
    """Write the output response to a file

    Writes the output json response to a file, located at
    $BIBCAT_OUTPUT/output/llms/openai_[config.llms.openai.model]/[config.llms.prompt_output_file]

    The output JSON file is organized by the filename or bibcode of the input file,
    with each prompt response appended in the relevant section.

    Parameters
    ----------
    paper_key : str
        the JSON key to append the response to, e.g. the bibcode or filename
    response : dict
        the response from the llm agent
    """

    # setup the output file
    out = pathlib.Path(config.paths.output) / f"llms/openai_{config.llms.openai.model}/{config.llms.prompt_output_file}"
    out.parent.mkdir(parents=True, exist_ok=True)

    # write the content
    if not os.path.exists(out):
        # create a new file
        data = {paper_key: [response]}
        with open(out, "w+") as f:
            json.dump(data, f, indent=2, sort_keys=False)
    else:
        # append to an existing file
        with open(out, "r") as f:
            data = json.load(f)

        # append response to an existing file entry, or add a new one with a new paper_key or in the OPS mode
        if paper_key in data and not config.llms.ops:
            # logger.info(f"Appending the new run to {paper_key}")
            data[paper_key].append(response)
        else:
            data[paper_key] = [response]

        # write the updated file
        with open(out, "w") as f:
            json.dump(data, f, indent=2, sort_keys=False)




[docs]
def read_output(bibcode: str | None = None, filename: str | pathlib.Path | None = None) -> list:
    """Read in the output for a given bibcode

    Returns the content from the output JSON file
    for the given bibcode.

    Parameters
    ----------
    bibcode : str, optional
        The paper bibcode, by default None
    filename: Path, optional
        The prompt output file path

    Returns
    -------
    list
        The output data from the LLM response
    """
    # set filename if not present
    if bibcode and not filename:
        filename = (
            pathlib.Path(config.paths.output)
            / f"llms/openai_{config.llms.openai.model}/{config.llms.prompt_output_file}"
        )

    logger.info(f"reading {filename}")

    with open(filename, "r") as f:
        data = json.load(f)
        return data.get(bibcode) if bibcode else data




[docs]
def write_summary(output: dict, output_path: str = None):
    """Write the evaluation summary output to a file

    Write the output summary statistics and info from evaluation into a JSON file.

    Parameters
    ----------
    output : dict
        the output summary data
    output_path: str, optional
        optional output directory path

    Returns
    -------
    None

    """

    output_path = (
        pathlib.Path(output_path)
        if output_path
        else pathlib.Path(config.paths.output) / f"llms/openai_{config.llms.openai.model}"
    )
    filename = output_path / f"{config.llms.eval_output_file}_t{config.llms.performance.threshold}.json"

    logger.info(f"Writing output to {filename}")

    # write the content
    if not os.path.exists(filename):
        # create a new file
        with open(filename, "w+") as f:
            json.dump(output, f, indent=2, sort_keys=False, cls=NumpyEncoder)
    else:
        # append to an existing file
        with open(filename, "r") as f:
            data = json.load(f)

        # update response to an existing bibcode, or add a new one
        data.update(output)

        # write the updated file
        with open(filename, "w") as f:
            json.dump(data, f, indent=2, sort_keys=False, cls=NumpyEncoder)




[docs]
def adjust_model(batch_file: pathlib.Path, orig: str, model: str):
    """Adjust the model in the jsonl batch file.

    This function replaces the original model with the new model in the specified batch file.

    Parameters
    ----------
    batch_file : Path
        The path to the batch file to modify.
    orig : str
        The original model name to replace.
    model : str
        The new model name to use.

    Returns
    -------
    Path
        The path to the modified batch file.
    """
    new = pathlib.Path(str(batch_file).replace(orig, model))
    new.parent.mkdir(parents=True, exist_ok=True)
    # extract and replace the model in the batch file
    with open(batch_file, "r", encoding="utf-8") as f:
        content = f.read()
        content = content.replace('"model": "gpt-4.1-mini"', f'"model": "{model}"')

    # write out the new jsonl batch file
    with open(new, "w", encoding="utf-8") as f:
        f.write(content)

    return new