Source code for bibcat.data.build_dataset

"""
:title: build_dataset.py

This module will produce the input corpus data in JSON format by
combining the MAST papertrack JSON file and the ADS full text JSON file.

Run example: bibcat train

"""

import json
import os
from functools import lru_cache
from pathlib import Path

import numpy as np

from bibcat import config
from bibcat.utils.logger_config import setup_logger
from bibcat.utils.utils import load_json_file, save_json_file

# set up logger
logger = setup_logger(__name__, level=config.logging.level)



[docs]
def file_exists(filelist: list) -> bool:
    "Check if any file exists among the list of files"
    return any([os.path.isfile(item) for item in filelist])




[docs]
def save_text_file(path_filename: Path, bibcodes: list[str]) -> None:
    try:
        with open(path_filename, mode="w") as file:
            file.write("\n".join(bibcodes))
    except IOError as e:
        print(f"An error occurred while saving the file: {e}")




[docs]
def load_datasets(path_papertext: Path, path_papertrack: Path) -> tuple[list[dict], list[dict]]:
    """Load the papertrack and papertext JSON datasets

    Loads the papertrack and papertext datasets and returns a tuple of the lists of dictionaries.

    Parameters
    ----------
    path_papertext: Path
        the path to the papertext data file
    path_papertrack: Path
        the path to the papertrack data file

    Returns
    -------
    tuple[list[dict], list[dict]]
        the tuple of the lists of the papertext and papertrack datasets
    """
    # Load paper texts and papertrack classes
    logger.info("Loading papertext and papertrack datasets!")
    papertext_dataset = load_json_file(path_papertext)
    papertrack_dataset = load_json_file(path_papertrack)
    logger.info("Loaded papertext and papertrack datasets!")
    return papertext_dataset, papertrack_dataset




[docs]
def extract_papertext_info(dataset: list[dict]) -> tuple[list[str], list[str]]:
    """Extract the papertext bibcodes and publish dates

    Extracts and returns the papertext ``bibcodes`` and ``pubdates``.

    Parameters
    ----------
    dataset: list[dict]
        the papertext dataset

    Returns
    -------
    tuple[list[str], list[str]]
        the tuple of a list of the ``bibbodes`` dict and the ``pubdates`` dict
    """

    bibcodes = [entry["bibcode"] for entry in dataset]
    pubdates = [entry["pubdate"] for entry in dataset]
    logger.info(f"The earliest date of papers within text database: {min(pubdates)}.")
    logger.info(f"The latest date of papers within text database: {max(pubdates)}.")

    return bibcodes, pubdates




[docs]
def extract_papertrack_info(dataset: list[dict]) -> tuple[list[None | dict], list[None | str], list[None | dict]]:
    """Extract papertrack info

    Extracts and returns the papertrack values: searches, bibcodes, and missions and papertypes.

    Parameters
    ----------
    dataset: list[dict]
        the papertrack dataset

    Returns
    -------
    tuple[list[None | dict], list[None | str], list[None | dict]]
        the tuple of a list of the ``searches`` dict, the ``bibcode`` dict, and the ``missions_and_papertypes`` dict

    Raises
    ------
    ValueError
        when the set of the bibcodes is different from the number of all the bibcodes
    """

    searches = [entry["searches"] for entry in dataset]
    bibcodes = [entry["bibcode"] for entry in dataset]
    missions_and_papertypes = [entry["class_missions"] for entry in dataset]

    # Throw an error if there are duplicate bibcodes in the papertrack classification dataset
    if len(set(bibcodes)) != len(bibcodes):
        raise ValueError("Err: Duplicate bibcodes in database of paper classifications!")

    return searches, bibcodes, missions_and_papertypes




[docs]
def missing_bibcodes_in_papertext(
    bibcodes_papertext: list[str],
    bibcodes_papertrack: list[str],
) -> list[str] | None:
    """Return the papertrack bibcodes are not in the papertext

    Returns the list of the papertrack bibcodes not in the papertext.

    Parameters
    ----------
    bibcodes_papertext: list[str]
        the bibcodes in papertext
    bibcodes_papertrack: list[str]
        the bibcodes in papertrack

    Returns
    -------
    list[str] | None
        the list of the bibcodes are not in the papertext
    """

    # Verify that all papers within papertrack are within the papertext database
    bibcodes_notin_papertext = [val for val in np.unique(bibcodes_papertrack) if (val not in bibcodes_papertext)]
    if len(bibcodes_notin_papertext) > 0:
        logger.warning(
            "Note! Papers in papertrack not in text database!"
            + f"\n{bibcodes_notin_papertext}\n{len(bibcodes_notin_papertext)} of {len(bibcodes_papertrack)} papertrack entries in all.\n"
        )
    return bibcodes_notin_papertext




[docs]
def trim_dict(dataset: list[dict], keys: list) -> list[dict]:
    """Trim the papertext data with the only required keys

    Trims the papertext data so that the dataset only has the values of
    [abstract, author, bibcode, body, keyword, keyword_norm, pubdate, title].

    Parameters
    ----------
    dataset: list[dict]
        the papertext data
    keys: list
        the list of the necessary keys

    Returns
    -------
    list[dict]
        the list of the only dictionary required for the source dataset

    """

    logger.info(f"trimming the papertext dict with {keys}")
    logger.debug(f"the first entry of the loaded_papertext = {dataset[0]}")

    trimmed_dict = [{key: value for key, value in thisdict.items() if (key in keys)}.copy() for thisdict in dataset]
    logger.debug(f"Show the first entry of the trimmed data: \n {trimmed_dict[0]}")
    logger.debug("Dict trimming is complete.")

    return trimmed_dict




[docs]
def combine_datasets(trimmed_papertext_data: list[dict], papertrack_data: list[dict]):
    """Combine the papertrack and papertext data

    Combines two datasets into a source dataset to be used for llm models or transformer training models.

    Parameters
    ----------
    trimmed_papertext_data: list[dict]
        the trimmed papertext data with only necessary keys
    papertrack_data: list[dict]
        the papertrack data

    Returns
    -------
    tuple
        a tuple of the list of the dictionary of the combined data,
        the list of the papertrack bibcodes not in the papertext data,
        the list of the papertext bibcodes not in the papertrack data,
        the list of the dictionary of the papertext not in the papertrack data
    """

    logger.info("Start combining the two datasets.")
    # Extract information from the papertrack classification dataset
    papertrack_searches, bibcodes_papertrack, missions_and_papertypes = extract_papertrack_info(papertrack_data)

    # Extract information from the paper text dataset
    bibcodes_papertext, _ = extract_papertext_info(trimmed_papertext_data)

    # Verify that all papers within papertrack are within the papertext database and return the bibcodes
    bibcodes_notin_papertext = missing_bibcodes_in_papertext(bibcodes_papertext, bibcodes_papertrack)

    bibcodes_notin_papertrack = []
    papertext_index_notin_papertrack = []
    combined_dataset = []

    for curr_index, curr_dict in enumerate(trimmed_papertext_data):
        new_dict = {}
        # Extract information for current paper within text database
        curr_bibcode = curr_dict["bibcode"]

        if curr_bibcode in bibcodes_papertrack:
            index = bibcodes_papertrack.index(curr_bibcode)
            new_dict["class_missions"] = {
                mission["mission"]: {"bibcode": curr_bibcode, "papertype": mission["paper_type"]}
                for mission in missions_and_papertypes[index]
            }

            for search in papertrack_searches[index]:
                new_dict[f"is_ignored_{search['search_key']}"] = search["ignored"]

            combined_dataset.append({**curr_dict, **new_dict})
        else:
            logger.warning(
                f"current papertext index = {curr_index}, Bibcode ({curr_dict['bibcode']}) not in papertrack database. Continuing..."
            )
            bibcodes_notin_papertrack.append(curr_bibcode)
            papertext_index_notin_papertrack.append(curr_index)

    logger.info(f"NOTE: {len(bibcodes_notin_papertrack)} papers in text data that were not in papertrack.")
    logger.info("Done generating dictionaries of combined papertrack+text data.")
    logger.info(f"papertext index not found in papertrack = {papertext_index_notin_papertrack}")

    return (
        combined_dataset,
        bibcodes_notin_papertext,
        bibcodes_notin_papertrack,
        papertext_index_notin_papertrack,
    )




[docs]
def save_text_files(missing_papertext_bibcodes: list, missing_papertrack_bibcodes: list) -> None:
    """Save the text files of the missing bibcodes

    Save the missing bibcodes in the papertext and papertrack files as text files.

    Parameters:
    -----------
    missing_papertext_bibcodes: list
        the list of the missing papertext bibcodes
    missing_papertrack_bibcodes: list
        the list of the missing papertrack bibcodes

    Returns:
    None
    """
    # Also save the bibcodes of the paper-texts not found in papertrack and papertext
    save_text_file(config.output.path_not_in_papertext, missing_papertext_bibcodes)
    save_text_file(config.output.path_not_in_papertrack, missing_papertrack_bibcodes)

    logger.info(f"Bibcodes not in papertext saved to:\n{config.output.path_not_in_papertext}\n")
    logger.info(f"Bibcodes not in papertrack saved to:\n{config.output.path_not_in_papertrack}\n")




[docs]
def build_dataset() -> None:
    """Building the source dataset

    This data is used for transformer models or llm models by combining the papertrack data and the ADS full papertext data.

    """

    logger.info("The script is building the dataset for bibcat!")

    # Load paper texts and papertrack classes
    dataset_papertext_orig, dataset_papertrack_orig = load_datasets(
        config.inputs.path_papertext, config.inputs.path_papertrack
    )
    # First, store trimmed papertext dictionary down to only columns to include
    trimmed_papertext_dataset = trim_dict(dataset_papertext_orig, config.inputs.keys_papertext)
    # combine the papertrack and papertext into one dataset
    combined_dataset, bibcodes_notin_papertext, bibcodes_notin_papertrack, papertext_index_notin_papertrack = (
        combine_datasets(
            trimmed_papertext_dataset,
            dataset_papertrack_orig,
        )
    )
    # Save the combined dataset and other files
    save_json_file(config.inputs.path_source_data, combined_dataset)
    logger.info("The combined dataset is saved!")

    # Save papertext data missing in the papertrack dataset; this may be used for ChatGPT use cases
    save_json_file(
        config.output.path_papertext_not_in_papertrack,
        [dataset_papertext_orig[index] for index in papertext_index_notin_papertrack],
    )
    logger.info("Saved the papertext data not in papertrack!")
    # Save missing bibcodes from the datasets.
    save_text_files(bibcodes_notin_papertext, bibcodes_notin_papertrack)

    logger.info("Saved bibcodes_notin_papertext and bibcodes_notin_papertrack!")




[docs]
@lru_cache
def load_source_dataset():
    """
    Load the original source dataset that is a combined set of papertrack classification and ADS full text. Return a dictionary of the JSON content.
    """
    with open(config.inputs.path_source_data, "r") as openfile:
        logger.info(f"Loading source dataset: {config.inputs.path_source_data}")
        source_dataset = json.load(openfile)
        logger.debug(f"{len(source_dataset)} papers have been loaded")

    return source_dataset