Source code for spinneret.main

"""The main module"""

import glob
import json
import os
from pathlib import Path
from typing import Union
from requests import get, codes
from rdflib import Graph
import daiquiri
from soso.main import convert
from soso.strategies.eml import EML, get_encoding_format
from soso.utilities import delete_null_values, generate_citation_from_doi
from spinneret import workbook
from spinneret.annotator import (
    annotate_workbook,
    annotate_eml,
    get_geoenv_response_data,
)
from spinneret.utilities import load_configuration
from spinneret.graph import create_graph
from spinneret.shadow import create_shadow_eml


logger = daiquiri.getLogger(__name__)


[docs]def create_workbooks(eml_dir: str, workbook_dir: str) -> None:
    """Create workbooks for each EML file in a directory
    :param eml_dir: Directory of EML files
    :param workbook_dir: Directory to save workbooks
    :return: None
    :notes: Workbooks will not be created if they already exist.
    """

    # A workbook is created for each EML file
    eml_files = os.listdir(eml_dir)
    eml_files = [f for f in eml_files if f.endswith(".xml")]  # Filter out non-XML files
    workbook_files = os.listdir(workbook_dir)
    workbook_pids = [wb.split("_")[0] for wb in workbook_files]

    # Iterate over EML files and create workbooks for each
    for eml_file in eml_files:

        # Continue if workbook already exists
        eml_pid = Path(eml_file).stem
        if eml_pid in workbook_pids:
            continue

        # Create workbook
        logger.info(f"Creating workbook for {eml_file}")
        wb = workbook.create(
            eml_file=eml_dir + "/" + eml_file,
            elements=["dataset", "attribute"],
            path_out=workbook_dir,
        )


# pylint: disable=too-many-positional-arguments
[docs]def annotate_workbooks(
    workbook_dir: str,
    eml_dir: str,
    output_dir: str,
    config_path: str,
    local_model: str = None,
    temperature: Union[float, None] = None,
    return_ungrounded: bool = False,
    sample_size: int = 1,
) -> None:
    """Create workbooks for each EML file in a directory

    :param workbook_dir: Directory of unannotated workbooks
    :param eml_dir: Directory of EML files corresponding to workbooks
    :param output_dir: Directory to save annotated workbooks
    :param config_path: Path to configuration file
    :param local_model: See `get_ontogpt_annotation` documentation for details.
    :param temperature: The temperature parameter for the model. If `None`, the
        OntoGPT default will be used.
    :param return_ungrounded: See `get_ontogpt_annotation` documentation for
        details.
    :param sample_size: Executes multiple replicates of the annotation request
        to reduce variability of outputs. Variability is inherent in OntoGPT.
    :return: None
    :notes: Annotated workbooks will not be created if they already exist.
    """

    # Load BioPortal API key
    load_configuration(config_path)

    # An annotated workbook is created for unannotated workbook file
    workbook_files = os.listdir(workbook_dir)
    workbook_files = [
        f for f in workbook_files if f.endswith(".tsv")
    ]  # Filter out non-TSV files
    output_files = os.listdir(output_dir)
    output_files = [f for f in output_files if f.endswith(".tsv")]

    # Iterate over EML files and create workbooks for each
    for workbook_file in workbook_files:

        # Continue if annotated workbook already exists
        workbook_file_annotated = workbook_file.replace(".tsv", "_annotated.tsv")
        if workbook_file_annotated in output_files:
            continue

        # Match EML file to workbook file
        eml_pid = workbook_file.split("_")[0]
        eml_file = eml_pid + ".xml"
        if not os.path.exists(eml_dir + "/" + eml_file):
            logger.warning(f"Could not find EML file for {workbook_file}")
            continue

        # Create annotated workbook
        logger.info(f"Creating annotated workbook for {workbook_file}")
        annotate_workbook(
            workbook_path=workbook_dir + "/" + workbook_file,
            eml_path=eml_dir + "/" + eml_file,
            output_path=output_dir + "/" + workbook_file_annotated,
            local_model=local_model,
            temperature=temperature,
            return_ungrounded=return_ungrounded,
            sample_size=sample_size,
        )


[docs]def annotate_eml_files(workbook_dir: str, eml_dir: str, output_dir: str) -> None:
    """Create workbooks for each EML file in a directory

    :param workbook_dir: Directory of annotated workbooks
    :param eml_dir: Directory of unannotated EML files
    :output_dir: Directory to save annotated EML files
    :return: None
    :notes: Annotated EML files will not be created if they already exist.
    """

    # An annotated EML file is created for each annotated workbook file
    workbook_files = os.listdir(workbook_dir)
    eml_files = os.listdir(eml_dir)
    eml_files = [f for f in eml_files if f.endswith(".xml")]  # Filter out non-XML files

    # Iterate over workbook files and create annotated EML for each
    for workbook_file in workbook_files:

        # Continue if the EML file does not exist or is already annotated
        eml_path = eml_dir + "/" + workbook_file.split("_")[0] + ".xml"
        if not os.path.exists(eml_path):
            continue
        eml_path_annotated = output_dir + "/" + workbook_file.split("_")[0] + ".xml"
        if os.path.exists(eml_path_annotated):
            continue

        # Create annotated EML file
        logger.info(f"Creating annotated EML file for {eml_path}")
        annotate_eml(
            eml=eml_path,
            workbook=workbook_dir + "/" + workbook_file,
            output_path=eml_path_annotated,
        )


# pylint: disable=too-many-locals
[docs]def create_soso_files(eml_dir: str, output_dir: str) -> None:
    """Create SOSO files for each EML file in a directory

    :param eml_dir: Directory of annotated EML files
    :param output_dir: Directory to save SOSO files
    :return: None
    :notes: SOSO files will not be created if they already exist.
    """

    # A SOSO file is created for each EML file
    eml_files = os.listdir(eml_dir)
    eml_files = [f for f in eml_files if f.endswith(".xml")]  # Filter out non-XML files
    soso_files = os.listdir(output_dir)

    # Iterate over EML files and create SOSO files for each
    for eml_file in eml_files:

        # Continue if SOSO file already exists
        eml_pid = Path(eml_file).stem
        soso_file = eml_pid + ".json"
        if soso_file in soso_files:
            continue
        logger.info(f"Creating SOSO file for {eml_file}")

        # Add properties that can't be derived from the EML record
        scope, identifier, revision = eml_pid.split(".")
        # url
        url = (
            "https://portal.edirepository.org/nis/mapbrowse?scope="
            + scope
            + "&identifier="
            + identifier
            + "&revision="
            + revision
        )
        # @id
        dataset_id = url
        # is_accessible_for_free
        is_accessible_for_free = True
        # doi
        doi_uri = (
            f"https://pasta.lternet.edu/package/doi/eml/{scope}/{identifier}/{revision}"
        )
        doi = get(doi_uri, timeout=10)
        if doi.status_code == codes.ok:  # pylint: disable=no-member
            doi = doi.text
            doi = "https://doi.org/" + doi.split(":")[1]  # URL format
        else:
            doi = None
        # identifier
        if doi is not None:
            identifier = {  # DOI is more informative than the packageId
                "@id": doi,
                "@type": "PropertyValue",
                "propertyID": "https://registry.identifiers.org/registry/doi",
                "value": doi.split("https://doi.org/")[1],
                "url": doi,
            }
        else:
            identifier = None
        # citation
        if doi is not None:
            citation = generate_citation_from_doi(doi, style="apa", locale="en-US")
        else:
            citation = None
        provider = {"@id": "https://edirepository.org"}
        publisher = {"@id": "https://edirepository.org"}

        # Modify the get_subject_of method to add the missing contentUrl
        def get_subject_of(self):
            encoding_format = get_encoding_format(self.metadata)
            date_modified = self.get_date_modified()
            if encoding_format and date_modified:
                file_name = self.file.split("/")[-1]
                subject_of = {
                    "@type": "DataDownload",
                    "name": "EML metadata for dataset",
                    "description": "EML metadata describing the dataset",
                    "encodingFormat": encoding_format,
                    "contentUrl": (
                        "https://pasta.lternet.edu/package/metadata/eml/"
                        + file_name.split(".")[0]
                        + "/"
                        + file_name.split(".")[1]
                        + "/"
                        + file_name.split(".")[2]
                    ),
                    "dateModified": date_modified,
                }
                return delete_null_values(subject_of)
            return None

        EML.get_subject_of = get_subject_of  # Override the method

        # Call the convert function with the additional properties
        additional_properties = {
            "url": url,
            "version": revision,
            "isAccessibleForFree": is_accessible_for_free,
            "citation": citation,
            "provider": provider,
            "publisher": publisher,
            "identifier": identifier,
            "@id": dataset_id,
        }
        json_ld = convert(
            file=eml_dir + "/" + eml_file, strategy="EML", **additional_properties
        )

        # Reformat the JSON-LD for readability and write to file
        with open(output_dir + "/" + soso_file, "w", encoding="utf-8") as fp:
            fp.write(json_ld)


[docs]def create_shadow_eml_files(eml_dir: str, output_dir: str) -> None:
    """Create shadow EML files for each EML file in a directory

    :param eml_dir: Directory of EML files
    :param output_dir: Directory to save shadow EML files
    :return: None
    :notes: Shadow EML files will not be created if they already exist.
    """

    # A shadow EML file is created for each EML file
    eml_files = os.listdir(eml_dir)
    eml_files = [f for f in eml_files if f.endswith(".xml")]  # Filter out non-XML files
    shadow_files = os.listdir(output_dir)

    # Iterate over EML files and create shadow EML files for each
    for eml_file in eml_files:

        # Continue if shadow file already exists
        eml_pid = Path(eml_file).stem
        shadow_file = eml_pid + ".xml"
        if shadow_file in shadow_files:
            continue

        # Create shadow EML file
        logger.info(f"Creating shadow EML file for {eml_file}")
        create_shadow_eml(
            eml_path=eml_dir + "/" + eml_file,
            output_path=output_dir + "/" + shadow_file,
        )


[docs]def create_kgraph(soso_dir: str, vocabulary_dir: str) -> Graph:
    """Create a Knowledge Graph from SOSO files and vocabularies

    :param soso_dir: Directory of SOSO files
    :param vocabulary_dir: Directory of vocabulary files
    :return: Knowledge Graph"""

    # Get list of SOSO and vocabulary files
    soso_files = [soso_dir + "/" + f for f in os.listdir(soso_dir)]
    soso_files = [
        f for f in soso_files if f.endswith(".json")
    ]  # Filter out non-JSON files
    vocabulary_files = [vocabulary_dir + "/" + f for f in os.listdir(vocabulary_dir)]
    vocabulary_files = [
        f for f in vocabulary_files if f.endswith(".ttl") or f.endswith(".owl")
    ]  # Filter out non-TTL and non-OWL files

    # Load knowledge graph
    kgraph = create_graph(metadata_files=soso_files, vocabulary_files=vocabulary_files)

    return kgraph


[docs]def create_geoenv_data_files(
    eml_dir: str, output_dir: str, data_sources: list, overwrite=False
):
    """
    Create GeoEnv data files for each EML file in a directory
    :param eml_dir: Path to directory containing EML files
    :param output_dir: Path to directory to save GeoEnv data files
    :param overwrite: Overwrite existing files, default is False
    :return: None
    """
    files = glob.glob(os.path.join(eml_dir, "*.xml"))
    # Iterate over EML files
    for file in files:
        file_name = os.path.splitext(os.path.basename(file))[0]
        output_file_path = os.path.join(output_dir, file_name + ".json")

        # Don't overwrite existing json files unless specified
        if os.path.isfile(output_file_path) and not overwrite:
            continue
        logger.info(file)

        # Get the GeoEnv response data
        response = get_geoenv_response_data(file, data_sources=data_sources)
        result = {"data": response}

        # Write the data to a file
        with open(output_file_path, "w", encoding="utf-8") as f:
            json.dump(result, f)


if __name__ == "__main__":
    import logging
    from geoenvo.data_sources import (
        WorldTerrestrialEcosystems,
        EcologicalMarineUnits,
        EcologicalCoastalUnits,
    )

    SCOPE = "remainder"

    daiquiri.setup(
        level=logging.INFO,
        outputs=(
            daiquiri.output.RotatingFile(
                f"/Users/csmith/Data/testing_geoenvo/full_batch/{SCOPE}_spinneret.log",
                max_size_bytes=100 * 10**6,  # 100 MB
                backup_count=0,  # Unlimited backup files
            ),
            "stdout",
        ),
    )

    ecu = EcologicalCoastalUnits()
    ecu.buffer = 1
    sources = [
        WorldTerrestrialEcosystems(),
        ecu,
        EcologicalMarineUnits(),
    ]

    create_geoenv_data_files(
        eml_dir="/Users/csmith/Data/testing_geoenvo/full_batch/eml",
        output_dir="/Users/csmith/Data/testing_geoenvo/full_batch/responses",
        data_sources=sources,
        overwrite=False,
    )

    # create_workbooks(
    #     eml_dir="/Users/csmith/Data/kgraph/eml/raw",
    #     workbook_dir="/Users/csmith/Data/kgraph/workbook/raw",
    # )

    # annotate_workbooks(
    #     workbook_dir="/Users/csmith/Data/kgraph/workbook/raw",
    #     eml_dir="/Users/csmith/Data/kgraph/eml/raw",
    #     output_dir="/Users/csmith/Data/kgraph/workbook/annotated",
    #     config_path="/Users/csmith/Code/spinneret_EDIorg/spinneret/config.json",
    # )

    # annotate_eml_files(
    #     workbook_dir="/Users/csmith/Data/kgraph/workbook/annotated",
    #     eml_dir="/Users/csmith/Data/kgraph/eml/raw",
    #     output_dir="/Users/csmith/Data/kgraph/eml/annotated",
    # )

    # create_shadow_eml_files(
    #     eml_dir="/Users/csmith/Data/kgraph/eml/annotated",
    #     output_dir="/Users/csmith/Data/kgraph/eml/shadow",
    # )

    # create_soso_files(
    #     eml_dir="/Users/csmith/Data/kgraph/eml/shadow",
    #     output_dir="/Users/csmith/Data/kgraph/soso/raw",
    # )

    # g = create_kgraph(
    #     soso_dir="/Users/csmith/Data/kgraph/soso/annotated",
    #     vocabulary_dir="/Users/csmith/Data/kgraph/vocab",
    # )
    # # Serialize to file
    # g.serialize(
    #     destination="/Users/csmith/Data/kgraph/kgraph/edi_kgraph_top_20.ttl",
    #     format="turtle"
    # )
Source code for spinneret.main

Useful Links

Related Topics