Source code for spinneret.main

"""The main module"""

import glob
import json
import os
from pathlib import Path
from typing import Union
from requests import get, codes
from rdflib import Graph
import daiquiri
from soso.main import convert
from soso.strategies.eml import EML, get_encoding_format
from soso.utilities import delete_null_values, generate_citation_from_doi
from spinneret import workbook
from spinneret.annotator import (
    annotate_workbook,
    annotate_eml,
    get_geoenv_response_data,
)
from spinneret.utilities import load_configuration
from spinneret.graph import create_graph
from spinneret.shadow import create_shadow_eml


logger = daiquiri.getLogger(__name__)


[docs]def create_workbooks(eml_dir: str, workbook_dir: str) -> None: """Create workbooks for each EML file in a directory :param eml_dir: Directory of EML files :param workbook_dir: Directory to save workbooks :return: None :notes: Workbooks will not be created if they already exist. """ # A workbook is created for each EML file eml_files = os.listdir(eml_dir) eml_files = [f for f in eml_files if f.endswith(".xml")] # Filter out non-XML files workbook_files = os.listdir(workbook_dir) workbook_pids = [wb.split("_")[0] for wb in workbook_files] # Iterate over EML files and create workbooks for each for eml_file in eml_files: # Continue if workbook already exists eml_pid = Path(eml_file).stem if eml_pid in workbook_pids: continue # Create workbook logger.info(f"Creating workbook for {eml_file}") wb = workbook.create( eml_file=eml_dir + "/" + eml_file, elements=["dataset", "attribute"], path_out=workbook_dir, )
# pylint: disable=too-many-positional-arguments
[docs]def annotate_workbooks( workbook_dir: str, eml_dir: str, output_dir: str, config_path: str, local_model: str = None, temperature: Union[float, None] = None, return_ungrounded: bool = False, sample_size: int = 1, ) -> None: """Create workbooks for each EML file in a directory :param workbook_dir: Directory of unannotated workbooks :param eml_dir: Directory of EML files corresponding to workbooks :param output_dir: Directory to save annotated workbooks :param config_path: Path to configuration file :param local_model: See `get_ontogpt_annotation` documentation for details. :param temperature: The temperature parameter for the model. If `None`, the OntoGPT default will be used. :param return_ungrounded: See `get_ontogpt_annotation` documentation for details. :param sample_size: Executes multiple replicates of the annotation request to reduce variability of outputs. Variability is inherent in OntoGPT. :return: None :notes: Annotated workbooks will not be created if they already exist. """ # Load BioPortal API key load_configuration(config_path) # An annotated workbook is created for unannotated workbook file workbook_files = os.listdir(workbook_dir) workbook_files = [ f for f in workbook_files if f.endswith(".tsv") ] # Filter out non-TSV files output_files = os.listdir(output_dir) output_files = [f for f in output_files if f.endswith(".tsv")] # Iterate over EML files and create workbooks for each for workbook_file in workbook_files: # Continue if annotated workbook already exists workbook_file_annotated = workbook_file.replace(".tsv", "_annotated.tsv") if workbook_file_annotated in output_files: continue # Match EML file to workbook file eml_pid = workbook_file.split("_")[0] eml_file = eml_pid + ".xml" if not os.path.exists(eml_dir + "/" + eml_file): logger.warning(f"Could not find EML file for {workbook_file}") continue # Create annotated workbook logger.info(f"Creating annotated workbook for {workbook_file}") annotate_workbook( workbook_path=workbook_dir + "/" + workbook_file, eml_path=eml_dir + "/" + eml_file, output_path=output_dir + "/" + workbook_file_annotated, local_model=local_model, temperature=temperature, return_ungrounded=return_ungrounded, sample_size=sample_size, )
[docs]def annotate_eml_files(workbook_dir: str, eml_dir: str, output_dir: str) -> None: """Create workbooks for each EML file in a directory :param workbook_dir: Directory of annotated workbooks :param eml_dir: Directory of unannotated EML files :output_dir: Directory to save annotated EML files :return: None :notes: Annotated EML files will not be created if they already exist. """ # An annotated EML file is created for each annotated workbook file workbook_files = os.listdir(workbook_dir) eml_files = os.listdir(eml_dir) eml_files = [f for f in eml_files if f.endswith(".xml")] # Filter out non-XML files # Iterate over workbook files and create annotated EML for each for workbook_file in workbook_files: # Continue if the EML file does not exist or is already annotated eml_path = eml_dir + "/" + workbook_file.split("_")[0] + ".xml" if not os.path.exists(eml_path): continue eml_path_annotated = output_dir + "/" + workbook_file.split("_")[0] + ".xml" if os.path.exists(eml_path_annotated): continue # Create annotated EML file logger.info(f"Creating annotated EML file for {eml_path}") annotate_eml( eml=eml_path, workbook=workbook_dir + "/" + workbook_file, output_path=eml_path_annotated, )
# pylint: disable=too-many-locals
[docs]def create_soso_files(eml_dir: str, output_dir: str) -> None: """Create SOSO files for each EML file in a directory :param eml_dir: Directory of annotated EML files :param output_dir: Directory to save SOSO files :return: None :notes: SOSO files will not be created if they already exist. """ # A SOSO file is created for each EML file eml_files = os.listdir(eml_dir) eml_files = [f for f in eml_files if f.endswith(".xml")] # Filter out non-XML files soso_files = os.listdir(output_dir) # Iterate over EML files and create SOSO files for each for eml_file in eml_files: # Continue if SOSO file already exists eml_pid = Path(eml_file).stem soso_file = eml_pid + ".json" if soso_file in soso_files: continue logger.info(f"Creating SOSO file for {eml_file}") # Add properties that can't be derived from the EML record scope, identifier, revision = eml_pid.split(".") # url url = ( "https://portal.edirepository.org/nis/mapbrowse?scope=" + scope + "&identifier=" + identifier + "&revision=" + revision ) # @id dataset_id = url # is_accessible_for_free is_accessible_for_free = True # doi doi_uri = ( f"https://pasta.lternet.edu/package/doi/eml/{scope}/{identifier}/{revision}" ) doi = get(doi_uri, timeout=10) if doi.status_code == codes.ok: # pylint: disable=no-member doi = doi.text doi = "https://doi.org/" + doi.split(":")[1] # URL format else: doi = None # identifier if doi is not None: identifier = { # DOI is more informative than the packageId "@id": doi, "@type": "PropertyValue", "propertyID": "https://registry.identifiers.org/registry/doi", "value": doi.split("https://doi.org/")[1], "url": doi, } else: identifier = None # citation if doi is not None: citation = generate_citation_from_doi(doi, style="apa", locale="en-US") else: citation = None provider = {"@id": "https://edirepository.org"} publisher = {"@id": "https://edirepository.org"} # Modify the get_subject_of method to add the missing contentUrl def get_subject_of(self): encoding_format = get_encoding_format(self.metadata) date_modified = self.get_date_modified() if encoding_format and date_modified: file_name = self.file.split("/")[-1] subject_of = { "@type": "DataDownload", "name": "EML metadata for dataset", "description": "EML metadata describing the dataset", "encodingFormat": encoding_format, "contentUrl": ( "https://pasta.lternet.edu/package/metadata/eml/" + file_name.split(".")[0] + "/" + file_name.split(".")[1] + "/" + file_name.split(".")[2] ), "dateModified": date_modified, } return delete_null_values(subject_of) return None EML.get_subject_of = get_subject_of # Override the method # Call the convert function with the additional properties additional_properties = { "url": url, "version": revision, "isAccessibleForFree": is_accessible_for_free, "citation": citation, "provider": provider, "publisher": publisher, "identifier": identifier, "@id": dataset_id, } json_ld = convert( file=eml_dir + "/" + eml_file, strategy="EML", **additional_properties ) # Reformat the JSON-LD for readability and write to file with open(output_dir + "/" + soso_file, "w", encoding="utf-8") as fp: fp.write(json_ld)
[docs]def create_shadow_eml_files(eml_dir: str, output_dir: str) -> None: """Create shadow EML files for each EML file in a directory :param eml_dir: Directory of EML files :param output_dir: Directory to save shadow EML files :return: None :notes: Shadow EML files will not be created if they already exist. """ # A shadow EML file is created for each EML file eml_files = os.listdir(eml_dir) eml_files = [f for f in eml_files if f.endswith(".xml")] # Filter out non-XML files shadow_files = os.listdir(output_dir) # Iterate over EML files and create shadow EML files for each for eml_file in eml_files: # Continue if shadow file already exists eml_pid = Path(eml_file).stem shadow_file = eml_pid + ".xml" if shadow_file in shadow_files: continue # Create shadow EML file logger.info(f"Creating shadow EML file for {eml_file}") create_shadow_eml( eml_path=eml_dir + "/" + eml_file, output_path=output_dir + "/" + shadow_file, )
[docs]def create_kgraph(soso_dir: str, vocabulary_dir: str) -> Graph: """Create a Knowledge Graph from SOSO files and vocabularies :param soso_dir: Directory of SOSO files :param vocabulary_dir: Directory of vocabulary files :return: Knowledge Graph""" # Get list of SOSO and vocabulary files soso_files = [soso_dir + "/" + f for f in os.listdir(soso_dir)] soso_files = [ f for f in soso_files if f.endswith(".json") ] # Filter out non-JSON files vocabulary_files = [vocabulary_dir + "/" + f for f in os.listdir(vocabulary_dir)] vocabulary_files = [ f for f in vocabulary_files if f.endswith(".ttl") or f.endswith(".owl") ] # Filter out non-TTL and non-OWL files # Load knowledge graph kgraph = create_graph(metadata_files=soso_files, vocabulary_files=vocabulary_files) return kgraph
[docs]def create_geoenv_data_files( eml_dir: str, output_dir: str, data_sources: list, overwrite=False ): """ Create GeoEnv data files for each EML file in a directory :param eml_dir: Path to directory containing EML files :param output_dir: Path to directory to save GeoEnv data files :param overwrite: Overwrite existing files, default is False :return: None """ files = glob.glob(os.path.join(eml_dir, "*.xml")) # Iterate over EML files for file in files: file_name = os.path.splitext(os.path.basename(file))[0] output_file_path = os.path.join(output_dir, file_name + ".json") # Don't overwrite existing json files unless specified if os.path.isfile(output_file_path) and not overwrite: continue logger.info(file) # Get the GeoEnv response data response = get_geoenv_response_data(file, data_sources=data_sources) result = {"data": response} # Write the data to a file with open(output_file_path, "w", encoding="utf-8") as f: json.dump(result, f)
if __name__ == "__main__": import logging from geoenvo.data_sources import ( WorldTerrestrialEcosystems, EcologicalMarineUnits, EcologicalCoastalUnits, ) SCOPE = "remainder" daiquiri.setup( level=logging.INFO, outputs=( daiquiri.output.RotatingFile( f"/Users/csmith/Data/testing_geoenvo/full_batch/{SCOPE}_spinneret.log", max_size_bytes=100 * 10**6, # 100 MB backup_count=0, # Unlimited backup files ), "stdout", ), ) ecu = EcologicalCoastalUnits() ecu.buffer = 1 sources = [ WorldTerrestrialEcosystems(), ecu, EcologicalMarineUnits(), ] create_geoenv_data_files( eml_dir="/Users/csmith/Data/testing_geoenvo/full_batch/eml", output_dir="/Users/csmith/Data/testing_geoenvo/full_batch/responses", data_sources=sources, overwrite=False, ) # create_workbooks( # eml_dir="/Users/csmith/Data/kgraph/eml/raw", # workbook_dir="/Users/csmith/Data/kgraph/workbook/raw", # ) # annotate_workbooks( # workbook_dir="/Users/csmith/Data/kgraph/workbook/raw", # eml_dir="/Users/csmith/Data/kgraph/eml/raw", # output_dir="/Users/csmith/Data/kgraph/workbook/annotated", # config_path="/Users/csmith/Code/spinneret_EDIorg/spinneret/config.json", # ) # annotate_eml_files( # workbook_dir="/Users/csmith/Data/kgraph/workbook/annotated", # eml_dir="/Users/csmith/Data/kgraph/eml/raw", # output_dir="/Users/csmith/Data/kgraph/eml/annotated", # ) # create_shadow_eml_files( # eml_dir="/Users/csmith/Data/kgraph/eml/annotated", # output_dir="/Users/csmith/Data/kgraph/eml/shadow", # ) # create_soso_files( # eml_dir="/Users/csmith/Data/kgraph/eml/shadow", # output_dir="/Users/csmith/Data/kgraph/soso/raw", # ) # g = create_kgraph( # soso_dir="/Users/csmith/Data/kgraph/soso/annotated", # vocabulary_dir="/Users/csmith/Data/kgraph/vocab", # ) # # Serialize to file # g.serialize( # destination="/Users/csmith/Data/kgraph/kgraph/edi_kgraph_top_20.ttl", # format="turtle" # )