Source code for spinneret.workbook

"""The workbook module"""

from lxml import etree
import pandas as pd
from spinneret.utilities import (
    delete_empty_tags,
    load_eml,
    write_workbook,
)


[docs]def create(
    eml_file: str, elements: list, env: str = "production", path_out: str = False
) -> pd.core.frame.DataFrame:
    """Create an annotation workbook from an EML file

    :param eml_file: Path to a single EML file
    :param elements:    List of EML elements to include in the workbook. Can be
                        one or more of: 'dataset', 'dataTable', 'otherEntity',
                        'spatialVector', 'spatialRaster', 'storedProcedure',
                        'view', 'attribute'.
    :param env: The environment to use for the base URL. Options are:
        'production', 'staging', 'development'.
    :param path_out:    Path to a directory where the workbook will be written.
                        Will result in a file
                        '[packageId]_annotation_workbook.tsv'.
    :returns:   DataFrame of the annotation workbook with columns:

                * `package_id`: Data package identifier listed in the EML at
                  the xpath attribute `packageId`
                * `url`: Link to the data package landing page corresponding to
                  'package_id'
                * `element`: Element to be annotated
                * `element_id`: UUID assigned at the time of workbook creation
                * `element_xpath`: xpath of element to be annotated
                * `context`: The broader context in which the subject is found.
                  When the subject is a dataset the context is the packageId.
                  When the subject is a data object/entity, the context is
                  `dataset`. When the subject is an attribute, the context is
                  the data object/entity the attribute is apart of.
                * `subject`: The subject of annotation
                * `predicate`: The label of the predicate relating the subject
                  to the object
                * `predicate_id`: The identifier of the predicate. Typically,
                  this is a URI/IRI.
                * `object`: The label of the object
                * `object_id`: The identifier of the object. Typically, this
                  is a URI/IRI.
                * `author`: Identifier of the data curator authoring the
                  annotation. Typically, this value is an ORCiD.
                * `date`: Date of the annotation. Can be helpful when
                  revisiting annotations.
                * `comment`: Comments related to the annotation. Can be useful
                  when revisiting an annotation at a later date.
    """
    eml = load_eml(eml_file)
    eml = delete_empty_tags(eml)
    wb = pd.DataFrame(columns=list_workbook_columns())  # initialize workbook
    for element in elements:
        for e in eml.xpath(".//" + element):
            row = initialize_workbook_row()
            row["package_id"] = get_package_id(eml)
            row["url"] = get_package_url(eml, env)
            row["element"] = element
            if "id" in e.attrib:
                row["element_id"] = e.attrib["id"]
            else:
                row["element_id"] = pd.NA
            row["element_xpath"] = eml.getpath(e)
            row["context"] = get_subject_and_context(e)["context"]
            row["description"] = get_description(e)
            row["subject"] = get_subject_and_context(e)["subject"]
            wb.loc[len(wb)] = row  # append row to workbook
    if path_out:
        path_out = path_out + "/" + get_package_id(eml) + "_annotation_workbook.tsv"
        write_workbook(wb, path_out)
    return wb


[docs]def get_subject_and_context(element: etree._Element) -> dict:
    """Get subject and context values for a given element

    This function is called by 'workbook.create' to get the subject and
    context values. See 'workbook.create' for explanation of parameters.

    :param element: The EML element to be annotated.
    :returns:   Dictionary with keys 'subject' and 'context' and values as the
                subject and context of the element.
    :notes: Values for the 'subject' and 'context' of each annotatable element
            is defined on a case-by-case basis. This approach is taken because
            a generalizable pattern to derive recognizable and meaningful
            values for these fields is difficult since annotatable elements
            (specified by the EML schema) aren't constrained to leaf nodes
            with text values.
    """
    entities = [
        "dataTable",
        "otherEntity",
        "spatialVector",
        "spatialRaster",
        "storedProcedure",
        "view",
    ]
    if element.tag in "dataset":
        subject = "dataset"
        p = element.getparent()
        context = p.xpath("./@packageId")[0]
    elif element.tag in entities:
        subject = element.findtext(".//objectName")
        context = "dataset"
    elif element.tag in "attribute":
        subject = element.findtext(".//attributeName")
        entity = list(element.iterancestors(entities))
        context = entity[0].findtext(".//objectName")
    else:
        subject = None
        context = None
    res = {"subject": subject, "context": context}
    return res


[docs]def get_description(element: etree._Element) -> str:
    """Get the description of an element

    :param element: The EML element to be annotated.
    :returns:   The description of the element.
    """
    entities = [
        "dataTable",
        "otherEntity",
        "spatialVector",
        "spatialRaster",
        "storedProcedure",
        "view",
    ]
    if element.tag in "dataset":
        # Add abstract and keywords, they are descriptive of the entire dataset
        abstract = element.xpath("./abstract")
        if len(abstract) != 0:  # abstract is optional
            abstract = etree.tostring(abstract[0], encoding="utf-8", method="text")
            abstract = abstract.decode("utf-8").strip()
        else:
            abstract = ""
        keywords = element.xpath(".//keyword")
        if len(keywords) != 0:  # keywords are optional
            keywords = [k.text for k in keywords]
        else:
            keywords = ""
        description = abstract + " ".join(keywords)
    elif element.tag in entities:
        description = element.findtext(".//entityName")
    elif element.tag in "attribute":
        description = element.findtext(".//attributeDefinition")
    elif element.tag in "methods":
        methods = etree.tostring(element, encoding="utf-8", method="text")
        description = methods.decode("utf-8").strip()
    else:
        description = None
    return description


[docs]def list_workbook_columns() -> list:
    """
    :returns: A list of the columns in the workbook.
    """
    res = [
        "package_id",
        "url",
        "element",
        "element_id",
        "element_xpath",
        "context",
        "description",
        "subject",
        "predicate",
        "predicate_id",
        "object",
        "object_id",
        "author",
        "date",
        "comment",
    ]
    return res


[docs]def initialize_workbook_row() -> pd.core.series.Series:
    """Initialize a row for the annotation workbook
    :returns:   A pandas Series with the initialized row
    """
    row = dict.fromkeys(list_workbook_columns(), pd.NA)
    return pd.Series(row)


[docs]def get_package_id(eml: etree._ElementTree) -> str:
    """
    :param eml: The EML file as an lxml etree object
    :returns: The packageId of the EML file
    """
    package_id = eml.xpath("./@packageId")[0]
    return package_id


[docs]def get_package_url(eml: etree._ElementTree, env: str = "production") -> str:
    """
    :param eml: The EML file as an lxml etree object
    :param env: The environment to use for the base URL. Options are:
        'production', 'staging', 'development'.
    :returns: The URL to the data package landing page
    """
    if env == "staging":
        base_url = "https://portal-s.edirepository.org/nis/metadataviewer?packageid="
    elif env == "development":
        base_url = "https://portal-d.edirepository.org/nis/metadataviewer?packageid="
    else:
        base_url = "https://portal.edirepository.org/nis/metadataviewer?packageid="
    package_id = get_package_id(eml)
    url = base_url + package_id
    return url


[docs]def delete_duplicate_annotations(
    workbook: pd.core.frame.DataFrame,
) -> pd.core.frame.DataFrame:
    """
    :param workbook: The annotation workbook
    :returns: The workbook with duplicate annotations removed
    :notes: The function removes duplicate annotations based on the
        following columns: `element_xpath`, `predicate`, `predicate_id`,
        `object`, and `object_id`. The most recent annotation, based on `date`,
        is preferred to allow improvements to other fields set by the
        annotator.
    """
    wb = workbook.sort_values("date", ascending=False)
    wb = wb.drop_duplicates(
        subset=[
            "element_xpath",
            "predicate",
            "predicate_id",
            "object",
            "object_id",
        ],
        keep="first",
    )
    return wb


[docs]def delete_annotations(
    workbook: pd.core.frame.DataFrame, criteria: dict
) -> pd.core.frame.DataFrame:
    """
    :param workbook: The workbook to delete rows of annotations from.
    :param criteria: A dictionary of key-value pairs to define rows to delete.
        Each key corresponds to a column in the workbook and each value is a
        string to match in the column.
    :returns: The workbook with annotations deleted corresponding to the
        criteria.
    :notes: A matching row must contain all key-value pairs in the criteria
        dictionary to be deleted. Matching is case-sensitive. Partial
        matches are supported.
    """
    wb = workbook.copy()
    filtered_wb = wb[
        ~wb[list(criteria)]
        .apply(lambda x: x.str.contains(criteria[x.name]))
        .all(axis=1)
    ]
    return filtered_wb


[docs]def delete_unannotated_rows(
    workbook: pd.core.frame.DataFrame,
) -> pd.core.frame.DataFrame:
    """
    :param workbook: The workbook to remove unannotated rows from.
    :returns: The workbook with rows that do not have an annotation deleted.
    :notes: This function may remove potential human annotation opportunities,
        i.e. rows that have not been annotated by an automated annotator but
        may be annotated by a human annotator. It is recommended that this
        function is not applied within existing workbook annotators or the
        `annotate_workbook` wrapper due to this limitation.
    """
    wb = workbook
    unannotated_rows = [is_unannotated_row(row) for index, row in wb.iterrows()]
    wb = wb[~pd.Series(unannotated_rows, index=wb.index)]
    return wb


[docs]def is_unannotated_row(row: pd.core.series.Series) -> bool:
    """
    :param row: A row from the workbook
    :returns: True if the row is unannotated, i.e. one or more of `predicate`,
        `predicate_id`, `object`, `object_id` are missing. Otherwise, False.
    """
    unannotated_row = (
        (row["predicate"] is pd.NA)
        | (row["predicate_id"] is pd.NA)
        | (row["object"] is pd.NA)
        | (row["object_id"] is pd.NA)
    )
    return unannotated_row
Source code for spinneret.workbook

Useful Links

Related Topics