Object fetcher methods

globalbiodata.utils_fetch

fetch_accession

fetch_accession(
    query,
    expanded=False,
    conn=None,
    engine=None,
    debug=False,
)

Fetch Accession(es) from the database matching the provided query.

Parameters:
  • query (dict) –

    Dictionary of column names and values to match for selection.

  • expanded (bool, default: False ) –

    If True, fetch associated resource, version, and publications.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • Optional[list]

    list of Accession objects if found, else None.

Source code in globalbiodata/utils_fetch.py
def fetch_accession(query: dict, expanded: bool = False, conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> Optional[list]:
    """Fetch Accession(es) from the database matching the provided query.

    Args:
        query (dict): Dictionary of column names and values to match for selection.
        expanded (bool, optional): If `True`, fetch associated resource, version, and publications.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
		list of Accession objects if found, else `None`.
    """
    from .accession import Accession

    # join accession and accession_publication tables to get publication IDs
    order_by = ["accession_resource_id", "accession_accession"]
    formatted_query = {f"accession_publication_{k}" if k =='publication_id' else f"accession_{k}": v for k, v in query.items() if v is not None}
    accession_raw = select_from_table('accession', formatted_query, join_table='accession_publication', order_by=order_by, conn=conn, engine=engine, debug=debug)

    # format column names to remove table prefixes added by sqlalchemy join
    accession_results = []
    for a in accession_raw:
        af = {re.sub('^accession_publication_', '', k): v for k, v in a.items()}
        af = {re.sub('^accession_', '', k): v for k, v in af.items()}
        accession_results.append(af)

    if len(accession_results) == 0:
        return None

    # group by accession to combine multiple publications
    grouped_accessions = {}
    for a in accession_results:
        if a['accession'] not in grouped_accessions:
            grouped_accessions[a['accession']] = {
                'version_id': a['version_id'],
                'resource_id': a['resource_id'],
                'publications': set(),
                'url': a['url'],
                'additional_metadata': a['prediction_metadata']
            }
        grouped_accessions[a['accession']]['publications'].add(a['publication_id'])

    sorted_accessions = sorted(grouped_accessions.keys()) # sort for consistent order (important for testing)

    # build component objects
    accessions = []
    for a in sorted_accessions:
        a_obj = { 'accession': a, 'publications': [] }
        a_obj['resource'] = fetch_resource({'id':grouped_accessions[a]['resource_id']}, expanded=expanded, conn=conn, engine=engine, debug=debug)
        a_obj['version'] = fetch_version({'id':grouped_accessions[a]['version_id']}, conn=conn, engine=engine, debug=debug)
        a_obj['publications'] = fetch_publication({'id':list(grouped_accessions[a]['publications'])}, expanded=expanded, conn=conn, engine=engine, debug=debug)
        a_obj['publications'] = [a_obj['publications']] if type(a_obj['publications']) is not list else a_obj['publications']

        accessions.append(Accession(a_obj))

    return accessions

fetch_all_connection_statuses

fetch_all_connection_statuses(
    order_by="url_id", conn=None, engine=None, debug=False
)

Fetch all ConnectionStatuses from the database.

Parameters:
  • order_by (str, default: 'url_id' ) –

    Column name(s) to order the results by.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • list

    List of ConnectionStatus objects.

Source code in globalbiodata/utils_fetch.py
def fetch_all_connection_statuses(order_by: str = 'url_id', conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> list:
    """Fetch all ConnectionStatuses from the database.

    Args:
        order_by (str, optional): Column name(s) to order the results by.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
        List of ConnectionStatus objects.
    """
    return fetch_connection_status({}, order_by=order_by, conn=conn, engine=engine, debug=debug)

fetch_all_grant_agencies

fetch_all_grant_agencies(
    order_by="id", conn=None, engine=None, debug=False
)

Fetch all GrantAgencies from the database.

Parameters:
  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • list

    List of GrantAgency objects.

Source code in globalbiodata/utils_fetch.py
def fetch_all_grant_agencies(order_by: str = 'id', conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> list:
    """Fetch all GrantAgencies from the database.

    Args:
        order_by (str, optional): Column name(s) to order the results by.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
		List of GrantAgency objects.
    """
    return fetch_grant_agency({}, order_by=order_by, conn=conn, engine=engine, debug=debug)

fetch_all_grants

fetch_all_grants(
    order_by="id", conn=None, engine=None, debug=False
)

Fetch all Grants from the database.

Parameters:
  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • list

    List of Grant objects.

Source code in globalbiodata/utils_fetch.py
def fetch_all_grants(order_by: str = 'id', conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> list:
    """Fetch all Grants from the database.

    Args:
        order_by (str, optional): Column name(s) to order the results by.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
		List of Grant objects.
    """
    return fetch_grant({}, order_by=order_by, conn=conn, engine=engine, debug=debug)

fetch_all_online_resources

fetch_all_online_resources(
    order_by="id",
    expanded=True,
    conn=None,
    engine=None,
    debug=False,
)

Fetch all Resources from the database where online status is true.

Parameters:
  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • expanded (bool, default: True ) –

    If True, fetch associated publications and grants.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • list

    List of online Resource objects.

Source code in globalbiodata/utils_fetch.py
def fetch_all_online_resources(order_by: str = 'id', expanded: bool = True, conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> list:
    """Fetch all Resources from the database where online status is true.

    Args:
        order_by (str, optional): Column name(s) to order the results by.
        expanded (bool, optional): If `True`, fetch associated publications and grants.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
        List of online Resource objects.
    """
    full_list = fetch_all_resources(order_by=order_by, expanded=expanded, conn=conn, engine=engine, debug=debug)
    return [r for r in full_list if r.is_online()]

fetch_all_publications

fetch_all_publications(
    order_by="id",
    expanded=True,
    conn=None,
    engine=None,
    debug=False,
)

Fetch all Publications from the database.

Parameters:
  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • expanded (bool, default: True ) –

    If True, fetch associated grants.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • list

    List of Publication objects.

Source code in globalbiodata/utils_fetch.py
def fetch_all_publications(order_by: str = 'id', expanded: bool = True, conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> list:
    """Fetch all Publications from the database.

    Args:
        order_by (str, optional): Column name(s) to order the results by.
        expanded (bool, optional): If `True`, fetch associated grants.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
		List of Publication objects.
    """
    return fetch_publication({}, order_by=order_by, expanded=expanded, conn=conn, engine=engine, debug=debug)

fetch_all_resources

fetch_all_resources(
    order_by="id",
    expanded=True,
    conn=None,
    engine=None,
    debug=False,
)

Fetch all Resources from the database.

Parameters:
  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • expanded (bool, default: True ) –

    If True, fetch associated publications and grants.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • list

    List of Resource objects.

Source code in globalbiodata/utils_fetch.py
def fetch_all_resources(order_by: str = 'id', expanded: bool = True, conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> list:
    """Fetch all Resources from the database.

    Args:
        order_by (str, optional): Column name(s) to order the results by.
        expanded (bool, optional): If `True`, fetch associated publications and grants.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
        List of Resource objects.
    """
    return fetch_resource({}, order_by=order_by, expanded=expanded, conn=conn, engine=engine, debug=debug)

fetch_all_urls

fetch_all_urls(
    order_by="id",
    expanded=True,
    conn=None,
    engine=None,
    debug=False,
)

Fetch all URLs from the database.

Parameters:
  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • expanded (bool, default: True ) –

    If True, fetch associated connection status.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • list

    List of URL objects.

Source code in globalbiodata/utils_fetch.py
def fetch_all_urls(order_by: str = 'id', expanded: bool = True, conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> list:
    """Fetch all URLs from the database.

    Args:
        order_by (str, optional): Column name(s) to order the results by.
        expanded (bool, optional): If `True`, fetch associated connection status.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
        List of URL objects.
    """
    return fetch_url({}, order_by=order_by, expanded=expanded, conn=conn, engine=engine, debug=debug)

fetch_all_versions

fetch_all_versions(
    order_by="id", conn=None, engine=None, debug=False
)

Fetch all Versions from the database.

Parameters:
  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • list

    List of Version objects.

Source code in globalbiodata/utils_fetch.py
def fetch_all_versions(order_by: str = 'id', conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> list:
    """Fetch all Versions from the database.

    Args:
        order_by (str, optional): Column name(s) to order the results by.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
		List of Version objects.
    """
    return fetch_version({}, order_by=order_by, conn=conn, engine=engine, debug=debug)

fetch_connection_status

fetch_connection_status(
    query,
    order_by="url_id",
    conn=None,
    engine=None,
    debug=False,
)

Fetch ConnectionStatus(es) from the database matching the provided query.

Parameters:
  • query (dict) –

    Dictionary of column names and values to match for selection.

  • order_by (str, default: 'url_id' ) –

    Column name(s) to order the results by.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • Optional[ConnectionStatus]

    single ConnectionStatus object (where single result if found), or list of ConnectionStatus objects if found, else None.

Source code in globalbiodata/utils_fetch.py
def fetch_connection_status(query: dict, order_by: str = 'url_id', conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> Optional[ConnectionStatus]:
    """Fetch ConnectionStatus(es) from the database matching the provided query.

    Args:
        query (dict): Dictionary of column names and values to match for selection.
        order_by (str, optional): Column name(s) to order the results by.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
        single ConnectionStatus object (where single result if found), or list of ConnectionStatus objects if found, else `None`.
    """
    from .url import ConnectionStatus

    status_raw = select_from_table('connection_status', query, order_by=order_by, conn=conn, engine=engine, debug=debug)
    if len(status_raw) == 0:
        return None

    conn_stats = [ConnectionStatus(cs) for cs in status_raw]

    return conn_stats if len(conn_stats) > 1 else conn_stats[0]

fetch_grant

fetch_grant(
    query,
    order_by="id",
    conn=None,
    engine=None,
    debug=False,
)

Fetch Grant(s) from the database matching the provided query.

Parameters:
  • query (dict) –

    Dictionary of column names and values to match for selection.

  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • Optional[Grant]

    single Grant object (where single result if found), or list of Grant objects if found, else None.

Source code in globalbiodata/utils_fetch.py
def fetch_grant(query: dict, order_by: str = 'id', conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> Optional[Grant]:
    """Fetch Grant(s) from the database matching the provided query.

    Args:
        query (dict): Dictionary of column names and values to match for selection.
        order_by (str, optional): Column name(s) to order the results by.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
		single Grant object (where single result if found), or list of Grant objects if found, else `None`.
    """
    from .grant import Grant

    grant_raw = select_from_table('grant', query, order_by=order_by, conn=conn, engine=engine, debug=debug)
    if len(grant_raw) == 0:
        return None

    grants = []
    for g in grant_raw:
        g['grant_agency'] = fetch_grant_agency({'id':g['grant_agency_id']}, conn=conn, engine=engine, debug=debug)
        grants.append(Grant(g))

    return grants if len(grants) > 1 else grants[0]

fetch_grant_agency

fetch_grant_agency(
    query,
    order_by="id",
    conn=None,
    engine=None,
    debug=False,
)

Fetch GrantAgency(s) from the database matching the provided query.

Parameters:
  • query (dict) –

    Dictionary of column names and values to match for selection.

  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • Optional[GrantAgency]

    single GrantAgency object (where single result if found), or list of GrantAgency objects if found, else None.

Source code in globalbiodata/utils_fetch.py
def fetch_grant_agency(query: dict, order_by: str = 'id', conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> Optional[GrantAgency]:
    """Fetch GrantAgency(s) from the database matching the provided query.

    Args:
        query (dict): Dictionary of column names and values to match for selection.
        order_by (str, optional): Column name(s) to order the results by.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
		single GrantAgency object (where single result if found), or list of GrantAgency objects if found, else `None`.
    """
    from .grant import GrantAgency

    grant_agency_raw = select_from_table('grant_agency', query, order_by=order_by, conn=conn, engine=engine, debug=debug)
    if len(grant_agency_raw) == 0:
        return None

    grant_agencies = [GrantAgency(ga) for ga in grant_agency_raw]

    return grant_agencies if len(grant_agencies) > 1 else grant_agencies[0]

fetch_publication

fetch_publication(
    query,
    order_by="id",
    expanded=True,
    conn=None,
    engine=None,
    debug=False,
)

Fetch Publication(s) from the database matching the provided query.

Parameters:
  • query (dict) –

    Dictionary of column names and values to match for selection.

  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • expanded (bool, default: True ) –

    If True, fetch associated grants.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • Optional[Publication]

    single Publication object (where single result if found), or list of Publication objects if found, else None.

Source code in globalbiodata/utils_fetch.py
def fetch_publication(query: dict, order_by: str = 'id', expanded: bool = True, conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> Optional[Publication]:
    """Fetch Publication(s) from the database matching the provided query.

    Args:
        query (dict): Dictionary of column names and values to match for selection.
        order_by (str, optional): Column name(s) to order the results by.
        expanded (bool, optional): If `True`, fetch associated grants.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
		single Publication object (where single result if found), or list of Publication objects if found, else `None`.
    """
    from .publication import Publication

    publication_raw = select_from_table('publication', query, order_by=order_by, conn=conn, engine=engine, debug=debug)
    if len(publication_raw) == 0:
        return None

    publications = []
    for p in publication_raw:
        if expanded:
            grant_ids = select_from_table('publication_grant', {'publication_id':p['id']}, conn=conn, engine=engine, debug=debug)
            p['grants'] = fetch_grant({'id':[g['grant_id'] for g in grant_ids]}, conn=conn, engine=engine, debug=debug)
            p['grants'] = [p['grants']] if (p['grants'] is not None and type(p['grants']) is not list) else p['grants']
        else:
            p['grants'] = None

        p['__conn__'] = conn
        p['__engine__'] = engine

        publications.append(Publication(p))

    return publications if len(publications) > 1 else publications[0]

fetch_resource

fetch_resource(
    query,
    order_by="id",
    expanded=True,
    conn=None,
    engine=None,
    debug=False,
)

Fetch Resource(s) from the database matching the provided query.

Parameters:
  • query (dict) –

    Dictionary of column names and values to match for selection.

  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • expanded (bool, default: True ) –

    If True, fetch associated publications and grants.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • Optional[Resource]

    single Resource object (where single result if found), or list of Resource objects if found, else None.

Source code in globalbiodata/utils_fetch.py
def fetch_resource(query: dict, order_by: str = 'id', expanded: bool = True, conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> Optional[Resource]:
    """Fetch Resource(s) from the database matching the provided query.

    Args:
        query (dict): Dictionary of column names and values to match for selection.
        order_by (str, optional): Column name(s) to order the results by.
        expanded (bool, optional): If `True`, fetch associated publications and grants.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
        single Resource object (where single result if found), or list of Resource objects if found, else `None`.
    """
    from .resource import Resource

    resource_raw = select_from_table('resource', query, order_by=order_by, conn=conn, engine=engine, debug=debug)
    if len(resource_raw) == 0:
        return None

    resources = []
    for r in resource_raw:
        r['url'] = fetch_url({'id':r['url_id']}, conn=conn, engine=engine, debug=debug)
        r['version'] = fetch_version({'id':r['version_id']}, conn=conn, engine=engine, debug=debug)

        if expanded:
            pub_ids = select_from_table('resource_publication', {'resource_id':r['id']}, conn=conn, engine=engine, debug=debug)
            r['publications'] = fetch_publication({'id':[p['publication_id'] for p in pub_ids]}, conn=conn, engine=engine, debug=debug)
            r['publications'] = [r['publications']] if type(r['publications']) is not list else r['publications']

            grant_ids = select_from_table('resource_grant', {'resource_id':r['id']}, conn=conn, engine=engine, debug=debug)
            r['grants'] = fetch_grant({'id':[g['grant_id'] for g in grant_ids]}, conn=conn, engine=engine, debug=debug)
            r['grants'] = [r['grants']] if (r['grants'] is not None and type(r['grants']) is not list) else r['grants']

        r['__conn__'] = conn
        r['__engine__'] = engine

        resources.append(Resource(r))

    return resources if len(resources) > 1 else resources[0]

fetch_resource_mention

fetch_resource_mention(
    query,
    expanded=True,
    conn=None,
    engine=None,
    debug=False,
)

Fetch ResourceMention(s) from the database matching the provided query.

Parameters:
  • query (dict) –

    Dictionary of column names and values to match for selection.

  • expanded (bool, default: True ) –

    If True, fetch associated resource, version, and publication.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • Optional[list]

    list of ResourceMention objects if found, else None.

Source code in globalbiodata/utils_fetch.py
def fetch_resource_mention(query: dict, expanded: bool = True, conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> Optional[list]:
    """Fetch ResourceMention(s) from the database matching the provided query.

    Args:
        query (dict): Dictionary of column names and values to match for selection.
        expanded (bool, optional): If `True`, fetch associated resource, version, and publication.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
		list of ResourceMention objects if found, else `None`.
    """
    from .resource_mention import ResourceMention, MatchedAlias

    order_by = ['publication_id', 'resource_id', 'match_count']
    mention_raw = select_from_table('resource_mention', query, order_by=order_by, conn=conn, engine=engine, debug=debug)
    if len(mention_raw) == 0:
        return None

    # group by publication_id, resource_id, version_id to aggregate matched_aliases
    mentions_grouped = {}
    group_order = set()
    for m in mention_raw:
        m['mean_confidence'] = float(m['mean_confidence'])
        m['match_count'] = int(m['match_count'])

        key = (m['publication_id'], m['resource_id'], m['version_id'])
        group_order.add(key)
        if key not in mentions_grouped:
            mentions_grouped[key] = {
                'publication_id': m['publication_id'],
                'resource_id': m['resource_id'],
                'version_id': m['version_id'],
                'matched_aliases': [],
            }
        mentions_grouped[key]['matched_aliases'].append(MatchedAlias({
            'matched_alias': m['matched_alias'],
            'match_count': m['match_count'],
            'mean_confidence': m['mean_confidence']
        }))
    for k in mentions_grouped:
        this_group_match_count, this_group_conf_sum, this_group_conf_n = 0, 0.0, 0
        for ma in mentions_grouped[k]['matched_aliases']:
            this_group_match_count += ma.match_count
            this_group_conf_sum += ma.mean_confidence
            this_group_conf_n += 1
        mentions_grouped[k]['match_count'] = this_group_match_count
        mentions_grouped[k]['mean_confidence'] = (this_group_conf_sum / this_group_conf_n) if this_group_conf_n > 0 else 0.0

    # build component objects
    mentions = []
    cache = {}
    for group_key in group_order:
        m = mentions_grouped[group_key]
        m_obj = {}
        if f"pub:{m['publication_id']}" in cache:
            m_obj['publication'] = cache[f"pub:{m['publication_id']}"]
        else:
            m_obj['publication'] = fetch_publication({'id':m['publication_id']}, expanded=expanded, conn=conn, engine=engine, debug=debug)
            cache[f"pub:{m['publication_id']}"] = m_obj['publication']

        if f"res:{m['resource_id']}" in cache:
            m_obj['resource'] = cache[f"res:{m['resource_id']}"]
        else:
            m_obj['resource'] = fetch_resource({'id':m['resource_id']}, expanded=expanded, conn=conn, engine=engine, debug=debug)
            cache[f"res:{m['resource_id']}"] = m_obj['resource']

        if f"ver:{m['version_id']}" in cache:
            m_obj['version'] = cache[f"ver:{m['version_id']}"]
        else:
            m_obj['version'] = fetch_version({'id':m['version_id']}, conn=conn, engine=engine, debug=debug)
            cache[f"ver:{m['version_id']}"] = m_obj['version']

        m_obj['matched_aliases'] = m['matched_aliases'][::-1] # reverse order to have highest count first
        m_obj['match_count'] = m['match_count']
        m_obj['mean_confidence'] = m['mean_confidence']

        mentions.append(ResourceMention(m_obj))

    return mentions

fetch_url

fetch_url(
    query,
    order_by="id",
    expanded=True,
    conn=None,
    engine=None,
    debug=False,
)

Fetch URL(s) from the database matching the provided query.

Parameters:
  • query (dict) –

    Dictionary of column names and values to match for selection.

  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • expanded (bool, default: True ) –

    If True, fetch associated connection status.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • Optional[URL]

    single URL object (where single result if found), or list of URL objects if found, else None.

Source code in globalbiodata/utils_fetch.py
def fetch_url(query: dict, order_by: str = 'id', expanded: bool = True, conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> Optional[URL]:
    """Fetch URL(s) from the database matching the provided query.

    Args:
        query (dict): Dictionary of column names and values to match for selection.
        order_by (str, optional): Column name(s) to order the results by.
        expanded (bool, optional): If `True`, fetch associated connection status.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
        single URL object (where single result if found), or list of URL objects if found, else `None`.
    """
    from .url import URL

    url_raw = select_from_table('url', query, order_by=order_by, conn=conn, engine=engine, debug=debug)
    if len(url_raw) == 0:
        return None

    urls = []
    for u in url_raw:
        if expanded:
            u['status'] = fetch_connection_status({'url_id':u['id']}, order_by=['is_latest', 'date'], conn=conn, engine=engine, debug=debug)
            u['status'] = [u['status']] if (u['status'] is not None and type(u['status']) is not list) else u['status']
            u['status'] = u['status'][::-1] # reverse order to have latest first

        urls.append(URL(u))

    return urls if len(urls) > 1 else urls[0]

fetch_version

fetch_version(
    query,
    order_by="id",
    conn=None,
    engine=None,
    debug=False,
)

Fetch Version(s) from the database matching the provided query.

Parameters:
  • query (dict) –

    Dictionary of column names and values to match for selection.

  • order_by (str, default: 'id' ) –

    Column name(s) to order the results by.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • Optional[Version]

    single Version object (where single result if found), or list of Version objects if found, else None.

Source code in globalbiodata/utils_fetch.py
def fetch_version(query: dict, order_by: str = 'id', conn: Optional[Connection] = None, engine: Optional[Engine] = None, debug: bool = False) -> Optional[Version]:
    """Fetch Version(s) from the database matching the provided query.

    Args:
        query (dict): Dictionary of column names and values to match for selection.
        order_by (str, optional): Column name(s) to order the results by.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
		single Version object (where single result if found), or list of Version objects if found, else `None`.
    """
    from .version import Version

    version_raw = select_from_table('version', query, order_by=order_by, conn=conn, engine=engine, debug=debug)
    if len(version_raw) == 0:
        return None

    versions = [Version(p) for p in version_raw]

    return versions if len(versions) > 1 else versions[0]

Generic database interactors

globalbiodata.utils_db

delete_from_table

delete_from_table(
    table_name, data, conn=None, engine=None, debug=False
)

Delete rows from a table matching the provided data.

Parameters:
  • table_name (str) –

    Name of the table to delete from.

  • data (dict) –

    Dictionary of column names and values to match for deletion.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • int

    Number of rows deleted.

Source code in globalbiodata/utils_db.py
def delete_from_table(
    table_name: str,
    data: dict,
    conn: Optional[Connection] = None,
    engine: Optional[Engine] = None,
    debug: bool = False
) -> int:
    """Delete rows from a table matching the provided data.

    Args:
        table_name (str): Name of the table to delete from.
        data (dict): Dictionary of column names and values to match for deletion.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
		Number of rows deleted.
    """
    metadata_obj = db.MetaData()

    conn_created = False
    if conn is None:
        if engine is None:
            raise ValueError("select_from_table requires either an engine or an open connection")
        conn = engine.connect()
        conn_created = True

    # Reflect the table using the active connection
    table = db.Table(table_name, metadata_obj, autoload_with=conn)
    data = _stringify_data(data)

    if debug:
        print(f"\n--> Deleting from table: {table_name} WHERE:")
        print(' AND '.join([f"{k} == {data[k]}" for k in data.keys()]))

    trans = conn.begin() if not conn.in_transaction() else None  # Begin a transaction if we're not already in one
    try:
        wheres = [table.columns.get(c) == data[c] for c in data.keys()]
        del_result = conn.execute(db.delete(table).where(db.and_(*wheres)))
        if debug:
            print(f"Deleted {del_result.rowcount} rows.")
        trans.commit()  # Commit the transaction
    except Exception as e:
        if trans:
            trans.rollback()  # Rollback the transaction if an error occurs
            sys.stderr.write(f"Transaction rolled back due to: {e}\n")
        raise
    finally:
        if conn_created:
            conn.close()  # Close the connection

    return del_result.rowcount

insert_into_table

insert_into_table(
    table_name,
    data,
    conn=None,
    engine=None,
    debug=False,
    filter_long_data=False,
    retry_on_deadlock=True,
    max_retries=5,
    base_delay=0.2,
)

Insert-or-update a row and return its id.

Parameters:
  • table_name (str) –

    Name of the table to insert into.

  • data (dict) –

    Dictionary of column names and values to insert.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

  • filter_long_data (bool, default: False ) –

    If True, move overlength strings to long_text table.

  • retry_on_deadlock (bool, default: True ) –

    If True, retry on deadlock errors.

  • max_retries (int, default: 5 ) –

    Maximum number of retries on deadlock.

  • base_delay (float, default: 0.2 ) –

    Base delay in seconds for exponential backoff.

Returns:
  • int

    The ID/PK of the inserted or updated row.

Source code in globalbiodata/utils_db.py
def insert_into_table(
    table_name: str,
    data: dict,
    conn: Optional[Connection] = None,
    engine: Optional[Engine] = None,
    debug: bool = False,
    filter_long_data: bool = False,
    retry_on_deadlock: bool = True,
    max_retries: int = 5,
    base_delay: float = 0.2,
) -> int:
    """Insert-or-update a row and return its id.

    Args:
        table_name (str): Name of the table to insert into.
        data (dict): Dictionary of column names and values to insert.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.
        filter_long_data (bool, optional): If `True`, move overlength strings to long_text table.
        retry_on_deadlock (bool, optional): If `True`, retry on deadlock errors.
        max_retries (int, optional): Maximum number of retries on deadlock.
        base_delay (float, optional): Base delay in seconds for exponential backoff.

    Returns:
		The ID/PK of the inserted or updated row.
    """
    metadata_obj = db.MetaData()

    conn_created = False
    if conn is None:
        if engine is None:
            raise ValueError("insert_into_table requires either an engine or an open connection")
        conn = engine.connect()
        conn_created = True

    # Reflect the table using the ENGINE (separate pooled connection) to avoid
    # starting an implicit transaction on our working connection.
    table = db.Table(table_name, metadata_obj, autoload_with=conn)
    pk_cols = _get_primary_keys(table, conn)
    data = _stringify_data(data)

    if debug:
        print(f"\n--> Inserting into table: {table_name}")
        print(data)
        print(f"Columns: {', '.join(table.columns.keys())}")
        print(f"Primary keys: {', '.join(pk_cols)}")

    # Optionally move overlength strings to long_text and replace with reference token
    if filter_long_data:
        for k, v in list(data.items()):
            this_max_len = _get_max_len(k, table_name, conn=conn, engine=engine)
            if v and this_max_len and isinstance(v, str) and len(v) > this_max_len:
                longtext_id = insert_into_table('long_text', {'text': v}, conn=conn, engine=engine, debug=debug)
                data[k] = f"long_text({longtext_id})"

    # We "own" the transaction if we opened this connection in this function.
    # Reflection above may have triggered an implicit txn elsewhere; don't let that
    # disable retries. We manage our own explicit txn when we created the conn.
    own_txn = conn_created
    # print(f"conn_created={conn_created}, own_txn={own_txn}, conn.in_transaction()={conn.in_transaction()}")

    # Retry loop only if we control our own transaction
    attempts = 0
    while True:
        attempts += 1
        # If we own the connection, start an explicit transaction only if one isn't already active
        implicit_txn = conn.in_transaction()
        trans = conn.begin() if (own_txn and not implicit_txn) else None
        try:
            insert_stmt = insert(table).values(data)
            data_no_pks = _remove_key_fields(table, conn, data)

            if data_no_pks:  # typical path: upsert (ON DUPLICATE KEY UPDATE)
                if pk_cols and len(pk_cols) == 1:
                    # single primary key: use LAST_INSERT_ID() hack to get the id of existing row if no insert occurred
                    pk_name = pk_cols[0]
                    data_no_pks[pk_name] = db.func.last_insert_id(table.c[pk_name])

                do_update_stmt = insert_stmt.on_duplicate_key_update(**data_no_pks)
                if debug:
                    print(f"Updating {table_name} with data: {data_no_pks}")
                result = conn.execute(do_update_stmt)
            else:  # tables that are pure key rows
                result = conn.execute(insert_stmt.prefix_with('IGNORE'))

            if trans is not None:
                trans.commit()
            elif own_txn and implicit_txn and conn.in_transaction():
                # We own the connection and were already in an implicit txn (likely started by prior metadata queries)
                conn.commit()

            inserted_pk = result.inserted_primary_key[0] if result.inserted_primary_key else None
            affected_rows = result.rowcount

            # Determine the resulting id to return
            if (not inserted_pk) or (inserted_pk and affected_rows == 0):
                # entity existed and was not updated
                existing_id = conn.execute(db.text("SELECT LAST_INSERT_ID()")).scalar() if len(pk_cols) == 1 else _fetch_id_from_unique_keys(table, data, conn, debug=debug)
                if debug:
                    print(f"Entity already exists. Fetched id: {existing_id}")
                this_id = existing_id
            elif (inserted_pk and affected_rows > 1):
                # entity existed and was updated (MySQL reports >1 affected rows on upsert-update)
                this_id = inserted_pk
                if debug:
                    print(f"Entity already exists. Updated id: {this_id}")
            else:
                this_id = inserted_pk
                if debug:
                    print(f"New entity added. Inserted id: {this_id}")

            # Success; break retry loop
            break

        except OperationalError as e:
            # MySQL deadlock / lock wait timeout codes
            code = None
            try:
                code = e.orig.args[0]
            except Exception:
                pass

            if trans is not None:
                try:
                    trans.rollback()
                except Exception:
                    pass
            elif own_txn and conn.in_transaction():
                try:
                    conn.rollback()
                except Exception:
                    pass

            # If we don't own the transaction or retries disabled, re-raise immediately
            if not own_txn or not retry_on_deadlock or code not in (1213, 1205):
                raise

            # Backoff and retry (we own the txn)
            if attempts > max_retries:
                raise
            # Exponential backoff with jitter
            delay = (base_delay * (2 ** (attempts - 1))) * (1 + 0.25 * random.random())
            if debug:
                sys.stderr.write(f"[retry] {table_name}: OperationalError {code}; attempt {attempts}/{max_retries}; sleeping {delay:.2f}s\n")
            time.sleep(delay)
            continue

        except Exception as e:
            if trans is not None:
                try:
                    trans.rollback()
                except Exception:
                    pass
            elif own_txn and conn.in_transaction():
                try:
                    conn.rollback()
                except Exception:
                    pass
            sys.stderr.write(f"Transaction rolled back due to: {e}\n")
            raise
        finally:
            if conn_created and own_txn and not conn.closed:
                # keep connection open for caller if they provided it; otherwise close at end
                pass

    if conn_created:
        conn.close()

    return this_id

select_from_table

select_from_table(
    table_name,
    data={},
    join_table=None,
    order_by=None,
    conn=None,
    engine=None,
    debug=False,
)

Select rows from a table matching the provided data.

Parameters:
  • table_name (str) –

    Name of the table to select from.

  • data (dict, default: {} ) –

    Dictionary of column names and values to match for selection.

  • join_table (str, default: None ) –

    Name of a table to join with.

  • order_by (list, default: None ) –

    Column name(s) to order the results by.

  • conn (Optional[Connection], default: None ) –

    SQLAlchemy Connection object.

  • engine (Optional[Engine], default: None ) –

    SQLAlchemy Engine object.

  • debug (bool, default: False ) –

    If True, print debug information.

Returns:
  • list

    List of dictionaries representing the selected rows.

Source code in globalbiodata/utils_db.py
def select_from_table(
    table_name: str,
    data: dict = {},
    join_table: str = None,
    order_by: list = None,
    conn: Optional[Connection] = None,
    engine: Optional[Engine] = None,
    debug: bool = False
) -> list:
    """Select rows from a table matching the provided data.

    Args:
        table_name (str): Name of the table to select from.
        data (dict, optional): Dictionary of column names and values to match for selection.
        join_table (str, optional): Name of a table to join with.
        order_by (list, optional): Column name(s) to order the results by.
        conn (Optional[Connection], optional): SQLAlchemy Connection object.
        engine (Optional[Engine], optional): SQLAlchemy Engine object.
        debug (bool, optional): If `True`, print debug information.

    Returns:
		List of dictionaries representing the selected rows.
    """
    metadata_obj = db.MetaData()

    # Ensure we have a live connection before reflecting
    conn_created = False
    if conn is None:
        if engine is None:
            raise ValueError("select_from_table requires either an engine or an open connection")
        conn = engine.connect()
        conn_created = True

    # Reflect using the active connection (works in SA 1.4/2.0)
    table = db.Table(table_name, metadata_obj, autoload_with=conn)
    if join_table:
        join_tbl = db.Table(join_table, metadata_obj, autoload_with=conn)
        table = table.join(join_tbl)
        # print("JOINED TABLE COLUMNS:", table.columns.keys())

    if debug:
        print(f"\n--> Selecting from table: {table_name} WHERE:")
        print('AND '.join([f"{k} == '{data[k]}'" for k in data.keys()]))

    wheres = [
        table.columns.get(c).in_(data[c]) if isinstance(data[c], list)
        else table.columns.get(c) == data[c]
        for c in data
    ]

    # construct select statement with correct options
    stmt = db.select(table)
    if wheres:
        stmt = stmt.where(db.and_(*wheres))
    if order_by:
        if isinstance(order_by, list):
            order_cols = [table.columns.get(col) for col in order_by if table.columns.get(col) is not None]
            if order_cols:
                stmt = stmt.order_by(*order_cols)
        else:
            order_col = table.columns.get(order_by)
            if order_col is not None:
                stmt = stmt.order_by(order_col)
    result = conn.execute(stmt).fetchall()

    # convert result to list of dicts
    d_result = [dict(zip(table.columns.keys(), list(r))) for r in result]

    if conn_created:
        conn.close()
    return d_result

Other utility methods

globalbiodata.utils

extract_fields_by_type

extract_fields_by_type(data, type_prefix)

Extract fields from a dictionary that start with a given prefix.

Parameters:
  • data (dict) –

    Input dictionary.

  • type_prefix (str) –

    Prefix to filter keys.

Returns:
  • dict

    Dictionary with extracted fields.

Source code in globalbiodata/utils.py
def extract_fields_by_type(data: dict, type_prefix: str) -> dict:
    """Extract fields from a dictionary that start with a given prefix.

    Args:
        data (dict): Input dictionary.
        type_prefix (str): Prefix to filter keys.

    Returns:
        Dictionary with extracted fields.
    """
    extracted = {}
    for k, v in data.items():
        if k.startswith(f"{type_prefix}_"):
            extracted[re.sub(f"^{type_prefix}_", "", k)] = v
    return extracted

new_publication_from_EuropePMC_result

new_publication_from_EuropePMC_result(
    epmc_result, google_maps_api_key=None
)

Create a new Publication object from an EuropePMC search result, including additional geographic metadata enrichment.

Parameters:
  • epmc_result (dict) –

    EuropePMC search result metadata.

  • google_maps_api_key (str, default: None ) –

    Google Maps API key for advanced geolocation.

Returns:
Source code in globalbiodata/utils.py
def new_publication_from_EuropePMC_result(epmc_result: dict, google_maps_api_key: str = None) -> Publication:
    """Create a new Publication object from an EuropePMC search result, including additional geographic metadata enrichment.

    Args:
        epmc_result (dict): EuropePMC search result metadata.
        google_maps_api_key (str, optional): Google Maps API key for advanced geolocation.

    Returns:
        New Publication object.
    """
    print(f"Creating new Publication from EuropePMC result: {epmc_result.get('title', '')} (PMID:{epmc_result.get('pmid', '')})")
    print("Searching for author affiliations and countries...")
    affiliations, countries = _extract_affiliations(epmc_result, google_maps_api_key=google_maps_api_key)
    print(f"  Found countries: {', '.join(countries) if countries else 'None found'}")
    new_publication = Publication({
        'publication_title': epmc_result.get('title', ''), 'pubmed_id': epmc_result.get('pmid', None), 'pmc_id': epmc_result.get('pmcid', ''),
        'publication_date': epmc_result.get('journalInfo', {}).get('printPublicationDate') or epmc_result.get('firstPublicationDate'),
        'grants': _extract_grants(epmc_result),'keywords': '; '.join(_extract_keywords(epmc_result)),
        'citation_count': epmc_result.get('citedByCount', 0), 'authors': epmc_result.get('authorString', ''), 'affiliation': affiliations,
        'affiliation_countries': countries
    })
    return new_publication