europepmc - GBC Publication Analysis API documentation

gbcutils.europepmc

epmc_search

epmc_search(
    query,
    result_type="core",
    limit=0,
    cursor=None,
    returncursor=False,
    fields=[],
    page_size=1000,
)

Search Europe PMC with pagination support.

Parameters:

query (str) –

The search query string.
result_type (str, default: 'core' ) –

The type of results to return ('core', 'lite', 'idlist').
limit (int, default: 0 ) –

Maximum number of results to return (0 for all - default).
cursor (Optional[str], default: None ) –

Cursor mark for pagination (None for first page).
returncursor (bool, default: False ) –

If True, return the final cursor along with results.
fields (list, default: [] ) –

List of fields to include in results (empty for all).
page_size (int, default: 1000 ) –

Number of results per page (max 1000).

Returns:	`Optional[list]` – List of search results, or `(results, cursor)` if `returncursor` is `True`.

Source code in gbcutils/europepmc.py

def epmc_search(query: str, result_type: str = 'core', limit: int = 0, cursor: Optional[str] = None, returncursor: bool = False, fields: list = [], page_size: int = 1000) -> Optional[list]:
    """Search Europe PMC with pagination support.

    Args:
        query (str): The search query string.
        result_type (str): The type of results to return ('core', 'lite', 'idlist').
        limit (int): Maximum number of results to return (0 for all - default).
        cursor (Optional[str]): Cursor mark for pagination (None for first page).
        returncursor (bool): If True, return the final cursor along with results.
        fields (list): List of fields to include in results (empty for all).
        page_size (int): Number of results per page (max 1000).

    Returns:
        List of search results, or `(results, cursor)` if `returncursor` is `True`.
    """

    page_size = limit if (limit and limit <= page_size) else page_size

    all_results = []
    more_data = True
    while more_data:
        search_params = {
            'query': query, 'resultType': result_type,
            'format': 'json', 'pageSize': page_size,
            'cursorMark': cursor
        }
        data = query_europepmc(f"{epmc_base_url}/search", search_params)

        limit = limit if (limit > 0 and limit < data.get('hitCount')) else data.get('hitCount')
        if cursor is None and VERBOSE:
            print(f"-- Expecting {limit} of {data.get('hitCount')} results for query '{query}'!")


        if fields:
            restricted_results = []
            for result in data['resultList']['result']:
                restricted_results.append({k: result[k] for k in fields if k in result})
            data['resultList']['result'] = restricted_results

        all_results.extend(data['resultList']['result'])

        cursor = data.get('nextCursorMark')
        print(f"\t-- got {len(all_results)} results (cursor: {cursor})") if VERBOSE else None
        if not cursor:
            more_data = False

        if len(all_results) >= limit > 0:
            if VERBOSE:
                print(f"Reached limit of {limit} results, stopping.")
            more_data = False
            cursor = None  # reset cursor to avoid further queries

    return (all_results, cursor) if returncursor else all_results

get_fulltext_body

get_fulltext_body(pmcid, path=None, dest='/tmp')

Fetch the full text body of a publication from Europe PMC by PMCID. If a local path is provided, it will first check for a local XML file. If not found, it will download the XML from Europe PMC.

Parameters:	`pmcid` (`str`) – The PMCID of the publication (e.g., 'PMC123456'). `path` (`Optional[str]`, default: `None` ) – Local directory path to search for XML files. `dest` (`str`, default: `'/tmp'` ) – Destination directory for downloaded/extracted XML files.

Returns:	`tuple` – (text_blocks, table_blocks) where text_blocks is a list of strings representing the main text sections, and table_blocks is a list of strings representing the tables extracted from the XML.

Source code in gbcutils/europepmc.py

def get_fulltext_body(pmcid: str, path: Optional[str] = None, dest: str = '/tmp') -> tuple:
    """
    Fetch the full text body of a publication from Europe PMC by PMCID.
    If a local path is provided, it will first check for a local XML file.
    If not found, it will download the XML from Europe PMC.

    Args:
        pmcid (str): The PMCID of the publication (e.g., 'PMC123456').
        path (Optional[str]): Local directory path to search for XML files.
        dest (str): Destination directory for downloaded/extracted XML files.

    Returns:
        (text_blocks, table_blocks) where text_blocks is a list of strings
               representing the main text sections, and table_blocks is a list of
               strings representing the tables extracted from the XML.
    """
    xml = None
    path = path or dest # check for files downloaded from FTP or local XMLs
    if path:
        # find the matching record in the filesystem
        if VERBOSE: print(f"[local] Searching {path} for full text XML for {pmcid}")
        xml = _find_local_fulltext(pmcid, path, dest=dest)

    if not xml:
        # if not found locally, try the EuropePMC FTP
        if VERBOSE: print(f"[ftp] Searching EuropePMC FTP for full text XML for {pmcid}")
        xml = _find_europepmc_ftp_fulltext(pmcid, dest=dest)

    if not xml:
        # 1. Download the XML
        if VERBOSE: print(f"[api] Querying EuropePMC's API for full text XML for {pmcid}")
        url = f"{epmc_base_url}/{pmcid}/fullTextXML"
        response = requests.get(url)
        if response.status_code != 200:
            return (None, None)
        xml = response.text

    if not xml:
        return (None, None)

    # 2. Parse with BeautifulSoup
    if VERBOSE:
        print("\n🎉 XML found! Parsing text and tables from XML body")
    soup = BeautifulSoup(xml, "lxml-xml")

    # 3. Extract body text with headers
    text_blocks = []

    # 1. Title
    title = soup.find("article-title")
    if title:
        title_text = title.get_text(strip=True)
        if title_text:
            text_blocks.append(f"# TITLE\n{title_text}")
    text_blocks.append("\n")

    # 2. Abstract
    abstract = soup.find("abstract")
    if abstract:
        abstract_title = abstract.find("title")
        if abstract_title and abstract_title.get_text(strip=True).upper() == 'ABSTRACT':
            abstract_title.extract()  # remove the title

        text_blocks.append(f"# ABSTRACT\n{_section_to_text(abstract)}")

    # 2.1. Other metadata sections
    funding_statement = soup.find("funding-statement")
    if funding_statement:
        funding_text = funding_statement.get_text(strip=True)
        if funding_text:
            text_blocks.append(f"### FUNDING\n{funding_text}")

    all_custom_metas = soup.find_all("custom-meta")
    for custom_meta in all_custom_metas:
        meta_name = custom_meta.find("meta-name").get_text(strip=True)
        meta_value = custom_meta.find("meta-value").get_text(strip=True)
        if meta_name and meta_value:
            text_blocks.append(f"### {meta_name.upper()}\n{meta_value}")

    text_blocks.append("\n")

    # 3. Tables (captions + content)
    table_blocks = []
    for tbl in soup.find_all("table-wrap"):
        tbl.extract()
        processed_table = _preprocess_xml_table(tbl)
        if processed_table:
            table_blocks.append(processed_table)

    # 4. Main body (sections + paragraphs)
    # excluded_section_types = ["supplementary-material", "orcid"]
    excluded_section_types = ["orcid"]
    body = soup.find("body")
    if body:
        all_sections = body.find_all("sec", recursive=False)
        for elem in all_sections:
            if elem.get("sec-type") in excluded_section_types:
                continue

            text_blocks.append(_section_to_text(elem))
            text_blocks.append("\n")

    return text_blocks, table_blocks

query_europepmc

query_europepmc(
    endpoint, request_params=None, no_exit=False
)

Query Europe PMC REST API endpoint with retries.

Parameters:	`endpoint` (`str`) – The Europe PMC API endpoint to query. `request_params` (`Optional[dict]`, default: `None` ) – Dictionary of query parameters. `no_exit` (`bool`, default: `False` ) – If `True`, do not exit on error, return `None` instead.

Returns:	`Optional[dict]` – The JSON response from Europe PMC, or `None` on error if `no_exit` is `True`.

Source code in gbcutils/europepmc.py

def query_europepmc(endpoint: str, request_params: Optional[dict] = None, no_exit: bool = False) -> Optional[dict]:
    """Query Europe PMC REST API endpoint with retries.

    Args:
        endpoint (str): The Europe PMC API endpoint to query.
        request_params (Optional[dict]): Dictionary of query parameters.
        no_exit (bool): If `True`, do not exit on error, return `None` instead.

    Returns:
        The JSON response from Europe PMC, or `None` on error if `no_exit` is `True`.
    """
    if not endpoint.startswith("http"):
        endpoint = f"{epmc_base_url}/{endpoint}"

    for attempt in range(max_retries):
        try:
            response = session.get(endpoint, params=request_params, timeout=15)
            if response.status_code == 200:
                return response.json() if 'json' in response.headers.get('Content-Type', '') else response.text
            else:
                if no_exit:
                    return None
                else:
                    sys.exit(f"Error: {response.status_code} for {endpoint}")
        except requests.RequestException as e:
            print(f"⚠️ Request failed: {e}. Retrying ({attempt + 1}/{max_retries})...")
    sys.exit("Max retries exceeded.")