Fetch the full text body of a publication from Europe PMC by PMCID.
If a local path is provided, it will first check for a local XML file.
If not found, it will download the XML from Europe PMC.
| Parameters: |
-
pmcid
(str)
–
The PMCID of the publication (e.g., 'PMC123456').
-
path
(Optional[str], default:
None
)
–
Local directory path to search for XML files.
-
dest
(str, default:
'/tmp'
)
–
Destination directory for downloaded/extracted XML files.
|
| Returns: |
-
tuple
–
(text_blocks, table_blocks) where text_blocks is a list of strings
representing the main text sections, and table_blocks is a list of
strings representing the tables extracted from the XML.
|
Source code in gbcutils/europepmc.py
| def get_fulltext_body(pmcid: str, path: Optional[str] = None, dest: str = '/tmp') -> tuple:
"""
Fetch the full text body of a publication from Europe PMC by PMCID.
If a local path is provided, it will first check for a local XML file.
If not found, it will download the XML from Europe PMC.
Args:
pmcid (str): The PMCID of the publication (e.g., 'PMC123456').
path (Optional[str]): Local directory path to search for XML files.
dest (str): Destination directory for downloaded/extracted XML files.
Returns:
(text_blocks, table_blocks) where text_blocks is a list of strings
representing the main text sections, and table_blocks is a list of
strings representing the tables extracted from the XML.
"""
xml = None
path = path or dest # check for files downloaded from FTP or local XMLs
if path:
# find the matching record in the filesystem
if VERBOSE: print(f"[local] Searching {path} for full text XML for {pmcid}")
xml = _find_local_fulltext(pmcid, path, dest=dest)
if not xml:
# if not found locally, try the EuropePMC FTP
if VERBOSE: print(f"[ftp] Searching EuropePMC FTP for full text XML for {pmcid}")
xml = _find_europepmc_ftp_fulltext(pmcid, dest=dest)
if not xml:
# 1. Download the XML
if VERBOSE: print(f"[api] Querying EuropePMC's API for full text XML for {pmcid}")
url = f"{epmc_base_url}/{pmcid}/fullTextXML"
response = requests.get(url)
if response.status_code != 200:
return (None, None)
xml = response.text
if not xml:
return (None, None)
# 2. Parse with BeautifulSoup
if VERBOSE:
print("\n🎉 XML found! Parsing text and tables from XML body")
soup = BeautifulSoup(xml, "lxml-xml")
# 3. Extract body text with headers
text_blocks = []
# 1. Title
title = soup.find("article-title")
if title:
title_text = title.get_text(strip=True)
if title_text:
text_blocks.append(f"# TITLE\n{title_text}")
text_blocks.append("\n")
# 2. Abstract
abstract = soup.find("abstract")
if abstract:
abstract_title = abstract.find("title")
if abstract_title and abstract_title.get_text(strip=True).upper() == 'ABSTRACT':
abstract_title.extract() # remove the title
text_blocks.append(f"# ABSTRACT\n{_section_to_text(abstract)}")
# 2.1. Other metadata sections
funding_statement = soup.find("funding-statement")
if funding_statement:
funding_text = funding_statement.get_text(strip=True)
if funding_text:
text_blocks.append(f"### FUNDING\n{funding_text}")
all_custom_metas = soup.find_all("custom-meta")
for custom_meta in all_custom_metas:
meta_name = custom_meta.find("meta-name").get_text(strip=True)
meta_value = custom_meta.find("meta-value").get_text(strip=True)
if meta_name and meta_value:
text_blocks.append(f"### {meta_name.upper()}\n{meta_value}")
text_blocks.append("\n")
# 3. Tables (captions + content)
table_blocks = []
for tbl in soup.find_all("table-wrap"):
tbl.extract()
processed_table = _preprocess_xml_table(tbl)
if processed_table:
table_blocks.append(processed_table)
# 4. Main body (sections + paragraphs)
# excluded_section_types = ["supplementary-material", "orcid"]
excluded_section_types = ["orcid"]
body = soup.find("body")
if body:
all_sections = body.find_all("sec", recursive=False)
for elem in all_sections:
if elem.get("sec-type") in excluded_section_types:
continue
text_blocks.append(_section_to_text(elem))
text_blocks.append("\n")
return text_blocks, table_blocks
|