Return the metadata dict for article_id from sharded JSONL.gz files under basepath.
Expects lines like: {"pmc_id": "...", "meta": {...}}.
| Parameters: |
-
article_id
(str)
–
The article identifier (e.g., PMC ID).
-
basepath
(str, default:
''
)
–
The base path where the shard files are located.
-
shards
(int, default:
default_shard_count
)
–
The total number of shards (default: 128).
|
| Returns: |
-
Optional[dict]
–
The metadata dictionary for the article_id, or None if not found.
|
Source code in gbcutils/metadata.py
| def get_article_metadata(article_id: str, basepath: str = '', shards: int = default_shard_count) -> Optional[dict]:
"""
Return the metadata dict for `article_id` from sharded JSONL.gz files under `basepath`.
Expects lines like: {"pmc_id": "...", "meta": {...}}.
Args:
article_id (str): The article identifier (e.g., PMC ID).
basepath (str): The base path where the shard files are located.
shards (int): The total number of shards (default: 128).
Returns:
The metadata dictionary for the article_id, or None if not found.
"""
global _shard_cache
k = shard_key(article_id, shards)
if k not in _shard_cache:
shard_file = shard_path(k, basepath=basepath, shards=shards)
shard_map = {}
if os.path.exists(shard_file):
with gzip.open(shard_file, 'rt', encoding='utf-8') as fh:
for line in fh:
try:
rec = json.loads(line)
pid = rec.get('id')
if pid is not None:
shard_map[str(pid)] = rec.get('meta') or rec
except Exception:
# swallow bad lines but keep going; optionally log if you want
pass
_shard_cache = {} # Reset the cache for this shard - only hold 1 shard at a time in memory
_shard_cache[k] = shard_map
return _shard_cache[k].get(str(article_id))
|