@article{Miller:2026aa,
 abstract = {Objectives
Clinical document metadata, such as document type, structure, author role, medical specialty, and encounter setting, is essential for accurate interpretation of information captured in clinical documents. However, vast documentation heterogeneity and drift over time challenge harmonization of document metadata. Automated extraction methods have emerged to coalesce metadata from disparate practices into target schema. This scoping review aims to catalog research on clinical document metadata extraction, identify methodological trends and applications, and highlight gaps warranting further investigation.
Methods
We followed the PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses Extension for Scoping Reviews) guidelines to identify articles from Ovid MEDLINE, Ovid EMBASE, Scopus, Web of Science and external sources that perform clinical document metadata extraction, either primarily as a methodology study, secondarily as a feature for a downstream application, or for analysis. We initially identified and screened 342 articles published between 2011 and 2025, then comprehensively reviewed 77 we deemed relevant to our study.
Results
Among the 77 articles included in our full text review, 49 were methodological, 22 used document metadata as features in a downstream application, and 6 analyzed document metadata composition. We observe myriad purposes for methodological study and application types. Available labelled public data remains sparse except for structural section datasets. Methods for extracting document metadata have progressed from largely rule-based and traditional machine learning with ample feature engineering to transformer-based architectures with minimal feature engineering.
Discussion and conclusion
Clinical document metadata extraction research has accelerated over recent years. The emergence of large language models has enabled broader exploration of generalizability across tasks and datasets, allowing the possibility of advanced clinical text processing systems. We anticipate that research will continue to expand into richer document metadata representations and integrate further into clinical applications and workflows.},
 author = {Kurt Miller and Qiuhao Lu and William Hersh and Kirk Roberts and Steven Bedrick and Andrew Wen and Hongfang Liu},
 bdsk-url-1 = {https://www.sciencedirect.com/science/article/pii/S1532046426000742},
 bdsk-url-2 = {https://doi.org/10.1016/j.jbi.2026.105050},
 date-added = {2026-05-12 11:28:13 -0700},
 date-modified = {2026-05-28 09:27:13 -0700},
 doi = {https://doi.org/10.1016/j.jbi.2026.105050},
 issn = {1532-0464},
 journal = {Journal of Biomedical Informatics},
 keywords = {Electronic health records, Clinical document metadata, Document attributes, Document structure, Information extraction, Clinical natural language processing},
 month = {July},
 pages = {105050},
 title = {Clinical document metadata extraction: A scoping review},
 url = {https://www.sciencedirect.com/science/article/pii/S1532046426000742},
 volume = {179},
 year = {2026}
}
