Skip to content

Pdf metadata summary

Code Example

import argparse
from pathlib import Path

from Docs2KG.parser.pdf.pdf2metadata import get_metadata_for_files
from Docs2KG.utils.constants import DATA_INPUT_DIR, DATA_OUTPUT_DIR
from Docs2KG.utils.get_logger import get_logger

logger = get_logger(__name__)

if __name__ == "__main__":
    """
    Loop a folder of pdf files and process them
    """

    args = argparse.ArgumentParser()
    args.add_argument(
        "--input_dir",
        type=str,
        help="Input directory of pdf files",
        default=DATA_INPUT_DIR,
    )
    args = args.parse_args()
    data_input_dir = Path(args.input_dir)

    all_files = list(data_input_dir.rglob("*.pdf"))
    if len(all_files) == 0:
        logger.info("No pdf files found in the input directory")
        raise Exception("No pdf files found in the input directory")

    all_metadata_df = get_metadata_for_files(all_files, log_summary=True)
    """
    Then you can save it to a file

    Example:
        all_metadata_df.to_csv(DATA_OUTPUT_DIR / "metadata.csv", index=False)

    Or use can use the metadata as the orchestrator
    So files can be directed to different processing pipelines
    And modules based on the metadata
    """
    all_metadata_df.to_csv(DATA_OUTPUT_DIR / "metadata.csv", index=False)

all_metadata_df = get_metadata_for_files(all_files, log_summary=True) module-attribute

Then you can save it to a file

Example

all_metadata_df.to_csv(DATA_OUTPUT_DIR / "metadata.csv", index=False)

Or use can use the metadata as the orchestrator So files can be directed to different processing pipelines And modules based on the metadata