Skip to content

Pdf2text

PDF2Text

Bases: PDFParserBase

Source code in Docs2KG/parser/pdf/pdf2text.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
class PDF2Text(PDFParserBase):
    def __init__(self, *args, **kwargs):
        """
        Initialize the PDF2Text class

        """
        super().__init__(*args, **kwargs)

        self.text_output_dir = self.output_dir / "texts"
        self.text_output_dir.mkdir(parents=True, exist_ok=True)

    def extract2text(self, output_csv: bool = False) -> dict:
        """
        Extract text from the pdf file

        Args
        output_csv (bool, optional): Whether to output the extracted data to a csv file. Defaults to False.

        Returns:
            text (str): The extracted text
            output_file (Path): The path to the output file
            df (pd.Dataframe): The dataframe containing the text information
        """
        doc = fitz.open(self.pdf_file)
        text = ""
        texts = []
        for page in doc:
            text += page.get_text()
            texts.append({"page_number": page.number, "text": page.get_text()})

        df = pd.DataFrame(texts)
        if output_csv:
            df.to_csv(self.text_output_dir / "text.csv", index=False)
            return {
                "text": text,
                "output_file": self.text_output_dir / "text.csv",
                "df": df,
            }
        return {"text": text, "output_file": None, "df": df}

    def extract2markdown(self, output_csv: bool = False) -> dict:
        """
        Convert the extracted text to markdown

        Args:
            output_csv (bool, optional): Whether to output the extracted data to a csv file. Defaults to False.

        Returns:
            md (str): The Markdown text,
            output_file (Path): Where the Markdown text save to
            df (pd.Dataframe): Each page for the Markdown text
        """
        doc = fitz.open(self.pdf_file)
        md_text = pymupdf4llm.to_markdown(doc)
        logger.debug(f"Markdown text: {md_text}")

        # split the Markdown text into pages
        markdown_texts = []
        for page in doc:
            page_text = pymupdf4llm.to_markdown(doc=doc, pages=[page.number])
            logger.debug(f"Page {page.number} Markdown text: {page_text}")
            markdown_texts.append({"page_number": page.number, "text": page_text})
        df = pd.DataFrame(markdown_texts)

        if output_csv:
            df.to_csv(self.text_output_dir / "md.csv", index=False)
            return {
                "md": md_text,
                "output_file": self.text_output_dir / "md.csv",
                "df": df,
            }

        return {"md": md_text, "df": df, "output_file": None}

__init__(*args, **kwargs)

Initialize the PDF2Text class

Source code in Docs2KG/parser/pdf/pdf2text.py
12
13
14
15
16
17
18
19
20
def __init__(self, *args, **kwargs):
    """
    Initialize the PDF2Text class

    """
    super().__init__(*args, **kwargs)

    self.text_output_dir = self.output_dir / "texts"
    self.text_output_dir.mkdir(parents=True, exist_ok=True)

extract2markdown(output_csv=False)

Convert the extracted text to markdown

Parameters:

Name Type Description Default
output_csv bool

Whether to output the extracted data to a csv file. Defaults to False.

False

Returns:

Name Type Description
md str

The Markdown text,

output_file Path

Where the Markdown text save to

df Dataframe

Each page for the Markdown text

Source code in Docs2KG/parser/pdf/pdf2text.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def extract2markdown(self, output_csv: bool = False) -> dict:
    """
    Convert the extracted text to markdown

    Args:
        output_csv (bool, optional): Whether to output the extracted data to a csv file. Defaults to False.

    Returns:
        md (str): The Markdown text,
        output_file (Path): Where the Markdown text save to
        df (pd.Dataframe): Each page for the Markdown text
    """
    doc = fitz.open(self.pdf_file)
    md_text = pymupdf4llm.to_markdown(doc)
    logger.debug(f"Markdown text: {md_text}")

    # split the Markdown text into pages
    markdown_texts = []
    for page in doc:
        page_text = pymupdf4llm.to_markdown(doc=doc, pages=[page.number])
        logger.debug(f"Page {page.number} Markdown text: {page_text}")
        markdown_texts.append({"page_number": page.number, "text": page_text})
    df = pd.DataFrame(markdown_texts)

    if output_csv:
        df.to_csv(self.text_output_dir / "md.csv", index=False)
        return {
            "md": md_text,
            "output_file": self.text_output_dir / "md.csv",
            "df": df,
        }

    return {"md": md_text, "df": df, "output_file": None}

extract2text(output_csv=False)

Extract text from the pdf file

Args output_csv (bool, optional): Whether to output the extracted data to a csv file. Defaults to False.

Returns:

Name Type Description
text str

The extracted text

output_file Path

The path to the output file

df Dataframe

The dataframe containing the text information

Source code in Docs2KG/parser/pdf/pdf2text.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def extract2text(self, output_csv: bool = False) -> dict:
    """
    Extract text from the pdf file

    Args
    output_csv (bool, optional): Whether to output the extracted data to a csv file. Defaults to False.

    Returns:
        text (str): The extracted text
        output_file (Path): The path to the output file
        df (pd.Dataframe): The dataframe containing the text information
    """
    doc = fitz.open(self.pdf_file)
    text = ""
    texts = []
    for page in doc:
        text += page.get_text()
        texts.append({"page_number": page.number, "text": page.get_text()})

    df = pd.DataFrame(texts)
    if output_csv:
        df.to_csv(self.text_output_dir / "text.csv", index=False)
        return {
            "text": text,
            "output_file": self.text_output_dir / "text.csv",
            "df": df,
        }
    return {"text": text, "output_file": None, "df": df}