Skip to content

Excel

Code Example

import argparse
from pathlib import Path

from Docs2KG.kg.excel_layout_kg import ExcelLayoutKG
from Docs2KG.kg.semantic_kg import SemanticKG
from Docs2KG.kg.utils.json2triplets import JSON2Triplets
from Docs2KG.kg.utils.neo4j_connector import Neo4jLoader
from Docs2KG.modules.llm.sheet2metadata import Sheet2Metadata
from Docs2KG.parser.excel.excel2image import Excel2Image
from Docs2KG.parser.excel.excel2markdown import Excel2Markdown
from Docs2KG.parser.excel.excel2table import Excel2Table
from Docs2KG.utils.constants import DATA_INPUT_DIR

if __name__ == "__main__":
    """
    Plan of the attack:

    1. For each sheet, extract the description stuff, and tables will be kept still in csv
    2. Then create the kg mainly based on the description
    """
    argparse = argparse.ArgumentParser()
    argparse.add_argument(
        "--excel_file", type=str, default=None, help="The Excel File Absolute Path"
    )
    argparse.add_argument(
        "--model_name", type=str, default="gpt-3.5-turbo", help="The model name"
    )
    argparse.add_argument("--neo4j_uri", type=str, default="bolt://localhost:7687")
    argparse.add_argument("--neo4j_username", type=str, default="neo4j")
    argparse.add_argument("--neo4j_password", type=str, default="testpassword")

    args = argparse.parse_args()
    # if you want to run this script, you can run it with `python excel.py --excel_file <excel_file>`
    if not args.excel_file:
        excel_file = DATA_INPUT_DIR / "excel" / "GCP_10002.xlsx"
    else:
        excel_file = Path(args.excel_file)

    excel2table = Excel2Table(excel_file=excel_file)
    excel2table.extract_tables_from_excel()

    excel2image = Excel2Image(excel_file=excel_file)
    excel2image.excel2image_and_pdf()

    excel2markdown = Excel2Markdown(excel_file=excel_file)
    excel2markdown.extract2markdown()

    sheet_2_metadata = Sheet2Metadata(
        excel2markdown.md_csv,
        llm_model_name=args.model_name,
    )
    sheet_2_metadata.extract_metadata()

    excel_layout_kg = ExcelLayoutKG(excel2markdown.output_dir, input_format="excel")
    excel_layout_kg.create_kg()
    # After this, you will have the layout.json in the `kg` folder

    # then we add the semantic knowledge graph
    semantic_kg = SemanticKG(
        excel2markdown.output_dir, llm_enabled=True, input_format="excel"
    )
    semantic_kg.add_semantic_kg()

    json_2_triplets = JSON2Triplets(excel2markdown.output_dir)
    json_2_triplets.transform()

    json_file_path = excel2markdown.output_dir / "kg" / "triplets_kg.json"

    neo4j_loader = Neo4jLoader(
        uri=args.neo4j_uri,
        username=args.neo4j_username,
        password=args.neo4j_password,
        json_file_path=json_file_path,
        clean=True,
    )
    neo4j_loader.load_data()
    neo4j_loader.close()