Skip to content

Sheet2metadata

Sheet2Metadata

Source code in Docs2KG/modules/llm/sheet2metadata.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
class Sheet2Metadata:
    def __init__(self, markdown_file: Path, llm_model_name: str = "gpt-3.5-turbo-0125"):
        """
        1. Extract the descriptive part of the markdown
        2. Summary the markdown

        Args:
            markdown_file (Path): The path to the markdown file
            llm_model_name (str): The OpenAI LLM model name
        """
        self.markdown_file = markdown_file
        if self.markdown_file.suffix != ".csv":
            raise ValueError("Only support csv")
        self.json_csv_file = markdown_file.with_suffix(".json.csv")
        self.llm_model_name = llm_model_name
        self.cost = 0

    def extract_metadata(self):
        if self.json_csv_file.exists():
            logger.info(f"{self.json_csv_file} already exists")
            return
        logger.info(self.markdown_file)
        current_cost = self.cost
        df = pd.read_csv(self.markdown_file)

        for index, row in tqdm(
            df.iterrows(), total=df.shape[0], desc="Summary and Description Extraction"
        ):
            try:
                summary, desc = self.openai_sheet_handler(row["text"])
                logger.info(summary)
                logger.info(desc)
                # if it is list, then we will join them
                if isinstance(summary, list):
                    summary = " ".join(summary)
                if isinstance(desc, list):
                    desc = " ".join(desc)
                try:
                    df.loc[index, "summary"] = summary
                    df.loc[index, "desc"] = desc
                except Exception as e:
                    logger.error(e)
                    df.loc[index, "summary"] = str(summary)
                    df.loc[index, "desc"] = str(desc)
            except Exception as e:
                logger.error(e)
                df.loc[index, "summary"] = str()
                df.loc[index, "desc"] = ""
                logger.error(f"Error at index {index}")

        df.to_csv(self.json_csv_file, index=False)
        logger.info(f"Cost: {self.cost - current_cost}")

    def openai_sheet_handler(self, markdown: str):
        """
        1. Use OpenAI to exclude the numerical part of the markdown, only keep the descriptive part
        2. Summarize the markdown

        Args:
            markdown:

        Returns:

        """
        messages = [
            {
                "role": "system",
                "content": """
                    You are a helpful assistant, help us clean and understand the excel data
                    You will need to first summary the markdwon content, it is normally some descriptive text with
                    Table information.
                    """,
            },
            {
                "role": "user",
                "content": f"""
                            We will need you to do two things:
                            1. Summarize the following markdown content into a description about the data:
                                - What the data is about
                                - What's the main point of the data
                            2. Exclude the numerical part of the markdown, only keep the descriptive part.

                            The markdown content is: \n\n{markdown}

                            Return in JSON format with key "summary" and "desc"
                            """,
            },
        ]
        try:
            content = self.llm_openai_call(messages)
            content = json.loads(content)
            return content["summary"], content["desc"]
        except Exception as e:
            logger.error(e)
            return "", ""

    def llm_openai_call(self, messages: List[dict]) -> str:
        """
        Call the OpenAI API to get the response
        Args:
            messages (List[dict]): The messages to send to the OpenAI API


        Returns:
            response_json_str (str): The response from the OpenAI API
        """
        result_json_str, cost = openai_call(messages, self.llm_model_name)
        self.cost += cost
        logger.debug(result_json_str)
        logger.debug(f"Cost: {self.cost}")
        return result_json_str

__init__(markdown_file, llm_model_name='gpt-3.5-turbo-0125')

  1. Extract the descriptive part of the markdown
  2. Summary the markdown

Parameters:

Name Type Description Default
markdown_file Path

The path to the markdown file

required
llm_model_name str

The OpenAI LLM model name

'gpt-3.5-turbo-0125'
Source code in Docs2KG/modules/llm/sheet2metadata.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def __init__(self, markdown_file: Path, llm_model_name: str = "gpt-3.5-turbo-0125"):
    """
    1. Extract the descriptive part of the markdown
    2. Summary the markdown

    Args:
        markdown_file (Path): The path to the markdown file
        llm_model_name (str): The OpenAI LLM model name
    """
    self.markdown_file = markdown_file
    if self.markdown_file.suffix != ".csv":
        raise ValueError("Only support csv")
    self.json_csv_file = markdown_file.with_suffix(".json.csv")
    self.llm_model_name = llm_model_name
    self.cost = 0

llm_openai_call(messages)

Call the OpenAI API to get the response Args: messages (List[dict]): The messages to send to the OpenAI API

Returns:

Name Type Description
response_json_str str

The response from the OpenAI API

Source code in Docs2KG/modules/llm/sheet2metadata.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def llm_openai_call(self, messages: List[dict]) -> str:
    """
    Call the OpenAI API to get the response
    Args:
        messages (List[dict]): The messages to send to the OpenAI API


    Returns:
        response_json_str (str): The response from the OpenAI API
    """
    result_json_str, cost = openai_call(messages, self.llm_model_name)
    self.cost += cost
    logger.debug(result_json_str)
    logger.debug(f"Cost: {self.cost}")
    return result_json_str

openai_sheet_handler(markdown)

  1. Use OpenAI to exclude the numerical part of the markdown, only keep the descriptive part
  2. Summarize the markdown

Parameters:

Name Type Description Default
markdown str
required

Returns:

Source code in Docs2KG/modules/llm/sheet2metadata.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def openai_sheet_handler(self, markdown: str):
    """
    1. Use OpenAI to exclude the numerical part of the markdown, only keep the descriptive part
    2. Summarize the markdown

    Args:
        markdown:

    Returns:

    """
    messages = [
        {
            "role": "system",
            "content": """
                You are a helpful assistant, help us clean and understand the excel data
                You will need to first summary the markdwon content, it is normally some descriptive text with
                Table information.
                """,
        },
        {
            "role": "user",
            "content": f"""
                        We will need you to do two things:
                        1. Summarize the following markdown content into a description about the data:
                            - What the data is about
                            - What's the main point of the data
                        2. Exclude the numerical part of the markdown, only keep the descriptive part.

                        The markdown content is: \n\n{markdown}

                        Return in JSON format with key "summary" and "desc"
                        """,
        },
    ]
    try:
        content = self.llm_openai_call(messages)
        content = json.loads(content)
        return content["summary"], content["desc"]
    except Exception as e:
        logger.error(e)
        return "", ""