sceptix-club · Janice25dsouza · Oct 2, 2024 · Oct 2, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/Project_documentation.pdf b/Project_documentation.pdf
diff --git a/data/blah.py b/data/blah.py
diff --git a/data/read2.py b/data/read2.py
@@ -0,0 +1,33 @@
+#reading contents of the pdf using pdfplumber
+
+import pdfplumber
+
+# Use raw string format to avoid issues with backslashes
+file_path = r"C:\Users\HP\OneDrive\Desktop\data_preprocessor\data\acetone-acs-l (1).pdf"
+
+# Open the PDF
+with pdfplumber.open(file_path) as pdf:
+    # Iterate through each page of the PDF
+    for page_num, page in enumerate(pdf.pages):
+        # Extract text from the page
+        text = page.extract_text()
+        print(f"Text from page {page_num + 1}:")
+        print(text)
+
+        # Extract tables from the page
+        tables = page.extract_tables()  # Use extract_tables() instead of extract_table()
+        print(f"Tables from page {page_num + 1}:")
+        for table in tables:
+            for row in table:
+                print(row)
+
+        # Extract images from the page
+        images = page.images  # Use page.images to get a list of images on the page
+        if images:
+            print(f"Images on page {page_num + 1}:")
+            for image in images:
+                print(f"Image on page {page_num + 1}: {image}")
+                # Note: PDFPlumber does not extract the image data itself, only metadata like its position, width, height, etc.
+                # For extracting image data, you might need another library like Pillow to render the PDF into an image.
+
+
diff --git a/data/read4.py b/data/read4.py
@@ -0,0 +1,78 @@
+import pdfplumber
+import json
+
+def process_main_heading(heading):
+    main_head=heading.strip()
+    return{heading.strip()}
+
+#def process_sub_heading1(heading):
+ #   sub_head=heading.strip()
+ #   return{heading.strip()}
+
+def process_paragraph(h,paragraph):
+    return{h : paragraph.strip()}
+
+
+
+#def process_paragraph(paragraph): 
+   # return {"type": "paragraph", "content": paragraph.strip()}
+
+#def process_heading(heading):
+    #return {"type": "heading", "content": heading.strip()}
+
+def process_table(table):
+    return {"type": "table", "content": table}
+
+
+
+def detect_structure(text):
+    paragraphs=text.split('\n\n')                      #to split the paras ig
+    structured_content=[]                              #to store the structred content ...empty list is initialised
+
+    for paragraph in paragraphs:
+        clean_paragraph = paragraph.strip()            #to remove leading or trailing whitespaces
+
+        if clean_paragraph:
+
+            if len(clean_paragraph.split()) < 5:       # include logic for headers of different font sizes
+                headd=clean_paragraph
+                structured_content.append(process_main_heading(clean_paragraph))
+
+            else:
+                structured_content.append(process_paragraph(headd,clean_paragraph))
+
+    return structured_content
+
+
+def extract_tables(page):
+    tables = page.extract_tables()
+    table_list = []
+    for table in tables:
+        table_list.append(process_table(table))
+    return table_list
+
+
+def main():
+    #file_path=input(r"Enter the file path: ")
+    file_path = r"C:\Users\HP\OneDrive\Desktop\data_preprocessor\data\acetone-acs-l (1).pdf"
+
+    document_structure=[] #empty list to storee doc structure??
+
+    with pdfplumber.open(file_path) as pdf:
+
+        for page_num,page in enumerate(pdf.pages):
+            print(f"Processing page {page_num + 1}:")
+
+            text=page.extract_text()
+
+# Convert the structure to JSON
+    json_output = json.dumps(document_structure, indent=4)
+
+    # Save to file or print
+    with open('output.json', 'w') as json_file:
+        json_file.write(json_output)
+
+    print("PDF content has been converted to JSON.")
+
+if __name__ == "__main__":
+    main()
diff --git a/data/read5.py b/data/read5.py
@@ -0,0 +1,62 @@
+import pdfplumber
+import json
+
+def process_paragraph(paragraph): 
+    return {"type": "paragraph", "content": paragraph.strip()}
+
+def process_heading(heading):
+    return {"type": "heading", "content": heading.strip()}
+
+def process_table(table):
+    return {"type": "table", "content": table}
+
+def detect_structure(text):
+    paragraphs = text.split('\n\n')
+    structured_content = []
+
+    for paragraph in paragraphs:
+        clean_paragraph = paragraph.strip()
+        if clean_paragraph:
+            if len(clean_paragraph.split()) < 5:  
+                structured_content.append(process_heading(clean_paragraph))
+            else:
+                structured_content.append(process_paragraph(clean_paragraph))
+
+    return structured_content
+
+def extract_tables(page):
+    tables = page.extract_tables()
+    table_list = []
+    for table in tables:
+        table_list.append(process_table(table))
+    return table_list
+
+def main():
+    #filepath = input(r"Enter the file path: ")
+    filepath = r"C:\Users\HP\OneDrive\Desktop\data_preprocessor\data\acetone-acs-l (1).pdf"
+
+    document_structure = []
+
+    with pdfplumber.open(filepath) as pdf:
+        for page_num, page in enumerate(pdf.pages):
+            print(f"Processing page {page_num + 1}:")
+
+            text = page.extract_text()
+            if text:
+                structured_text = detect_structure(text)
+                document_structure.extend(structured_text)
+
+            tables = extract_tables(page)
+            document_structure.extend(tables)
+
+    # Convert the structure to JSON
+    json_output = json.dumps(document_structure, indent=4)
+
+    # Save to file or print
+    with open('output.json', 'w') as json_file:
+        json_file.write(json_output)
+
+    print("PDF content has been converted to JSON.")
+
+if __name__ == "__main__":
+    main()
diff --git a/data/read6.py b/data/read6.py
@@ -0,0 +1,77 @@
+import pdfplumber
+import json
+
+
+
+def process_main_heading(heading):
+    return{heading.strip()}
+
+def process_sub_heading(subheading):
+    return{subheading.strip()}
+
+def process_paragraph(h,paragraph):
+    return{h : paragraph.strip()}
+
+def process_table(table):
+    return {"type": "table", "content": table}
+
+
+def detect_structure(text):
+    paragraphs = text.split('\n\n')
+    structured_content = []
+
+    for paragraph in paragraphs:
+        clean_paragraph = paragraph.strip()
+        if clean_paragraph:
+            for element in clean_paragraph.extract_words():  # Each element contains text, font-size, bold, etc.
+                text = element["text"]
+                font_size = element["size"]   # Access font size
+                is_bold = "Bold" in element["fontname"]
+            if len(clean_paragraph.split()) < 5 and is_bold: 
+                headd=clean_paragraph 
+                structured_content.append(process_main_heading(clean_paragraph))
+            elif len(clean_paragraph.split())<5:
+                subhead=clean_paragraph
+                structured_content.append(process_sub_heading(clean_paragraph))
+
+            else:
+                structured_content.append(process_paragraph(headd,clean_paragraph))
+
+    return structured_content
+
+def extract_tables(page):
+    tables = page.extract_tables()
+    table_list = []
+    for table in tables:
+        table_list.append(process_table(table))
+    return table_list
+
+def main():
+    #filepath = input(r"Enter the file path: ")
+    filepath = r"C:\Users\HP\OneDrive\Desktop\data_preprocessor\data\acetone-acs-l (1).pdf"
+
+    document_structure = []
+
+    with pdfplumber.open(filepath) as pdf:
+        for page_num, page in enumerate(pdf.pages):
+            print(f"Processing page {page_num + 1}:")
+
+            text = page.extract_text()
+            if text:
+                structured_text = detect_structure(text)
+                document_structure.extend(structured_text)
+
+            tables = extract_tables(page)
+            document_structure.extend(tables)
+
+    # Convert the structure to JSON
+    json_output = json.dumps(document_structure, indent=4)
+
+    # Save to file or print
+    with open('output.json', 'w') as json_file:
+        json_file.write(json_output)
+
+    print("PDF content has been converted to JSON.")
+
+if __name__ == "__main__":
+    main()
diff --git a/data/read7.py b/data/read7.py
@@ -0,0 +1,82 @@
+import pdfplumber
+import json
+import re
+
+def process_main_heading(heading):
+    main_head=heading.strip()
+    return{heading.strip()}
+
+#def process_sub_heading1(heading):
+ #   sub_head=heading.strip()
+ #   return{heading.strip()}
+
+#def process_paragraph(h,paragraph):
+ #   return{h : paragraph.strip()}
+
+def process_paragraph(paragraph): 
+    return {"type": "paragraph", "content": paragraph.strip()}
+
+def process_heading(heading):
+    return {"type": "heading", "content": heading.strip()}
+
+def process_table(table):
+    return {"type": "table", "content": table}
+
+def detect_structure(text):
+    paragraphs=text.split('\n\n')                      #to split the paras ig
+    structured_content=[]                              #to store the structred content ...empty list is initialised
+
+    sentences_per_paragraph = [paragraph.split('.') for paragraph in paragraphs]
+    cleaned_sentences_per_paragraph = [[sentence.strip() for sentence in sentences if sentence.strip()] for sentences in sentences_per_paragraph]
+
+    for i, sentences in enumerate(cleaned_sentences_per_paragraph):
+        #print(f"Paragraph {i + 1}:")
+        for sentence in sentences:
+            match = re.search("ThermoFisher",sentence) or re.search("SCIENTIFIC",sentence) or re.search("SAFETY DATA SHEET",sentence) or re.search("Creation Date\s+\d+-[a-zA-Z]{3}-\d+\s+Revision Date\s+\d+-[a-zA-Z]{3}-\d+\s+Revision Number\s+\d",sentence)
+            print(f"  {sentence}.") 
+
+''' for paragraph in paragraphs:
+        clean_paragraph = paragraph.strip()            #to remove leading or trailing whitespaces
+
+        if clean_paragraph:
+
+            if len(clean_paragraph.split()) < 5:       # include logic for headers of different font sizes
+                headd=clean_paragraph
+                structured_content.append(process_main_heading(clean_paragraph))
+
+            else:
+                structured_content.append(process_paragraph(headd,clean_paragraph))
+
+    return structured_content'''
+
+
+def extract_tables(page):
+    tables = page.extract_tables()
+    table_list = []
+    for table in tables:
+        table_list.append(process_table(table))
+    return table_list
+
+
+def main():
+    #file_path=input(r"Enter the file path: ")
+    file_path = r"C:\Users\HP\OneDrive\Desktop\data_preprocessor\data\acetone-acs-l (1).pdf"
+
+    document_structure=[] #empty list to storee doc structure??
+
+    with pdfplumber.open(file_path) as pdf:
+
+        for page_num,page in enumerate(pdf.pages):
+            print(f"Processing page {page_num + 1}:")
+
+            text=page.extract_text()
+
+
+
+    json_output = json.dumps(document_structure, indent=4)
+    with open('output.json', 'w') as json_file:
+        json_file.write(json_output)
+    print("PDF content has been converted to JSON.")
+
+if __name__ == "__main__":
+    main()
diff --git a/data/read8.py b/data/read8.py
@@ -0,0 +1,44 @@
+import pdfplumber
+
+def process_pdf_with_font_details(pdf_path):
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            # Extract characters with their properties
+            chars = page.chars
+
+            # Initialize a variable to hold the current word
+            current_word = ""
+            current_font_name = None
+            current_font_size = None
+            current_fill = None
+
+            for char in chars:
+                text = char['text']
+                font_size = char['size']
+                font_name = char['fontname']
+                fill_color = char.get('fill')  # Get fill color
+
+                # Check for spaces to identify word boundaries
+                if text.isspace():
+                    if current_word:
+                        # Print the accumulated word with font details
+                        print(f"'{current_word}' (Font size: {current_font_size}, Font style: {current_font_name}, Font color: {current_fill})")
+                        current_word = ""
+                        current_font_name = None
+                        current_font_size = None
+                        current_fill = None
+                else:
+                    current_word += text
+                    # Store the font details of the current character
+                    current_font_name = font_name
+                    current_font_size = font_size
+                    current_fill = fill_color  # Store the fill color
+
+            # Print the last word if it exists
+            if current_word:
+                print(f"'{current_word}' (Font size: {current_font_size}, Font style: {current_font_name}, Font color: {current_fill})")
+
+# Example usage
+pdf_path = r"C:\Users\HP\OneDrive\Desktop\data_preprocessor\data\acetone-acs-l (1).pdf"
+
+process_pdf_with_font_details(pdf_path)
diff --git a/output.json b/output.json
diff --git a/requirements.txt b/requirements.txt