Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mini project submission #16

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added Project_documentation.pdf
Binary file not shown.
Empty file added data/blah.py
Empty file.
33 changes: 33 additions & 0 deletions data/read2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#reading contents of the pdf using pdfplumber

import pdfplumber

# Use raw string format to avoid issues with backslashes
file_path = r"C:\Users\HP\OneDrive\Desktop\data_preprocessor\data\acetone-acs-l (1).pdf"

# Open the PDF
with pdfplumber.open(file_path) as pdf:
# Iterate through each page of the PDF
for page_num, page in enumerate(pdf.pages):
# Extract text from the page
text = page.extract_text()
print(f"Text from page {page_num + 1}:")
print(text)

# Extract tables from the page
tables = page.extract_tables() # Use extract_tables() instead of extract_table()
print(f"Tables from page {page_num + 1}:")
for table in tables:
for row in table:
print(row)

# Extract images from the page
images = page.images # Use page.images to get a list of images on the page
if images:
print(f"Images on page {page_num + 1}:")
for image in images:
print(f"Image on page {page_num + 1}: {image}")
# Note: PDFPlumber does not extract the image data itself, only metadata like its position, width, height, etc.
# For extracting image data, you might need another library like Pillow to render the PDF into an image.


78 changes: 78 additions & 0 deletions data/read4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pdfplumber
import json

def process_main_heading(heading):
main_head=heading.strip()
return{heading.strip()}

#def process_sub_heading1(heading):
# sub_head=heading.strip()
# return{heading.strip()}

def process_paragraph(h,paragraph):
return{h : paragraph.strip()}



#def process_paragraph(paragraph):
# return {"type": "paragraph", "content": paragraph.strip()}

#def process_heading(heading):
#return {"type": "heading", "content": heading.strip()}

def process_table(table):
return {"type": "table", "content": table}



def detect_structure(text):
paragraphs=text.split('\n\n') #to split the paras ig
structured_content=[] #to store the structred content ...empty list is initialised

for paragraph in paragraphs:
clean_paragraph = paragraph.strip() #to remove leading or trailing whitespaces

if clean_paragraph:

if len(clean_paragraph.split()) < 5: # include logic for headers of different font sizes
headd=clean_paragraph
structured_content.append(process_main_heading(clean_paragraph))

else:
structured_content.append(process_paragraph(headd,clean_paragraph))

return structured_content


def extract_tables(page):
tables = page.extract_tables()
table_list = []
for table in tables:
table_list.append(process_table(table))
return table_list


def main():
#file_path=input(r"Enter the file path: ")
file_path = r"C:\Users\HP\OneDrive\Desktop\data_preprocessor\data\acetone-acs-l (1).pdf"

document_structure=[] #empty list to storee doc structure??

with pdfplumber.open(file_path) as pdf:

for page_num,page in enumerate(pdf.pages):
print(f"Processing page {page_num + 1}:")

text=page.extract_text()

# Convert the structure to JSON
json_output = json.dumps(document_structure, indent=4)

# Save to file or print
with open('output.json', 'w') as json_file:
json_file.write(json_output)

print("PDF content has been converted to JSON.")

if __name__ == "__main__":
main()
62 changes: 62 additions & 0 deletions data/read5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pdfplumber
import json

def process_paragraph(paragraph):
return {"type": "paragraph", "content": paragraph.strip()}

def process_heading(heading):
return {"type": "heading", "content": heading.strip()}

def process_table(table):
return {"type": "table", "content": table}

def detect_structure(text):
paragraphs = text.split('\n\n')
structured_content = []

for paragraph in paragraphs:
clean_paragraph = paragraph.strip()
if clean_paragraph:
if len(clean_paragraph.split()) < 5:
structured_content.append(process_heading(clean_paragraph))
else:
structured_content.append(process_paragraph(clean_paragraph))

return structured_content

def extract_tables(page):
tables = page.extract_tables()
table_list = []
for table in tables:
table_list.append(process_table(table))
return table_list

def main():
#filepath = input(r"Enter the file path: ")
filepath = r"C:\Users\HP\OneDrive\Desktop\data_preprocessor\data\acetone-acs-l (1).pdf"

document_structure = []

with pdfplumber.open(filepath) as pdf:
for page_num, page in enumerate(pdf.pages):
print(f"Processing page {page_num + 1}:")

text = page.extract_text()
if text:
structured_text = detect_structure(text)
document_structure.extend(structured_text)

tables = extract_tables(page)
document_structure.extend(tables)

# Convert the structure to JSON
json_output = json.dumps(document_structure, indent=4)

# Save to file or print
with open('output.json', 'w') as json_file:
json_file.write(json_output)

print("PDF content has been converted to JSON.")

if __name__ == "__main__":
main()
77 changes: 77 additions & 0 deletions data/read6.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import pdfplumber
import json



def process_main_heading(heading):
return{heading.strip()}

def process_sub_heading(subheading):
return{subheading.strip()}

def process_paragraph(h,paragraph):
return{h : paragraph.strip()}

def process_table(table):
return {"type": "table", "content": table}


def detect_structure(text):
paragraphs = text.split('\n\n')
structured_content = []

for paragraph in paragraphs:
clean_paragraph = paragraph.strip()
if clean_paragraph:
for element in clean_paragraph.extract_words(): # Each element contains text, font-size, bold, etc.
text = element["text"]
font_size = element["size"] # Access font size
is_bold = "Bold" in element["fontname"]
if len(clean_paragraph.split()) < 5 and is_bold:
headd=clean_paragraph
structured_content.append(process_main_heading(clean_paragraph))
elif len(clean_paragraph.split())<5:
subhead=clean_paragraph
structured_content.append(process_sub_heading(clean_paragraph))

else:
structured_content.append(process_paragraph(headd,clean_paragraph))

return structured_content

def extract_tables(page):
tables = page.extract_tables()
table_list = []
for table in tables:
table_list.append(process_table(table))
return table_list

def main():
#filepath = input(r"Enter the file path: ")
filepath = r"C:\Users\HP\OneDrive\Desktop\data_preprocessor\data\acetone-acs-l (1).pdf"

document_structure = []

with pdfplumber.open(filepath) as pdf:
for page_num, page in enumerate(pdf.pages):
print(f"Processing page {page_num + 1}:")

text = page.extract_text()
if text:
structured_text = detect_structure(text)
document_structure.extend(structured_text)

tables = extract_tables(page)
document_structure.extend(tables)

# Convert the structure to JSON
json_output = json.dumps(document_structure, indent=4)

# Save to file or print
with open('output.json', 'w') as json_file:
json_file.write(json_output)

print("PDF content has been converted to JSON.")

if __name__ == "__main__":
main()
82 changes: 82 additions & 0 deletions data/read7.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import pdfplumber
import json
import re

def process_main_heading(heading):
main_head=heading.strip()
return{heading.strip()}

#def process_sub_heading1(heading):
# sub_head=heading.strip()
# return{heading.strip()}

#def process_paragraph(h,paragraph):
# return{h : paragraph.strip()}

def process_paragraph(paragraph):
return {"type": "paragraph", "content": paragraph.strip()}

def process_heading(heading):
return {"type": "heading", "content": heading.strip()}

def process_table(table):
return {"type": "table", "content": table}

def detect_structure(text):
paragraphs=text.split('\n\n') #to split the paras ig
structured_content=[] #to store the structred content ...empty list is initialised

sentences_per_paragraph = [paragraph.split('.') for paragraph in paragraphs]
cleaned_sentences_per_paragraph = [[sentence.strip() for sentence in sentences if sentence.strip()] for sentences in sentences_per_paragraph]

for i, sentences in enumerate(cleaned_sentences_per_paragraph):
#print(f"Paragraph {i + 1}:")
for sentence in sentences:
match = re.search("ThermoFisher",sentence) or re.search("SCIENTIFIC",sentence) or re.search("SAFETY DATA SHEET",sentence) or re.search("Creation Date\s+\d+-[a-zA-Z]{3}-\d+\s+Revision Date\s+\d+-[a-zA-Z]{3}-\d+\s+Revision Number\s+\d",sentence)
print(f" {sentence}.")

''' for paragraph in paragraphs:
clean_paragraph = paragraph.strip() #to remove leading or trailing whitespaces

if clean_paragraph:

if len(clean_paragraph.split()) < 5: # include logic for headers of different font sizes
headd=clean_paragraph
structured_content.append(process_main_heading(clean_paragraph))

else:
structured_content.append(process_paragraph(headd,clean_paragraph))

return structured_content'''


def extract_tables(page):
tables = page.extract_tables()
table_list = []
for table in tables:
table_list.append(process_table(table))
return table_list


def main():
#file_path=input(r"Enter the file path: ")
file_path = r"C:\Users\HP\OneDrive\Desktop\data_preprocessor\data\acetone-acs-l (1).pdf"

document_structure=[] #empty list to storee doc structure??

with pdfplumber.open(file_path) as pdf:

for page_num,page in enumerate(pdf.pages):
print(f"Processing page {page_num + 1}:")

text=page.extract_text()



json_output = json.dumps(document_structure, indent=4)
with open('output.json', 'w') as json_file:
json_file.write(json_output)
print("PDF content has been converted to JSON.")

if __name__ == "__main__":
main()
44 changes: 44 additions & 0 deletions data/read8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pdfplumber

def process_pdf_with_font_details(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
# Extract characters with their properties
chars = page.chars

# Initialize a variable to hold the current word
current_word = ""
current_font_name = None
current_font_size = None
current_fill = None

for char in chars:
text = char['text']
font_size = char['size']
font_name = char['fontname']
fill_color = char.get('fill') # Get fill color

# Check for spaces to identify word boundaries
if text.isspace():
if current_word:
# Print the accumulated word with font details
print(f"'{current_word}' (Font size: {current_font_size}, Font style: {current_font_name}, Font color: {current_fill})")
current_word = ""
current_font_name = None
current_font_size = None
current_fill = None
else:
current_word += text
# Store the font details of the current character
current_font_name = font_name
current_font_size = font_size
current_fill = fill_color # Store the fill color

# Print the last word if it exists
if current_word:
print(f"'{current_word}' (Font size: {current_font_size}, Font style: {current_font_name}, Font color: {current_fill})")

# Example usage
pdf_path = r"C:\Users\HP\OneDrive\Desktop\data_preprocessor\data\acetone-acs-l (1).pdf"

process_pdf_with_font_details(pdf_path)
Loading