-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdeepseek_processor.py
More file actions
118 lines (99 loc) · 4.64 KB
/
deepseek_processor.py
File metadata and controls
118 lines (99 loc) · 4.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import requests
import json
import logging
from datetime import datetime
class DeepseekProcessor:
def __init__(self):
self.base_url = 'http://localhost:11434/api/generate'
self.model_name = 'deepseek-r1:8b'
def process_invoice(self, text: str) -> dict:
"""Process invoice text with Deepseek model"""
try:
# Clean and prepare text
clean_text = ' '.join(text.split())
# Prepare prompt
prompt = self._create_prompt(clean_text)
# Get response from Deepseek
response = self._get_model_response(prompt)
# Process the response
return self._process_response(response)
except Exception as e:
logging.error(f"Deepseek processing error: {str(e)}")
return self._get_default_response()
def _create_prompt(self, text: str) -> str:
"""Create a structured prompt for invoice processing"""
return (
"You are an invoice data extraction expert. Extract the following information from this invoice text "
"and return ONLY a JSON object with these fields:\n"
"- invoice_number (string): The unique identifier for this invoice\n"
"- invoice_date (string): The date in YYYY-MM-DD format\n"
"- amount (number): The total amount as a decimal number\n"
"- currency (string): The 3-letter currency code (e.g., AUD)\n"
"- company_name (string): The name of the company issuing the invoice\n"
"- payment_status (string): The payment status if available\n\n"
f"Invoice text:\n{text[:1500]}\n\n"
"Return ONLY the JSON object, no other text. Format numbers as plain decimals without calculations."
)
def _get_model_response(self, prompt: str) -> str:
"""Get response from Deepseek model"""
try:
response = requests.post(
self.base_url,
json={
'model': self.model_name,
'prompt': prompt,
'stream': False,
'temperature': 0.1,
'num_predict': 500
},
timeout=30
)
return response.json()['response']
except Exception as e:
logging.error(f"Error getting model response: {str(e)}")
raise
def _process_response(self, response: str) -> dict:
"""Process the model's response into structured data"""
try:
# Clean up the response
clean_response = response.strip()
# Extract JSON from the response
import re
# Remove markdown code blocks
if '```' in clean_response:
matches = re.findall(r'```(?:json)?\s*({[^`]*})', clean_response, re.DOTALL)
if matches:
clean_response = matches[0]
# Remove inline comments
clean_response = re.sub(r',\s*//[^\n]*\n', ',\n', clean_response)
clean_response = re.sub(r'//[^\n]*\n', '\n', clean_response)
# Remove block comments
clean_response = re.sub(r'/\*.*?\*/', '', clean_response, flags=re.DOTALL)
# Fix null values
clean_response = re.sub(r':\s*null\s*,', ': "AUD",', clean_response)
# Remove trailing commas
clean_response = re.sub(r',(\s*})', r'\1', clean_response)
# Parse JSON
data = json.loads(clean_response)
# Process and validate the data
return {
'invoice_number': str(data.get('invoice_number', 'Unknown')),
'invoice_date': data.get('invoice_date'),
'amount': float(str(data.get('amount', '0')).replace(',', '')),
'currency': str(data.get('currency', 'AUD')),
'company_name': str(data.get('company_name', 'Unknown')),
'payment_status': str(data.get('payment_status', 'not provided')).lower()
}
except Exception as e:
logging.error(f"Error processing response: {str(e)}")
return self._get_default_response()
def _get_default_response(self) -> dict:
"""Return default response when processing fails"""
return {
'invoice_number': 'Unknown',
'invoice_date': None,
'amount': 0.0,
'currency': 'AUD',
'company_name': 'Unknown',
'payment_status': 'not provided'
}