Skip to content

Commit 81de613

Browse files
committed
receipt to total structure
1 parent bff20cd commit 81de613

File tree

18 files changed

+563
-194
lines changed

18 files changed

+563
-194
lines changed

project/backend/pom.xml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,7 @@
5656
<groupId>jakarta.json.bind</groupId>
5757
<artifactId>jakarta.json.bind-api</artifactId>
5858
</dependency>
59-
<dependency>
60-
<groupId>io.quarkus</groupId>
61-
<artifactId>quarkus-smallrye-openapi</artifactId>
62-
</dependency>
59+
6360
<dependency>
6461
<groupId>net.sourceforge.tess4j</groupId>
6562
<artifactId>tess4j</artifactId>

project/backend/src/main/java/dev/lueem/service/ExtractionService.java

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ public Response extractArticles(MultipartFormDataInput input) {
8181
String extractTotal = textUtils.extractTotal(cleanedContent);
8282
String extractDate = textUtils.extractDate(cleanedContent);
8383
String cuttedEnd = textUtils.extractArticlesUntilTotal(cleanedContent);
84+
int totalRowNumber = textUtils.findTotalRowNumber(cuttedEnd);
8485

8586
LOGGER.info("Extracted Total: " + extractTotal);
8687
LOGGER.info("Extracted Date: " + extractDate);
@@ -117,6 +118,18 @@ public Response extractArticles(MultipartFormDataInput input) {
117118
jsonResponseBuilder.add("Purchase_Date", "Unknown");
118119
}
119120

121+
if (totalRowNumber >= 0) {
122+
jsonResponseBuilder.add("Total_R_Extract", totalRowNumber);
123+
} else {
124+
jsonResponseBuilder.add("Total_R_Extract", "0");
125+
}
126+
127+
if (totalRowNumber != 0) {
128+
jsonResponseBuilder.add("Total_R_Open_Ai", articles.size());
129+
} else {
130+
jsonResponseBuilder.add("Total_R_Open_Ai", "0");
131+
}
132+
120133
// Add Corp
121134
if (corp != null) {
122135
jsonResponseBuilder.add("Corp", corp);
@@ -222,9 +235,9 @@ private JsonArray sanitizeArticlesJson(JsonArray articlesJson) {
222235
double quantity = jsonObject.containsKey("Quantity") && !jsonObject.isNull("Quantity")
223236
? jsonObject.getJsonNumber("Quantity").doubleValue()
224237
: 0.0;
225-
double discount = jsonObject.containsKey("Discount") && !jsonObject.isNull("Discount")
226-
? jsonObject.getJsonNumber("Discount").doubleValue()
227-
: 0.0;
238+
double discount = 0.0;//jsonObject.containsKey("Discount") && !jsonObject.isNull("Discount")
239+
// ? jsonObject.getJsonNumber("Discount").doubleValue()
240+
// : 0.0;
228241
double total = jsonObject.containsKey("Total") && !jsonObject.isNull("Total")
229242
? jsonObject.getJsonNumber("Total").doubleValue()
230243
: 0.0;
@@ -297,9 +310,10 @@ private List<Article> convertJsonArrayToArticles(JsonArray articlesJson) {
297310
article.setTotal(total);
298311

299312
// // Handle 'Category' field
300-
// String category = jsonObject.containsKey("Category") && !jsonObject.isNull("Category")
301-
// ? jsonObject.getString("Category")
302-
// : "Andere";
313+
// String category = jsonObject.containsKey("Category") &&
314+
// !jsonObject.isNull("Category")
315+
// ? jsonObject.getString("Category")
316+
// : "Andere";
303317
// article.setCategory(category);
304318

305319
// Correct data inconsistencies

project/backend/src/main/java/dev/lueem/util/TextUtils.java

Lines changed: 75 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import java.util.regex.Matcher;
77
import java.util.regex.Pattern;
88
import java.util.stream.Collectors;
9+
import java.util.logging.Logger;
910

1011
import jakarta.enterprise.context.ApplicationScoped;
1112

@@ -16,13 +17,20 @@
1617
@ApplicationScoped
1718
public class TextUtils {
1819

20+
private static final Logger LOGGER = Logger.getLogger(TextUtils.class.getName());
21+
1922
// Precompiled regex patterns for efficiency
2023
private static final Pattern COOP_PATTERN = Pattern.compile("Coop", Pattern.CASE_INSENSITIVE);
2124
private static final Pattern MIGROS_PATTERN = Pattern.compile("Migros", Pattern.CASE_INSENSITIVE);
22-
private static final Pattern TOTAL_PATTERN = Pattern.compile("Total CHF (\\d+\\.\\d{2})", Pattern.CASE_INSENSITIVE);
25+
private static final Pattern TOTAL_PATTERN = Pattern.compile("(?m)^\\bTotal CHF\\b.*", Pattern.CASE_INSENSITIVE);
2326
private static final Pattern DATE_PATTERN = Pattern.compile("\\b\\d{2}\\.\\d{2}\\.\\d{2}\\b");
24-
private static final Pattern RABATT_PATTERN = Pattern.compile("^Rabatt\\s+.*", Pattern.CASE_INSENSITIVE);
25-
private static final Pattern BON_PATTERN = Pattern.compile("^Bon\\s+.*", Pattern.CASE_INSENSITIVE);
27+
private static final Pattern RABATT_PATTERN = Pattern.compile("(?m)^\\bRabatt\\b.*", Pattern.CASE_INSENSITIVE);
28+
private static final Pattern BON_PATTERN = Pattern.compile("(?m)^\\bBon\\b.*", Pattern.CASE_INSENSITIVE);
29+
private static final Pattern HEADER_PATTERN = Pattern
30+
.compile("Artikel\\s+Menge\\s+Preis\\s+Aktion\\s+Total\\s+Zusatz", Pattern.CASE_INSENSITIVE);
31+
32+
private static final Pattern TERMINATOR_PATTERN = Pattern.compile("(?m)^\\b(?:Total CHF|Rabatt|Bon)\\b.*",
33+
Pattern.CASE_INSENSITIVE);
2634

2735
// Prefix used to locate the total in the receipt
2836
private static final String TOTAL_PREFIX = "Total CHF";
@@ -33,7 +41,8 @@ public class TextUtils {
3341
private static final DateTimeFormatter TIMESTAMP_FORMATTER = DateTimeFormatter.ofPattern("yyyyMMddHHmmss");
3442

3543
/**
36-
* Cleans up the provided content by removing empty lines and trimming whitespace.
44+
* Cleans up the provided content by removing empty lines and trimming
45+
* whitespace.
3746
*
3847
* @param content the raw text content from a receipt
3948
* @return a cleaned-up version of the text with no empty lines
@@ -71,70 +80,98 @@ public String extractCorp(String text) {
7180
*/
7281
public String extractTotal(String receipt) {
7382
Matcher matcher = TOTAL_PATTERN.matcher(receipt);
74-
return matcher.find() ? matcher.group(1) : "0.00";
83+
if (matcher.find()) {
84+
// Extrahiere den Betrag aus der Zeile "Total CHF X.XX"
85+
String totalLine = matcher.group();
86+
Matcher amountMatcher = Pattern.compile("\\d+\\.\\d{2}").matcher(totalLine);
87+
if (amountMatcher.find()) {
88+
return amountMatcher.group();
89+
}
90+
}
91+
return "0.00";
7592
}
7693

7794
/**
78-
* Extracts the date from the text. If no date is found, returns the current timestamp.
95+
* Extracts the date from the text. If no date is found, returns the current
96+
* timestamp.
7997
*
8098
* @param text the text to search for a date
81-
* @return the extracted date in the format "dd.MM.yy" or the current timestamp if not found
99+
* @return the extracted date in the format "dd.MM.yy" oder den aktuellen
100+
* Zeitstempel, wenn kein Datum gefunden wurde
82101
*/
83102
public String extractDate(String text) {
84103
Matcher matcher = DATE_PATTERN.matcher(text);
85104
return matcher.find() ? matcher.group() : getCurrentTimestamp();
86105
}
87106

88107
/**
89-
* Extracts the articles section from the receipt, stopping before 'Total CHF', 'Rabatt', or 'Bon'.
108+
* Extracts the articles section from the receipt, stopping before 'Total CHF',
109+
* 'Rabatt', or 'Bon'.
90110
*
91111
* @param receipt the full receipt text
92112
* @return the substring of the receipt containing the articles
93113
*/
94114
public String extractArticlesUntilTotal(String receipt) {
95-
// Define patterns to search for 'Total CHF', 'Rabatt', and 'Bon'
96-
Matcher totalMatcher = Pattern.compile(Pattern.quote(TOTAL_PREFIX), Pattern.CASE_INSENSITIVE).matcher(receipt);
97-
Matcher rabattMatcher = RABATT_PATTERN.matcher(receipt);
98-
Matcher bonMatcher = BON_PATTERN.matcher(receipt);
99-
100-
int totalIndex = -1;
101-
int rabattIndex = -1;
102-
int bonIndex = -1;
115+
System.out.println(receipt);
116+
Matcher terminatorMatcher = TERMINATOR_PATTERN.matcher(receipt);
103117

104-
if (totalMatcher.find()) {
105-
totalIndex = totalMatcher.start();
106-
}
118+
int terminatorIndex = -1;
107119

108-
if (rabattMatcher.find()) {
109-
rabattIndex = rabattMatcher.start();
120+
if (terminatorMatcher.find()) {
121+
terminatorIndex = terminatorMatcher.start();
122+
String matchedLine = terminatorMatcher.group();
123+
LOGGER.info("Terminierende Zeile gefunden: \"" + matchedLine + "\" bei Index: " + terminatorIndex);
124+
} else {
125+
LOGGER.info("Keine terminierenden Schlüsselwörter gefunden.");
110126
}
111127

112-
if (bonMatcher.find()) {
113-
bonIndex = bonMatcher.start();
128+
// Extrahiere den Substring bis zum terminierenden Schlüsselwort
129+
if (terminatorIndex != -1) {
130+
String extracted = receipt.substring(0, terminatorIndex).trim();
131+
LOGGER.info("Extrahierter Artikelabschnitt: " + extracted);
132+
return extracted;
133+
} else {
134+
// Wenn keines der Schlüsselwörter gefunden wurde, gib den gesamten Beleg zurück
135+
LOGGER.info("Keine Schlüsselwörter gefunden. Gesamter Beleg wird zurückgegeben.");
136+
return receipt;
114137
}
138+
}
115139

116-
// Determine the earliest index among 'Total CHF', 'Rabatt', and 'Bon'
117-
int earliestIndex = receipt.length(); // Default to end of string
140+
public int findTotalRowNumber(String receipt) {
141+
String[] lines = receipt.split(System.lineSeparator());
118142

119-
if (totalIndex != -1 && totalIndex < earliestIndex) {
120-
earliestIndex = totalIndex;
143+
// Finde den Start der Artikelzeilen
144+
int firstArticleLine = findFirstArticleStart(receipt);
145+
if (firstArticleLine == -1) {
146+
LOGGER.warning("Kopfzeile für Artikelzeilen nicht gefunden. 'Total' kann nicht bestimmt werden.");
147+
return -1;
121148
}
122149

123-
if (rabattIndex != -1 && rabattIndex < earliestIndex) {
124-
earliestIndex = rabattIndex;
150+
for (int i = firstArticleLine; i < lines.length; i++) {
151+
Matcher matcher = TOTAL_PATTERN.matcher(lines[i]);
152+
if (matcher.find()) {
153+
LOGGER.info("\"Total CHF\" Zeile gefunden bei Zeile " + (i + 1));
154+
return i + 1;
155+
}
125156
}
126157

127-
if (bonIndex != -1 && bonIndex < earliestIndex) {
128-
earliestIndex = bonIndex;
129-
}
158+
LOGGER.warning("\"Total CHF\" Zeile nicht gefunden.");
159+
return -1;
160+
}
130161

131-
// Extract the substring up to the earliest keyword
132-
if (earliestIndex != receipt.length()) {
133-
return receipt.substring(0, earliestIndex).trim();
134-
} else {
135-
// If none of the keywords are found, return the entire receipt
136-
return receipt;
162+
private int findFirstArticleStart(String receipt) {
163+
String[] lines = receipt.split(System.lineSeparator());
164+
165+
for (int i = 0; i < lines.length; i++) {
166+
Matcher matcher = HEADER_PATTERN.matcher(lines[i]);
167+
if (matcher.find()) {
168+
LOGGER.info("Kopfzeile gefunden bei Zeile " + (i + 1));
169+
return i + 1;
170+
}
137171
}
172+
173+
LOGGER.warning("Kopfzeile nicht gefunden.");
174+
return -1;
138175
}
139176

140177
/**

0 commit comments

Comments
 (0)