6
6
import java .util .regex .Matcher ;
7
7
import java .util .regex .Pattern ;
8
8
import java .util .stream .Collectors ;
9
+ import java .util .logging .Logger ;
9
10
10
11
import jakarta .enterprise .context .ApplicationScoped ;
11
12
16
17
@ ApplicationScoped
17
18
public class TextUtils {
18
19
20
+ private static final Logger LOGGER = Logger .getLogger (TextUtils .class .getName ());
21
+
19
22
// Precompiled regex patterns for efficiency
20
23
private static final Pattern COOP_PATTERN = Pattern .compile ("Coop" , Pattern .CASE_INSENSITIVE );
21
24
private static final Pattern MIGROS_PATTERN = Pattern .compile ("Migros" , Pattern .CASE_INSENSITIVE );
22
- private static final Pattern TOTAL_PATTERN = Pattern .compile ("Total CHF ( \\ d+ \\ . \\ d{2}) " , Pattern .CASE_INSENSITIVE );
25
+ private static final Pattern TOTAL_PATTERN = Pattern .compile ("(?m)^ \\ bTotal CHF \\ b.* " , Pattern .CASE_INSENSITIVE );
23
26
private static final Pattern DATE_PATTERN = Pattern .compile ("\\ b\\ d{2}\\ .\\ d{2}\\ .\\ d{2}\\ b" );
24
- private static final Pattern RABATT_PATTERN = Pattern .compile ("^Rabatt\\ s+.*" , Pattern .CASE_INSENSITIVE );
25
- private static final Pattern BON_PATTERN = Pattern .compile ("^Bon\\ s+.*" , Pattern .CASE_INSENSITIVE );
27
+ private static final Pattern RABATT_PATTERN = Pattern .compile ("(?m)^\\ bRabatt\\ b.*" , Pattern .CASE_INSENSITIVE );
28
+ private static final Pattern BON_PATTERN = Pattern .compile ("(?m)^\\ bBon\\ b.*" , Pattern .CASE_INSENSITIVE );
29
+ private static final Pattern HEADER_PATTERN = Pattern
30
+ .compile ("Artikel\\ s+Menge\\ s+Preis\\ s+Aktion\\ s+Total\\ s+Zusatz" , Pattern .CASE_INSENSITIVE );
31
+
32
+ private static final Pattern TERMINATOR_PATTERN = Pattern .compile ("(?m)^\\ b(?:Total CHF|Rabatt|Bon)\\ b.*" ,
33
+ Pattern .CASE_INSENSITIVE );
26
34
27
35
// Prefix used to locate the total in the receipt
28
36
private static final String TOTAL_PREFIX = "Total CHF" ;
@@ -33,7 +41,8 @@ public class TextUtils {
33
41
private static final DateTimeFormatter TIMESTAMP_FORMATTER = DateTimeFormatter .ofPattern ("yyyyMMddHHmmss" );
34
42
35
43
/**
36
- * Cleans up the provided content by removing empty lines and trimming whitespace.
44
+ * Cleans up the provided content by removing empty lines and trimming
45
+ * whitespace.
37
46
*
38
47
* @param content the raw text content from a receipt
39
48
* @return a cleaned-up version of the text with no empty lines
@@ -71,70 +80,98 @@ public String extractCorp(String text) {
71
80
*/
72
81
public String extractTotal (String receipt ) {
73
82
Matcher matcher = TOTAL_PATTERN .matcher (receipt );
74
- return matcher .find () ? matcher .group (1 ) : "0.00" ;
83
+ if (matcher .find ()) {
84
+ // Extrahiere den Betrag aus der Zeile "Total CHF X.XX"
85
+ String totalLine = matcher .group ();
86
+ Matcher amountMatcher = Pattern .compile ("\\ d+\\ .\\ d{2}" ).matcher (totalLine );
87
+ if (amountMatcher .find ()) {
88
+ return amountMatcher .group ();
89
+ }
90
+ }
91
+ return "0.00" ;
75
92
}
76
93
77
94
/**
78
- * Extracts the date from the text. If no date is found, returns the current timestamp.
95
+ * Extracts the date from the text. If no date is found, returns the current
96
+ * timestamp.
79
97
*
80
98
* @param text the text to search for a date
81
- * @return the extracted date in the format "dd.MM.yy" or the current timestamp if not found
99
+ * @return the extracted date in the format "dd.MM.yy" oder den aktuellen
100
+ * Zeitstempel, wenn kein Datum gefunden wurde
82
101
*/
83
102
public String extractDate (String text ) {
84
103
Matcher matcher = DATE_PATTERN .matcher (text );
85
104
return matcher .find () ? matcher .group () : getCurrentTimestamp ();
86
105
}
87
106
88
107
/**
89
- * Extracts the articles section from the receipt, stopping before 'Total CHF', 'Rabatt', or 'Bon'.
108
+ * Extracts the articles section from the receipt, stopping before 'Total CHF',
109
+ * 'Rabatt', or 'Bon'.
90
110
*
91
111
* @param receipt the full receipt text
92
112
* @return the substring of the receipt containing the articles
93
113
*/
94
114
public String extractArticlesUntilTotal (String receipt ) {
95
- // Define patterns to search for 'Total CHF', 'Rabatt', and 'Bon'
96
- Matcher totalMatcher = Pattern .compile (Pattern .quote (TOTAL_PREFIX ), Pattern .CASE_INSENSITIVE ).matcher (receipt );
97
- Matcher rabattMatcher = RABATT_PATTERN .matcher (receipt );
98
- Matcher bonMatcher = BON_PATTERN .matcher (receipt );
99
-
100
- int totalIndex = -1 ;
101
- int rabattIndex = -1 ;
102
- int bonIndex = -1 ;
115
+ System .out .println (receipt );
116
+ Matcher terminatorMatcher = TERMINATOR_PATTERN .matcher (receipt );
103
117
104
- if (totalMatcher .find ()) {
105
- totalIndex = totalMatcher .start ();
106
- }
118
+ int terminatorIndex = -1 ;
107
119
108
- if (rabattMatcher .find ()) {
109
- rabattIndex = rabattMatcher .start ();
120
+ if (terminatorMatcher .find ()) {
121
+ terminatorIndex = terminatorMatcher .start ();
122
+ String matchedLine = terminatorMatcher .group ();
123
+ LOGGER .info ("Terminierende Zeile gefunden: \" " + matchedLine + "\" bei Index: " + terminatorIndex );
124
+ } else {
125
+ LOGGER .info ("Keine terminierenden Schlüsselwörter gefunden." );
110
126
}
111
127
112
- if (bonMatcher .find ()) {
113
- bonIndex = bonMatcher .start ();
128
+ // Extrahiere den Substring bis zum terminierenden Schlüsselwort
129
+ if (terminatorIndex != -1 ) {
130
+ String extracted = receipt .substring (0 , terminatorIndex ).trim ();
131
+ LOGGER .info ("Extrahierter Artikelabschnitt: " + extracted );
132
+ return extracted ;
133
+ } else {
134
+ // Wenn keines der Schlüsselwörter gefunden wurde, gib den gesamten Beleg zurück
135
+ LOGGER .info ("Keine Schlüsselwörter gefunden. Gesamter Beleg wird zurückgegeben." );
136
+ return receipt ;
114
137
}
138
+ }
115
139
116
- // Determine the earliest index among 'Total CHF', 'Rabatt', and 'Bon'
117
- int earliestIndex = receipt .length (); // Default to end of string
140
+ public int findTotalRowNumber ( String receipt ) {
141
+ String [] lines = receipt .split ( System . lineSeparator ());
118
142
119
- if (totalIndex != -1 && totalIndex < earliestIndex ) {
120
- earliestIndex = totalIndex ;
143
+ // Finde den Start der Artikelzeilen
144
+ int firstArticleLine = findFirstArticleStart (receipt );
145
+ if (firstArticleLine == -1 ) {
146
+ LOGGER .warning ("Kopfzeile für Artikelzeilen nicht gefunden. 'Total' kann nicht bestimmt werden." );
147
+ return -1 ;
121
148
}
122
149
123
- if (rabattIndex != -1 && rabattIndex < earliestIndex ) {
124
- earliestIndex = rabattIndex ;
150
+ for (int i = firstArticleLine ; i < lines .length ; i ++) {
151
+ Matcher matcher = TOTAL_PATTERN .matcher (lines [i ]);
152
+ if (matcher .find ()) {
153
+ LOGGER .info ("\" Total CHF\" Zeile gefunden bei Zeile " + (i + 1 ));
154
+ return i + 1 ;
155
+ }
125
156
}
126
157
127
- if ( bonIndex != - 1 && bonIndex < earliestIndex ) {
128
- earliestIndex = bonIndex ;
129
- }
158
+ LOGGER . warning ( " \" Total CHF \" Zeile nicht gefunden." );
159
+ return - 1 ;
160
+ }
130
161
131
- // Extract the substring up to the earliest keyword
132
- if (earliestIndex != receipt .length ()) {
133
- return receipt .substring (0 , earliestIndex ).trim ();
134
- } else {
135
- // If none of the keywords are found, return the entire receipt
136
- return receipt ;
162
+ private int findFirstArticleStart (String receipt ) {
163
+ String [] lines = receipt .split (System .lineSeparator ());
164
+
165
+ for (int i = 0 ; i < lines .length ; i ++) {
166
+ Matcher matcher = HEADER_PATTERN .matcher (lines [i ]);
167
+ if (matcher .find ()) {
168
+ LOGGER .info ("Kopfzeile gefunden bei Zeile " + (i + 1 ));
169
+ return i + 1 ;
170
+ }
137
171
}
172
+
173
+ LOGGER .warning ("Kopfzeile nicht gefunden." );
174
+ return -1 ;
138
175
}
139
176
140
177
/**
0 commit comments