Skip to content

Commit

Permalink
Update n26 parser to chunk data
Browse files Browse the repository at this point in the history
  • Loading branch information
dennisprudlo committed Apr 2, 2022
1 parent 89fa8b8 commit d3a781c
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 39 deletions.
31 changes: 11 additions & 20 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,14 @@
"author": "Dennis Prudlo",
"license": "MIT",
"devDependencies": {
"@types/pdfjs-dist": "2.7",
"ts-loader": "^9.2.8",
"typescript": "^4.6.3",
"webpack": "^5.70.0",
"webpack-cli": "^4.9.2"
},
"dependencies": {
"csv-parse": "^5.0.4",
"pdfjs-dist": "2.5.207",
"pdfjs-dist": "2.10.377",
"worker-loader": "^3.0.8"
}
}
122 changes: 108 additions & 14 deletions src/converters/bank-statement/N26PdfConverter.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import { TextContentItem } from "pdfjs-dist";
import { TextItem } from "pdfjs-dist/types/src/display/api";
import PdfParsableFile from "../../parsers/PdfParsableFile";
import BankStatementConverter from "../BankStatementConverter";

/**
* Responsible for converting pdf parsed data from N26 statements.
*/
export default class N26PdfConverter extends BankStatementConverter<PdfParsableFile, Array<TextContentItem>> {
export default class N26PdfConverter extends BankStatementConverter<PdfParsableFile, Array<TextItem>> {

/**
* Constructs the N26PdfConverter object
Expand All @@ -15,21 +15,114 @@ export default class N26PdfConverter extends BankStatementConverter<PdfParsableF
super(parsable);
}

public prepareToConvert () : Array<Array<TextContentItem>> {
/**
* Chunks the data into transactions context sets
* @returns The prepared context
*/
public prepareToConvert () : Array<Array<TextItem>> {
const data = this.parsable.data();
const maxPageNumber = Math.max(...data.map(page => page.pageNumber));

let transactions: Array<Array<TextItem>> = [];

this.parsable.data().forEach(page => {
page.contents.forEach(item => {
console.log(item);
});
})
return [];
let hotspots = {
pageNumbering: 0,
descriptionHeader: 0,
dateHeader: 0,
amountHeader: 0,
bankStatementHeader: 0
};

//
// Determine the indicies of the headers
for (let itemIndex = 0; itemIndex < page.contents.length; itemIndex++) {
const item = page.contents[itemIndex];

//
// Determine the last index of the items with the page numbering
if (item.str === maxPageNumber.toString()
&& itemIndex >= 2
&& page.contents[itemIndex - 1].str === '/'
&& page.contents[itemIndex - 2].str === page.pageNumber.toString()) {
hotspots.pageNumbering = itemIndex;
}

if (hotspots.pageNumbering > 0 && item.str === 'Description' && itemIndex > hotspots.pageNumbering) {
hotspots.descriptionHeader = itemIndex;
}

if (hotspots.descriptionHeader > 0 && item.str === 'Booking Date' && itemIndex > hotspots.descriptionHeader) {
hotspots.dateHeader = itemIndex;
}

if (hotspots.dateHeader > 0 && item.str === 'Amount' && itemIndex > hotspots.dateHeader) {
hotspots.amountHeader = itemIndex;
}

if (hotspots.amountHeader > 0 && item.str.startsWith('Bank Statement Nr.') && itemIndex > hotspots.amountHeader) {
hotspots.bankStatementHeader = itemIndex;
}
}

//
// Cutoff the data to the relevant parts
const start = hotspots.amountHeader + 1;
const end = hotspots.bankStatementHeader;
if (start === 0 || end === 0 || start > end) {
return;
}

const cutoff = page.contents.slice(start, end);

//
// The font height for the amount text
const amountHeight = 13.8;

let transactionBeginIndex = 0;
let dateIndex = 0;
let amountIndex = 0;
for (let itemIndex = 0; itemIndex < cutoff.length; itemIndex++) {
const item = cutoff[itemIndex];

//
// If we have no transaction begin index, we need to find the start of a new transaction
if (transactionBeginIndex === -1) {
if (item.str.length > 0 && item.height === amountHeight && !item.hasEOL) {
transactionBeginIndex = itemIndex;
continue;
}
}

//
// If we have a transaction begin index we want to find the end of the transaction
if (transactionBeginIndex !== -1) {
const previousItem = itemIndex > 0 ? cutoff[itemIndex - 1] : null;
const nextItem = itemIndex < (cutoff.length - 1) ? cutoff[itemIndex + 1] : null;
const hasEmptyPreviousItem = previousItem && previousItem.str === '' && previousItem.height === 0 && !previousItem.hasEOL;
const hasEmptyNextItemWithEOL = nextItem && nextItem.str === '' && nextItem.height === 0 && nextItem.hasEOL;
const isRegularAmount = hasEmptyPreviousItem && item.height === amountHeight && item.hasEOL;
const isLastAmount = hasEmptyPreviousItem && hasEmptyNextItemWithEOL && item.height === amountHeight && !item.hasEOL;

if (isRegularAmount || isLastAmount) {
const transactionRange = cutoff.slice(transactionBeginIndex, itemIndex);
transactions.push(transactionRange);
transactionBeginIndex = -1;
continue;
}
}
}
});

return transactions;
}

/**
* Gets the date of the transaction
* @param context The parsed data context
* @returns The date of the transaction
*/
public getBookedAt (context: Array<TextContentItem>) : string|null {
public getBookedAt (context: Array<TextItem>) : string|null {
return null;
}

Expand All @@ -38,7 +131,8 @@ export default class N26PdfConverter extends BankStatementConverter<PdfParsableF
* @param context The parsed data context
* @returns The amount of the transaction
*/
public getAmount (context: Array<TextContentItem>) : string {
public getAmount (context: Array<TextItem>) : string {
console.log(context);
return 'null';
}

Expand All @@ -47,16 +141,16 @@ export default class N26PdfConverter extends BankStatementConverter<PdfParsableF
* @param context The parsed data context
* @returns The name of the transaction
*/
public getName (context: Array<TextContentItem>) : string {
return 'null';
public getName (context: Array<TextItem>) : string {
return context[0].str;
}

/**
* Gets the description of the transaction
* @param context The parsed data context
* @returns The description of the transaction
*/
public getDescription (context: Array<TextContentItem>) : string|null {
public getDescription (context: Array<TextItem>) : string|null {
return null;
}

Expand All @@ -65,7 +159,7 @@ export default class N26PdfConverter extends BankStatementConverter<PdfParsableF
* @param context The parsed data context
* @returns The category of the transaction
*/
public getCategory (context: Array<TextContentItem>) : string|null {
public getCategory (context: Array<TextItem>) : string|null {
return null;
}
}
11 changes: 8 additions & 3 deletions src/parsers/PDFParsableFile.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import FileFacade from "../lib/FileFacade";
import Parsable from "../contracts/Parsable";
import StatementParser from "../lib/StatementParser";
import { getDocument as getPdfDocument, TextContentItem } from "pdfjs-dist";
import { getDocument as getPdfDocument } from "pdfjs-dist";
import { TextItem } from "pdfjs-dist/types/src/display/api";

/**
* Defines the parsed type of a pdf page
Expand All @@ -16,7 +17,7 @@ type PageContent = {
/**
* The contents of the page
*/
contents: Array<TextContentItem>
contents: Array<TextItem>
};

export default class PdfParsableFile implements Parsable {
Expand Down Expand Up @@ -68,7 +69,11 @@ export default class PdfParsableFile implements Parsable {

this.pages.push({
pageNumber,
contents: contents.items
contents: (contents.items as Array<TextItem>)
.map(item => {
item.str = item.str.trim()
return item;
})
});
}

Expand Down

0 comments on commit d3a781c

Please sign in to comment.