Skip to content

Commit

Permalink
Merge pull request #102 from codereverser/fix/various-fixes
Browse files Browse the repository at this point in the history
Various fixes
  • Loading branch information
codereverser authored Dec 24, 2024
2 parents c01bbde + ea20723 commit 6eaa108
Show file tree
Hide file tree
Showing 15 changed files with 900 additions and 921 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/run-pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.8']
python-version: ['3.10']

steps:
- uses: actions/checkout@v3
Expand Down Expand Up @@ -41,6 +41,7 @@ jobs:
KFINTECH_CAS_FILE_NEW: ${{ secrets.KFINTECH_CAS_FILE_NEW }}
KFINTECH_CAS_PASSWORD: ${{ secrets.KFINTECH_CAS_PASSWORD }}
- name: Upload coverage report to codecov
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v5
with:
file: ./coverage.xml
files: ./coverage.xml
token: ${{ secrets.CODECOV_TOKEN }}
11 changes: 4 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
repos:
- repo: 'https://github.com/pre-commit/pre-commit-hooks'
rev: v4.4.0
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.287
rev: v0.8.4
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- repo: 'https://github.com/psf/black'
rev: 23.7.0
hooks:
- id: black
- id: ruff-format
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# CASParser

[![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![code style: ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
[![GitHub](https://img.shields.io/github/license/codereverser/casparser)](https://github.com/codereverser/casparser/blob/main/LICENSE)
![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/codereverser/casparser/run-pytest.yml?branch=main)
[![codecov](https://codecov.io/gh/codereverser/casparser/branch/main/graph/badge.svg?token=DYZ7TXWRGI)](https://codecov.io/gh/codereverser/casparser)
Expand Down Expand Up @@ -73,13 +73,18 @@ csv_str = casparser.read_cas_pdf("/path/to/cas/file.pdf", "password", output="cs
"advisor": "string",
"rta_code": "string",
"rta": "string",
"type": "string",
"nominees": [
"string",
],
"open": "number",
"close": "number",
"close_calculated": "number",
"valuation": {
"date": "date",
"nav": "number",
"value": "number"
"value": "number",
"cost": "number",
},
"transactions": [
{
Expand Down
2 changes: 2 additions & 0 deletions casparser/analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
"FY2020-21": 301,
"FY2021-22": 317,
"FY2022-23": 331,
"FY2023-24": 348,
"FY2024-25": 365,
}


Expand Down
9 changes: 5 additions & 4 deletions casparser/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from rich.prompt import Prompt
from rich.table import Table

from . import read_cas_pdf, __version__
from . import __version__, read_cas_pdf
from .analysis.gains import CapitalGainsReport
from .enums import CASFileType
from .exceptions import GainsError, IncompleteCASError, ParserException
Expand Down Expand Up @@ -146,8 +146,9 @@ def print_summary(parsed_data: CASData, output_filename=None, include_zero_folio
console_row = {
"scheme": scheme_name,
"open": scheme["open"],
"close": format_number(scheme_close) if is_summary
else f"{format_number(scheme_close)}\n/\n{calc_close}",
"close": format_number(scheme_close)
if is_summary
else f"{format_number(scheme_close)}\n/\n{calc_close}",
"value": f"{formatINR(valuation['value'])}\n@\n{formatINR(valuation['nav'])}",
"txns": len(scheme["transactions"]),
"status": status,
Expand Down Expand Up @@ -384,4 +385,4 @@ def cli(output, summary, password, include_all, gains, gains_112a, force_pdfmine


if __name__ == "__main__":
cli(prog_name="casparser")
cli(prog_name="casparser")
20 changes: 9 additions & 11 deletions casparser/parsers/mupdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ def extract_blocks(page_dict):
for block in grouped_blocks:
lines = []
items = []
if len(block.get("lines", [])) == 0:
continue
bbox = block["lines"][0]["bbox"]
bbox = [0, 0, 0, 0]
if len(block.get("lines", [])) > 0:
bbox = block["lines"][0]["bbox"]
y0, y1 = bbox[1], bbox[3]
for line in sorted(block["lines"], key=lambda x: x["bbox"][1]):
if len(items) > 0 and not (
Expand Down Expand Up @@ -113,12 +113,10 @@ def parse_investor_info(page_dict, page_rect: fitz.Rect) -> InvestorInfo:
name = None
for block in blocks:
for line in block["lines"]:
for span in line["spans"]:
if span["bbox"][0] > width / 3:
continue
for span in filter(
lambda x: x["bbox"][0] <= width / 3 and x["text"].strip() != "", line["spans"]
):
txt = span["text"].strip()
if txt == "":
continue
if not email_found:
if m := re.search(r"^\s*email\s+id\s*:\s*(.+?)(?:\s|$)", txt, re.I):
email = m.group(1).strip()
Expand Down Expand Up @@ -156,9 +154,9 @@ def group_similar_rows(elements_list: List[Iterator[Any]]):
lines = []
for elements in elements_list:
sorted_elements = list(sorted(elements, key=itemgetter(1, 0)))
if len(sorted_elements) == 0:
continue
y0, y1 = sorted_elements[0][1], sorted_elements[0][3]
y0, y1 = 0, 0
if len(sorted_elements) > 0:
y0, y1 = sorted_elements[0][1], sorted_elements[0][3]
items = []
for el in sorted_elements:
x2, y2, x3, y3 = el[:4]
Expand Down
13 changes: 7 additions & 6 deletions casparser/parsers/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ def parse_investor_info(layout, width, height) -> InvestorInfo:
[
x
for x in layout
if isinstance(x, LTTextBoxHorizontal) and x.x1 < width / 1.5 and x.y1 > height / 2
if isinstance(x, LTTextBoxHorizontal)
and x.x1 < width / 1.5
and x.y1 > height / 2
and x.get_text().strip() != ""
],
key=lambda x: -x.y1,
)
Expand All @@ -33,8 +36,6 @@ def parse_investor_info(layout, width, height) -> InvestorInfo:
name = None
for el in text_elements:
txt = el.get_text().strip()
if txt == "":
continue
if not email_found:
if m := re.search(r"^\s*email\s+id\s*:\s*(.+?)(?:\s|$)", txt, re.I):
email = m.group(1).strip()
Expand Down Expand Up @@ -88,9 +89,9 @@ def group_similar_rows(elements_list: List[Iterator[LTTextBoxHorizontal]]):
lines = []
for elements in elements_list:
sorted_elements = list(sorted(elements, key=lambda x: (-x.y1, x.x0)))
if len(sorted_elements) == 0:
continue
y0, y1 = sorted_elements[0].y0, sorted_elements[0].y1
y0, y1 = 0, 0
if len(sorted_elements) > 0:
y0, y1 = sorted_elements[0].y0, sorted_elements[0].y1
items = []
for el in sorted_elements:
if len(items) > 0 and not (
Expand Down
13 changes: 9 additions & 4 deletions casparser/process/cas_detailed.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,18 @@
DESCRIPTION_TAIL_RE,
DETAILED_DATE_RE,
DIVIDEND_RE,
FOLIO_RE,
FOLIO_KV_RE,
FOLIO_RE,
NAV_RE,
NOMINEE_RE,
OPEN_UNITS_RE,
REGISTRAR_RE,
SCHEME_RE,
SCHEME_KV_RE,
SCHEME_RE,
TRANSACTION_RE1,
TRANSACTION_RE2,
TRANSACTION_RE3,
TRANSACTION_RE4,
VALUATION_RE,
)
from .utils import isin_search
Expand Down Expand Up @@ -99,7 +100,7 @@ def get_transaction_type(
txn_type = TransactionType.PURCHASE
elif units < 0:
if re.search(
"reversal|rejection|dishonoured|mismatch|insufficient\s+balance", description, re.I
r"reversal|rejection|dishonoured|mismatch|insufficient\s+balance", description, re.I
):
txn_type = TransactionType.REVERSAL
elif "switch" in description:
Expand Down Expand Up @@ -128,7 +129,7 @@ def get_parsed_scheme_name(scheme) -> str:


def parse_transaction(line) -> Optional[ParsedTransaction]:
for regex in (TRANSACTION_RE1, TRANSACTION_RE2, TRANSACTION_RE3):
for regex in (TRANSACTION_RE1, TRANSACTION_RE2, TRANSACTION_RE3, TRANSACTION_RE4):
if m := re.search(regex, line, re.DOTALL | re.MULTILINE | re.I):
groups = m.groups()
date = description = amount = units = nav = balance = None
Expand All @@ -138,6 +139,10 @@ def parse_transaction(line) -> Optional[ParsedTransaction]:
elif groups.count(None) == 2:
# Segregated Portfolio Entries
date, description, units, balance, *_ = groups
elif groups.count(None) == 1:
# Zero unit entries
date, description, amount, units, nav, balance = groups
units = "0.000"
elif groups.count(None) == 0:
# Normal entries
date, description, amount, units, nav, balance = groups
Expand Down
6 changes: 4 additions & 2 deletions casparser/process/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@

# Normal Transaction entries
TRANSACTION_RE1 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}\t\t{amt_re}\t\t{amt_re}"
# Zero unit transactions (ref: #88)
TRANSACTION_RE2 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t(?:{amt_re})*\t\t{amt_re}\t\t{amt_re}"
# Segregated portfolio entries
TRANSACTION_RE2 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re})*"
TRANSACTION_RE3 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re})*"
# Tax transactions
TRANSACTION_RE3 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re}\t\t{amt_re})*"
TRANSACTION_RE4 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re}\t\t{amt_re})*"
DESCRIPTION_TAIL_RE = r"(\n.+?)(\t\t|$)"
DIVIDEND_RE = r"(?:div\.|dividend|idcw).+?(reinvest)*.*?@\s*Rs\.\s*([\d\.]+)(?:\s+per\s+unit)?"
SCHEME_TAIL_RE = r"(\n.+?)(?:\t\t|$)"
Loading

0 comments on commit 6eaa108

Please sign in to comment.