diff --git a/optical character recognition for financial docs/Readme.md b/optical character recognition for financial docs/Readme.md new file mode 100644 index 00000000..641e44c3 --- /dev/null +++ b/optical character recognition for financial docs/Readme.md @@ -0,0 +1,11 @@ +**Financial Document OCR Tool
+
+Overview**
+The Financial Document OCR Tool is a Python-based utility designed to extract text from financial documents, including balance sheets, income statements, and other financial reports. The tool leverages the Tesseract OCR engine with deep learning-based LSTM (Long Short-Term Memory) models to accurately recognize and extract text from images. + +**Features**
+ + diff --git a/optical character recognition for financial docs/image.jpg b/optical character recognition for financial docs/image.jpg new file mode 100644 index 00000000..2e9f3a8d Binary files /dev/null and b/optical character recognition for financial docs/image.jpg differ diff --git a/optical character recognition for financial docs/ocr_code.ipynb b/optical character recognition for financial docs/ocr_code.ipynb new file mode 100644 index 00000000..4061444e --- /dev/null +++ b/optical character recognition for financial docs/ocr_code.ipynb @@ -0,0 +1,257 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "!pip install invoice2data" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "irjtibnRtaji", + "outputId": "2655643e-d156-4918-c58f-3f0bc58be1b9" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting invoice2data\n", + " Downloading invoice2data-0.4.5-py3-none-any.whl (149 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m149.0/149.0 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (from invoice2data) (9.4.0)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from invoice2data) (6.0.1)\n", + "Collecting dateparser (from invoice2data)\n", + " Downloading dateparser-1.2.0-py2.py3-none-any.whl (294 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from dateparser->invoice2data) (2.8.2)\n", + "Requirement already satisfied: pytz in /usr/local/lib/python3.10/dist-packages (from dateparser->invoice2data) (2023.4)\n", + "Requirement already satisfied: regex!=2019.02.19,!=2021.8.27 in /usr/local/lib/python3.10/dist-packages (from dateparser->invoice2data) (2023.12.25)\n", + "Requirement already satisfied: tzlocal in /usr/local/lib/python3.10/dist-packages (from dateparser->invoice2data) (5.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil->dateparser->invoice2data) (1.16.0)\n", + "Installing collected packages: dateparser, invoice2data\n", + "Successfully installed dateparser-1.2.0 invoice2data-0.4.5\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!sudo apt install tesseract-ocr\n", + "!pip install pytesseract Pillow pdf2image pypdf\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MztfBgrFzXxY", + "outputId": "b38f7b2e-e52e-4486-e9d8-457077da64ba" + }, + "execution_count": 46, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reading package lists... Done\n", + "Building dependency tree... Done\n", + "Reading state information... Done\n", + "The following additional packages will be installed:\n", + " tesseract-ocr-eng tesseract-ocr-osd\n", + "The following NEW packages will be installed:\n", + " tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd\n", + "0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.\n", + "Need to get 4,816 kB of archives.\n", + "After this operation, 15.6 MB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]\n", + "Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]\n", + "Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]\n", + "Fetched 4,816 kB in 1s (5,602 kB/s)\n", + "debconf: unable to initialize frontend: Dialog\n", + "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 3.)\n", + "debconf: falling back to frontend: Readline\n", + "debconf: unable to initialize frontend: Readline\n", + "debconf: (This frontend requires a controlling tty.)\n", + "debconf: falling back to frontend: Teletype\n", + "dpkg-preconfigure: unable to re-open stdin: \n", + "Selecting previously unselected package tesseract-ocr-eng.\n", + "(Reading database ... 121918 files and directories currently installed.)\n", + "Preparing to unpack .../tesseract-ocr-eng_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n", + "Unpacking tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n", + "Selecting previously unselected package tesseract-ocr-osd.\n", + "Preparing to unpack .../tesseract-ocr-osd_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n", + "Unpacking tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n", + "Selecting previously unselected package tesseract-ocr.\n", + "Preparing to unpack .../tesseract-ocr_4.1.1-2.1build1_amd64.deb ...\n", + "Unpacking tesseract-ocr (4.1.1-2.1build1) ...\n", + "Setting up tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n", + "Setting up tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n", + "Setting up tesseract-ocr (4.1.1-2.1build1) ...\n", + "Processing triggers for man-db (2.10.2-1) ...\n", + "Collecting pytesseract\n", + " Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (9.4.0)\n", + "Collecting pdf2image\n", + " Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n", + "Requirement already satisfied: pypdf in /usr/local/lib/python3.10/dist-packages (4.2.0)\n", + "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (24.0)\n", + "Requirement already satisfied: typing_extensions>=4.0 in /usr/local/lib/python3.10/dist-packages (from pypdf) (4.11.0)\n", + "Installing collected packages: pytesseract, pdf2image\n", + "Successfully installed pdf2image-1.17.0 pytesseract-0.3.10\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "id": "vJGeEAjjs8-s" + }, + "outputs": [], + "source": [ + "\n", + "import pytesseract\n", + "from PIL import Image" + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Path to the image file\n", + "image_path = '/image.jpg'\n", + "\n", + "# Perform OCR on the image\n", + "def ocr_image(image_path):\n", + " # Open the image file\n", + " img = Image.open(image_path)\n", + "\n", + " # Use pytesseract to do OCR on the image\n", + " text = pytesseract.image_to_string(img)\n", + "\n", + " return text\n", + "\n", + "# Extract text from the image\n", + "text = ocr_image(image_path)\n", + "\n", + "# Print the extracted text\n", + "print(\"Extracted Text:\")\n", + "print(text)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TzeAJCSXvyxl", + "outputId": "4efe1c2d-92a2-42cd-86da-4bd6cdbc7fd1" + }, + "execution_count": 55, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Extracted Text:\n", + "Example Corporation\n", + "‘Balance Sheet\n", + "December 31, 2020\n", + "\n", + " \n", + "\n", + " \n", + "\n", + "ASSETS\n", + "‘Curent assets\n", + "‘Cash and cash equivalents $2200\n", + "‘Shorterm investments 40,000\n", + "Account receivable -net 39,500\n", + "Other receivables 11000\n", + "Inventory 3,000\n", + "Suppies 3.800\n", + "Prepaid expences 41500\n", + "‘Total curet assets 39.000\n", + "Investments 36.000\n", + "Property plat & equipment net\n", + "Land 5500\n", + "Land improvements 6.500\n", + "Buidings 180,000\n", + "Equipment 201,000\n", + "Less: accumuates depreciation 8.000)\n", + "Property. pant & equipment-net_ 337.000,\n", + "Imangbe assets\n", + "‘Goodwl 105,000\n", + "Other ntangbe assets 200,000\n", + "“ofl intangible assets 306,000\n", + "Other assets 3,000\n", + "Total asst $770.00\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "LIABILITIES\n", + "Curent abies\n", + "‘Short-term loans payable $500\n", + "Curent porton of ong-tom debt 15,000\n", + "‘Accounts payable 20,900\n", + "‘Accvved compensation and benefits 8500\n", + "Income taxes payable 6.100\n", + "Other acoved abies ‘4.000\n", + "Deterod revenues 41500\n", + "Tal curent taiities 61.000\n", + "Long term habits\n", + "Notes payable 20,000\n", + "Bonds payable 375,000\n", + "Deferred income taxes 25,000\n", + "Total ong tem habits 220,000\n", + "Total abies 481,000\n", + "\n", + "Commitments and contingencies es)\n", + "STOCKHOLDERS’ EQ\n", + "\n", + " \n", + "\n", + "commen stock 110,000\n", + "Retained earings 220,000\n", + "‘Accum other comprehensive income 9,000\n", + "Loss: Treasury stock ($0,000)\n", + "\n", + "Toa stockholders’ equity 289,000\n", + "\n", + "Total abies & stockholders’ equity $770,000,\n", + "\f\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "NiPIyCRWxzgV" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file