diff --git a/optical character recognition for financial docs/Readme.md b/optical character recognition for financial docs/Readme.md
new file mode 100644
index 00000000..641e44c3
--- /dev/null
+++ b/optical character recognition for financial docs/Readme.md
@@ -0,0 +1,11 @@
+**Financial Document OCR Tool
+
+Overview**
+The Financial Document OCR Tool is a Python-based utility designed to extract text from financial documents, including balance sheets, income statements, and other financial reports. The tool leverages the Tesseract OCR engine with deep learning-based LSTM (Long Short-Term Memory) models to accurately recognize and extract text from images.
+
+**Features**
+
Text Extraction: Automatically extracts textual content from financial documents, including scanned images and PDF files.
+
Accuracy: Utilizes advanced OCR techniques to achieve high accuracy in text recognition, even with complex and poorly formatted documents.
+
Customizable Configuration: Allows customization of OCR parameters to optimize performance for different types of financial documents.
+
Ease of Use: Simple and intuitive interface for seamless integration into financial analysis pipelines and workflows.
+
diff --git a/optical character recognition for financial docs/image.jpg b/optical character recognition for financial docs/image.jpg
new file mode 100644
index 00000000..2e9f3a8d
Binary files /dev/null and b/optical character recognition for financial docs/image.jpg differ
diff --git a/optical character recognition for financial docs/ocr_code.ipynb b/optical character recognition for financial docs/ocr_code.ipynb
new file mode 100644
index 00000000..4061444e
--- /dev/null
+++ b/optical character recognition for financial docs/ocr_code.ipynb
@@ -0,0 +1,257 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install invoice2data"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "irjtibnRtaji",
+ "outputId": "2655643e-d156-4918-c58f-3f0bc58be1b9"
+ },
+ "execution_count": 2,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Collecting invoice2data\n",
+ " Downloading invoice2data-0.4.5-py3-none-any.whl (149 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m149.0/149.0 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (from invoice2data) (9.4.0)\n",
+ "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from invoice2data) (6.0.1)\n",
+ "Collecting dateparser (from invoice2data)\n",
+ " Downloading dateparser-1.2.0-py2.py3-none-any.whl (294 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from dateparser->invoice2data) (2.8.2)\n",
+ "Requirement already satisfied: pytz in /usr/local/lib/python3.10/dist-packages (from dateparser->invoice2data) (2023.4)\n",
+ "Requirement already satisfied: regex!=2019.02.19,!=2021.8.27 in /usr/local/lib/python3.10/dist-packages (from dateparser->invoice2data) (2023.12.25)\n",
+ "Requirement already satisfied: tzlocal in /usr/local/lib/python3.10/dist-packages (from dateparser->invoice2data) (5.2)\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil->dateparser->invoice2data) (1.16.0)\n",
+ "Installing collected packages: dateparser, invoice2data\n",
+ "Successfully installed dateparser-1.2.0 invoice2data-0.4.5\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!sudo apt install tesseract-ocr\n",
+ "!pip install pytesseract Pillow pdf2image pypdf\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "MztfBgrFzXxY",
+ "outputId": "b38f7b2e-e52e-4486-e9d8-457077da64ba"
+ },
+ "execution_count": 46,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Reading package lists... Done\n",
+ "Building dependency tree... Done\n",
+ "Reading state information... Done\n",
+ "The following additional packages will be installed:\n",
+ " tesseract-ocr-eng tesseract-ocr-osd\n",
+ "The following NEW packages will be installed:\n",
+ " tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd\n",
+ "0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.\n",
+ "Need to get 4,816 kB of archives.\n",
+ "After this operation, 15.6 MB of additional disk space will be used.\n",
+ "Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]\n",
+ "Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]\n",
+ "Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]\n",
+ "Fetched 4,816 kB in 1s (5,602 kB/s)\n",
+ "debconf: unable to initialize frontend: Dialog\n",
+ "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 3.)\n",
+ "debconf: falling back to frontend: Readline\n",
+ "debconf: unable to initialize frontend: Readline\n",
+ "debconf: (This frontend requires a controlling tty.)\n",
+ "debconf: falling back to frontend: Teletype\n",
+ "dpkg-preconfigure: unable to re-open stdin: \n",
+ "Selecting previously unselected package tesseract-ocr-eng.\n",
+ "(Reading database ... 121918 files and directories currently installed.)\n",
+ "Preparing to unpack .../tesseract-ocr-eng_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
+ "Unpacking tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n",
+ "Selecting previously unselected package tesseract-ocr-osd.\n",
+ "Preparing to unpack .../tesseract-ocr-osd_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
+ "Unpacking tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n",
+ "Selecting previously unselected package tesseract-ocr.\n",
+ "Preparing to unpack .../tesseract-ocr_4.1.1-2.1build1_amd64.deb ...\n",
+ "Unpacking tesseract-ocr (4.1.1-2.1build1) ...\n",
+ "Setting up tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n",
+ "Setting up tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n",
+ "Setting up tesseract-ocr (4.1.1-2.1build1) ...\n",
+ "Processing triggers for man-db (2.10.2-1) ...\n",
+ "Collecting pytesseract\n",
+ " Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)\n",
+ "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (9.4.0)\n",
+ "Collecting pdf2image\n",
+ " Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n",
+ "Requirement already satisfied: pypdf in /usr/local/lib/python3.10/dist-packages (4.2.0)\n",
+ "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (24.0)\n",
+ "Requirement already satisfied: typing_extensions>=4.0 in /usr/local/lib/python3.10/dist-packages (from pypdf) (4.11.0)\n",
+ "Installing collected packages: pytesseract, pdf2image\n",
+ "Successfully installed pdf2image-1.17.0 pytesseract-0.3.10\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {
+ "id": "vJGeEAjjs8-s"
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "import pytesseract\n",
+ "from PIL import Image"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\n",
+ "# Path to the image file\n",
+ "image_path = '/image.jpg'\n",
+ "\n",
+ "# Perform OCR on the image\n",
+ "def ocr_image(image_path):\n",
+ " # Open the image file\n",
+ " img = Image.open(image_path)\n",
+ "\n",
+ " # Use pytesseract to do OCR on the image\n",
+ " text = pytesseract.image_to_string(img)\n",
+ "\n",
+ " return text\n",
+ "\n",
+ "# Extract text from the image\n",
+ "text = ocr_image(image_path)\n",
+ "\n",
+ "# Print the extracted text\n",
+ "print(\"Extracted Text:\")\n",
+ "print(text)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "TzeAJCSXvyxl",
+ "outputId": "4efe1c2d-92a2-42cd-86da-4bd6cdbc7fd1"
+ },
+ "execution_count": 55,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Extracted Text:\n",
+ "Example Corporation\n",
+ "‘Balance Sheet\n",
+ "December 31, 2020\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ "\n",
+ "ASSETS\n",
+ "‘Curent assets\n",
+ "‘Cash and cash equivalents $2200\n",
+ "‘Shorterm investments 40,000\n",
+ "Account receivable -net 39,500\n",
+ "Other receivables 11000\n",
+ "Inventory 3,000\n",
+ "Suppies 3.800\n",
+ "Prepaid expences 41500\n",
+ "‘Total curet assets 39.000\n",
+ "Investments 36.000\n",
+ "Property plat & equipment net\n",
+ "Land 5500\n",
+ "Land improvements 6.500\n",
+ "Buidings 180,000\n",
+ "Equipment 201,000\n",
+ "Less: accumuates depreciation 8.000)\n",
+ "Property. pant & equipment-net_ 337.000,\n",
+ "Imangbe assets\n",
+ "‘Goodwl 105,000\n",
+ "Other ntangbe assets 200,000\n",
+ "“ofl intangible assets 306,000\n",
+ "Other assets 3,000\n",
+ "Total asst $770.00\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ "\n",
+ "LIABILITIES\n",
+ "Curent abies\n",
+ "‘Short-term loans payable $500\n",
+ "Curent porton of ong-tom debt 15,000\n",
+ "‘Accounts payable 20,900\n",
+ "‘Accvved compensation and benefits 8500\n",
+ "Income taxes payable 6.100\n",
+ "Other acoved abies ‘4.000\n",
+ "Deterod revenues 41500\n",
+ "Tal curent taiities 61.000\n",
+ "Long term habits\n",
+ "Notes payable 20,000\n",
+ "Bonds payable 375,000\n",
+ "Deferred income taxes 25,000\n",
+ "Total ong tem habits 220,000\n",
+ "Total abies 481,000\n",
+ "\n",
+ "Commitments and contingencies es)\n",
+ "STOCKHOLDERS’ EQ\n",
+ "\n",
+ " \n",
+ "\n",
+ "commen stock 110,000\n",
+ "Retained earings 220,000\n",
+ "‘Accum other comprehensive income 9,000\n",
+ "Loss: Treasury stock ($0,000)\n",
+ "\n",
+ "Toa stockholders’ equity 289,000\n",
+ "\n",
+ "Total abies & stockholders’ equity $770,000,\n",
+ "\f\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "NiPIyCRWxzgV"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file