Skip to content

Commit

Permalink
chore: testing
Browse files Browse the repository at this point in the history
  • Loading branch information
simjak committed Feb 25, 2024
1 parent 542d8db commit eeeb234
Showing 1 changed file with 71 additions and 31 deletions.
102 changes: 71 additions & 31 deletions dev/walkthrough.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -35,26 +35,14 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"ename": "JSONDecodeError",
"evalue": "Expecting value: line 1 column 1 (char 0)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m~/simonas/open-source/super-rag/.venv/lib/python3.11/site-packages/requests/models.py:971\u001b[0m, in \u001b[0;36mResponse.json\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 970\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 971\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcomplexjson\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloads\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 972\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m JSONDecodeError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 973\u001b[0m \u001b[38;5;66;03m# Catch JSON-related errors and raise as requests.JSONDecodeError\u001b[39;00m\n\u001b[1;32m 974\u001b[0m \u001b[38;5;66;03m# This aliases json.JSONDecodeError and simplejson.JSONDecodeError\u001b[39;00m\n",
"File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/json/__init__.py:346\u001b[0m, in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 343\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 344\u001b[0m parse_int \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m parse_float \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 345\u001b[0m parse_constant \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_pairs_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kw):\n\u001b[0;32m--> 346\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_default_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 347\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/json/decoder.py:337\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Return the Python representation of ``s`` (a ``str`` instance\u001b[39;00m\n\u001b[1;32m 334\u001b[0m \u001b[38;5;124;03mcontaining a JSON document).\u001b[39;00m\n\u001b[1;32m 335\u001b[0m \n\u001b[1;32m 336\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m--> 337\u001b[0m obj, end \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraw_decode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43midx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_w\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mend\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 338\u001b[0m end \u001b[38;5;241m=\u001b[39m _w(s, end)\u001b[38;5;241m.\u001b[39mend()\n",
"File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/json/decoder.py:355\u001b[0m, in \u001b[0;36mJSONDecoder.raw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m--> 355\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m JSONDecodeError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpecting value\u001b[39m\u001b[38;5;124m\"\u001b[39m, s, err\u001b[38;5;241m.\u001b[39mvalue) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 356\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj, end\n",
"\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[6], line 34\u001b[0m\n\u001b[1;32m 4\u001b[0m payload \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 5\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfiles\u001b[39m\u001b[38;5;124m\"\u001b[39m: [\n\u001b[1;32m 6\u001b[0m {\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 29\u001b[0m },\n\u001b[1;32m 30\u001b[0m }\n\u001b[1;32m 32\u001b[0m response \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mpost(url, json\u001b[38;5;241m=\u001b[39mpayload)\n\u001b[0;32m---> 34\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n",
"File \u001b[0;32m~/simonas/open-source/super-rag/.venv/lib/python3.11/site-packages/requests/models.py:975\u001b[0m, in \u001b[0;36mResponse.json\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 971\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m complexjson\u001b[38;5;241m.\u001b[39mloads(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtext, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 972\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m JSONDecodeError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 973\u001b[0m \u001b[38;5;66;03m# Catch JSON-related errors and raise as requests.JSONDecodeError\u001b[39;00m\n\u001b[1;32m 974\u001b[0m \u001b[38;5;66;03m# This aliases json.JSONDecodeError and simplejson.JSONDecodeError\u001b[39;00m\n\u001b[0;32m--> 975\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m RequestsJSONDecodeError(e\u001b[38;5;241m.\u001b[39mmsg, e\u001b[38;5;241m.\u001b[39mdoc, e\u001b[38;5;241m.\u001b[39mpos)\n",
"\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)"
"name": "stdout",
"output_type": "stream",
"text": [
"{'success': True, 'index_name': 'simonas-serverless-1536'}\n"
]
}
],
Expand Down Expand Up @@ -87,6 +75,7 @@
" \"encoder\": {\n",
" \"name\": \"text-embedding-ada-002\",\n",
" \"provider\": \"openai\",\n",
" \"dimensions\": 1536\n",
" },\n",
"}\n",
"\n",
Expand All @@ -97,7 +86,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 18,
"metadata": {},
"outputs": [
{
Expand All @@ -108,9 +97,9 @@
" \"success\": true,\n",
" \"data\": [\n",
" {\n",
" \"id\": \"79b04347-f56a-4987-8901-fe1415b592b3\",\n",
" \"id\": \"19baf12d-cc24-44a7-bbb8-b3068e2217e3\",\n",
" \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"document_id\": \"doc_879d9ef3-e2c1-45ef-ae2f-396065b5f18e\",\n",
" \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
" \"content\": \"Results in table 5 show that element-based chunking strategies o\\ufb00er the best question-answering accuracy, which is consistent with page retrieval and para- graph retrieval accuracy. Lastly, our approach stands out for its e\\ufb03ciency. Not only is element-based chunking generalizable without the need to select the chunk size, but when com- pared to the aggregation results that yield the highest retrieval scores. Element- based chunking achieves the highest retrieval scores with only half the number of chunks required compared to methods that do not consider the structure of the documents (62,529 v.s. 112,155). This can reduce the indexing cost and im- prove query latency because there are only half as many vectors to index for the vectordb that stores the chunks. This underscores the e\\ufb00ectiveness of our solu- tion in optimizing the balance between performance and computational resource requirements.\",\n",
" \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"source_type\": \".pdf\",\n",
Expand All @@ -119,7 +108,7 @@
" \"token_count\": 188,\n",
" \"page_number\": 9,\n",
" \"metadata\": {\n",
" \"filename\": \"tmpsaw15pz9.pdf\",\n",
" \"filename\": \"tmpq96h17zo.pdf\",\n",
" \"filetype\": \"application/pdf\",\n",
" \"languages\": [\n",
" \"eng\"\n",
Expand All @@ -129,9 +118,9 @@
" \"dense_embedding\": null\n",
" },\n",
" {\n",
" \"id\": \"1adb618f-2b6e-45cb-8ea8-a622d9498dc7\",\n",
" \"id\": \"20340d3d-f6f0-4dde-98e7-8193c77be46e\",\n",
" \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"document_id\": \"doc_7778c117-6100-423b-9c33-55acf42329e6\",\n",
" \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
" \"content\": \"More speci\\ufb01cally on document chunking methods for RAG, there are stan- dard approaches being considered such as chunking text into spans of a given token length (e.g. 128 and 256) or chunking based on sentences. Open source projects already allow simple processing of documents (e.g. Unstructured4, Lla- maindex5 or Langchain 6), without explicitly considering the table structure on which these chunking strategies are applied. Even though di\\ufb00erent approaches are available, an exhaustive evaluation of chunking applied to RAG and speci\\ufb01cally to \\ufb01nancial reporting, except for some limited chunking analysis [14,36], is non-existent. In our work, we compare a broad range of chunking approaches in addition to more simple ones and provide an analysis of the outcomes of di\\ufb00erent methods when asking questions about di\\ufb00erent aspects of the reports.\",\n",
" \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"source_type\": \".pdf\",\n",
Expand All @@ -140,7 +129,7 @@
" \"token_count\": 199,\n",
" \"page_number\": 3,\n",
" \"metadata\": {\n",
" \"filename\": \"tmpxr1ag3dr.pdf\",\n",
" \"filename\": \"tmpq96h17zo.pdf\",\n",
" \"filetype\": \"application/pdf\",\n",
" \"languages\": [\n",
" \"eng\"\n",
Expand All @@ -150,9 +139,51 @@
" \"dense_embedding\": null\n",
" },\n",
" {\n",
" \"id\": \"cb1a0fd4-e5ad-4cb4-b5ff-5da531934fab\",\n",
" \"id\": \"90e306b2-5b1b-42ef-9c5c-fec52a073f84\",\n",
" \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
" \"content\": \"9 10 the lowest paragraph-level scores among all. On the other hand, element-based chunking strategies showed more consistent results. A fascinating discovery is that when various chunking strategies are com- bined, it results in enhanced retrieval scores, achieving superior performance at both the page level (84.4%) and paragraph level (with ROGUE at 0.568% and BLEU at 0.452%). This \\ufb01nding addresses an unresolved question: how to improve the accuracy of RAG. The element based method provides the highest scores and it also provides a mechanism to chunk documents without the need to \\ufb01ne tune hyper-parameters like the number of tokens in a chunk. This suggests the element based method is more generalizable and can be applied to new types of documents.\",\n",
" \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"source_type\": \".pdf\",\n",
" \"chunk_index\": 12,\n",
" \"title\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\",\n",
" \"token_count\": 160,\n",
" \"page_number\": 9,\n",
" \"metadata\": {\n",
" \"filename\": \"tmpq96h17zo.pdf\",\n",
" \"filetype\": \"application/pdf\",\n",
" \"languages\": [\n",
" \"eng\"\n",
" ],\n",
" \"parent_id\": \"53ffedc9520f52ef2c8e4568301c8530\"\n",
" },\n",
" \"dense_embedding\": null\n",
" },\n",
" {\n",
" \"id\": \"07ea8e9d-e989-4b4a-a1e9-14506b5c82ff\",\n",
" \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"document_id\": \"doc_879d9ef3-e2c1-45ef-ae2f-396065b5f18e\",\n",
" \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
" \"content\": \"Retrieval Accuracy Secondly, we evaluate the capabilities of each chunking strategy in terms of retrieval accuracy. We use the page numbers in the ground truth to calculate the page-level retrieval accuracy, and we use ROGUE [24] and BLEU [32] scores to evaluate the accuracy of paragraph-level retrieval compared to the ground truth evidence paragraphs. As shown in Table 4, when compared to Unstructured element-based chunk- ing strategies, basic chunking strategies seem to have higher page-level retrieval accuracy but lower paragraph-level accuracy on average. Additionally, basic chunking strategies also lack consistency between page-level and paragraph-level accuracy; higher page-level accuracy doesn\\u2019t ensure higher paragraph-level ac- curacy. For example, Base 128 has the second highest page-level accuracy but\",\n",
" \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"source_type\": \".pdf\",\n",
" \"chunk_index\": 12,\n",
" \"title\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\",\n",
" \"token_count\": 154,\n",
" \"page_number\": 9,\n",
" \"metadata\": {\n",
" \"filename\": \"tmpq96h17zo.pdf\",\n",
" \"filetype\": \"application/pdf\",\n",
" \"languages\": [\n",
" \"eng\"\n",
" ],\n",
" \"parent_id\": \"53ffedc9520f52ef2c8e4568301c8530\"\n",
" },\n",
" \"dense_embedding\": null\n",
" },\n",
" {\n",
" \"id\": \"d8eca566-f0ab-4a36-a674-3f0b99bf3807\",\n",
" \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
" \"content\": \"Chunking strategy Base 128 Base 256 Base 512 Keywords Chipper Summary Chipper Pre\\ufb01x & Table Description Chipper Furthermore, we would like to study the impact of RAG con\\ufb01guration and ele- meant type based chunking.\",\n",
" \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"source_type\": \".pdf\",\n",
Expand All @@ -161,7 +192,7 @@
" \"token_count\": 53,\n",
" \"page_number\": 1,\n",
" \"metadata\": {\n",
" \"filename\": \"tmpsaw15pz9.pdf\",\n",
" \"filename\": \"tmpq96h17zo.pdf\",\n",
" \"filetype\": \"application/pdf\",\n",
" \"languages\": [\n",
" \"eng\"\n",
Expand Down Expand Up @@ -199,6 +230,7 @@
"query_response = requests.post(query_url, json=query_payload)\n",
"\n",
"# NOTE: Filter out fields before given to LLM\n",
"# Include title, content, source, page_number, chunk_index\n",
"formatted_json = json.dumps(query_response.json(), indent=4)\n",
"print(formatted_json)"
]
Expand Down Expand Up @@ -285,9 +317,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'detail': [{'type': 'missing', 'loc': ['body', 'files'], 'msg': 'Field required', 'input': {'file_url': 'https://arxiv.org/pdf/2210.03629.pdf', 'vector_database': {'type': 'pinecone', 'config': {'api_key': 'f4adc79e-ad40-4426-a78a-9878e2ed4a79', 'host': 'https://simonas-serverless-1536-75c816a.svc.apw5-4e34-81fa.pinecone.io'}}, 'index_name': 'simonas-serverless-1536', 'encoder': 'openai'}, 'url': 'https://errors.pydantic.dev/2.6/v/missing'}, {'type': 'model_attributes_type', 'loc': ['body', 'encoder'], 'msg': 'Input should be a valid dictionary or object to extract fields from', 'input': 'openai', 'url': 'https://errors.pydantic.dev/2.6/v/model_attributes_type'}]}\n"
]
}
],
"source": [
"# Delete the index\n",
"query_url = f\"{API_URL}/api/v1/delete\"\n",
Expand All @@ -302,7 +342,7 @@
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": \"cohere\",\n",
" \"encoder\": \"openai\",\n",
"}\n",
"\n",
"delete_response = requests.delete(query_url, json=delete_payload)\n",
Expand Down

0 comments on commit eeeb234

Please sign in to comment.