Skip to content

Commit 680fe32

Browse files
committed
add langchain documentation
1 parent 139a897 commit 680fe32

File tree

5 files changed

+60
-0
lines changed

5 files changed

+60
-0
lines changed

data/scraping_scripts/create_vector_stores.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,29 @@
1+
"""
2+
Vector Store Creation Script
3+
4+
Purpose:
5+
This script processes various data sources (e.g., transformers, peft, trl, llama_index, openai_cookbooks, langchain)
6+
to create vector stores using Chroma and LlamaIndex. It reads data from JSONL files, creates document embeddings,
7+
and stores them in persistent Chroma databases for efficient retrieval.
8+
9+
Usage:
10+
python script_name.py <source1> <source2> ...
11+
12+
Example:
13+
python script_name.py transformers peft llama_index
14+
15+
The script accepts one or more source names as command-line arguments. Valid source names are:
16+
transformers, peft, trl, llama_index, openai_cookbooks, langchain
17+
18+
For each specified source, the script will:
19+
1. Read data from the corresponding JSONL file
20+
2. Create document embeddings
21+
3. Store the embeddings in a Chroma vector database
22+
4. Save a dictionary of documents for future reference
23+
24+
Note: Ensure that the input JSONL files are present in the 'data' directory.
25+
"""
26+
127
import argparse
228
import json
329
import os
@@ -27,6 +53,10 @@
2753
"input_file": "data/openai_cookbooks_data.jsonl",
2854
"db_name": "chroma-db-openai_cookbooks",
2955
},
56+
"langchain": {
57+
"input_file": "data/langchain_data.jsonl",
58+
"db_name": "chroma-db-langchain",
59+
},
3060
}
3161

3262

data/scraping_scripts/github_to_markdown_ai_docs.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,11 @@
6767
"repo": "openai-cookbook",
6868
"path": "examples",
6969
},
70+
"langchain": {
71+
"owner": "langchain-ai",
72+
"repo": "langchain",
73+
"path": "docs/docs",
74+
},
7075
}
7176

7277
# GitHub Personal Access Token (replace with your own token)

data/scraping_scripts/process_md_files.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,18 @@
110110
"included_root_files": [],
111111
"url_extension": ".ipynb",
112112
},
113+
"langchain": {
114+
"base_url": "https://python.langchain.com/v0.2/docs/",
115+
"input_directory": "data/langchain_md_files",
116+
"output_file": "data/langchain_data.jsonl",
117+
"source_name": "langchain",
118+
"use_include_list": True,
119+
"included_dirs": ["how_to", "versions", "turorials", "integrations"],
120+
"excluded_dirs": [],
121+
"excluded_root_files": [],
122+
"included_root_files": ["security.md", "concepts.mdx", "introduction.mdx"],
123+
"url_extension": "",
124+
},
113125
}
114126

115127

scripts/main.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
AVAILABLE_SOURCES,
1111
AVAILABLE_SOURCES_UI,
1212
CONCURRENCY_COUNT,
13+
custom_retriever_langchain,
1314
custom_retriever_llama_index,
1415
custom_retriever_openai_cookbooks,
1516
custom_retriever_peft,
@@ -46,6 +47,11 @@ def update_query_engine_tools(selected_sources):
4647
"openai_cookbooks_info",
4748
"""Useful for questions asking about accomplishing common tasks with the OpenAI API. Returns example code and guides stored in Jupyter notebooks, including info about ChatGPT GPT actions, OpenAI Assistants API, and How to fine-tune OpenAI's GPT-4o and GPT-4o-mini models with the OpenAI API.""",
4849
),
50+
"LangChain Docs": (
51+
custom_retriever_langchain,
52+
"langchain_info",
53+
"""Useful for questions asking about the LangChain framework. It is the documentation of the LangChain framework, includes info about building chains, agents, and tools, using memory, prompts, callbacks, etc.""",
54+
),
4955
}
5056

5157
for source in selected_sources:

scripts/setup.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ def setup_database(db_collection, dict_file_name):
7777
"chroma-db-openai_cookbooks",
7878
"document_dict_openai_cookbooks.pkl",
7979
)
80+
custom_retriever_langchain = setup_database(
81+
"chroma-db-langchain",
82+
"document_dict_langchain.pkl",
83+
)
8084

8185
# Constants
8286
CONCURRENCY_COUNT = int(os.getenv("CONCURRENCY_COUNT", 64))
@@ -88,6 +92,7 @@ def setup_database(db_collection, dict_file_name):
8892
"TRL Docs",
8993
"LlamaIndex Docs",
9094
"OpenAI Cookbooks",
95+
"LangChain Docs",
9196
# "Towards AI Blog",
9297
# "RAG Course",
9398
]
@@ -98,6 +103,7 @@ def setup_database(db_collection, dict_file_name):
98103
"trl",
99104
"llama_index",
100105
"openai_cookbooks",
106+
"langchain",
101107
# "towards_ai_blog",
102108
# "rag_course",
103109
]
@@ -114,6 +120,7 @@ def setup_database(db_collection, dict_file_name):
114120
"custom_retriever_trl",
115121
"custom_retriever_llama_index",
116122
"custom_retriever_openai_cookbooks",
123+
"custom_retriever_langchain",
117124
"CONCURRENCY_COUNT",
118125
"MONGODB_URI",
119126
"AVAILABLE_SOURCES_UI",

0 commit comments

Comments
 (0)