Skip to content

Commit

Permalink
Remove link file storage
Browse files Browse the repository at this point in the history
  • Loading branch information
gadhagod committed Jul 20, 2023
1 parent d541311 commit 24d12c6
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 81 deletions.
6 changes: 1 addition & 5 deletions .github/workflows/store-embeddings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,7 @@ jobs:
- name: Install requirements
run: pip install -r requirements.txt
- name: Generate links
run: python3 generate_links.py
- name: Show links
run: cat links.txt
- name: Create and store embeddings
run: python3 links_to_docs.py
run: python3 generate_links.py --reset
env:
ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
41 changes: 36 additions & 5 deletions constants.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,46 @@
from sys import argv
from time import sleep
from os import getenv
from rockset import RocksetClient, Regions
from rockset import RocksetClient, Regions, exceptions
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Rockset as RocksetStore
from sql import ingest_tranformation

rockset_api_key = getenv("ROCKSET_API_KEY")
openai_api_key = getenv("OPENAI_API_KEY")

rockset = RocksetClient(Regions.rs2, rockset_api_key)

def collection_exists():
try:
rockset.Collections.get(collection="hyrule-compendium-ai")
except exceptions.NotFoundException:
return False
return True

def collection_is_ready():
return rockset.Collections.get(collection="hyrule-compendium-ai").data.status == "READY"

def delete_collection():
print("Deleting collection \"commons.hyrule-compendium-ai\"")
rockset.Collections.delete(collection="hyrule-compendium-ai")

def create_collection():
print("Creating collection \"commons.hyrule-compendium-ai\"")
rockset.Collections.create_s3_collection(name="hyrule-compendium-ai", field_mapping_query=ingest_tranformation)

if "--reset" in argv:
if collection_exists():
delete_collection()
while collection_exists():
sleep(1)

create_collection()
while not collection_exists():
sleep(1)
while not collection_is_ready():
sleep(1)

openai = OpenAIEmbeddings(
openai_api_key=openai_api_key,
model="text-embedding-ada-002"
Expand All @@ -17,7 +51,4 @@
"hyrule-compendium-ai",
"text",
"embedding"
)

# test connectivity
store.add_texts(["Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."])
)
156 changes: 120 additions & 36 deletions generate_links.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,76 @@
from sys import setrecursionlimit
from requests import get, exceptions
from bs4 import BeautifulSoup
from requests import get, exceptions
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from constants import store, rockset as rs

text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 120,
length_function = len,
add_start_index = True,
)

class LinkNode():
def __init__(self, link, next=None):
self.link = link
self.next = next

class LinkQueue():
def __init__(self, init_value=None):
self.first = LinkNode(init_value, None) if init_value is not None else None
self.last = self.first

def remove(self):
if self.first is self.last: # one item in queue
link = self.first.link
self.first = None
self.last = None
return link
prev_first = self.first
self.first = self.first.next
return prev_first.link

def add(self, link):
node = LinkNode(link)
if self.first is None and self.last is None: # empty queue
self.first = node
else:
self.last.next = node
self.last = node

def is_empty(self):
return self.first is None

def add_elem_links(self, a_elems):
for i in a_elems:
self.add(i["href"])

def __str__(self) -> str:
if self.is_empty():
return "[]"
res = ""
curr = self.first
while curr is not None:
res += f"{curr.link}, "
curr = curr.next
return f"[{res[:-2]}]"


class Scraper():
def _is_valid(link: str):
def _cleanse(self, link):
paramLoc = link.find("?")
if paramLoc > 0:
link = link[:paramLoc]
hashLoc = link.find("#")
if hashLoc > 0:
link = link[:hashLoc]
if link.startswith("/"):
link = "https://zelda.fandom.com" + link
return link

def _is_valid(self, link: str):
return (
not link.startswith("#") and
(link.startswith("https://zelda.fandom.com/wiki") or link.startswith("/")) and
Expand All @@ -22,42 +89,59 @@ def _is_valid(link: str):
not "Guidelines" in link and
not "Help" in link and
not "Template" in link

)

def _is_category(self, link):
return "Category:" in link

def __init__(self):
setrecursionlimit(1000000)
self.cnt = 0
self.link_file = open("links.txt", "w+")
self.scraped_links = set()
self.scrape("https://zelda.fandom.com/wiki/Main_Page")
self.link_file.close()
print(len(self.scraped_links))
def _scrape(self, link):
soup = BeautifulSoup(get(link).text, "html.parser")

if self._is_category(link): # we do not need to generate embeddings for this page
rs.Documents.add_documents(
collection="hyrule-compendium-ai",
data=[{
"source": link, # make sure we do not scrape this page again
"embedding": None
}]
)
else:
page_title = soup.find("title").get_text()
page_text = soup.find(class_="page__main").get_text().replace("\n\n", "\n")
docs = text_splitter.create_documents([page_text],[{"source": link}])
store.add_texts(
texts=[f"This information is about {page_title}. {doc.page_content}" for doc in docs],
metadatas=[doc.metadata for doc in docs]
)

return soup

def _has_been_scraped(self, link):
return len(rs.sql("""
SELECT
1
FROM
commons."hyrule-compendium-ai"
WHERE
source = :link
""",
params={"link": str(link)}).results
) > 0

def scrape(self, link):
if (self.cnt > 5000):
return
print(f"Scraping {link} ...")
try:
soup = BeautifulSoup(get(link).text, "html.parser")
except exceptions.RequestException as e:
print(e)
return # skip
links = soup.find_all("a", {"href": lambda value: value})
for i in links:
href = i["href"]
paramLoc = href.find("?")
if paramLoc > 0:
href = href[:paramLoc]
hashLoc = href.find("#")
if hashLoc > 0:
href = href[:hashLoc]
if href.startswith("/"):
href = "https://zelda.fandom.com" + href
if (href not in self.scraped_links) and Scraper._is_valid(href):
if "Category:" not in href:
self.link_file.write(href + "\n")
self.scraped_links.add(href)
self.scrape(href)
def __init__(self):
self.first = True
links = LinkQueue("https://zelda.fandom.com/wiki/Main_Page")
while not links.is_empty():
curr_link = self._cleanse(links.remove())
if self.first or (self._is_valid(curr_link) and not self._has_been_scraped(curr_link)):
print(f"Scraping {curr_link}...")
try:
soup = self._scrape(curr_link)
except exceptions.RequestException as e:
print(f"Skipping {curr_link}: {e}")
return # skip

links.add_elem_links(soup.find_all("a", {"href": lambda value: value}))
self.first = False

Scraper()
35 changes: 0 additions & 35 deletions links_to_docs.py

This file was deleted.

8 changes: 8 additions & 0 deletions sql/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from os.path import dirname, join
from rockset.model.field_mapping_query import FieldMappingQuery

ingest_tranformation = FieldMappingQuery(
sql=open(
join(dirname(__file__), "ingest-transformation.sql")
).read()
)
4 changes: 4 additions & 0 deletions sql/ingest-transformation.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
SELECT
*, VECTOR_ENFORCE(embeddings, 1536, 'float') as embeddings
FROM
_input

0 comments on commit 24d12c6

Please sign in to comment.