Remove link file storage

gadhagod · Jul 20, 2023 · 24d12c6 · 24d12c6
1 parent d541311
commit 24d12c6
Show file tree

Hide file tree

Showing 6 changed files with 169 additions and 81 deletions.
diff --git a/.github/workflows/store-embeddings.yml b/.github/workflows/store-embeddings.yml
@@ -13,11 +13,7 @@ jobs:
       - name: Install requirements
         run: pip install -r requirements.txt
       - name: Generate links
-        run: python3 generate_links.py
-      - name: Show links
-        run: cat links.txt
-      - name: Create and store embeddings
-        run: python3 links_to_docs.py
+        run: python3 generate_links.py --reset
         env:
           ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/constants.py b/constants.py
@@ -1,12 +1,46 @@
+from sys import argv
+from time import sleep
 from os import getenv
-from rockset import RocksetClient, Regions
+from rockset import RocksetClient, Regions, exceptions
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.vectorstores import Rockset as RocksetStore
+from sql import ingest_tranformation
 
 rockset_api_key = getenv("ROCKSET_API_KEY")
 openai_api_key = getenv("OPENAI_API_KEY")
 
 rockset = RocksetClient(Regions.rs2, rockset_api_key)
+
+def collection_exists():
+    try:
+        rockset.Collections.get(collection="hyrule-compendium-ai")
+    except exceptions.NotFoundException:
+        return False
+    return True
+
+def collection_is_ready():
+    return rockset.Collections.get(collection="hyrule-compendium-ai").data.status == "READY"
+
+def delete_collection():
+    print("Deleting collection \"commons.hyrule-compendium-ai\"")
+    rockset.Collections.delete(collection="hyrule-compendium-ai")
+
+def create_collection():
+    print("Creating collection \"commons.hyrule-compendium-ai\"")
+    rockset.Collections.create_s3_collection(name="hyrule-compendium-ai", field_mapping_query=ingest_tranformation)
+
+if "--reset" in argv:
+    if collection_exists():
+        delete_collection()
+        while collection_exists():
+            sleep(1)
+
+    create_collection()
+    while not collection_exists():
+        sleep(1)
+    while not collection_is_ready():
+        sleep(1)
+
 openai = OpenAIEmbeddings(
     openai_api_key=openai_api_key,
     model="text-embedding-ada-002"
@@ -17,7 +51,4 @@
     "hyrule-compendium-ai",
     "text",
     "embedding"
-)
-
-# test connectivity
-store.add_texts(["Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum."])
+)
diff --git a/generate_links.py b/generate_links.py
@@ -1,9 +1,76 @@
-from sys import setrecursionlimit
 from requests import get, exceptions
 from bs4 import BeautifulSoup
+from requests import get, exceptions
+from bs4 import BeautifulSoup
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from constants import store, rockset as rs
+
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size = 1000,
+    chunk_overlap  = 120,
+    length_function = len,
+    add_start_index = True,
+)
+
+class LinkNode():
+    def __init__(self, link, next=None):
+        self.link = link
+        self.next = next
 
+class LinkQueue():
+    def __init__(self, init_value=None):
+        self.first = LinkNode(init_value, None) if init_value is not None else None
+        self.last = self.first
+
+    def remove(self):
+        if self.first is self.last: # one item in queue
+            link = self.first.link
+            self.first = None
+            self.last = None
+            return link
+        prev_first = self.first
+        self.first = self.first.next
+        return prev_first.link
+
+    def add(self, link):
+        node = LinkNode(link)
+        if self.first is None and self.last is None: # empty queue
+            self.first = node
+        else:
+            self.last.next = node
+        self.last = node
+
+    def is_empty(self):
+        return self.first is None
+
+    def add_elem_links(self, a_elems):
+        for i in a_elems:
+            self.add(i["href"])
+
+    def __str__(self) -> str:
+        if self.is_empty():
+            return "[]"
+        res = ""
+        curr = self.first
+        while curr is not None:
+            res += f"{curr.link}, "
+            curr = curr.next
+        return f"[{res[:-2]}]"
+
+
 class Scraper():
-    def _is_valid(link: str): 
+    def _cleanse(self, link):
+        paramLoc = link.find("?")
+        if paramLoc > 0:
+            link = link[:paramLoc]
+        hashLoc = link.find("#")
+        if hashLoc > 0:
+            link = link[:hashLoc]
+        if link.startswith("/"):
+            link = "https://zelda.fandom.com" + link
+        return link
+
+    def _is_valid(self, link: str): 
         return (
             not link.startswith("#") and 
             (link.startswith("https://zelda.fandom.com/wiki") or link.startswith("/")) and 
@@ -22,42 +89,59 @@ def _is_valid(link: str):
             not "Guidelines" in link and 
             not "Help" in link and 
             not "Template" in link
-
         )
+
+    def _is_category(self, link):
+        return "Category:" in link
 
-    def __init__(self):
-        setrecursionlimit(1000000)
-        self.cnt = 0
-        self.link_file = open("links.txt", "w+")
-        self.scraped_links = set()
-        self.scrape("https://zelda.fandom.com/wiki/Main_Page")
-        self.link_file.close()
-        print(len(self.scraped_links))
+    def _scrape(self, link):
+        soup = BeautifulSoup(get(link).text, "html.parser")
+
+        if self._is_category(link): # we do not need to generate embeddings for this page
+            rs.Documents.add_documents(
+                collection="hyrule-compendium-ai",
+                data=[{
+                    "source": link, # make sure we do not scrape this page again
+                    "embedding": None
+                }]
+            )
+        else:
+            page_title = soup.find("title").get_text()
+            page_text = soup.find(class_="page__main").get_text().replace("\n\n", "\n")
+            docs = text_splitter.create_documents([page_text],[{"source": link}])
+            store.add_texts(
+                texts=[f"This information is about {page_title}. {doc.page_content}" for doc in docs],
+                metadatas=[doc.metadata for doc in docs]
+            )
+
+        return soup
+
+    def _has_been_scraped(self, link):
+        return len(rs.sql("""
+            SELECT
+                1
+            FROM
+                commons."hyrule-compendium-ai"
+            WHERE
+                source = :link
+            """, 
+            params={"link": str(link)}).results
+        ) > 0
 
-    def scrape(self, link):
-        if (self.cnt > 5000):
-            return
-        print(f"Scraping {link} ...")
-        try:
-            soup = BeautifulSoup(get(link).text, "html.parser")
-        except exceptions.RequestException as e:
-            print(e)    
-            return # skip
-        links = soup.find_all("a", {"href": lambda value: value})
-        for i in links:
-            href = i["href"]
-            paramLoc = href.find("?")
-            if paramLoc > 0:
-                href = href[:paramLoc]
-            hashLoc = href.find("#")
-            if hashLoc > 0:
-                href = href[:hashLoc]
-            if href.startswith("/"):
-                href = "https://zelda.fandom.com" + href
-            if (href not in self.scraped_links) and Scraper._is_valid(href):
-                if "Category:" not in href:
-                    self.link_file.write(href + "\n")
-                self.scraped_links.add(href)
-                self.scrape(href)
+    def __init__(self):
+        self.first = True
+        links = LinkQueue("https://zelda.fandom.com/wiki/Main_Page")
+        while not links.is_empty():
+            curr_link = self._cleanse(links.remove())
+            if self.first or (self._is_valid(curr_link) and not self._has_been_scraped(curr_link)):
+                print(f"Scraping {curr_link}...")
+                try:
+                    soup = self._scrape(curr_link)
+                except exceptions.RequestException as e:
+                    print(f"Skipping {curr_link}: {e}")    
+                    return # skip
+
+                links.add_elem_links(soup.find_all("a", {"href": lambda value: value}))
+            self.first = False
 
 Scraper()
diff --git a/links_to_docs.py b/links_to_docs.py
diff --git a/sql/__init__.py b/sql/__init__.py
@@ -0,0 +1,8 @@
+from os.path import dirname, join
+from rockset.model.field_mapping_query import FieldMappingQuery
+
+ingest_tranformation = FieldMappingQuery(
+    sql=open(
+        join(dirname(__file__), "ingest-transformation.sql")
+    ).read()
+)
diff --git a/sql/ingest-transformation.sql b/sql/ingest-transformation.sql
@@ -0,0 +1,4 @@
+SELECT
+    *, VECTOR_ENFORCE(embeddings, 1536, 'float') as embeddings
+FROM
+    _input