Merge pull request #51 from h1alexbel/47

feat(#47): extract step
h1alexbel · Sep 9, 2024 · 8892942 · 8892942
2 parents ddc52df + 852411d
commit 8892942
Show file tree

Hide file tree

Showing 10 changed files with 304 additions and 99 deletions.
diff --git a/README.md b/README.md
@@ -70,6 +70,70 @@ just filter repos.csv
 
 You should expect to have `sr-data/experiment/after-filter.csv`.
 
+### Extract headings
+
+From each README file we extract it's all headings (text after `#`).
+We remove English stop words from each heading. Then, we apply
+lemmatization for each word, filter words with `^[a-zA-Z]+$` regex,
+and calculate up to 5 most common words across README headings.
+
+For instance, this README:
+
+```markdown
+# Building web applications in Java with Spring Boot 3
+...
+
+## Agenda
+...
+
+## Who am I?
+...
+
+## Prerequisites
+...
+
+## Outcomes
+...
+
+## What is Spring?
+...
+
+## Resources
+...
+
+### Dan Vega
+...
+
+### Spring
+... 
+
+### Documentation
+...
+
+### Books
+...
+
+### Podcasts
+...
+
+### YouTube
+...
+```
+
+Will be transformed to:
+
+```text
+['spring', 'build', 'web', 'application', 'java']
+```
+
+To run this:
+
+```bash
+just extract after-filter.csv
+```
+
+You should expect to have `sr-data/experiment/after-extract.csv`.
+
 ## How to contribute
 
 Make sure that you have [Python 3.10+], [just], and [npm] installed on your

diff --git a/justfile b/justfile
@@ -82,6 +82,10 @@ test-collect:
 filter repos out="experiment/after-filter.csv":
   cd sr-data && poetry poe filter --repos {{repos}} --out {{out}}
 
+# Extract headings from README files.
+extract repos out="experiment/after-extract.csv":
+  cd sr-data && poetry poe extract --repos {{repos}} --out {{out}}
+
 # Build paper with LaTeX.
 paper:
   latexmk --version

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,7 @@ python = "^3.10 || ^3.11 || ^3.12"
 sr-data = { path = "./sr-data" }
 sr-train = { path = "./sr-train" }
 sr-detector = { path = "./sr-detector" }
+nltk = "^3.9.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.2.2"

diff --git a/sr-data/pyproject.toml b/sr-data/pyproject.toml
@@ -34,8 +34,8 @@ langdetect = "^1.0.9"
 pandas = "^2.2.2"
 beautifulsoup4 = "^4.12.3"
 markdown = "^3.6"
-openai = "^1.35.10"
 requests = "^2.32.3"
+nltk = "^3.9.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.2.2"
@@ -44,6 +44,10 @@ pytest = "^8.2.2"
 script = "sr_data.tasks.filter:main(repos, out)"
 args = [{name = "repos"}, {name = "out"}]
 
+[tool.poe.tasks.extract]
+script = "sr_data.tasks.extract:main(repos, out)"
+args = [{name = "repos"}, {name = "out"}]
+
 [tool.poe.tasks.embed]
 script = "sr_data.tasks.embed:main(key, checkpoint, csv, out)"
 args = [ {name = "key"}, {name = "checkpoint"}, {name = "csv"}, {name = "out"} ]

diff --git a/sr-data/src/sr_data/all.py b/sr-data/src/sr_data/all.py
diff --git a/sr-data/src/sr_data/tasks/extract.py b/sr-data/src/sr_data/tasks/extract.py
@@ -0,0 +1,144 @@
+"""
+Extract README headings (#).
+"""
+# The MIT License (MIT)
+#
+# Copyright (c) 2024 Aliaksei Bialiauski
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import re
+from collections import Counter
+
+import nltk
+import pandas as pd
+from nltk import WordNetLemmatizer, word_tokenize
+from nltk.corpus import stopwords, wordnet
+
+nltk.download("stopwords")
+nltk.download("wordnet")
+nltk.download('punkt_tab')
+nltk.download('averaged_perceptron_tagger_eng')
+nltk.download('omw-1.4')
+lemmatizer = WordNetLemmatizer()
+
+
+def main(repos, out):
+    print("Extracting headings from README files...")
+    frame = pd.read_csv(repos)
+    frame["headings"] = frame["readme"].apply(headings)
+    before = len(frame)
+    frame = frame.dropna(subset=["headings"])
+    print(f"Removed {before - len(frame)} repositories that don't have at least one heading (#)")
+    frame["headings"] = frame["headings"].apply(
+        lambda readme: remove_stop_words(readme, stopwords.words("english"))
+    )
+    frame["headings"] = frame["headings"].apply(
+        lambda headings: [lemmatize(heading) for heading in headings]
+    )
+    frame["headings"] = frame["headings"].apply(
+        lambda headings: filter(headings, r"^[a-zA-Z]+$")
+    )
+    frame["top"] = frame["headings"].apply(
+        lambda headings: top_words(headings, 5)
+    )
+    frame.to_csv(out, index=False)
+
+
+def top_words(headings, amount):
+    """
+    Calculate top words in headings
+    :param headings: README headings
+    :param amount: Amount of top words to find
+    :return: Array of top words
+    """
+    words = [word for heading in headings for word in word_tokenize(heading)]
+    return [word for word, _ in Counter(words).most_common(amount)]
+
+
+def filter(headings, regex):
+    """
+    Filter headings
+    :param headings: README headings
+    :param regex: Regex
+    :return: Filtered headings
+    """
+    words = [word for heading in headings for word in word_tokenize(heading)]
+    return [word for word in words if re.match(regex, word)]
+
+
+def lemmatize(heading):
+    """
+    Lemmatize heading.
+    :param heading: README heading
+    :return: Lemmatized README heading
+    """
+    tokens = word_tokenize(heading)
+    tags = nltk.pos_tag(tokens)
+    return " ".join(
+        [lemmatizer.lemmatize(token.lower(), wordnet_pos(tag)) for token, tag in tags]
+    )
+
+
+def wordnet_pos(tag):
+    """
+    NLTK POS tag to WordNet POS tag
+    :param tag: NLTK POS tag
+    :return: WordNet POS tag
+    """
+    if tag.startswith('J'):
+        return wordnet.ADJ
+    elif tag.startswith('V'):
+        return wordnet.VERB
+    elif tag.startswith('N'):
+        return wordnet.NOUN
+    elif tag.startswith('R'):
+        return wordnet.ADV
+    else:
+        return wordnet.NOUN
+
+
+def remove_stop_words(headings, words):
+    """
+    Remove stop words
+    :param headings: Headings
+    :param words: Stop words
+    :return: Headings without stop words
+    """
+    clean = []
+    for head in headings:
+        clean.append(
+            ' '.join([word for word in head.split() if word not in words])
+        )
+    return clean
+
+
+def headings(readme):
+    """
+    Extract all headings
+    :param readme: README content
+    :return: All README headings
+    """
+    pattern = re.compile("(#+\\s.+)")
+    result = None
+    hashless = []
+    for match in pattern.findall(readme):
+        hashless.append(match.replace("#", "").strip())
+    if len(hashless) != 0:
+        result = hashless
+    return result
diff --git a/sr-data/src/sr_data/tasks/filter.py b/sr-data/src/sr_data/tasks/filter.py
@@ -45,8 +45,9 @@ def main(repos, out):
     non_null = start - len(frame)
     after_null = len(frame)
     print(f"Skipped {non_null} repositories with empty README files")
-    frame["readme"] = frame["readme"].apply(md_to_text)
-    frame = frame[frame["readme"].apply(english)]
+    frame["readme_text"] = frame["readme"].apply(md_to_text)
+    frame = frame[frame["readme_text"].apply(english)]
+    frame = frame.drop(columns=["readme_text"])
     non_english = after_null - len(frame)
     print(f"Skipped {non_english} non-english repositories")
     print(f"Total skipped: {non_null + non_english}")

diff --git a/sr-data/src/sr_data/verifications/__init__.py b/sr-data/src/sr_data/verifications/__init__.py
diff --git a/sr-data/src/sr_data/verifications/install.py b/sr-data/src/sr_data/verifications/install.py