Skip to content

Commit

Permalink
Merge pull request #51 from h1alexbel/47
Browse files Browse the repository at this point in the history
feat(#47): extract step
  • Loading branch information
h1alexbel authored Sep 9, 2024
2 parents ddc52df + 852411d commit 8892942
Show file tree
Hide file tree
Showing 10 changed files with 304 additions and 99 deletions.
64 changes: 64 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,70 @@ just filter repos.csv

You should expect to have `sr-data/experiment/after-filter.csv`.

### Extract headings

From each README file we extract it's all headings (text after `#`).
We remove English stop words from each heading. Then, we apply
lemmatization for each word, filter words with `^[a-zA-Z]+$` regex,
and calculate up to 5 most common words across README headings.

For instance, this README:

```markdown
# Building web applications in Java with Spring Boot 3
...

## Agenda
...

## Who am I?
...

## Prerequisites
...

## Outcomes
...

## What is Spring?
...

## Resources
...

### Dan Vega
...

### Spring
...

### Documentation
...

### Books
...

### Podcasts
...

### YouTube
...
```

Will be transformed to:

```text
['spring', 'build', 'web', 'application', 'java']
```

To run this:

```bash
just extract after-filter.csv
```

You should expect to have `sr-data/experiment/after-extract.csv`.

## How to contribute

Make sure that you have [Python 3.10+], [just], and [npm] installed on your
Expand Down
4 changes: 4 additions & 0 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ test-collect:
filter repos out="experiment/after-filter.csv":
cd sr-data && poetry poe filter --repos {{repos}} --out {{out}}

# Extract headings from README files.
extract repos out="experiment/after-extract.csv":
cd sr-data && poetry poe extract --repos {{repos}} --out {{out}}

# Build paper with LaTeX.
paper:
latexmk --version
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ python = "^3.10 || ^3.11 || ^3.12"
sr-data = { path = "./sr-data" }
sr-train = { path = "./sr-train" }
sr-detector = { path = "./sr-detector" }
nltk = "^3.9.1"

[tool.poetry.group.dev.dependencies]
pytest = "^8.2.2"
Expand Down
6 changes: 5 additions & 1 deletion sr-data/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ langdetect = "^1.0.9"
pandas = "^2.2.2"
beautifulsoup4 = "^4.12.3"
markdown = "^3.6"
openai = "^1.35.10"
requests = "^2.32.3"
nltk = "^3.9.1"

[tool.poetry.group.dev.dependencies]
pytest = "^8.2.2"
Expand All @@ -44,6 +44,10 @@ pytest = "^8.2.2"
script = "sr_data.tasks.filter:main(repos, out)"
args = [{name = "repos"}, {name = "out"}]

[tool.poe.tasks.extract]
script = "sr_data.tasks.extract:main(repos, out)"
args = [{name = "repos"}, {name = "out"}]

[tool.poe.tasks.embed]
script = "sr_data.tasks.embed:main(key, checkpoint, csv, out)"
args = [ {name = "key"}, {name = "checkpoint"}, {name = "csv"}, {name = "out"} ]
Expand Down
40 changes: 0 additions & 40 deletions sr-data/src/sr_data/all.py

This file was deleted.

144 changes: 144 additions & 0 deletions sr-data/src/sr_data/tasks/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""
Extract README headings (#).
"""
# The MIT License (MIT)
#
# Copyright (c) 2024 Aliaksei Bialiauski
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import re
from collections import Counter

import nltk
import pandas as pd
from nltk import WordNetLemmatizer, word_tokenize
from nltk.corpus import stopwords, wordnet

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()


def main(repos, out):
print("Extracting headings from README files...")
frame = pd.read_csv(repos)
frame["headings"] = frame["readme"].apply(headings)
before = len(frame)
frame = frame.dropna(subset=["headings"])
print(f"Removed {before - len(frame)} repositories that don't have at least one heading (#)")
frame["headings"] = frame["headings"].apply(
lambda readme: remove_stop_words(readme, stopwords.words("english"))
)
frame["headings"] = frame["headings"].apply(
lambda headings: [lemmatize(heading) for heading in headings]
)
frame["headings"] = frame["headings"].apply(
lambda headings: filter(headings, r"^[a-zA-Z]+$")
)
frame["top"] = frame["headings"].apply(
lambda headings: top_words(headings, 5)
)
frame.to_csv(out, index=False)


def top_words(headings, amount):
"""
Calculate top words in headings
:param headings: README headings
:param amount: Amount of top words to find
:return: Array of top words
"""
words = [word for heading in headings for word in word_tokenize(heading)]
return [word for word, _ in Counter(words).most_common(amount)]


def filter(headings, regex):
"""
Filter headings
:param headings: README headings
:param regex: Regex
:return: Filtered headings
"""
words = [word for heading in headings for word in word_tokenize(heading)]
return [word for word in words if re.match(regex, word)]


def lemmatize(heading):
"""
Lemmatize heading.
:param heading: README heading
:return: Lemmatized README heading
"""
tokens = word_tokenize(heading)
tags = nltk.pos_tag(tokens)
return " ".join(
[lemmatizer.lemmatize(token.lower(), wordnet_pos(tag)) for token, tag in tags]
)


def wordnet_pos(tag):
"""
NLTK POS tag to WordNet POS tag
:param tag: NLTK POS tag
:return: WordNet POS tag
"""
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN


def remove_stop_words(headings, words):
"""
Remove stop words
:param headings: Headings
:param words: Stop words
:return: Headings without stop words
"""
clean = []
for head in headings:
clean.append(
' '.join([word for word in head.split() if word not in words])
)
return clean


def headings(readme):
"""
Extract all headings
:param readme: README content
:return: All README headings
"""
pattern = re.compile("(#+\\s.+)")
result = None
hashless = []
for match in pattern.findall(readme):
hashless.append(match.replace("#", "").strip())
if len(hashless) != 0:
result = hashless
return result
5 changes: 3 additions & 2 deletions sr-data/src/sr_data/tasks/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ def main(repos, out):
non_null = start - len(frame)
after_null = len(frame)
print(f"Skipped {non_null} repositories with empty README files")
frame["readme"] = frame["readme"].apply(md_to_text)
frame = frame[frame["readme"].apply(english)]
frame["readme_text"] = frame["readme"].apply(md_to_text)
frame = frame[frame["readme_text"].apply(english)]
frame = frame.drop(columns=["readme_text"])
non_english = after_null - len(frame)
print(f"Skipped {non_english} non-english repositories")
print(f"Total skipped: {non_null + non_english}")
Expand Down
21 changes: 0 additions & 21 deletions sr-data/src/sr_data/verifications/__init__.py

This file was deleted.

35 changes: 0 additions & 35 deletions sr-data/src/sr_data/verifications/install.py

This file was deleted.

Loading

0 comments on commit 8892942

Please sign in to comment.