Skip to content

Commit 8acf86a

Browse files
committed
added text summarizer python
1 parent 60a8076 commit 8acf86a

File tree

2 files changed

+67
-0
lines changed

2 files changed

+67
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
nltk
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import nltk
2+
nltk.download('punkt_tab')
3+
nltk.download('stopwords')
4+
from nltk.corpus import stopwords
5+
from nltk.tokenize import word_tokenize, sent_tokenize
6+
import string
7+
8+
9+
def summarize_text_nltk(text, num_sentences=3):
10+
"""
11+
Summarizes the input text using a frequency-based method with NLTK.
12+
13+
Args:
14+
text (str): The text to be summarized.
15+
num_sentences (int): The desired number of sentences in the summary.
16+
17+
Returns:
18+
str: The summarized text.
19+
"""
20+
if not text:
21+
return ""
22+
text = " ".join(text.split())
23+
sentences = sent_tokenize(text)
24+
if len(sentences) <= num_sentences:
25+
return text
26+
stop_words = set(stopwords.words('english'))
27+
punctuations = string.punctuation
28+
words = word_tokenize(text.lower())
29+
clean_words = [word for word in words if word.isalnum() and word not in stop_words]
30+
word_freq = {}
31+
for word in clean_words:
32+
word_freq[word] = word_freq.get(word, 0) + 1
33+
max_freq = max(word_freq.values())
34+
for word in word_freq:
35+
word_freq[word] = word_freq[word] / max_freq
36+
sentence_scores = {}
37+
for sentence in sentences:
38+
for word in word_tokenize(sentence.lower()):
39+
if word in word_freq:
40+
if sentence not in sentence_scores:
41+
sentence_scores[sentence] = word_freq[word]
42+
else:
43+
sentence_scores[sentence] += word_freq[word]
44+
sorted_sentences = sorted(sentence_scores.items(), key=lambda item: item[1], reverse=True)
45+
summary_sentences = [sentence for sentence, score in sorted_sentences[:num_sentences]]
46+
final_summary = []
47+
selected_sentences_set = set(summary_sentences)
48+
49+
for sentence in sentences:
50+
if sentence.strip() in selected_sentences_set:
51+
final_summary.append(sentence)
52+
53+
return " ".join(final_summary)
54+
55+
input_text = """
56+
Hacktoberfest is a month-long celebration of open source software, where developers from around
57+
the world contribute to various projects. It encourages participation in the open source community
58+
and helps improve software quality through collaborative efforts. Participants can earn rewards by
59+
making contributions, such as pull requests, to eligible repositories on GitHub.
60+
"""
61+
62+
summary = summarize_text_nltk(input_text, num_sentences=1)
63+
64+
print(input_text)
65+
66+
print(summary)

0 commit comments

Comments
 (0)