-
Notifications
You must be signed in to change notification settings - Fork 16
Create a pull request #12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,8 @@ | ||
| # TextMining | ||
|
|
||
| This is the base repo for the text mining and analysis project for Software Design at Olin College. | ||
|
|
||
| # commonwords.py | ||
| Required packages: | ||
|
|
||
| $ pip install wikipedia | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,86 @@ | ||
| import wikipedia | ||
| import math | ||
|
|
||
| def text_to_word(text): | ||
| """ | ||
| Purpose: To refine the word being available to do further process. | ||
| The function replaces non-alphabet characters into space or blank as well as change all into lower character. | ||
| The string typed text is divided into multiple words by using split function. | ||
| """ | ||
| text = text.replace('-',' ') | ||
| text = text.replace('(','') | ||
| text = text.replace(')','') | ||
| text = text.replace("'",'') | ||
| text = text.replace(".",'') | ||
| text = text.replace(",",'') | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just a minor suggestion, but if you want to make code a bit more scalable and concise code, |
||
| text = text.lower() | ||
| list_of_unsorted = text.split() | ||
| return list_of_unsorted | ||
|
|
||
| def histogram(unsorted_words): | ||
| """ | ||
| produces a dictionary composed of common word from two summary | ||
| in which the key and value is the word and the number of word. | ||
| """ | ||
| count = dict() | ||
| for word in unsorted_words: | ||
| if word not in count: | ||
| count[word] = 1 | ||
| else: | ||
| count[word] += 1 | ||
| return count | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Also minor suggestion) |
||
|
|
||
| def matching_summary(search_1,search_2): | ||
| """ | ||
| The function gets two arguments, which will be each summary of two words. | ||
| The arguments are refined by <text_to_word> function, | ||
| then it is used to construct the histogram of each word through <histogram> function. | ||
| For the common words which are both in each summary only can be stored inside of common_count dictionary, as a key. | ||
| The corresponding value is a geometric mean of the number of word in each dictionary from the arguments. | ||
| The specific words except preposition, pronoun, number, article, common verb, alphabet and conjunction | ||
| can be stored in the common_count dictionary. | ||
| """ | ||
| print ('Common words in the summary of', search_1,'&', search_2) | ||
|
|
||
| text_1 = text_to_word(wikipedia.summary(search_1)) | ||
| text_2 = text_to_word(wikipedia.summary(search_2)) | ||
| histogram_1 = histogram(text_1) | ||
| histogram_2 = histogram(text_2) | ||
|
|
||
| preposition = ['as','by','in','at','on','with','of','for','to','through','after','from','over','until','during','under','all'] | ||
| pronoun = ['i','me','my','he','his','him','she','her','hers','we','our','ours','it','its','they','them','their'] | ||
| number = ['one','two','three','four','five','first','second','third'] | ||
| article = ['also','an','a','the','non','most'] | ||
| commonverb = ['is','are','was','were','be','been','have','has','had','get','got','gotten','can','could','make','made'] | ||
| alphabet = ['a','b','c','d','e','g','f','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','–','&'] | ||
| conjunction = ['and','but','or','yet','that','which','where','what','when','why','how'] | ||
|
|
||
| common_count = dict() | ||
| for word in text_1: | ||
| geometric_mean = math.sqrt(histogram_1.get(word, 0)*histogram_2.get(word, 0)) | ||
| if geometric_mean != 0: | ||
| if word not in preposition and word not in pronoun and word not in article and word not in commonverb and word not in alphabet and word not in conjunction and word not in number: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| common_count[word] = geometric_mean | ||
|
|
||
| return common_count | ||
|
|
||
| import matplotlib.pyplot as plt | ||
|
|
||
| word1 = input('1st word: ') | ||
| word2 = input('2nd word: ') | ||
| forxlabel = 'Common Words in Wikipedia Summary: ' + word1 + ' & ' + word2 | ||
| forylabel = 'Geometric Mean of Frequencey' | ||
|
|
||
| """ | ||
| Dictionary of common word gets changed into histogram with designated bar, sticks, and label of each axis. | ||
| """ | ||
| dictionary = matching_summary(word1,word2) | ||
| fig = plt.figure() | ||
| plt.bar(range(len(dictionary)), dictionary.values(), align='center') | ||
| plt.xticks(range(len(dictionary)), dictionary.keys(), rotation=80) | ||
| plt.xlabel(forxlabel) | ||
| plt.ylabel(forylabel) | ||
| plt.show() | ||
|
|
||
| fig.savefig('image',dpi=600,bbox_inches='tight') | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| import wikipedia | ||
| import math | ||
|
|
||
| def text_to_word(text): | ||
| text = text.replace('-',' ') | ||
| text = text.replace('(','') | ||
| text = text.replace(')','') | ||
| text = text.replace("'",'') | ||
| text = text.lower() | ||
| list_of_unsorted = text.split() | ||
| return list_of_unsorted | ||
|
|
||
| def histogram(unsorted_words): | ||
| count = dict() | ||
| for word in unsorted_words: | ||
| if word not in count: | ||
| count[word] = 1 | ||
| else: | ||
| count[word] += 1 | ||
| return count | ||
|
|
||
| #histogram(text_to_word('Subeen-is-(an)-idiot')) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can remove the unnecessary comments before submitting |
||
|
|
||
| def matching_summary(): | ||
| print ('Relationship between the given words will be presented') | ||
| search_1 = input('1st word: ') | ||
| search_2 = input('2nd word: ') | ||
| text_1 = text_to_word(wikipedia.summary(search_1)) | ||
| text_2 = text_to_word(wikipedia.summary(search_2)) | ||
| histogram_1 = histogram(text_1) | ||
| histogram_2 = histogram(text_2) | ||
|
|
||
| common_count = dict() | ||
| for word in text_1: | ||
| geometric_mean = math.sqrt(histogram_1.get(word, 0)*histogram_2.get(word, 0)) | ||
| if geometric_mean != 0: | ||
| common_count[word] = geometric_mean | ||
| else: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. else statement is unnecessary here |
||
|
|
||
| return common_count | ||
|
|
||
| matching_summary() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would put more documentation on README file.
Besides required packages, you can put descriptions, relevant links, or how to use it