-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy path19_pandas1_series2.py
37 lines (30 loc) · 1.12 KB
/
19_pandas1_series2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# Panadas series are similar to lists, but have many more functions
import pandas as pd
from pandas import Series
import nltk
# Let's get that wordcount series back
textfile = open("holmes.txt","r",encoding="utf8")
holmesstring = textfile.read()
textfile.close()
words = nltk.word_tokenize(holmesstring)
words = [word.lower() for word in words if word.isalnum()]
wordSeries = Series(words)
wordCounts = wordSeries.value_counts()
# Unlike lists, indexes for series can be words, in addition
# to numbers. Unlike dictionaries, series are ordered
print(wordCounts.index)
# You can get data by index name:
holmesNum = wordCounts["holmes"]
print(f"holmes occurs {holmesNum} times.")
# You can also get it by index number:
theNum = wordCounts[0]
print(f"The most common word occurs {theNum} times")
# You can use get to get a value.
# this returns None if the value doesn't exist
# handy in avoiding errors
print(wordCounts.get("chimp"))
# Series treat data in a very similar way to R (the statistical
# programming language), so it does vector operations on Series.
print(wordCounts*2)
print(wordCounts + wordCounts)
print(wordCounts - wordCounts)