-
Notifications
You must be signed in to change notification settings - Fork 0
/
kindleVocab.py
136 lines (117 loc) · 4.76 KB
/
kindleVocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import sqlite3
import pandas as pd
pd.set_option('mode.chained_assignment', None)
class kindleVocab(object):
'''This class fetches the vocabulary of a book from the Kindle database.'''
def __init__(self):
'''
Connects to the database and create a DataFrame.
'''
self.SQLCode = '''SELECT w.stem, w.word, w.lang, w.id, l.usage, l.book_key, b.title, b.authors
FROM WORDS w
JOIN LOOKUPS l ON w.id = l.word_key
JOIN BOOK_INFO b ON l.book_key = b.id;
'''
try:
#get path
file_path = self.getPath()
#open database
self.db = sqlite3.connect(file_path)
#create a dataframe
self.df = self.createDataframe(self.SQLCode, self.db)
print("\nDatabase connection opened and DataFrame created\n")
self.db.close()
except sqlite3.Error as e:
print(e)
print("Database connection failed")
def getPath(self):
'''Gets the path to the vocab.db file'''
try:
# Get the current working directory
cwd = os.getcwd()
# Specify the name of the file you want to find the path for
filename = "vocab.db"
# Use os.path.join() to construct the full path to the file
file_path = os.path.join(cwd, filename)
return file_path
except:
return "vocab.db does not exist. Please add it to the same directory as this script."
def createDataframe(self, SQLCode, db):
'''Creates and filters the dataframe'''
sql_query = pd.read_sql_query(SQLCode,db)
sql_query = sql_query.drop('book_key', axis=1)
sql_query = sql_query.drop('id', axis=1)
sql_query = sql_query.sort_values(by='title')
unique_titles = sorted(set(sql_query['title']))
title_to_id = {title: i+1 for i, title in enumerate(unique_titles)}
sql_query.reset_index(drop=True, inplace=True)
# add a new column with the id for each title
sql_query['id'] = sql_query['title'].apply(lambda x: title_to_id[x])
sql_query['stem'] = sql_query['stem'].str.capitalize()
self.isTitleSet = False
return sql_query
def getLanguage(self):
'''Gets unique languages'''
return self.df["lang"].unique()
def setLanguage(self, lang):
'''Sets the language and drops unused columns'''
self.lang = lang
self.df = self.df[self.df["lang"] == self.lang]
self.df = self.df.drop('lang', axis=1)
self.df = self.df.drop('word', axis=1)
return (f"Language set to {self.lang}")
def getTitle(self):
'''Gets unique book titles'''
query = pd.DataFrame({'id': self.df['id'].unique(), 'title': self.df['title'].unique()})
query = query.set_index('id')
return query
def isTitle(self, title):
'''Checks if the title is in the DataFrame'''
if self.df['title'][self.df['title'] == title].unique() == title:
return True
else:
return False
def isId(self, id):
'''Checks if the id is in the DataFrame'''
if self.df['id'][self.df['id'] == id].unique() == id:
return True
else:
return False
def setTitleByName(self, title):
if not self.isTitleSet:
if self.isTitle(title):
self.df = self.df[self.df["title"] == title]
self.df.reset_index(drop=True, inplace=True)
self.isTitleSet = True
return (f"Title set to {self.getSelectedTitle()()}")
else:
return "Invalid title, please try again"
else:
return "Title already set"
def setTitleById(self, id):
if self.isId(id):
self.df = self.df[self.df["id"] == id]
self.df.reset_index(drop=True, inplace=True)
self.isTitleSet = True
return (f"Title set to {self.getSelectedTitle()}")
else:
return "Invalid title, please try again"
def getSelectedTitle(self):
return self.df["title"].unique()[0]
def getSelectedAuthor(self):
return self.df["authors"].unique()[0]
def getAuthor(self):
'''Gets unique book authors'''
return self.df["authors"].unique()
def getShape(self):
'''Gets the shape of the DataFrame'''
return self.df.shape
def findTitle(self, substring):
result = self.df[self.df['title'].str.contains(substring, case=False)]
if not result.empty:
query = pd.DataFrame({'id': result['id'].unique(), 'title': result['title'].unique()})
query = query.set_index('id')
return query
else:
return None