yelp_original

import json
import pandas as pd 
import time
import numpy as np
pd.set_option('display.max_colwidth', -1)
import json
import pandas as pd 
import time
import numpy as np
pd.set_option('display.max_colwidth', -1)
##GENERAL
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from collections import Counter
import csv
import re
import concurrent.futures
##SPACY
import spacy
nlp = spacy.load('en_core_web_sm')
#nlp = spacy.load('en_core_web_md')
#nlp = spacy.load('en_core_web_lg')
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.pipeline import SentenceSegmenter
from spacy import displacy

# open json file
#DATA LOADING AND PROFILING
##Reading without Chunk Size Parameter
data_preprocessing_start_time = time.time() 

#Reading Full File
fullfile_start=time.time()
path = '/Users/ankitkothari/Documents/ONGOING_PROJECTS/optimum/yelp_academic_dataset_review.json'
data = pd.read_json(path, lines=True)
full_file_read_time = (time.time()-fullfile_start)/60

print(f'Time Taken for fullfile to read {full_file_read_time:03.2f} mins')

print(data.head())
print(f' Memory usage in MB {data.memory_usage(deep=True).sort_values()/(1024*1024)}')
print(f'data types {data.info(memory_usage="deep")}')
print(f'data size {data.size}')

for dtype in ['float','int','object']:
    selected_dtype = data.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

data_preprocessing_time= (time.time()-data_preprocessing_start_time)/60
print(f'Time Taken for Data Preprocessing WITHOUT Optimization  {data_preprocessing_time:03.2f} mins')

#Filtering Time 
filter_start_time= time.time()
only_rating_1 = data[data['stars']==1]
print(only_rating_1.shape)
only_rating_1['month'] = only_rating_1['date'].apply(lambda x: x.month)
group_by_rating_1 = grouped = only_rating_1.groupby(['business_id','month']).agg(
          {
            'stars': 'count'
          })
original_filter = (time.time()-filter_start_time)/60
print(f'Time Taken for Filtering without Memory Optimization  {original_filter:03.2f} mins')
group_by_rating_1= group_by_rating_1.unstack().reset_index()
print(group_by_rating_1.head())
###Aggregation
aggregation_start_time = time.time()

grouped = data.groupby(['business_id']).agg(
          {
            'stars':{'Mean':np.mean}
          })
grouped = grouped.reset_index()


group_by_rating =  data.groupby(['stars']).agg(
          {
            'cool': 'sum',
            'funny': 'sum'
          })
group_by_rating = group_by_rating.reset_index()

aggregation_time = time.time()-aggregation_start_time
print(grouped.head(25))
print(group_by_rating)
print(grouped.shape)
print(f'Time Taken for Aggregarion with No optimization  {aggregation_time:03.2f} mins')


##Text Cleaning
contraction_mapping = {"ain't": "is not","don't": "do not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have", "w/":"with", "cnt":"cannot", "w/o":"without","u":"you"}

def spacy_preprocessing(text):
    #print(text)
    #text = re.sub(r"\S*\w*.(com)\S*", "",text) #replaces any email or websitw with space
    text = re.sub(r"\b([a-zA-Z]{1})\b", " ", text) #replaces single random characters in the text with space
    text = re.sub(r"[^a-zA-Z]"," ",text) #replaces special characters with spaces
    text = re.sub(r"(.)\1{3,}", r"\1", text) #replaces multiple character with a word with one like pooooost will be post
    text = re.sub(r"\s{2,}", r" ", text) #replaces multiple space in the line with single space
    
    
    tokens = text.split(" ")
    #print(tokens)
    clean_tokens = [contraction_mapping[i] if i in contraction_mapping else i for i in tokens]
    text = " ".join(clean_tokens)
    #except:
    #text=text
    clean_text=[]
    for token in nlp(text):
       if (token.lemma_ != "-PRON-") & (token.text not in nlp.Defaults.stop_words):
           clean_text.append(token.text.lower())
       elif (token.lemma_ == "-PRON-")  & (token.text not in nlp.Defaults.stop_words):
           clean_text.append(token.text.lower())
       else:
           continue
    clean_string = " ".join(clean_text).lstrip()
    #print(type(clean_string))
    return clean_string
checkpoints=[100, 1000,10000,100000]
def text_profile(checkpoint):
    start = time.time()
    temp= data.copy()
    temp= temp.iloc[0:checkpoint]
    temp['clean']=temp['text'].apply(spacy_preprocessing)
    print(temp.shape)
    end = (time.time()- start)/60
    return temp, end

time_profile = [text_profile(checkpoint)[1] for checkpoint in checkpoints]
print(time_profile)