Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .env_example
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
HOST=0.0.0.0
HOST='0.0.0.0'
PORT=8050
USER_FILE_PATH='file/path/user.json'
MATCH_FILE_PATH='file/path/matches.json'
ASSETS_PATH='app/assets/'
GEOLITE_DB_PATH='data/GeoLite2-City.mmdb'
251 changes: 119 additions & 132 deletions app/analytics/MatchAnalytics.py
Original file line number Diff line number Diff line change
@@ -1,132 +1,119 @@
import pandas as pd
import re
import json


def prepare_uploaded_match_data(file_path="../data/app_uploaded_files/matches.json"):
__validate_upload_file_type(file_path)
__validate_match_file_upload(file_path)

with open(file_path, 'r') as file:
# match upload data is a list of dictionaries
match_upload_data = json.load(file)

events = []
for interaction, all_actions in enumerate(match_upload_data):
# action type is like, match, chats, blocks, overarching "action"
for action_type, actions in all_actions.items():
# action is the metadata assoc. one event of the action type
for action in actions:
action["interaction_id"] = interaction
events.append(action)

return pd.DataFrame(events).sort_values("timestamp")


def date_count_distribution(events):
chat_events = events[events["type"] == "chats"]
chats_per_interaction = chat_events.groupby('interaction_id').size()

# convert the Series to a DataFrame with specified column names, have to reset the index
interaction_counts = chats_per_interaction.to_frame().reset_index()
interaction_counts.columns = ['interaction_id', 'outgoing_messages']
return interaction_counts


def activity_by_date(events):
events['activity_date'] = pd.DatetimeIndex(events["timestamp"]).date

event_type_counts_by_date = events.groupby(['activity_date', 'type']).size()

# creating a DataFrame from the Series, have to reset the index
action_type_freq_per_day = pd.DataFrame(event_type_counts_by_date).reset_index()
action_type_freq_per_day.columns = ['activity_date', 'type', 'count']

return action_type_freq_per_day


def analyze_double_likes(events):
like_events = events[events["type"] == "like"]
multi_like_event_count = len(like_events.groupby('interaction_id').filter(lambda x: len(x) > 1))
single_like_event_count = len(like_events) - multi_like_event_count

single_vs_double_likes = pd.DataFrame(
[['Single Like', single_like_event_count], ['Multiple Likes', multi_like_event_count]],
columns=["Like Frequency", "Count"])

return single_vs_double_likes


def total_counts(events):
distinct_interaction_count = len(pd.unique(events['interaction_id']))
like_event_count = len(events[events['type'] == "like"])
match_event_count = len(events[events['type'] == "match"])

chat_events = events[events['type'] == "chats"]
chat_event_count = len(chat_events.interaction_id.unique())

totals = pd.DataFrame(
[['Distinct Interactions', distinct_interaction_count], ['Outgoing Likes', like_event_count],
['Matches', match_event_count],
['Chats', chat_event_count]],
columns=["action_type", "count"])
return totals


def commented_outgoing_likes(events):
likes_w_comments = __build_comments_list(events)

return pd.DataFrame(likes_w_comments, columns=["Comments"])


def like_comment_ratios(events):
likes_w_comments = __build_comments_list(events)
likes_wo_comment = len(events) - len(likes_w_comments)

likes_w_wo_comments = pd.DataFrame(
[['Likes with Comments', len(likes_w_comments)], ['Likes without Comments', likes_wo_comment]],
columns=["Likes With/ Without Comments", "Count"])
return likes_w_wo_comments


def phone_number_shares(events):
chats_w_messages = events.where(events["type"] == "chats")
chats_w_messages = chats_w_messages[chats_w_messages['body'].notna()]
total_messages_w_chats = len(chats_w_messages)

message_bodies = chats_w_messages['body']

phone_number_shared = []
for message in message_bodies:
# finds common phone number formats in the message: XXX-XXX-XXXX, XXX.XXX.XXXX, (XXX) XXX-XXXX
message_containing_number = re.findall(r"\(?\d{3}\)?[-.\s]\d{3}[-.\s]\d{4}", message)

if len(message_containing_number) >= 1:
phone_number_shared.append(message_containing_number)

phone_number_share_ratios = pd.DataFrame([['Gave Phone Number', len(phone_number_shared)],
['Did Not Give Phone Number', total_messages_w_chats]],
columns=["Message Outcomes", "Count"])
return phone_number_share_ratios

def __build_comments_list(events):
likes_w_comments = []
like_events = events["like"].dropna()
for value in like_events:
# likes are an array with a single element (most of the time)
# TODO: handle multiple likes, it's rare but possible
like_event = value[0]
if like_event.get('comment') is not None:
likes_w_comments.append(like_event.get('comment'))

return likes_w_comments


def __validate_upload_file_type(file_path):
if not file_path.endswith('.json'):
raise ValueError("Invalid file type. Please upload a JSON file.")


def __validate_match_file_upload(file_path):
if 'match' not in file_path:
raise ValueError("Invalid file name. Please upload a file with 'match' in the file name.")
import json, os
from datetime import datetime, timedelta

class MatchAnalytics:
def __init__(self):
self.match_file_path = os.environ.get("MATCH_FILE_PATH")

if self.match_file_path is None:
raise Exception("MATCH_FILE_PATH environment varviable is not set.")

if '.json' not in self.match_file_path:
raise Exception("The match file needs to be a JSON file.")

with open(self.match_file_path, 'r') as file:
match_data = json.load(file)
self.match_data = match_data

def get_match_data(self):
all_matches = []
for entry in self.match_data:
matches = entry.get("match", [])
all_matches.extend(matches)
return all_matches

def get_block_data(self):
all_blocks = []
for entry in self.match_data:
blocks = entry.get("block", [])
all_blocks.extend(blocks)
return all_blocks

def get_likes_data(self):
all_likes = []
for entry in self.match_data:
likes = entry.get("like", [])
all_likes.extend(likes)
return all_likes

def get_chat_data(self):
all_chats = []
for entry in self.match_data:
chats = entry.get("chats", [])
all_chats.extend(chats)
return all_chats

def get_message_count_last_12_months(self):
now = datetime.now()
one_year_ago = now - timedelta(days=365)

msg_counts_per_month = []
for entry in self.match_data:
match = entry.get("match", [])
chats = entry.get("chats", [])
if match:
match_time = datetime.fromisoformat(match[0]["timestamp"])
if match_time >= one_year_ago:
month = match_time.strftime("%Y-%m")
msg_counts_per_month.append({
"month": month,
"message_count": len(chats)
})
return msg_counts_per_month

def get_response_latency(self):
latency_data = []
for entry in self.match_data:
match = entry.get("match", [])
chats = entry.get("chats", [])

if match and chats:
match_time = datetime.fromisoformat(match[0]["timestamp"])
first_message_time = datetime.fromisoformat(chats[0]["timestamp"])
latency = (first_message_time - match_time).total_seconds() / (3600 * 24)

latency_data.append({
"match_time": match_time,
"first_message_time": first_message_time,
"latency_days": latency
})
return latency_data

def get_match_durations(self):
durations = []
for entry in self.match_data:
match = entry.get("match", [])
block = entry.get("block", [])

if match and block:
match_time = datetime.fromisoformat(match[0]["timestamp"])
block_time = datetime.fromisoformat(block[0]["timestamp"])
duration_days = (block_time - match_time).days

durations.append({
"match_time": match_time,
"block_time": block_time,
"duration_days": duration_days
})
return durations

def get_match_rm_counts(self):
records = []

for entry in self.match_data:
match_time = entry.get("match", [{}])[0].get("timestamp")
block_time = entry.get("block", [{}])[0].get("timestamp")
chats = entry.get("chats", [])

if match_time and block_time:
match_dt = datetime.fromisoformat(match_time)
block_dt = datetime.fromisoformat(block_time)
delta_days = (block_dt - match_dt).days
message_count = len(chats)

records.append({
"message_count": message_count,
"duration_days": delta_days
})

return records
File renamed without changes.
Loading