-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add capability to parse user sessions by browser session, tab session…
…, and url/domain of app (#33) * Utility function to parse a continuous log to set of logs representing user sessions * Add checking tab Id is missing from log * WIP -- * Provide a configurable options to parse user sessions * Address PR review, create Session and Sessions classes * Add modified __init__.py file * refactor(sessions): refactors the session and sessions classes to reconcile the mismatched initialization approach and methods linting and formatting are also bundled into the commit * style(lint): lints files after rebase --------- Co-authored-by: Madeline Diep <MDiep@FCMD.local> Co-authored-by: Evan Jones <evan.a.jones3@gmail.com>
- Loading branch information
1 parent
6f63548
commit 4fc4606
Showing
14 changed files
with
6,149 additions
and
368 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import json | ||
|
||
|
||
class Session: | ||
""" | ||
Distill's sessions package. | ||
Allows the user to parse UserALE log data to user sessions. | ||
""" | ||
|
||
def __init__(self, session={}): | ||
""" | ||
Initializes a Session object. This object contains\ | ||
metadata for the associated Session. | ||
:param session: a set of key value pair representing a session | ||
""" | ||
|
||
self.session = session | ||
self.session_name = list(session.keys())[0] | ||
self.num_logs = len(list(session.values())[0]) | ||
self.logs = list(session.values())[0] | ||
|
||
def __str__(self): | ||
return json.dumps(self.session) | ||
|
||
def get_session_name(self): | ||
""" | ||
Gets the name of a given session. | ||
:return: The session name of the given session. | ||
""" | ||
return self.session_name | ||
|
||
def get_num_logs(self): | ||
""" | ||
Gets the number of logs within a given session. | ||
:return: The number of logs within the given session. | ||
""" | ||
return self.num_logs | ||
|
||
def get_logs(self): | ||
""" | ||
Gets the logs within a given session. | ||
:return: The logs within the given session. | ||
""" | ||
return self.logs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from enum import Enum | ||
from typing import Dict, Optional | ||
|
||
from distill.sessions.session import Session | ||
from distill.sessions.utils import ( | ||
chunk_by_domain, | ||
chunk_by_idle_time, | ||
chunk_by_tabId, | ||
flatten_dict, | ||
group_by_user, | ||
) | ||
|
||
|
||
class Sessions_Type(Enum): | ||
TAB = "tab" | ||
DOMAIN = "domain" | ||
DEFAULT = "default" | ||
|
||
|
||
class Sessions: | ||
""" | ||
A collection of Session objects. | ||
""" | ||
|
||
def __init__(self, logs: Optional[Dict] = None, sessions=[], **kwargs): | ||
""" | ||
Sessions initialization function. | ||
:param logs ({}): Optional, Userale logs in the form of dictionary, if provided, | ||
the logs will be parsed into sessions on initialization | ||
:param sessions ({}): A dictionary that describe the parsed logs to sessions | ||
**kwargs, Additional arguments passed to the user session parsing function, | ||
if logs are provided | ||
""" | ||
self.sessions = sessions | ||
self.sessions_type = Sessions_Type.DEFAULT | ||
|
||
# Chunk on initialization | ||
if logs: | ||
self.create_from_logs(logs, **kwargs) | ||
|
||
def __iter__(self): | ||
""" | ||
Allows Sessions to be iterable. | ||
""" | ||
return iter(self.sessions) | ||
|
||
def __len__(self): | ||
""" | ||
Allows Sessions to return the number of sessions it contains. | ||
""" | ||
return len(self.sessions) | ||
|
||
def __str__(self): | ||
""" | ||
Creates a readable string for Sessions. | ||
""" | ||
sessions_in_str = "{" | ||
for session in self.sessions: | ||
# Remove the bracket | ||
sessions_in_str += str(session)[1:-1] + "," | ||
sessions_in_str = sessions_in_str[:-1] + "}" | ||
# Remove the last "," and add the closing bracket | ||
return sessions_in_str | ||
|
||
def get_sessions_type(self): | ||
""" | ||
Gets the type of a groupings/parsing that were used to create the sessions. | ||
:return: The sessions type. | ||
""" | ||
return self.sessions_type | ||
|
||
def get_session_list(self): | ||
""" | ||
Returns a list of Session objects in Sessions. | ||
:return: A list of session objects. | ||
""" | ||
return self.sessions | ||
|
||
def get_session_names(self): | ||
""" | ||
Returns the names session names (key of the session dictionary). | ||
:return: A list of session names | ||
""" | ||
session_names = [] | ||
for session in self.sessions: | ||
session_names.append(session.get_session_name()) | ||
return session_names | ||
|
||
def create_from_logs( | ||
self, logs, inactive_interval_s=60, group_by_type="None", url_re="." | ||
): | ||
""" | ||
Separate a raw log to sets of user sessions. | ||
A user session is defined by: unique session ID, | ||
user ID, and separated by idle time that exceeds | ||
the specified inactive_interval (in seconds). | ||
By default, the interval is 60 seconds. This set | ||
is further separated by the windows tab in which | ||
the user activities occurred. | ||
:param logs: Userale logs in the form of dictionary | ||
:param inactive_interval_s: Threshold of inactivity (no logged activity) | ||
in seconds | ||
:param url_re: Regular expression to filter the log | ||
:param group_by_type: either group by tab, URL, browser (None) | ||
:return: A dictionary that represent sets of user sessions | ||
""" | ||
data_by_users = {} | ||
data_by_users = group_by_user(logs) | ||
|
||
chunk_data_by_users_sessions = {} | ||
for user in data_by_users: | ||
user_session_sets = {} | ||
# Sort the logs associated by each users so we can create sets accordingly | ||
sorted_data = sorted( | ||
data_by_users[user], | ||
key=lambda item: item.get("clientTime", item.get("endTime")), | ||
) | ||
chunked_group = {} | ||
# Separate by browser tab | ||
if group_by_type == "tab": | ||
chunked_group = chunk_by_tabId(sorted_data) | ||
for g_id in chunked_group: | ||
# For each set, detect if there is an idle time between the logs | ||
# that exceed X seconds | ||
user_session_sets[g_id] = chunk_by_idle_time( | ||
chunked_group[g_id], inactive_interval_s | ||
) | ||
chunk_data_by_users_sessions[user] = user_session_sets | ||
# Separate by domain application | ||
elif group_by_type == "domain": | ||
# Do something | ||
chunked_group = chunk_by_domain(sorted_data, url_re) | ||
for g_id in chunked_group: | ||
# For each set, detect if there is an idle time between the logs | ||
# that exceed X seconds | ||
user_session_sets[g_id] = chunk_by_idle_time( | ||
chunked_group[g_id], inactive_interval_s | ||
) | ||
chunk_data_by_users_sessions[user] = user_session_sets | ||
else: | ||
# For each set, detect if there is an idle time between the logs | ||
# that exceed X seconds | ||
chunk_data_by_users_sessions[user] = chunk_by_idle_time( | ||
sorted_data, inactive_interval_s | ||
) | ||
|
||
# Flatten the structure into a collection of sessions | ||
flattened_results = flatten_dict(chunk_data_by_users_sessions) | ||
parsed_sessions = [] | ||
for result in flattened_results: | ||
parsed_sessions.append(Session({result: flattened_results[result]})) | ||
|
||
# Update the sessions | ||
self.sessions = parsed_sessions | ||
|
||
# Set the sessions type | ||
if group_by_type == "tab": | ||
self.sessions_type = Sessions_Type.TAB | ||
elif group_by_type == "domain": | ||
self.sessions_type = Sessions_Type.DOMAIN |
Oops, something went wrong.