Skip to content

Commit

Permalink
Add capability to parse user sessions by browser session, tab session…
Browse files Browse the repository at this point in the history
…, and url/domain of app (#33)

* Utility function to parse a continuous log to set of logs representing
user sessions

* Add checking tab Id is missing from log

* WIP --

* Provide a configurable options to parse user sessions

* Address PR review, create Session and Sessions classes

* Add modified __init__.py file

* refactor(sessions): refactors the session and sessions classes to reconcile the mismatched initialization approach and methods

linting and formatting are also bundled into the commit

* style(lint): lints files after rebase

---------

Co-authored-by: Madeline Diep <MDiep@FCMD.local>
Co-authored-by: Evan Jones <evan.a.jones3@gmail.com>
  • Loading branch information
3 people authored Mar 11, 2024
1 parent 6f63548 commit 4fc4606
Show file tree
Hide file tree
Showing 14 changed files with 6,149 additions and 368 deletions.
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,24 @@ repos:
- id: end-of-file-fixer
- id: mixed-line-ending
repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v4.5.0
- repo: https://github.com/commitizen-tools/commitizen
rev: 3.8.0 # automatically updated by Commitizen
rev: v3.18.0 # automatically updated by Commitizen
hooks:
- id: commitizen
stages: [commit-msg]
- hooks:
- id: flake8
repo: https://github.com/pycqa/flake8
rev: 6.1.0
rev: 7.0.0
- hooks:
- id: black
repo: https://github.com/psf/black
rev: 23.7.0
rev: 24.2.0
- hooks:
- args:
- --profile
- black
id: isort
repo: https://github.com/PyCQA/isort
rev: 5.12.0
rev: 5.13.2
7 changes: 2 additions & 5 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.


# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details

Expand All @@ -41,7 +40,5 @@ sphinx:

# Optionally set the version of Python and requirements required to build your docs
python:
install:
- requirements: docs/requirements.txt


install:
- requirements: docs/requirements.txt
4 changes: 4 additions & 0 deletions distill/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,17 @@
)
from distill.segmentation.segmentation_error import SegmentationError
from distill.segmentation.segments import Segments
from distill.sessions.session import Session
from distill.sessions.sessions import Sessions
from distill.utils.crud import epoch_to_datetime, getUUID

__all__ = [
"Segment",
"Segment_Type",
"Segments",
"SegmentationError",
"Sessions",
"Session",
"graph",
"createDiGraph",
"sankey",
Expand Down
15 changes: 15 additions & 0 deletions distill/sessions/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
65 changes: 65 additions & 0 deletions distill/sessions/session.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json


class Session:
"""
Distill's sessions package.
Allows the user to parse UserALE log data to user sessions.
"""

def __init__(self, session={}):
"""
Initializes a Session object. This object contains\
metadata for the associated Session.
:param session: a set of key value pair representing a session
"""

self.session = session
self.session_name = list(session.keys())[0]
self.num_logs = len(list(session.values())[0])
self.logs = list(session.values())[0]

def __str__(self):
return json.dumps(self.session)

def get_session_name(self):
"""
Gets the name of a given session.
:return: The session name of the given session.
"""
return self.session_name

def get_num_logs(self):
"""
Gets the number of logs within a given session.
:return: The number of logs within the given session.
"""
return self.num_logs

def get_logs(self):
"""
Gets the logs within a given session.
:return: The logs within the given session.
"""
return self.logs
182 changes: 182 additions & 0 deletions distill/sessions/sessions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from enum import Enum
from typing import Dict, Optional

from distill.sessions.session import Session
from distill.sessions.utils import (
chunk_by_domain,
chunk_by_idle_time,
chunk_by_tabId,
flatten_dict,
group_by_user,
)


class Sessions_Type(Enum):
TAB = "tab"
DOMAIN = "domain"
DEFAULT = "default"


class Sessions:
"""
A collection of Session objects.
"""

def __init__(self, logs: Optional[Dict] = None, sessions=[], **kwargs):
"""
Sessions initialization function.
:param logs ({}): Optional, Userale logs in the form of dictionary, if provided,
the logs will be parsed into sessions on initialization
:param sessions ({}): A dictionary that describe the parsed logs to sessions
**kwargs, Additional arguments passed to the user session parsing function,
if logs are provided
"""
self.sessions = sessions
self.sessions_type = Sessions_Type.DEFAULT

# Chunk on initialization
if logs:
self.create_from_logs(logs, **kwargs)

def __iter__(self):
"""
Allows Sessions to be iterable.
"""
return iter(self.sessions)

def __len__(self):
"""
Allows Sessions to return the number of sessions it contains.
"""
return len(self.sessions)

def __str__(self):
"""
Creates a readable string for Sessions.
"""
sessions_in_str = "{"
for session in self.sessions:
# Remove the bracket
sessions_in_str += str(session)[1:-1] + ","
sessions_in_str = sessions_in_str[:-1] + "}"
# Remove the last "," and add the closing bracket
return sessions_in_str

def get_sessions_type(self):
"""
Gets the type of a groupings/parsing that were used to create the sessions.
:return: The sessions type.
"""
return self.sessions_type

def get_session_list(self):
"""
Returns a list of Session objects in Sessions.
:return: A list of session objects.
"""
return self.sessions

def get_session_names(self):
"""
Returns the names session names (key of the session dictionary).
:return: A list of session names
"""
session_names = []
for session in self.sessions:
session_names.append(session.get_session_name())
return session_names

def create_from_logs(
self, logs, inactive_interval_s=60, group_by_type="None", url_re="."
):
"""
Separate a raw log to sets of user sessions.
A user session is defined by: unique session ID,
user ID, and separated by idle time that exceeds
the specified inactive_interval (in seconds).
By default, the interval is 60 seconds. This set
is further separated by the windows tab in which
the user activities occurred.
:param logs: Userale logs in the form of dictionary
:param inactive_interval_s: Threshold of inactivity (no logged activity)
in seconds
:param url_re: Regular expression to filter the log
:param group_by_type: either group by tab, URL, browser (None)
:return: A dictionary that represent sets of user sessions
"""
data_by_users = {}
data_by_users = group_by_user(logs)

chunk_data_by_users_sessions = {}
for user in data_by_users:
user_session_sets = {}
# Sort the logs associated by each users so we can create sets accordingly
sorted_data = sorted(
data_by_users[user],
key=lambda item: item.get("clientTime", item.get("endTime")),
)
chunked_group = {}
# Separate by browser tab
if group_by_type == "tab":
chunked_group = chunk_by_tabId(sorted_data)
for g_id in chunked_group:
# For each set, detect if there is an idle time between the logs
# that exceed X seconds
user_session_sets[g_id] = chunk_by_idle_time(
chunked_group[g_id], inactive_interval_s
)
chunk_data_by_users_sessions[user] = user_session_sets
# Separate by domain application
elif group_by_type == "domain":
# Do something
chunked_group = chunk_by_domain(sorted_data, url_re)
for g_id in chunked_group:
# For each set, detect if there is an idle time between the logs
# that exceed X seconds
user_session_sets[g_id] = chunk_by_idle_time(
chunked_group[g_id], inactive_interval_s
)
chunk_data_by_users_sessions[user] = user_session_sets
else:
# For each set, detect if there is an idle time between the logs
# that exceed X seconds
chunk_data_by_users_sessions[user] = chunk_by_idle_time(
sorted_data, inactive_interval_s
)

# Flatten the structure into a collection of sessions
flattened_results = flatten_dict(chunk_data_by_users_sessions)
parsed_sessions = []
for result in flattened_results:
parsed_sessions.append(Session({result: flattened_results[result]}))

# Update the sessions
self.sessions = parsed_sessions

# Set the sessions type
if group_by_type == "tab":
self.sessions_type = Sessions_Type.TAB
elif group_by_type == "domain":
self.sessions_type = Sessions_Type.DOMAIN
Loading

0 comments on commit 4fc4606

Please sign in to comment.