From fac23a6418bf53dc6547f075482a3d7902ffff57 Mon Sep 17 00:00:00 2001 From: Adam Sachs Date: Sat, 7 Oct 2023 14:44:36 -0400 Subject: [PATCH] add gvl data category mapping and functions (#175) * add gvl data category mapping and functions * Update src/fideslang/gvl/gvl_data_category_mapping.json Co-authored-by: Dawn Pattison * Update src/fideslang/gvl/gvl_data_category_mapping.json Co-authored-by: Dawn Pattison * Update src/fideslang/gvl/gvl_data_category_mapping.json Co-authored-by: Dawn Pattison * Update src/fideslang/gvl/gvl_data_category_mapping.json Co-authored-by: Dawn Pattison * Update src/fideslang/gvl/gvl_data_category_mapping.json Co-authored-by: Dawn Pattison --------- Co-authored-by: Dawn Pattison --- src/fideslang/__init__.py | 3 + src/fideslang/gvl/__init__.py | 54 +++++- .../gvl/gvl_data_category_mapping.json | 171 ++++++++++++++++++ src/fideslang/gvl/models.py | 25 +++ tests/fideslang/gvl/test_gvl.py | 25 ++- 5 files changed, 266 insertions(+), 12 deletions(-) create mode 100644 src/fideslang/gvl/gvl_data_category_mapping.json diff --git a/src/fideslang/__init__.py b/src/fideslang/__init__.py index d211dd0c..4295434d 100644 --- a/src/fideslang/__init__.py +++ b/src/fideslang/__init__.py @@ -11,11 +11,14 @@ # export our GVL utilities from .gvl import ( + GVL_DATA_CATEGORIES, GVL_PURPOSES, GVL_SPECIAL_PURPOSES, + MAPPED_GVL_DATA_CATEGORIES, MAPPED_PURPOSES, MAPPED_PURPOSES_BY_DATA_USE, MAPPED_SPECIAL_PURPOSES, + data_category_id_to_data_categories, data_use_to_purpose, purpose_to_data_use, ) diff --git a/src/fideslang/gvl/__init__.py b/src/fideslang/gvl/__init__.py index 438d47d5..2a298eab 100644 --- a/src/fideslang/gvl/__init__.py +++ b/src/fideslang/gvl/__init__.py @@ -1,9 +1,13 @@ +# pylint: disable=too-many-locals + import os from json import load from os.path import dirname, join from typing import Dict, List, Optional -from .models import Feature, MappedPurpose, Purpose +from .models import Feature, GVLDataCategory, MappedDataCategory, MappedPurpose, Purpose + +### (Special) Purposes PURPOSE_MAPPING_FILE = join( dirname(__file__), @@ -11,24 +15,33 @@ "gvl_data_use_mapping.json", ) -FEATURE_MAPPING_FILE = join( - dirname(__file__), - "", - "gvl_feature_mapping.json", -) - - GVL_PURPOSES: Dict[int, Purpose] = {} MAPPED_PURPOSES: Dict[int, MappedPurpose] = {} - GVL_SPECIAL_PURPOSES: Dict[int, Purpose] = {} MAPPED_SPECIAL_PURPOSES: Dict[int, MappedPurpose] = {} +MAPPED_PURPOSES_BY_DATA_USE: Dict[str, MappedPurpose] = {} + +### (Special) Features +FEATURE_MAPPING_FILE = join( + dirname(__file__), + "", + "gvl_feature_mapping.json", +) GVL_FEATURES: Dict[int, Feature] = {} GVL_SPECIAL_FEATURES: Dict[int, Feature] = {} FEATURES_BY_NAME: Dict[str, Feature] = {} -MAPPED_PURPOSES_BY_DATA_USE: Dict[str, MappedPurpose] = {} + +### Data Categories + +DATA_CATEGORY_MAPPING_FILE = join( + dirname(__file__), + "", + "gvl_data_category_mapping.json", +) +GVL_DATA_CATEGORIES: Dict[int, GVLDataCategory] = {} +MAPPED_GVL_DATA_CATEGORIES: Dict[int, MappedDataCategory] = {} def _load_data() -> None: @@ -67,6 +80,17 @@ def _load_data() -> None: GVL_SPECIAL_FEATURES[special_feature.id] = special_feature FEATURES_BY_NAME[special_feature.name] = special_feature + with open( + os.path.join(os.curdir, DATA_CATEGORY_MAPPING_FILE), encoding="utf-8" + ) as data_category_mapping_file: + data_category_data = load(data_category_mapping_file) + + for raw_data_category in data_category_data.values(): + data_category = GVLDataCategory.parse_obj(raw_data_category) + mapped_data_category = MappedDataCategory.parse_obj(raw_data_category) + GVL_DATA_CATEGORIES[data_category.id] = data_category + MAPPED_GVL_DATA_CATEGORIES[mapped_data_category.id] = mapped_data_category + def purpose_to_data_use(purpose_id: int, special_purpose: bool = False) -> List[str]: """ @@ -108,4 +132,14 @@ def feature_id_to_feature_name( return feature.name +def data_category_id_to_data_categories(data_category_id: int) -> List[str]: + """ + Utility function to return the fideslang data categories associated with the + given GVL data category ID. + + Raises a KeyError if an invalid GVL data category ID is provided. + """ + return MAPPED_GVL_DATA_CATEGORIES[data_category_id].fides_data_categories + + _load_data() diff --git a/src/fideslang/gvl/gvl_data_category_mapping.json b/src/fideslang/gvl/gvl_data_category_mapping.json new file mode 100644 index 00000000..9de39789 --- /dev/null +++ b/src/fideslang/gvl/gvl_data_category_mapping.json @@ -0,0 +1,171 @@ +{ + "1": { + "id": 1, + "name": "IP addresses", + "description": "Your IP address is a number assigned by your Internet Service Provider to any Internet connection. It is not always specific to your device and is not always a stable identifier. It is used to route information on the Internet and display online content (including ads) on your connected device.", + "fides_data_categories": [ + "user.device.ip_address" + ] + }, + "2": { + "id": 2, + "name": "Device characteristics", + "description": "Technical characteristics about the device you are using that are not unique to you, such as the language, the time zone or the operating system.", + "fides_data_categories": [ + "user.device", + "user.sensor", + "user.user_sensor", + "user.telemetry" + ] + }, + "3": { + "id": 3, + "name": "Device identifiers", + "description": "A device identifier is a unique string of characters assigned to your device or browser by means of a cookie or other storage technologies. It may be created or accessed to recognise your device e.g. across web pages from the same site or across multiple sites or apps.", + "fides_data_categories": [ + "user.device.cookie_id", + "user.device.device_id", + "user.device.cookie" + ] + }, + "4": { + "id": 4, + "name": "Probabilistic identifiers", + "description": "A probabilistic identifier can be created by combining characteristics associated with your device (the type of browser or operating system used) and the IP address of the Internet connection. If you give your agreement, additional characteristics (e.g. the installed font or screen resolution) can also be combined to improve precision of the probabilistic identifier. Such an identifier is considered \"probabilistic\" because several devices can share the same characteristics and Internet connection. It may be used to recognise your device across e.g. web pages from the same site or across multiple sites or apps.", + "fides_data_categories": ["user.unique_id.pseudonymous"] + }, + "5": { + "id": 5, + "name": "Authentication-derived identifiers", + "description": "Where an identifier is created on the basis of authentication data, such as contact details associated with online accounts you have created on websites or apps (e.g. e-mail address, phone number) or customer identifiers (e.g. identifier provided by your telecom operator), that identifier may be used to recognise you across websites, apps and devices when you are logged-in with the same contact details.", + "fides_data_categories": [ + "user.authorization", + "user.authorization.biometric", + "user.authorization.credentials", + "user.authorization.password" + ] + }, + "6": { + "id": 6, + "name": "Browsing and interaction data", + "description": "Your online activity such as the websites you visit, apps you are using, the content you search for on this service, or your interactions with content or ads, such as the number of times you have seen a specific content or ad or whether you clicked on it.", + "fides_data_categories": [ + "user.behavior.purchase_history", + "user.behavior", + "user.behavior.browsing_history", + "user.behavior.media_consumption", + "user.behavior.search_history", + "user.social" + ] + }, + "7": { + "id": 7, + "name": "User-provided data", + "description": "The information you may have provided by way of declaration via a form (e.g. feedback, a comment) or when creating an account (e.g. your age, your occupation).", + "fides_data_categories": [ + "user.account", + "user.account.username", + "user.account.settings", + "user.authorization", + "user.authorization.biometric", + "user.authorization.credentials", + "user.authorization.password", + "user.biometric", + "user.biometric.fingerprint", + "user.biometric.health", + "user.biometric.retinal", + "user.biometric.voice", + "user.contact", + "user.contact.organization", + "user.contact.address", + "user.contact.address.street", + "user.contact.address.city", + "user.contact.address.postal_code", + "user.contact.address.state", + "user.contact.address.country", + "user.contact.email", + "user.contact.phone_number", + "user.contact.fax_number", + "user.contact.url", + "user.job_title", + "user.financial", + "user.financial.bank_account", + "user.financial.credit_card", + "user.unique_id", + "user.name", + "user.name.first", + "user.name.last", + "user.criminal_history", + "user.content", + "user.content.private", + "user.content.public", + "user.content.self_image", + "user.demographic.age_range", + "user.demographic.date_of_birth", + "user.demographic.religious_belief", + "user.demographic.gender", + "user.demographic.race_ethnicity", + "user.demographic.sexual_orientation", + "user.demographic.political_opinion", + "user.demographic.language", + "user.demographic.marital_status", + "user.government_id", + "user.government_id.national_identification_number", + "user.government_id.passport_number", + "user.government_id.drivers_license_number", + "user.government_id.immigration", + "user.government_id.vehicle_registration", + "user.government_id.birth_certificate", + "user.health_and_medical", + "user.health_and_medical.record_id", + "user.health_and_medical.insurance_beneficiary_id", + "user.health_and_medical.genetic", + "user.sensor", + "user.childrens", + "user.workplace" + ] + }, + "8": { + "id": 8, + "name": "Non-precise location data", + "description": "An approximation of your location, expressed as an area with a radius of at least 500 meters. Your approximate location can be deduced from e.g. the IP address of your connection.", + "fides_data_categories": [ + "user.location.imprecise" + ] + }, + "9": { + "id": 9, + "name": "Precise location data", + "description": "Your precise location within a radius of less than 500 meters based on your GPS coordinates. It may be used only with your acceptance.", + "fides_data_categories": [ + "user.location.precise" + ] + }, + "10": { + "id": 10, + "name": "Users' profiles", + "description": "Certain characteristics (e.g. your possible interests, your purchase intentions, your consumer profile) may be inferred or modeled from your previous online activity (e.g. the content you viewed or the service you used, your time spent on various online content and services) or the information you have provided (e.g. your age, your occupation).", + "fides_data_categories": [ + "user.demographic", + "user.demographic.age_range", + "user.demographic.date_of_birth", + "user.demographic.gender", + "user.demographic.language", + "user.demographic.marital_status", + "user.demographic.political_opinion", + "user.demographic.profile", + "user.demographic.race_ethnicity", + "user.demographic.religious_belief", + "user.demographic.sexual_orientation" + ] + }, + "11": { + "id": 11, + "name": "Privacy choices", + "description": "Your preferences regarding the processing of your data, based on the information you have received.", + "fides_data_categories": [ + "user.privacy_preferences" + ] + } +} + diff --git a/src/fideslang/gvl/models.py b/src/fideslang/gvl/models.py index 5a59f7a8..9f0155cf 100644 --- a/src/fideslang/gvl/models.py +++ b/src/fideslang/gvl/models.py @@ -38,3 +38,28 @@ class Feature(BaseModel): description: str = Field( description="Description of the GVL feature or special feature." ) + + +class GVLDataCategory(BaseModel): + """ + Pydantic model for GVL data category records + """ + + id: int = Field( + description="Official GVL data category ID. Used for linking with vendor records" + ) + name: str = Field(description="Name of the GVL data category.") + description: str = Field(description="Description of the GVL purpose.") + + +class MappedDataCategory(GVLDataCategory): + """ + Extension of the base GVL data category model to include properties related to fideslang mappings. + + This is separated from the base GVL data category model to keep that model a "pristine" representation + of GVL source data. + """ + + fides_data_categories: List[str] = Field( + description="The fideslang default taxonomy data categories that are associated with the GVL data category." + ) diff --git a/tests/fideslang/gvl/test_gvl.py b/tests/fideslang/gvl/test_gvl.py index 348cd2ad..20b6fee7 100644 --- a/tests/fideslang/gvl/test_gvl.py +++ b/tests/fideslang/gvl/test_gvl.py @@ -1,12 +1,13 @@ import pytest from fideslang.gvl import ( - purpose_to_data_use, GVL_FEATURES, GVL_SPECIAL_FEATURES, Feature, - feature_name_to_feature, + data_category_id_to_data_categories, feature_id_to_feature_name, + feature_name_to_feature, + purpose_to_data_use, ) @@ -64,3 +65,23 @@ def test_feature_id_to_feature_name(): ) assert feature_id_to_feature_name(feature_id=1001) is None + + + +def test_data_category_id_to_data_categories(): + assert data_category_id_to_data_categories(1) == [ + "user.device.ip_address" + ] + + # let's test one other data category just to be comprehensive + assert data_category_id_to_data_categories(5) == [ + "user.authorization", + "user.authorization.biometric", + "user.authorization.credentials", + "user.authorization.password" + ] + + + # assert invalid categories raise KeyErrors + with pytest.raises(KeyError): + data_category_id_to_data_categories(12)