Skip to content

Commit

Permalink
fixes to pseudonym encoding for URL_SAFE_TOKEN in bulk mode (#455)
Browse files Browse the repository at this point in the history
* split encode method

* more tests

* fix legacy impl of bulk pseudonym URL_SAFE_TOKEN encoding

* fix conversion from PseudonymizedIdentity --> Pseudonym

* revert iml changes

* rename

* rename test

* eliminate inaccurate tests

* make changes work, even w LEGACY format
  • Loading branch information
eschultink authored Jul 31, 2023
1 parent 46bce96 commit 16c92bd
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 35 deletions.
17 changes: 14 additions & 3 deletions java/core/src/main/java/co/worklytics/psoxy/HashUtils.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package co.worklytics.psoxy;

import com.google.common.annotations.VisibleForTesting;
import lombok.NoArgsConstructor;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
Expand All @@ -8,16 +9,26 @@
import java.nio.charset.StandardCharsets;
import java.util.Base64;

/**
* NOTE: this class is used for LEGACY pseudonymization implementation, not DEFAULT since v0.4
*
*
*/
@NoArgsConstructor(onConstructor_ = @Inject) // in lieu of provider
public class HashUtils {

public String hash(String... fragments) {
return encode(DigestUtils.sha256(String.join("", fragments)));
}

@VisibleForTesting
public String encode(byte[] bytes) {
// No padding saves us the '=' character
// https://www.baeldung.com/java-base64-encode-and-decode#2-java-8-base64-encoding-without-padding
String hash = new String(
String encoded = new String(
Base64.getEncoder()
.withoutPadding()
.encode(DigestUtils.sha256(String.join("", fragments))),
.encode(bytes),
StandardCharsets.UTF_8);

// To avoid urlencoding issues (especially with handlebars/template rendering)
Expand All @@ -26,6 +37,6 @@ public String hash(String... fragments) {
// See: https://handlebarsjs.com/guide/#html-escaping
// https://en.wikipedia.org/wiki/Base64#Base64_table
// https://en.wikipedia.org/wiki/Percent-encoding#Types_of_URI_characters
return StringUtils.replaceChars(hash, "/+", "_.");
return StringUtils.replaceChars(encoded, "/+", "_.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import com.avaulta.gateway.pseudonyms.Pseudonym;
import com.fasterxml.jackson.annotation.JsonInclude;
import lombok.*;
import org.apache.commons.lang3.StringUtils;

import java.util.Base64;

/**
* pseudonymized form of an account identifier
Expand Down Expand Up @@ -72,10 +75,40 @@ public Pseudonym asPseudonym() {

//q: what to do w original, if anything?

Base64.Decoder decoder = Base64.getUrlDecoder();

byte[] decodedHash, decodedReversible;
if (hash != null) {
decoder = Base64.getUrlDecoder();
try {
decodedHash = decoder.decode(hash.getBytes());
} catch (IllegalArgumentException e) {
decoder = Base64.getDecoder();
//q: should we log this?
decodedHash = decoder.decode(StringUtils.replaceChars(hash, "_.", "/+").getBytes());
}
} else {
decodedHash = null;
}

if (reversible != null) {
try {
decodedReversible = decoder.decode(reversible.getBytes());
} catch (IllegalArgumentException e) {
decoder = Base64.getDecoder();
//q: should we log this?
decodedReversible = decoder.decode(StringUtils.replaceChars(reversible, "_.", "/+").getBytes());
}
} else {
decodedReversible = null;
}



return Pseudonym.builder()
.hash(hash == null ? null : hash.getBytes())
.hash(decodedHash)
.domain(domain)
.reversible(reversible == null ? null : reversible.getBytes())
.reversible(decodedReversible)
.build();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,34 @@
import co.worklytics.test.MockModules;
import co.worklytics.test.TestModules;
import co.worklytics.test.TestUtils;
import com.avaulta.gateway.pseudonyms.Pseudonym;
import com.avaulta.gateway.pseudonyms.PseudonymEncoder;
import com.avaulta.gateway.pseudonyms.PseudonymImplementation;
import com.avaulta.gateway.pseudonyms.impl.Base64UrlSha256HashPseudonymEncoder;
import com.avaulta.gateway.pseudonyms.impl.UrlSafeTokenPseudonymEncoder;
import com.avaulta.gateway.tokens.DeterministicTokenizationStrategy;
import com.avaulta.gateway.tokens.impl.Sha256DeterministicTokenizationStrategy;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import dagger.Component;
import lombok.SneakyThrows;
import org.apache.commons.codec.binary.Base64;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

import javax.inject.Inject;
import javax.inject.Singleton;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.function.Function;


import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
Expand Down Expand Up @@ -223,15 +229,58 @@ void handle_rename() {
}
}

@Test
@SneakyThrows
void handle_duplicates_legacy() {
Pseudonymizer defaultPseudonymizer =
pseudonymizerImplFactory.create(Pseudonymizer.ConfigurationOptions.builder()
.pseudonymizationSalt("salt")
.defaultScopeId("hris")
.pseudonymImplementation(PseudonymImplementation.LEGACY)
.build());


//this is a lookup-table use case (for customers to use in own data warehouse)
final String EXPECTED = "EMPLOYEE_ID,DEPARTMENT,EMPLOYEE_ID_ORIG\r\n" +
"t~SappwO4KZKGprqqUNruNreBD2BVR98nEM6NRCu3R2dM,Engineering,1\r\n" +
"t~mfsaNYuCX__xvnRz4gJp_t0zrDTC5DkuCJvMkubugsI,Sales,2\r\n" +
"t~-ZdDGUuOMK-Oy7_PJ3pf9SYX12-3tKPdLHfYbjVGcGk,Engineering,3\r\n" +
"t~-fs1T64Micz8SkbILrABgEv4kSg-tFhvhP35HGSLdOo,Engineering,4\r\n";

CsvRules rules = CsvRules.builder()
.pseudonymFormat(PseudonymEncoder.Implementations.URL_SAFE_TOKEN)
.columnToPseudonymize("EMPLOYEE_ID")
.columnToRedact("EMPLOYEE_EMAIL")
.columnToRedact("EFFECTIVE_ISOWEEK")
.columnsToDuplicate(Map.of("EMPLOYEE_ID", "EMPLOYEE_ID_ORIG"))
.build();

File inputFile = new File(getClass().getResource("/csv/hris-example.csv").getFile());

try (FileReader in = new FileReader(inputFile)) {
byte[] result = columnarFileSanitizerImpl.sanitize(in, rules, defaultPseudonymizer);

assertEquals(EXPECTED, new String(result));
}
}


@Test
@SneakyThrows
void handle_duplicates() {
Pseudonymizer defaultPseudonymizer =
pseudonymizerImplFactory.create(Pseudonymizer.ConfigurationOptions.builder()
.pseudonymizationSalt("salt")
.defaultScopeId("hris")
.build());


//this is a lookup-table use case (for customers to use in own data warehouse)
final String EXPECTED = "EMPLOYEE_ID,DEPARTMENT,EMPLOYEE_ID_ORIG\r\n" +
"t~U2FwcHdPNEtaS0dwcnFxVU5ydU5yZUJEMkJWUjk4bkVNNk5SQ3UzUjJkTQ,Engineering,1\r\n" +
"t~bWZzYU5ZdUNYX194dm5SejRnSnBfdDB6ckRUQzVEa3VDSnZNa3VidWdzSQ,Sales,2\r\n" +
"t~LlpkREdVdU9NSy5PeTdfUEozcGY5U1lYMTIuM3RLUGRMSGZZYmpWR2NHaw,Engineering,3\r\n" +
"t~LmZzMVQ2NE1pY3o4U2tiSUxyQUJnRXY0a1NnLnRGaHZoUDM1SEdTTGRPbw,Engineering,4\r\n";
"t~0zPKqEd-CtbCLB1ZSwX6Zo7uAWUvkpfHGzv9-cuYwZc,Engineering,1\r\n" +
"t~-hN_i1M1DeMAicDVp6LhFgW9lH7r3_LbOpTlXYWpXVI,Sales,2\r\n" +
"t~4W7Sl-LI6iMzNNngivs5dLMiVw-7ob3Cyr3jn8NureY,Engineering,3\r\n" +
"t~BOg00PLoiEEKyGzije3FJlKBzM6_Vjk87VJI9lTIA2o,Engineering,4\r\n";

CsvRules rules = CsvRules.builder()
.pseudonymFormat(PseudonymEncoder.Implementations.URL_SAFE_TOKEN)
Expand All @@ -244,7 +293,7 @@ void handle_duplicates() {
File inputFile = new File(getClass().getResource("/csv/hris-example.csv").getFile());

try (FileReader in = new FileReader(inputFile)) {
byte[] result = columnarFileSanitizerImpl.sanitize(in, rules, pseudonymizer);
byte[] result = columnarFileSanitizerImpl.sanitize(in, rules, defaultPseudonymizer);

assertEquals(EXPECTED, new String(result));
}
Expand Down Expand Up @@ -334,4 +383,38 @@ void shuffle() {
assertEquals(EXPECTED, new String(result));
}
}

@ValueSource(strings = {
"blah",
"blah@acme.com"
})
@ParameterizedTest
void pre_v0_4_30_bulk_pseudonym_URL_SAFE_TOKEN_ENCODING(String identifier) {
pseudonymizer = pseudonymizerImplFactory.create(Pseudonymizer.ConfigurationOptions.builder()
.pseudonymizationSalt("salt")
.defaultScopeId("hris")
.pseudonymImplementation(PseudonymImplementation.DEFAULT)
.build());

Base64UrlSha256HashPseudonymEncoder encoder = new Base64UrlSha256HashPseudonymEncoder();
DeterministicTokenizationStrategy deterministicTokenizationStrategy =
new Sha256DeterministicTokenizationStrategy("salt");

// this is how ColumnarBulkDataSanitizerImpl.java encoded pseudonyms if pseudonym
// format == URL_SAFE_TOKEN, and pseudonym implementation == DEFAULT
// prior to v0.4.31:
// see: https://github.com/Worklytics/psoxy/blob/ec0d324e0c45a6b97167b0907aa50bfdb8a45189/java/core/src/main/java/co/worklytics/psoxy/storage/impl/ColumnarBulkDataSanitizerImpl.java#L149C17-L149C17
PseudonymizedIdentity pseudonymizedIdentity = pseudonymizer.pseudonymize(identifier);
String legacyEncoded = pseudonymizedIdentity.getHash();


byte[] token = deterministicTokenizationStrategy.getToken(identifier, Function.identity());
assertEquals(legacyEncoded,
encoder.encode(Pseudonym.builder().hash(token).build()));

// check that the none legacy encoding is just the legacy encoding with a "t~" prefix
UrlSafeTokenPseudonymEncoder urlSafeTokenPseudonymEncoder = new UrlSafeTokenPseudonymEncoder();
assertEquals("t~" + legacyEncoded + (pseudonymizedIdentity.getDomain() == null ? "" : ("@" + pseudonymizedIdentity.getDomain())),
urlSafeTokenPseudonymEncoder.encode(pseudonymizedIdentity.asPseudonym()));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@

import com.avaulta.gateway.pseudonyms.Pseudonym;
import com.avaulta.gateway.pseudonyms.PseudonymEncoder;
import org.apache.commons.lang3.StringUtils;

import java.nio.charset.StandardCharsets;
import java.util.Base64;

/**
* implementation of defacto encoding used by BulkDataSanitizerImpl as of v0.4.30
* implementation of defacto encoding used by BulkDataSanitizerImpl as of v0.4.30 when
* configured with PSEUDONYM_FORMAT=URL_SAFE_TOKEN and PSEUDONYM_IMPLEMENTATION=default
*
* eg, `Pseudonym` encoded as the base64-url-safe encoding of its SHA-256 hash, with no prefix
*
*/
public class Sha256PseudonymEncoder implements PseudonymEncoder {
public class Base64UrlSha256HashPseudonymEncoder implements PseudonymEncoder {


@Override
Expand All @@ -30,21 +33,19 @@ public Pseudonym decode(String input) {
@Override
public boolean canBeDecoded(String possiblePseudonym) {
return possiblePseudonym != null &&
//NOTE: this couples it to the SHA-256 hash; otherwise has nothing specific to SHA-256
// - and even this is only based on the expected length in bytes of SHA-256 hash
possiblePseudonym.getBytes(StandardCharsets.UTF_8).length == 43; //43 rather than 32, bc of base64 encoding without padding
}

//base64 encoding, to match implementation in HashUtils.java from psoxy-core v0.4.30
String base64Encode(byte[] bytes) {
String encoded = new String(
Base64.getEncoder()
return new String(Base64.getUrlEncoder()
.withoutPadding()
.encode(bytes),
StandardCharsets.UTF_8);
return StringUtils.replaceChars(encoded, "/+", "_.");
.encode(bytes));
}

byte[] base64decode(String input) {
return Base64.getDecoder()
.decode(StringUtils.replaceChars(input, "_.", "/+"));
return Base64.getUrlDecoder().decode(input);
}
}
Original file line number Diff line number Diff line change
@@ -1,33 +1,32 @@
package com.avaulta.gateway.pseudonyms.impl;

import com.avaulta.gateway.pseudonyms.Pseudonym;
import com.avaulta.gateway.pseudonyms.PseudonymEncoder;
import com.avaulta.gateway.tokens.impl.Sha256DeterministicTokenizationStrategy;
import org.apache.commons.lang3.RandomStringUtils;
import org.junit.jupiter.api.Test;
import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

import java.util.Random;
import java.nio.charset.StandardCharsets;
import java.util.Base64;
import java.util.function.Function;

import static org.junit.jupiter.api.Assertions.*;

class Sha256PseudonymEncoderTest {
class Base64UrlSha256HashPseudonymEncoderTest {

Base64UrlSha256HashPseudonymEncoder encoder = new Base64UrlSha256HashPseudonymEncoder();

Sha256PseudonymEncoder encoder = new Sha256PseudonymEncoder();


@ParameterizedTest
@ValueSource(strings = {
// examples taken from https://github.com/Worklytics/psoxy/blob/b483e3788d5457398d55cad7934de959b74c7900/java/core/src/test/java/co/worklytics/psoxy/storage/impl/BulkDataSanitizerImplTest.java#L228-L239
"SappwO4KZKGprqqUNruNreBD2BVR98nEM6NRCu3R2dM",
"mfsaNYuCX__xvnRz4gJp_t0zrDTC5DkuCJvMkubugsI",
".ZdDGUuOMK.Oy7_PJ3pf9SYX12.3tKPdLHfYbjVGcGk",
".fs1T64Micz8SkbILrABgEv4kSg.tFhvhP35HGSLdOo"
"szTDtLRsbo-JneQPAYEYN7g5hjcvfttONtzUv5hFWZo",
})
void canBeDecoded(String encoded) {
assertTrue(encoder.canBeDecoded(encoded));

//just ensure doesn't throw anything
encoder.decode(encoded);
}


Expand Down Expand Up @@ -60,6 +59,6 @@ void roundtrip(String identifier) {
assertTrue(encoder.canBeDecoded(encoded));
assertEquals(new String(pseudonym.getHash()),
new String(encoder.decode(encoded).getHash()));

}

}

0 comments on commit 16c92bd

Please sign in to comment.