diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md new file mode 100644 index 000000000..e3b95ca10 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md @@ -0,0 +1,3 @@ +Fixes issue: + +Changes proposed in this pull request: diff --git a/.gitignore b/.gitignore index 2ebfd0e14..a1e47c933 100644 --- a/.gitignore +++ b/.gitignore @@ -89,6 +89,7 @@ venv/ ENV/ env.bak/ venv.bak/ +*venv/ # Spyder project settings .spyderproject diff --git a/Gopkg.lock b/Gopkg.lock index e2d03b0aa..168ef6faa 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -2,20 +2,20 @@ [[projects]] - digest = "1:b62a3c5b37db602bf1158e921da1a762315a4c37855fd418a14498aa87a342d5" + digest = "1:3d3a509c5ba327e8573bb57f9da8430c63a46a06886eb1d2ffc8af4e76f31c72" name = "cloud.google.com/go" packages = ["civil"] pruneopts = "UT" - revision = "0ebda48a7f143b1cce9eb37a8c1106ac762a3430" - version = "v0.34.0" + revision = "457ea5c15ccf3b87db582c450e80101989da35f7" + version = "v0.40.0" [[projects]] - digest = "1:b92928b73320648b38c93cacb9082c0fe3f8ac3383ad9bd537eef62c380e0e7a" + digest = "1:6b1426cad7057b717351eacf5b6fe70f053f11aac1ce254bbf2fd72c031719eb" name = "contrib.go.opencensus.io/exporter/ocagent" packages = ["."] pruneopts = "UT" - revision = "00af367e65149ff1f2f4b93bbfbb84fd9297170d" - version = "v0.2.0" + revision = "dcb33c7f3b7cfe67e8a2cea10207ede1b7c40764" + version = "v0.4.12" [[projects]] digest = "1:487dc37a77bbba996bf4ddae0bff1c69fde98027d507e75eca317ca7c94483c3" @@ -39,7 +39,7 @@ version = "v1.1.4" [[projects]] - digest = "1:b7ae7b7962d3c2656f3eb7d8543932de1cb31ba28dc1433b5ce74613f9694318" + digest = "1:88a413bd074ae78a3dfd59b8458439a17c393e859b234b4fcf2d2088da7907e9" name = "github.com/Azure/azure-event-hubs-go" packages = [ ".", @@ -48,16 +48,16 @@ "storage", ] pruneopts = "UT" - revision = "aca3e9cfe138951ffb815665621095482a674ee9" - version = "v1.1.2" + revision = "d3d1b70a113ea8a9f2d7443ff7d99ab40f9b6eca" + version = "v1.3.1" [[projects]] - digest = "1:d2ccb697dc13c8fbffafa37baae97594d5592ae8f7e113471084137315536e2b" + digest = "1:279540310125d2b219920588d7e2edb2a85b3317b528839166e896ce6b6f211c" name = "github.com/Azure/azure-pipeline-go" packages = ["pipeline"] pruneopts = "UT" - revision = "b8e3409182fd52e74f7d7bdfbff5833591b3b655" - version = "v0.1.8" + revision = "55fedc85a614dcd0e942a66f302ae3efb83d563c" + version = "v0.1.9" [[projects]] digest = "1:fd0485bc9bbf77bbfefed5b67fc45899b130c78b544127d5f1efde7a0b768b0b" @@ -73,15 +73,15 @@ version = "v23.2.0" [[projects]] - digest = "1:b8ac7e4464ce21f7487c663aa69b1b3437740bb10ab12d4dc7aa9b02422571a1" + digest = "1:b15d5bdadce5d98f1e06353508a4029fccfeb68761957b3a2a8cb38ebb8caca4" name = "github.com/Azure/azure-storage-blob-go" packages = ["azblob"] pruneopts = "UT" - revision = "45d0c5e3638e2b539942f93c48e419f4f2fc62e4" - version = "0.4.0" + revision = "678206e7e6e55abf0265a6440135e92005176ebf" + version = "v0.6.0" [[projects]] - digest = "1:9e8ebf3883dce8687221c4022bdb3b0bfdcde430b911be9cc7e52478a026893c" + digest = "1:c1e04f3b97fbb6e25891c10d3d2d906dcd5807774a3a68d0e32e2b7bda9b5c7f" name = "github.com/Azure/go-autorest" packages = [ "autorest", @@ -94,35 +94,35 @@ "tracing", ] pruneopts = "UT" - revision = "be17756531f50014397912b7aa557ec335e39b98" - version = "v11.3.0" + revision = "562d3769ef2f0f56bc52749babd3e88367e28588" + version = "v11.9.0" [[projects]] - digest = "1:ed77032e4241e3b8329c9304d66452ed196e795876e14be677a546f36b94e67a" + digest = "1:6d8a3b164679872fa5a4c44559235f7fb109c7b5cd0f456a2159d579b76cc9ba" name = "github.com/DataDog/zstd" packages = ["."] pruneopts = "UT" - revision = "c7161f8c63c045cbc7ca051dcc969dd0e4054de2" - version = "v1.3.5" + revision = "809b919c325d7887bff7bd876162af73db53e878" + version = "v1.4.0" [[projects]] branch = "master" - digest = "1:8d270b7938356d0f7262169aaff9a272a1b747ede5bcd5040bb5cc9c012e57e9" + digest = "1:c79ff0c1cb49b28ad509306045dd8d5e46072354c0b4dbaff75434f3e630ce8d" name = "github.com/Microsoft/presidio-genproto" packages = ["golang"] pruneopts = "UT" - revision = "7f79038dbc10904f279cfa87eb49d84f5c3a9b24" + revision = "1734e2635c253f79e4c44398315d92fe9d084601" [[projects]] - digest = "1:a59a467c541a1bf8b06e4fad6113028c959be6573b78ceca9f8020cd0d2127fc" + digest = "1:2ec153af6a806c3d63d4299f2549bcb29d75d9703097341be309a46db3481488" name = "github.com/Shopify/sarama" packages = ["."] pruneopts = "UT" - revision = "879f631812a30a580659e8035e7cda9994bb99ac" - version = "v1.20.0" + revision = "ea9ab1c316850bee881a07bb2555ee8a685cd4b6" + version = "v1.22.1" [[projects]] - digest = "1:355da6e69ecab4bb211ddd598a3c26d4c156802dac22722953734b7f792289e6" + digest = "1:927a701594a3a785378229c81bde8e83349bc7bd78c44a6ae103f37fd6da7d01" name = "github.com/aws/aws-sdk-go" packages = [ "aws", @@ -152,6 +152,7 @@ "private/protocol", "private/protocol/eventstream", "private/protocol/eventstream/eventstreamapi", + "private/protocol/json/jsonutil", "private/protocol/query", "private/protocol/query/queryutil", "private/protocol/rest", @@ -161,8 +162,8 @@ "service/sts", ] pruneopts = "UT" - revision = "1f8a24693bc965514ee0d7aadbabe0ceed184a88" - version = "v1.16.16" + revision = "b0b59fd2ceb03908e5d3bcd1449b46ce75508f4b" + version = "v1.20.7" [[projects]] digest = "1:526d64d0a3ac6c24875724a9355895be56a21f89a5d3ab5ba88d91244269a7d8" @@ -181,17 +182,19 @@ version = "v1.2.1" [[projects]] - digest = "1:65b0d980b428a6ad4425f2df4cd5410edd81f044cf527bd1c345368444649e58" + digest = "1:fdb4ed936abeecb46a8c27dcac83f75c05c87a46d9ec7711411eb785c213fa02" name = "github.com/census-instrumentation/opencensus-proto" packages = [ "gen-go/agent/common/v1", + "gen-go/agent/metrics/v1", "gen-go/agent/trace/v1", + "gen-go/metrics/v1", "gen-go/resource/v1", "gen-go/trace/v1", ] pruneopts = "UT" - revision = "7f2434bc10da710debe5c4315ed6d4df454b4024" - version = "v0.1.0" + revision = "a105b96453fe85139acc07b68de48f2cbdd71249" + version = "v0.2.0" [[projects]] digest = "1:ffe9824d294da03b391f44e1ae8281281b4afc1bdaa9588c9097785e3af10cec" @@ -203,14 +206,14 @@ [[projects]] branch = "master" - digest = "1:0fd9da444782c2defb1352dc098f55b8b42c538787e29e45677a1dc40ff0ab11" + digest = "1:2e702b60af5efe4b3b0e3b40fddfb6beba6adca1e0a7ccd25cc5c21b5e7df9ef" name = "github.com/denisenkom/go-mssqldb" packages = [ ".", "internal/cp", ] pruneopts = "UT" - revision = "4e0d7dc8888fbb59764060e99b7b68e77a6f9698" + revision = "eb9f6a1743f30383c8168cc520ca5db1f744d6f4" [[projects]] digest = "1:76dc72490af7174349349838f2fe118996381b31ea83243812a97e5a0fd5ed55" @@ -222,19 +225,19 @@ [[projects]] branch = "master" - digest = "1:7d0b66300f67891562442ba782b7927859bf9274afd36c4651371262396bbb65" + digest = "1:6695eade5deeae68a4fc0755a005cc894ee7b47f46cdfe75340cd7803b0d23d1" name = "github.com/disintegration/imaging" packages = ["."] pruneopts = "UT" - revision = "9458da53d1e65e098d48467a4317c403327e4424" + revision = "465faf0892b5c7b3325643b0e47282e1331672e7" [[projects]] digest = "1:1f0c7ab489b407a7f8f9ad16c25a504d28ab461517a971d341388a56156c1bd7" name = "github.com/eapache/go-resiliency" packages = ["breaker"] pruneopts = "UT" - revision = "ea41b0fad31007accc7f806884dcdf3da98b79ce" - version = "v1.1.0" + revision = "5efd2ed019fd331ec2defc6f3bd98882f1e3e636" + version = "v1.2.0" [[projects]] branch = "master" @@ -253,12 +256,12 @@ version = "v1.1.0" [[projects]] - digest = "1:f1f2bd73c025d24c3b93abf6364bccb802cf2fdedaa44360804c67800e8fab8d" + digest = "1:ac425d784b13d49b37a5bbed3ce022677f8f3073b216f05d6adcb9303e27fa0f" name = "github.com/evanphx/json-patch" packages = ["."] pruneopts = "UT" - revision = "72bf35d0ff611848c1dc9df0f976c81192392fa5" - version = "v4.1.0" + revision = "026c730a0dcc5d11f93f1cf1cc65b01247ea7b6f" + version = "v4.5.0" [[projects]] digest = "1:abeb38ade3f32a92943e5be54f55ed6d6e3b6602761d74b4aab4c9dd45c18abd" @@ -270,43 +273,43 @@ [[projects]] branch = "master" - digest = "1:237e20f314113702902d275bf57103693f8a4d3bfcf43a4cd02163ee3430c90e" + digest = "1:cc93dd54278f6c0dd4b43588b3cf2c07b09b2a6dd4e2617edf81aa8946054c1e" name = "github.com/gin-contrib/cors" packages = ["."] pruneopts = "UT" - revision = "5e7acb10687f94a88d0d8e96297818fff2da8f88" + revision = "5f50d4fb4e0306dcacc6f8e9bea2dcee784dbbdf" [[projects]] - branch = "master" - digest = "1:36fe9527deed01d2a317617e59304eb2c4ce9f8a24115bcc5c2e37b3aee5bae4" + digest = "1:3ee1d175a75b911a659fbd860060874c4f503e793c5870d13e5a0ede529a63cf" name = "github.com/gin-contrib/sse" packages = ["."] pruneopts = "UT" - revision = "22d885f9ecc78bf4ee5d72b937e4bbcdc58e8cae" + revision = "54d8467d122d380a14768b6b4e5cd7ca4755938f" + version = "v0.1.0" [[projects]] branch = "master" - digest = "1:264ce7a5d411d8d4304965d87ba016e379ed2d6bce26e62340112c108a786f38" + digest = "1:27283820a5e1b25ce0363e099eb233183a5de7f76177eba1688a71e605cad06c" name = "github.com/gin-contrib/zap" packages = ["."] pruneopts = "UT" - revision = "0672bb1dbf3af725a3d294a73bd92dab67cb8adc" + revision = "3cc18cd8fce3ca00df79ca44f73fed1ed5d1fe2f" [[projects]] - digest = "1:d5083934eb25e45d17f72ffa86cae3814f4a9d6c073c4f16b64147169b245606" + digest = "1:d8bd2a337f6ff2188e08f72c614f2f3f0fd48e6a7b37a071b197e427d77d3a47" name = "github.com/gin-gonic/gin" packages = [ ".", "binding", - "json", + "internal/json", "render", ] pruneopts = "UT" - revision = "b869fe1415e4b9eb52f247441830d502aece2d4d" - version = "v1.3.0" + revision = "b75d67cd51eb53c3c3a2fc406524c940021ffbda" + version = "v1.4.0" [[projects]] - digest = "1:ad53d1f710522a38d1f0e5e0a55a194b1c6b2cd8e84313568e43523271f0cf62" + digest = "1:c950e574951c7199fb3d990d0e7a61996f40f8e646ba7cf8a557878d4c737f53" name = "github.com/go-redis/redis" packages = [ ".", @@ -318,94 +321,83 @@ "internal/util", ] pruneopts = "UT" - revision = "22be8a3eaf992c828cecb69dc07348313bf08d2e" - version = "v6.15.1" + revision = "75795aa4236dc7341eefac3bbe945e68c99ef9df" + version = "v6.15.3" [[projects]] branch = "master" - digest = "1:23dca8a35ce10bf5caf35e92f6a2b5e633f55cbd28259c2207eb31d5b44b0c02" + digest = "1:783d985ffb1affe60a210e8620e25a8cd41475940ce19a3a7bab25c795b899fe" name = "github.com/go-sql-driver/mysql" packages = ["."] pruneopts = "UT" - revision = "c45f530f8e7fe40f4687eaa50d0c8c5f1b66f9e0" - -[[projects]] - digest = "1:436e8c1845d92384995e9c93470f639b886dbbc4b49c7babf544f9cc06361198" - name = "github.com/go-xorm/builder" - packages = ["."] - pruneopts = "UT" - revision = "03eb88feccce3e477c318ce7f6f1b386544ab20b" - version = "v0.3.3" - -[[projects]] - digest = "1:ec14b8c3b10e27599d7053a97bd28ef36e59cc4247f83b474bca43aaa971eab9" - name = "github.com/go-xorm/core" - packages = ["."] - pruneopts = "UT" - revision = "c10e21e7e1cec20e09398f2dfae385e58c8df555" - version = "v0.6.0" + revision = "877a9775f06853f611fb2d4e817d92479242d1cd" [[projects]] branch = "master" - digest = "1:683ffbf5c4f58c718a45c517884bf34110d8ddcb0f5a2b8309ce1630215fb5b3" + digest = "1:245c431f1b323b7c23de483f61bcae360c5278d191cb8e23327476a2c160c79f" name = "github.com/go-xorm/xorm" packages = ["."] pruneopts = "UT" - revision = "1cd2662be938bfee0e34af92fe448513e0560fb1" + revision = "4c806608ab1d39d93b8bdc2778acdf2c42735c04" [[projects]] - digest = "1:b402bb9a24d108a9405a6f34675091b036c8b056aac843bf6ef2389a65c5cf48" + digest = "1:4d02824a56d268f74a6b6fdd944b20b58a77c3d70e81008b3ee0c4f1a6777340" name = "github.com/gogo/protobuf" packages = [ "proto", "sortkeys", ] pruneopts = "UT" - revision = "4cbf7e384e768b4e01799441fdf2a706a5635ae7" - version = "v1.2.0" + revision = "ba06b47c162d49f2af050fb4c75bcbc86a159d5c" + version = "v1.2.1" [[projects]] branch = "master" - digest = "1:97239b8255df64c18138842365b135975e7402112beb593e139de1b91303d5bc" + digest = "1:420701248ee765a9945373092ec9b57a3a21099ce975063dbac100d228ec0bfe" name = "github.com/golang/protobuf" packages = [ + "jsonpb", "proto", "protoc-gen-go/descriptor", + "protoc-gen-go/generator", + "protoc-gen-go/generator/internal/remap", + "protoc-gen-go/plugin", "ptypes", "ptypes/any", "ptypes/duration", + "ptypes/struct", "ptypes/timestamp", "ptypes/wrappers", ] pruneopts = "UT" - revision = "347cf4a86c1cb8d262994d8ef5924d4576c5b331" + revision = "b285ee9cfc6c881bb20c0d8dc73370ea9b9ec90f" [[projects]] - branch = "master" - digest = "1:4a0c6bb4805508a6287675fac876be2ac1182539ca8a32468d8128882e9d5009" + digest = "1:e4f5819333ac698d294fe04dbf640f84719658d5c7ce195b10060cc37292ce79" name = "github.com/golang/snappy" packages = ["."] pruneopts = "UT" - revision = "2e65f85255dbc3072edf28d6b5b8efc472979f5a" + revision = "2a8bb927dd31d8daada140a5d09578521ce5c36a" + version = "v0.0.1" [[projects]] - branch = "master" digest = "1:0bfbe13936953a98ae3cfe8ed6670d396ad81edf069a806d2f6515d7bb6950df" name = "github.com/google/btree" packages = ["."] pruneopts = "UT" revision = "4030bb1f1f0c35b30ca7009e9ebd06849dd45306" + version = "v1.0.0" [[projects]] - branch = "master" - digest = "1:3ee90c0d94da31b442dde97c99635aaafec68d0b8a3c12ee2075c6bdabeec6bb" + digest = "1:a6181aca1fd5e27103f9a920876f29ac72854df7345a39f3b01e61c8c94cc8af" name = "github.com/google/gofuzz" packages = ["."] pruneopts = "UT" - revision = "24818f796faf91cd76ec7bddd72458fbced7a6c1" + revision = "f140a6486e521aad38f5917de355cbf147cc0496" + version = "v1.0.0" [[projects]] - digest = "1:65c4414eeb350c47b8de71110150d0ea8a281835b1f386eacaa3ad7325929c21" + digest = "1:d1a3774c1f8336a21669d6da87a7bafb4d6171a84752268b7011e767d6722c2b" name = "github.com/googleapis/gnostic" packages = [ "OpenAPIv2", @@ -413,19 +405,19 @@ "extensions", ] pruneopts = "UT" - revision = "7c663266750e7d82587642f65e60bc4083f1f84e" - version = "v0.2.0" + revision = "e73c7ec21d36ddb0711cb36d1502d18363b5c2c9" + version = "v0.3.0" [[projects]] branch = "master" - digest = "1:86c1210529e69d69860f2bb3ee9ccce0b595aa3f9165e7dd1388e5c612915888" + digest = "1:5fc0e23b254a1bd7d8d2d42fa093ba33471d08f52fe04afd3713adabb5888dc3" name = "github.com/gregjones/httpcache" packages = [ ".", "diskcache", ] pruneopts = "UT" - revision = "c63ab54fda8f77302f8d414e19933f2b6026a089" + revision = "901d90724c7919163f472a9812253fb26761123d" [[projects]] branch = "master" @@ -437,7 +429,27 @@ "util/metautils", ] pruneopts = "UT" - revision = "4832df01553a810b8e3404b95743d01c9ab5313f" + revision = "27f3801344b24dd6e9c608692368947f674a8298" + +[[projects]] + digest = "1:c20c9a82345346a19916a0086e61ea97425172036a32b8a8975490da6a129fda" + name = "github.com/grpc-ecosystem/grpc-gateway" + packages = [ + "internal", + "runtime", + "utilities", + ] + pruneopts = "UT" + revision = "cd0c8ef3533e9c04e6520cac37a81fe262fb0b34" + version = "v1.9.2" + +[[projects]] + digest = "1:67474f760e9ac3799f740db2c489e6423a4cde45520673ec123ac831ad849cb8" + name = "github.com/hashicorp/golang-lru" + packages = ["simplelru"] + pruneopts = "UT" + revision = "7087cb70de9f7a8bc0a10c375cb0d2280a8edf9c" + version = "v0.5.1" [[projects]] digest = "1:c0d19ab64b32ce9fe5cf4ddceba78d5bc9807f0016db6b1183599da3dcc24d10" @@ -459,12 +471,12 @@ version = "v1.0.0" [[projects]] - digest = "1:8eb1de8112c9924d59bf1d3e5c26f5eaa2bfc2a5fcbb92dc1c2e4546d695f277" + digest = "1:a0cefd27d12712af4b5018dc7046f245e1e3b5760e2e848c30b171b570708f9b" name = "github.com/imdario/mergo" packages = ["."] pruneopts = "UT" - revision = "9f23e2d6bd2a77f959b2bf6acdbefd708a83a4a4" - version = "v0.3.6" + revision = "7c29201646fa3de8506f701213473dd407f19646" + version = "v0.3.7" [[projects]] digest = "1:bb81097a5b62634f3e9fec1014657855610c82d19b9a40c17612e32651e35dca" @@ -482,12 +494,12 @@ version = "1.0.0" [[projects]] - digest = "1:3e551bbb3a7c0ab2a2bf4660e7fcad16db089fdcfbb44b0199e62838038623ea" + digest = "1:f5a2051c55d05548d2d4fd23d244027b59fbd943217df8aa3b5e170ac2fd6e1b" name = "github.com/json-iterator/go" packages = ["."] pruneopts = "UT" - revision = "1624edc4454b8682399def8740d46db5e4362ba4" - version = "v1.1.5" + revision = "0ff49de124c6f76f8494e194af75bde0f1a49a29" + version = "v1.1.6" [[projects]] branch = "master" @@ -499,22 +511,23 @@ [[projects]] branch = "master" - digest = "1:7cefc4f7f6a411c2598d3344563e4d23fd4e4d88fd1591831fe39cccff41ad28" + digest = "1:2abaafc9cb59897a71c84f1daf4131d59c0dd349c671206274ace759730fc1a0" name = "github.com/lib/pq" packages = [ ".", "oid", + "scram", ] pruneopts = "UT" - revision = "9eb73efc1fcc404148b56765b0d3f61d9a5ef8ee" + revision = "2ff3cb3adc01768e0a552b3a02575a6df38a9bea" [[projects]] - digest = "1:c568d7727aa262c32bdf8a3f7db83614f7af0ed661474b24588de635c20024c7" + digest = "1:5a0ef768465592efca0412f7e838cdc0826712f8447e70e6ccc52eb441e9ab13" name = "github.com/magiconair/properties" packages = ["."] pruneopts = "UT" - revision = "c2353362d570a7bfa228149c62842019201cfb71" - version = "v1.8.0" + revision = "de8848e004dd33dc07a2947b3d76f618a7fc7ef1" + version = "v1.8.1" [[projects]] digest = "1:4e878df5f4e9fd625bf9c9aac77ef7cbfa4a74c01265505527c23470c0e40300" @@ -525,12 +538,12 @@ version = "v1.1.0" [[projects]] - digest = "1:0981502f9816113c9c8c4ac301583841855c8cf4da8c72f696b3ebedf6d0e4e5" + digest = "1:9b90c7639a41697f3d4ad12d7d67dfacc9a7a4a6e0bbfae4fc72d0da57c28871" name = "github.com/mattn/go-isatty" packages = ["."] pruneopts = "UT" - revision = "6ca4dbf54d38eea1a992b3c722a76a5d1c4cb25c" - version = "v0.0.4" + revision = "1311e847b0cb909da63b5fecfb5370aa66236465" + version = "v0.0.8" [[projects]] digest = "1:4a49346ca45376a2bba679ca0e83bec949d780d4e927931317904bad482943ec" @@ -565,47 +578,47 @@ version = "1.0.1" [[projects]] - digest = "1:6411dc2c8891eb05c1d0599abf571e81f72cbc97ebd223b44d45712f4f1799c2" + digest = "1:f3b5bf575d84780832516c53d135b4917fe891ca67c472959578b7e09277e4b2" name = "github.com/otiai10/gosseract" packages = ["."] pruneopts = "UT" - revision = "b026a6fd291f00736db60738f4e83a79d26359cb" - version = "v2.2.0" + revision = "5bb1d6fc20fa3fafb3236d6c93c393369e4b38d9" + version = "v2.2.1" [[projects]] - digest = "1:95741de3af260a92cc5c7f3f3061e85273f5a81b5db20d4bd68da74bd521675e" + digest = "1:93131d8002d7025da13582877c32d1fc302486775a1b06f62241741006428c5e" name = "github.com/pelletier/go-toml" packages = ["."] pruneopts = "UT" - revision = "c01d1270ff3e442a8a57cddc1c92dc1138598194" - version = "v1.2.0" + revision = "728039f679cbcd4f6a54e080d2219a4c4928c546" + version = "v1.4.0" [[projects]] branch = "master" - digest = "1:3bf17a6e6eaa6ad24152148a631d18662f7212e21637c2699bff3369b7f00fa2" + digest = "1:89da0f0574bc94cfd0ac8b59af67bf76cdd110d503df2721006b9f0492394333" name = "github.com/petar/GoLLRB" packages = ["llrb"] pruneopts = "UT" - revision = "53be0d36a84c2a886ca057d34b6aa4468df9ccb4" + revision = "33fb24c13b99c46c93183c291836c573ac382536" [[projects]] - digest = "1:0e7775ebbcf00d8dd28ac663614af924411c868dca3d5aa762af0fae3808d852" + digest = "1:a8c2725121694dfbf6d552fb86fe6b46e3e7135ea05db580c28695b916162aad" name = "github.com/peterbourgon/diskv" packages = ["."] pruneopts = "UT" - revision = "5f041e8faa004a95c88a202771f4cc3e991971e6" - version = "v2.0.1" + revision = "0be1b92a6df0e4f5cb0a5d15fb7f643d0ad93ce6" + version = "v3.0.0" [[projects]] - digest = "1:e39a5ee8fcbec487f8fc68863ef95f2b025e0739b0e4aa55558a2b4cf8f0ecf0" + digest = "1:259f9b7645983a7a823318d78aa96dec68af8891f706493ac1ec04d819cb977c" name = "github.com/pierrec/lz4" packages = [ ".", "internal/xxh32", ] pruneopts = "UT" - revision = "635575b42742856941dbc767b44905bb9ba083f6" - version = "v2.0.7" + revision = "d705d4371bfccdf47f10e45584e896026c83616f" + version = "v2.2.3" [[projects]] digest = "1:cf31692c14422fa27c83a05292eb5cbe0fb2775972e8f1f8446a71549bd8980b" @@ -652,15 +665,15 @@ version = "v1.2.0" [[projects]] - digest = "1:d707dbc1330c0ed177d4642d6ae102d5e2c847ebd0eb84562d0dc4f024531cfc" + digest = "1:bb495ec276ab82d3dd08504bbc0594a65de8c3b22c6f2aaa92d05b73fbf3a82e" name = "github.com/spf13/afero" packages = [ ".", "mem", ] pruneopts = "UT" - revision = "a5d6946387efe7d64d09dcba68cdd523dc1273a3" - version = "v1.2.0" + revision = "588a75ec4f32903aa5e39a2619ba6a4631e28424" + version = "v1.2.2" [[projects]] digest = "1:08d65904057412fc0270fc4812a1c90c594186819243160dc779a402d4b6d0bc" @@ -671,12 +684,12 @@ version = "v1.3.0" [[projects]] - digest = "1:68ea4e23713989dc20b1bded5d9da2c5f9be14ff9885beef481848edd18c26cb" + digest = "1:1b753ec16506f5864d26a28b43703c58831255059644351bbcb019b843950900" name = "github.com/spf13/jwalterweatherman" packages = ["."] pruneopts = "UT" - revision = "4a4406e478ca629068e7768fc33f3f044173c0a6" - version = "v1.0.0" + revision = "94f6ae3ed3bceceafa716478c5fbf8d29ca601a1" + version = "v1.1.0" [[projects]] digest = "1:c1b1102241e7f645bc8e0c22ae352e8f0dc6484b6cb4d132fa9f24174e0119e2" @@ -687,20 +700,20 @@ version = "v1.0.3" [[projects]] - digest = "1:de37e343c64582d7026bf8ab6ac5b22a72eac54f3a57020db31524affed9f423" + digest = "1:11118bd196646c6515fea3d6c43f66162833c6ae4939bfb229b9956d91c6cf17" name = "github.com/spf13/viper" packages = ["."] pruneopts = "UT" - revision = "6d33b5a963d922d182c91e8a1c88d81fd150cfd4" - version = "v1.3.1" + revision = "b5bf975e5823809fb22c7644d008757f78a4259e" + version = "v1.4.0" [[projects]] branch = "master" - digest = "1:525ac3364813b4688df380594e562133e07830dfce0722effda64b37634c13d0" + digest = "1:d6bb6f3240a488ffe5bb6952b513569d009927dcb20ff94885f87b76cef2b698" name = "github.com/streadway/amqp" packages = ["."] pruneopts = "UT" - revision = "a314942b2fd9dde7a3f70ba3f1062848ce6eb392" + revision = "75d898a42a940fbc854dfd1a4199eabdc00cf024" [[projects]] digest = "1:ac83cf90d08b63ad5f7e020ef480d319ae890c208f8524622a2f3136e2686b02" @@ -722,24 +735,27 @@ version = "v1.3.0" [[projects]] - digest = "1:03aa6e485e528acb119fb32901cf99582c380225fc7d5a02758e08b180cb56c3" + digest = "1:d0072748c62defde1ad99dde77f6ffce492a0e5aea9204077e497c7edfb86653" name = "github.com/ugorji/go" packages = ["codec"] pruneopts = "UT" - revision = "b4c50a2b199d93b13dc15e78929cfb23bfdf21ab" - version = "v1.1.1" + revision = "2adff0894ba3bc2eeb9f9aea45fefd49802e1a13" + version = "v1.1.4" [[projects]] - digest = "1:2ae8314c44cd413cfdb5b1df082b350116dd8d2fff973e62c01b285b7affd89e" + digest = "1:4c93890bbbb5016505e856cb06b5c5a2ff5b7217584d33f2a9071ebef4b5d473" name = "go.opencensus.io" packages = [ ".", - "exemplar", "internal", "internal/tagencoding", + "metric/metricdata", + "metric/metricproducer", + "plugin/ocgrpc", "plugin/ochttp", "plugin/ochttp/propagation/b3", "plugin/ochttp/propagation/tracecontext", + "resource", "stats", "stats/internal", "stats/view", @@ -750,16 +766,16 @@ "trace/tracestate", ] pruneopts = "UT" - revision = "b7bf3cdb64150a8c8c53b769fdeb2ba581bd4d4b" - version = "v0.18.0" + revision = "43463a80402d8447b7fce0d2c58edf1687ff0b58" + version = "v0.19.3" [[projects]] - digest = "1:3c1a69cdae3501bf75e76d0d86dc6f2b0a7421bc205c0cb7b96b19eed464a34d" + digest = "1:a5158647b553c61877aa9ae74f4015000294e47981e6b8b07525edcbb0747c81" name = "go.uber.org/atomic" packages = ["."] pruneopts = "UT" - revision = "1ea20fb1cbb1cc08cbd0d913a96dead89aa18289" - version = "v1.3.2" + revision = "df976f2515e274675050de7b3f42545de80594fd" + version = "v1.4.0" [[projects]] digest = "1:60bf2a5e347af463c42ed31a493d817f8a72f102543060ed992754e689805d1a" @@ -770,7 +786,7 @@ version = "v1.1.0" [[projects]] - digest = "1:adccce69c151272d5053505aee552c6a1ac4e7bf6d18f0206ed7453187f6284d" + digest = "1:2a1fe9905518611e9ce56cd7aaefb35fca78d8a721ef5eb6540e5fdd436f45bb" name = "go.uber.org/zap" packages = [ ".", @@ -782,12 +798,12 @@ "zaptest/observer", ] pruneopts = "UT" - revision = "ff33455a0e382e8a81d14dd7c922020b6b5e7982" - version = "v1.9.1" + revision = "27376062155ad36be76b0f12cf1572a221d3a48c" + version = "v1.10.0" [[projects]] branch = "master" - digest = "1:189a9d376615810591d8682d1ef59f24c7130478257de2d9be0160d7b365be7c" + digest = "1:dc803ed7501a41bdf346841d923546876f666b250a83db19e1b6bd2aa9901938" name = "golang.org/x/crypto" packages = [ "md4", @@ -796,23 +812,24 @@ "ssh/terminal", ] pruneopts = "UT" - revision = "ff983b9c42bc9fbf91556e191cc8efb585c16908" + revision = "cc06ce4a13d484c0101a9e92913248488a75786d" [[projects]] branch = "master" - digest = "1:acadbbf02e5c744709c60ac929e63258a69a3404ab36ed0c1a245cc28f056220" + digest = "1:7f184383ff6745f500aa38f11a7bcb1a49e7a388e6a2ea853b6c529c98090e1a" name = "golang.org/x/image" packages = [ "bmp", + "ccitt", "tiff", "tiff/lzw", ] pruneopts = "UT" - revision = "cd38e8056d9b27bb2f265effa37fb0ea6b8a7f0f" + revision = "7e034cad644213bc79b336b52fce73624259aeca" [[projects]] branch = "master" - digest = "1:8ecb828bb550a8c6b7d75b8261a42c369461311616ebe5451966d067f5f993bf" + digest = "1:92bb0e8dc506ab66c513d409a6f7fe182be85127cf4093742d4e993b43e0ef9c" name = "golang.org/x/net" packages = [ "context", @@ -821,53 +838,58 @@ "http2", "http2/hpack", "idna", + "internal/socks", "internal/timeseries", + "proxy", "trace", + "websocket", ] pruneopts = "UT" - revision = "be1c187aa6c66b9daa1d9461c228d17e9dd2cab7" + revision = "3b0461eec859c4b73bb64fdc8285971fd33e3938" [[projects]] branch = "master" - digest = "1:5276e08fe6a1dfdb65b4f46a2e5d5c9e00be6e499105e441049c3c04a0c83b36" + digest = "1:8d1c112fb1679fa097e9a9255a786ee47383fa2549a3da71bcb1334a693ebcfe" name = "golang.org/x/oauth2" packages = [ ".", "internal", ] pruneopts = "UT" - revision = "d668ce993890a79bda886613ee587a69dd5da7a6" + revision = "0f29369cfe4552d0e4bcddc57cc75f4d7e672a33" [[projects]] branch = "master" - digest = "1:04a5b0e4138f98eef79ce12a955a420ee358e9f787044cc3a553ac3c3ade997e" + digest = "1:a2fc247e64b5dafd3251f12d396ec85f163d5bb38763c4997856addddf6e78d8" name = "golang.org/x/sync" packages = [ "errgroup", "semaphore", ] pruneopts = "UT" - revision = "37e7f081c4d4c64e13b10787722085407fe5d15f" + revision = "112230192c580c3556b8cee6403af37a4fc5f28c" [[projects]] branch = "master" - digest = "1:5ee4df7ab18e945607ac822de8d10b180baea263b5e8676a1041727543b9c1e4" + digest = "1:fe40fbf915905f8a2397b321b3f10190edbdf5d293f087d01d7eb3a6d1a4adca" name = "golang.org/x/sys" packages = [ "unix", "windows", ] pruneopts = "UT" - revision = "48ac38b7c8cbedd50b1613c0fccacfc7d88dfcdf" + revision = "c5567b49c5d04a5f83870795b8c0e2df43a8ce32" [[projects]] - digest = "1:a2ab62866c75542dd18d2b069fec854577a20211d7c0ea6ae746072a1dccdd18" + digest = "1:8d8faad6b12a3a4c819a3f9618cb6ee1fa1cfc33253abeeea8b55336721e3405" name = "golang.org/x/text" packages = [ "collate", "collate/build", "internal/colltab", "internal/gen", + "internal/language", + "internal/language/compact", "internal/tag", "internal/triegen", "internal/ucd", @@ -880,8 +902,8 @@ "unicode/rangetable", ] pruneopts = "UT" - revision = "f21a4dfb5e38f5895301dc265a8def02365cc3d0" - version = "v0.3.0" + revision = "342b2e1fbaa52c93f31447ad2c6abc048c63e475" + version = "v0.3.2" [[projects]] branch = "master" @@ -889,18 +911,18 @@ name = "golang.org/x/time" packages = ["rate"] pruneopts = "UT" - revision = "85acf8d2951cb2a3bde7632f9ff273ef0379bcbd" + revision = "9d24e82272b4f38b78bc8cff74fa936d31ccd8ef" [[projects]] digest = "1:5f003878aabe31d7f6b842d4de32b41c46c214bb629bb485387dbcce1edf5643" name = "google.golang.org/api" packages = ["support/bundler"] pruneopts = "UT" - revision = "19e022d8cf43ce81f046bae8cc18c5397cc7732f" - version = "v0.1.0" + revision = "02490b97dff7cfde1995bd77de808fd27053bc87" + version = "v0.7.0" [[projects]] - digest = "1:9e29a0ec029d012437d88da3ccccf18adcdce069cab08d462056c2c6bb006505" + digest = "1:7e8b9c5ae49011b12ae8473834ac1a7bb8ac029ba201270c723e4c280c9e4855" name = "google.golang.org/appengine" packages = [ "cloudsql", @@ -913,19 +935,23 @@ "urlfetch", ] pruneopts = "UT" - revision = "e9657d882bb81064595ca3b56cbe2546bbabf7b1" - version = "v1.4.0" + revision = "b2f4a3cf3c67576a2ee09e1fe62656a5086ce880" + version = "v1.6.1" [[projects]] branch = "master" - digest = "1:077c1c599507b3b3e9156d17d36e1e61928ee9b53a5b420f10f28ebd4a0b275c" + digest = "1:3565a93b7692277a5dea355bc47bd6315754f3246ed07a224be6aec28972a805" name = "google.golang.org/genproto" - packages = ["googleapis/rpc/status"] + packages = [ + "googleapis/api/httpbody", + "googleapis/rpc/status", + "protobuf/field_mask", + ] pruneopts = "UT" - revision = "ae2f86662275e140f395167f1dab7081a5bd5fa8" + revision = "6af8c5fc6601ab6b41cd32742a65ce2f5bd9db57" [[projects]] - digest = "1:8c8ed249fa6a8db070bf2082f02052c697695fa5e1558b4e28dd0fb5f15f70a2" + digest = "1:456a209c8f2449983f13bad0eb015b6169dbbfe90cf38be241259ae1f715df47" name = "google.golang.org/grpc" packages = [ ".", @@ -942,6 +968,7 @@ "grpclog", "internal", "internal/backoff", + "internal/balancerload", "internal/binarylog", "internal/channelz", "internal/envconfig", @@ -963,8 +990,8 @@ "tap", ] pruneopts = "UT" - revision = "df014850f6dee74ba2fc94874043a9f3f75fbfd8" - version = "v1.17.0" + revision = "501c41df7f472c740d0674ff27122f3f48c80ce7" + version = "v1.21.1" [[projects]] digest = "1:cbc72c4c4886a918d6ab4b95e347ffe259846260f99ebdd8a198c2331cf2b2e9" @@ -1033,7 +1060,7 @@ [[projects]] branch = "release-1.13" - digest = "1:1ff3647c207e3f7a6b96f2669f4dbab7b7ce8dc4c0a5371dff0b634143ac28df" + digest = "1:daf8a959a8731c620d4c6ba4cd41b9ed6477bfc81dc7ab4fe1e47c8209811ee8" name = "k8s.io/apimachinery" packages = [ "pkg/api/errors", @@ -1076,7 +1103,7 @@ "third_party/forked/golang/reflect", ] pruneopts = "UT" - revision = "2b1284ed4c93a43499e781493253e2ac5959c4fd" + revision = "86fb29eff6288413d76bd8506874fddd9fccdff0" [[projects]] digest = "1:509f442b58ab9907cb05c7410f48f9ee6795402caef5dd53d19ad493543593d2" @@ -1178,31 +1205,31 @@ version = "v10.0.0" [[projects]] - digest = "1:e2999bf1bb6eddc2a6aa03fe5e6629120a53088926520ca3b4765f77d7ff7eab" + digest = "1:c283ca5951eb7d723d3300762f96ff94c2ea11eaceb788279e2b7327f92e4f2a" name = "k8s.io/klog" packages = ["."] pruneopts = "UT" - revision = "a5bc97fbc634d635061f3146511332c7e313a55a" - version = "v0.1.0" + revision = "d98d8acdac006fb39831f1b25640813fef9c314f" + version = "v0.3.3" [[projects]] branch = "master" - digest = "1:03a96603922fc1f6895ae083e1e16d943b55ef0656b56965351bd87e7d90485f" + digest = "1:22abb5d4204ab1a0dcc9cda64906a31c43965ff5159e8b9f766c9d2a162dbed5" name = "k8s.io/kube-openapi" packages = ["pkg/util/proto"] pruneopts = "UT" - revision = "0317810137be915b9cf888946c6e115c1bfac693" + revision = "db7b694dc208eead64d38030265f702db593fcf2" [[projects]] - digest = "1:936255313723e7ba7e67aa01e8e0517e90195bd401cdee0a63c4c96d57d0425d" + digest = "1:e58fa5292aca459bdd6feaf01ccc591b4a6f21fbc1fa8d9975209135cc6c4816" name = "pack.ag/amqp" packages = [ ".", "internal/testconn", ] pruneopts = "UT" - revision = "a77984cb83aafae2bc3fcdf6f0ef75c93b87eea5" - version = "v0.10.2" + revision = "279d72ee259701e0e0e58d98a52a82ba172a2f5f" + version = "v0.11.2" [[projects]] digest = "1:7719608fe0b52a4ece56c2dde37bedd95b938677d1ab0f84b8a7852e4c59f849" @@ -1212,6 +1239,22 @@ revision = "fd68e9863619f6ec2fdd8625fe1f02e7c877e480" version = "v1.1.0" +[[projects]] + digest = "1:69760bb625770798aa43843e5a493097a9f864646697f5ee29015cc0565524a6" + name = "xorm.io/builder" + packages = ["."] + pruneopts = "UT" + revision = "5175e98d9e97da33b5b8234760b151d867cb2620" + version = "v0.3.5" + +[[projects]] + digest = "1:3a0128b50d38343b108e97b5b7d86d384df2cbd06ec7db69cb37e62d1e349009" + name = "xorm.io/core" + packages = ["."] + pruneopts = "UT" + revision = "a31f53637037b3461649b8a9a03e13b26d31f12d" + version = "v0.6.3" + [solve-meta] analyzer-name = "dep" analyzer-version = 1 diff --git a/Makefile b/Makefile index 342cd10dd..850ad605c 100644 --- a/Makefile +++ b/Makefile @@ -28,8 +28,8 @@ $(BINS): vendor .PHONY: docker-build-deps docker-build-deps: - -docker pull $(DOCKER_REGISTRY)/$(GOLANG_DEPS):$(PRESIDIO_DEPS_LABEL) - -docker pull $(DOCKER_REGISTRY)/$(PYTHON_DEPS):$(PRESIDIO_DEPS_LABEL) + -docker pull $(DOCKER_REGISTRY)/$(GOLANG_DEPS):$(PRESIDIO_DEPS_LABEL) ||: + -docker pull $(DOCKER_REGISTRY)/$(PYTHON_DEPS):$(PRESIDIO_DEPS_LABEL) ||: docker build -t $(DOCKER_REGISTRY)/$(GOLANG_DEPS):$(PRESIDIO_DEPS_LABEL) -f Dockerfile.golang.deps . docker build -t $(DOCKER_REGISTRY)/$(PYTHON_DEPS):$(PRESIDIO_DEPS_LABEL) -f Dockerfile.python.deps . diff --git a/README.MD b/README.MD index c3fbe73a5..9557b68ae 100644 --- a/README.MD +++ b/README.MD @@ -26,7 +26,7 @@ Presidio can be integrated into any data pipeline for intelligent PII scrubbing. ## Features -***Unstsructured text anonymization*** +***Unstructured text anonymization*** Presidio automatically detects Personal-Identifiable Information (PII) in unstructured text, annonymizes it based on one or more anonymization mechanisms, and returns a string with no personal identifiable data. For example: diff --git a/SECURITY.MD b/SECURITY.MD new file mode 100644 index 000000000..f4357d6b6 --- /dev/null +++ b/SECURITY.MD @@ -0,0 +1,31 @@ +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [many more](https://opensource.microsoft.com/). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [definition](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** Instead, please report them to the Microsoft Security Response Center at [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://technet.microsoft.com/en-us/security/dn606155). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). diff --git a/charts/presidio/templates/analyzer-deployment.yaml b/charts/presidio/templates/analyzer-deployment.yaml index 47d7dc3c9..5448f8d59 100644 --- a/charts/presidio/templates/analyzer-deployment.yaml +++ b/charts/presidio/templates/analyzer-deployment.yaml @@ -26,10 +26,10 @@ spec: - containerPort: {{ .Values.analyzer.service.internalPort }} resources: requests: - memory: "2000Mi" + memory: "1500Mi" cpu: "1500m" limits: - memory: "5000Mi" + memory: "3000Mi" cpu: "2000m" env: - name: PRESIDIO_NAMESPACE diff --git a/charts/presidio/templates/api-ingress.yaml b/charts/presidio/templates/api-ingress.yaml index 82fad7fa3..10a5ebf13 100644 --- a/charts/presidio/templates/api-ingress.yaml +++ b/charts/presidio/templates/api-ingress.yaml @@ -1,4 +1,4 @@ -{{- if .Values.api.ingress.enabled -}} +{{- if and (.Values.api.ingress.enabled) (or (eq .Values.api.ingress.class "nginx") (eq .Values.api.ingress.class "traefik")) -}} {{- $serviceName := include "presidio.api.fullname" . -}} {{- $servicePort := .Values.api.service.externalPort -}} apiVersion: extensions/v1beta1 @@ -21,4 +21,4 @@ spec: backend: serviceName: {{ $serviceName }} servicePort: {{ $servicePort }} -{{- end -}} \ No newline at end of file +{{- end -}} diff --git a/charts/presidio/templates/api-istio-gateway.yaml b/charts/presidio/templates/api-istio-gateway.yaml new file mode 100644 index 000000000..92f6222b3 --- /dev/null +++ b/charts/presidio/templates/api-istio-gateway.yaml @@ -0,0 +1,38 @@ +{{- if and (.Values.api.ingress.enabled) (eq .Values.api.ingress.class "istio") -}} +{{- $serviceName := include "presidio.api.fullname" . -}} +{{- $servicePort := .Values.api.service.externalPort -}} +{{- $servicePortName := .Values.api.service.name -}} +apiVersion: networking.istio.io/v1alpha3 +kind: Gateway +metadata: + name: {{ $serviceName }} +spec: + selector: + istio: ingressgateway # use istio default controller + servers: + - port: + number: 80 + name: http + protocol: HTTP + hosts: + - "*" +--- +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService +metadata: + name: {{ $serviceName }} +spec: + hosts: + - "*" + gateways: + - {{ $serviceName }} + http: + - match: + - uri: + prefix: /api/ + route: + - destination: + host: {{ $serviceName }} + port: + number: {{ $servicePort }} +{{- end -}} \ No newline at end of file diff --git a/charts/presidio/values.yaml b/charts/presidio/values.yaml index 7abcf148a..1d9e5cfd5 100644 --- a/charts/presidio/values.yaml +++ b/charts/presidio/values.yaml @@ -30,8 +30,9 @@ api: #readinessProbe: # initialDelaySeconds: 20 + # supported types are nginx, traefik and istio ingress: - enabled: true + enabled: false class: nginx analyzer: diff --git a/docs/custom_fields.md b/docs/custom_fields.md index c16fe98c2..700bacb0d 100644 --- a/docs/custom_fields.md +++ b/docs/custom_fields.md @@ -132,5 +132,4 @@ Presidio supports custom fields using either online via a simple REST API or by b. Reference and add the new class to the `RecognizerRegistry` module, in the `load_predefined_recognizers` method, which registers all code based recognizers. - - \ No newline at end of file + c. Note that if by adding the new recognizer, the memory or CPU consumption of the analyzer is expected to grow (such as in the case of adding a new model based recognizer), you should consider updating the pod's resources allocation in [analyzer-deployment.yaml](../charts/presidio/templates/analyzer-deployment.yaml) diff --git a/docs/development.md b/docs/development.md index 1fd2c786f..943bfd7c8 100644 --- a/docs/development.md +++ b/docs/development.md @@ -22,25 +22,9 @@ 5. Install [tesseract](https://github.com/tesseract-ocr/tesseract/wiki) OCR framework. -6. Protobuf generator tools (Optional) - - - `https://github.com/golang/protobuf` - - - `https://grpc.io/docs/tutorials/basic/python.html` - - To generate proto files, clone [presidio-genproto](https://github.com/Microsoft/presidio-genproto) and run the following commands in `$GOPATH/src/github.com/Microsoft/presidio-genproto/src` folder - - ```sh - python -m grpc_tools.protoc -I . --python_out=../python --grpc_python_out=../python ./*.proto - ``` - - ```sh - protoc -I . --go_out=plugins=grpc:../golang ./*.proto - ``` - ## Setting up the environment - Python -1. Build and install [re2](https://github.com/google/re2) +1. Build and install [re2](https://github.com/google/re2) (Optional. Presidio will use `regex` instead of `pyre2` if `re2` is not installed) ```sh re2_version="2018-12-01" @@ -94,11 +78,51 @@ Install the Python packages for the analyzer in the `presidio-analyzer` folder, pylint analyzer pip freeze ``` + +## Changing Presidio's API +Presidio leverages [protobuf](https://github.com/golang/protobuf) to create API classes and services across multiple environments. The proto files are stored on a different [Github repo](https://github.com/Microsoft/presidio-genproto) + +Follow these steps to change Presidio's API: +1. Fork the [presidio-genproto](https://github.com/Microsoft/presidio-genproto) repo into `YOUR_ORG/presidio-genproto` +2. Clone the repo into the `$GOPATH/src/github.com/YOUR_ORG/presidio-genproto` folder +3. Make the desired changes to the .proto files in /src +4. Make sure you have [protobuf](https://github.com/golang/protobuf) installed +5. Generate the Go and Python files. Run the following commands in the `src` folder of `presidio-genproto`: + + ```sh + python -m grpc_tools.protoc -I . --python_out=../python --grpc_python_out=../python ./*.proto + + protoc -I . --go_out=plugins=grpc:../golang ./*.proto + ``` + + 5. Copy all the files in the `python` folder into `presidio-analyzer/analyzer`. All generated files end with `*pb2.py` or `*pb2_grpc.py` + 6. Change the constraint on `Gopkg.toml` which directs to the location of `presidio-genproto` +From: + +```yaml +[[constraint]] + branch = "master" + name = "github.com/Microsoft/presidio-genproto" +``` + +To: + +```yaml +[[constraint]] + branch = "YOUR_GENPROTO_BRANCH" + name = "github.com/YOUR_ORG/presidio-genproto" + +``` + 7. Update `Gopkg.lock` by calling `dep ensure` or `dep ensure --update github.com/YOUR_ORG/presidio-genproto` + 8. Push all the changes (generated python files, `Gopkg.toml` and `Gopkg.lock` into your presidio repo + +For more info, see https://grpc.io/docs/tutorials/basic/python.html + ## Development notes - Build the bins with `make build` -- Build the base containers with `make docker-build-deps DOCKER_REGISTRY=${DOCKER_REGISTRY} PRESIDIO_DEPS_LABEL=${PRESIDIO_DEPS_LABEL}` +- Build the base containers with `make docker-build-deps DOCKER_REGISTRY=${DOCKER_REGISTRY} PRESIDIO_DEPS_LABEL=${PRESIDIO_DEPS_LABEL}` (If you do not specify a valid, logged-in, registry a warning will echo to the standard output) - Build the the Docker image with `make docker-build DOCKER_REGISTRY=${DOCKER_REGISTRY} PRESIDIO_DEPS_LABEL=${PRESIDIO_DEPS_LABEL} PRESIDIO_LABEL=${PRESIDIO_LABEL}` - Push the Docker images with `make docker-push DOCKER_REGISTRY=${DOCKER_REGISTRY} PRESIDIO_LABEL=${PRESIDIO_LABEL}` - Run the tests with `make test` diff --git a/docs/index.md b/docs/index.md index 326662eba..209446210 100644 --- a/docs/index.md +++ b/docs/index.md @@ -16,4 +16,6 @@ New to Presidio? Read this material to quickly get up and running. - [Analyzer service tutorial](tutorial_analyzer.md) - [Calling the different services](tutorial_service.md) - [Adding custom fields](custom_fields.md) -- [Presidio Build and Release](build_release.md) +- [Presidio build and release](build_release.md) +- [Interpretability traces](interpretability_logs.md) +- [Presidio logging and monitoring design concepts](monitoring_logging.md) \ No newline at end of file diff --git a/docs/install.md b/docs/install.md index 362c97e3b..1e6fa7611 100644 --- a/docs/install.md +++ b/docs/install.md @@ -24,6 +24,8 @@ sleep 30 # Wait for the analyzer model to load docker run --rm --name presidio-api --network mynetwork -d -p 8080:8080 -e WEB_PORT=8080 -e ANALYZER_SVC_ADDRESS=presidio-analyzer:3000 -e ANONYMIZER_SVC_ADDRESS=presidio-anonymizer:3001 -e RECOGNIZERS_STORE_SVC_ADDRESS=presidio-recognizers-store:3004 ${DOCKER_REGISTRY}/presidio-api:${PRESIDIO_LABEL} ``` +**NOTE: Building the deps images currently takes some time** (~70 minutes, depending on the build machine). We are working on improving the build time through improving the build and providing pre-built dependencies. + --- ## Presidio As a Service @@ -31,6 +33,7 @@ docker run --rm --name presidio-api --network mynetwork -d -p 8080:8080 -e WEB_P ### Requirements - Kubernetes 1.9+ with RBAC enabled. + - Note the pod's resources requirements (CPU and memory) and plan the cluster accordingly. - Helm ### Default installation using pre-made scripts @@ -50,6 +53,10 @@ Follow the installation guide at the [Readme page](https://github.com/Microsoft/ 3. Optional - Ingress controller for presidio API. - [Traefik](https://docs.traefik.io/user-guide/kubernetes/) - [NGINX](https://docs.microsoft.com/en-us/azure/aks/ingress-tls) + - [Istio](https://istio.io/docs/tasks/traffic-management/ingress/) + + **Note** that presidio is not deployed with an ingress controller by default. + to change this behavior, deploy the helm chart with *api.ingress.enabled=true* and specify they type of ingress controller to be used with *api.ingress.class=nginx* (supported classes are: nginx, traefik or istio). 4. Verify that Redis and Traefik/NGINX are installed correctly @@ -61,3 +68,86 @@ Follow the installation guide at the [Readme page](https://github.com/Microsoft/ ``` 6. For more options over the deployment, follow the [Development guide](https://github.com/Microsoft/presidio/blob/master/docs/development.md) + +## Install presidio-analyzer as a Python package +If you're interested in running the analyzer alone, you can install it as a standalone python package by packaging it into a `wheel` file. + +#### Creating the wheel file: +In the presidio-analyzer folder, run: + +```sh +python setup.py bdist_wheel +``` + +#### Installing the wheel file +1. Copy the created wheel file (from the `dist` folder of presidio-analyzer) into a clean virtual environment + +2. install `wheel` package + +```sh +pip install wheel +``` + +2. Install the presidio-analyzer wheel file + +```sh +pip install WHEEL_FILE +``` + +Where `WHEEL_FILE` is the path to the created wheel file + +3. Install the Spacy model from Github (not installed during the standard installation) + +```sh +pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz +``` + +Note that if you skip this step, the Spacy model would install lazily during the first call to the `AnalyzerEngine` + + +4. *Optional* : install `re2` and `pyre2`: + +- Install [re2](https://github.com/google/re2): + + ```sh + re2_version="2018-12-01" + wget -O re2.tar.gz https://github.com/google/re2/archive/${re2_version}.tar.gz + mkdir re2 + tar --extract --file "re2.tar.gz" --directory "re2" --strip-components 1 + cd re2 && make install + ``` + +- Install `pyre2`'s fork: + + ``` + pip install https://github.com/torosent/pyre2/archive/release/0.2.23.zip + ``` + + Note: If you don't install `re2`, Presidio will use the `regex` package for regular expressions handling + +5. Test the installation + + To test, run Python on the virtual env you've installed the presidio-analyzer in. + Then, make sure this code returns an answer: + + ```python + from analyzer import AnalyzerEngine + + engine = AnalyzerEngine() + + text = "My name is David and I live in Miami" + + response = engine.analyze(correlation_id=0, + text = text, + entities=[], + language='en', + all_fields=True, + score_threshold=0.5) + + for item in response: + print("Start = {}, end = {}, entity = {}, confidence = {}".format(item.start, + item.end, + item.entity_type, + item.score)) + + ``` diff --git a/docs/interpretability_logs.md b/docs/interpretability_logs.md new file mode 100644 index 000000000..20b63d45b --- /dev/null +++ b/docs/interpretability_logs.md @@ -0,0 +1,57 @@ +# Interpretability Traces + +## Background +Presidio offers interpretability traces, which allows you to investigate a specific api request, by exposing a `correlation-id` as part of the api response headers. + +The interpretability traces explain why a specific PII was detected. For example: which recognizer detected the entity, which regex / ML model were used, which context words improved the score, etc. + +## How it works +The current implementation of the `App Tracer` class writes the traces into the `stdout`. This can be easily customized to have your traces written to different destination of your choice. + +Each trace contains a `correlation-id` which correlates to a specific api request. The api returns a `x-correlation-id` header which you can use to the `correlation-id` and query the `stdout` logs. + +By having the traces written into the `stdout` it's very easy to configure a monitoring solution to ease the process of reading processing the tracing logs in a distributed system. Read our [monitoring guide](monitoring_logging.md) for more information. + +## Examples +For the a request with the following text: +``` +My name is Bart Simpson, my Credit card is: 4095-2609-9393-4932, my phone is 425 8829090 +``` + +The following traces will be written: +``` +[2019-07-14 14:22:32,409][InterpretabilityMock][INFO][00000000-0000-0000-0000-000000000000][nlp artifacts:{'entities': (Bart Simpson, 4095, 425), 'tokens': ['My', 'name', 'is', 'Bart', 'Simpson', ',', 'my', 'Credit', 'card', 'is', ':', '4095', '-', '2609', '-', '9393', '-', '4932', ',', ' ', 'my', 'phone', 'is', '425', '8829090'], 'lemmas': ['My', 'name', 'be', 'Bart', 'Simpson', ',', 'my', 'Credit', 'card', 'be', ':', '4095', '-', '2609', '-', '9393', '-', '4932', ',', ' ', 'my', 'phone', 'be', '425', '8829090'], 'tokens_indices': [0, 3, 8, 11, 16, 23, 25, 28, 35, 40, 42, 44, 48, 49, 53, 54, 58, 59, 63, 65, 66, 69, 75, 78, 82], 'keywords': ['bart', 'simpson', 'credit', 'card', '4095', '2609', '9393', '4932', ' ', 'phone', '425', '8829090']}] + +[2019-07-14 14:22:32,417][InterpretabilityMock][INFO][00000000-0000-0000-0000-000000000000][["{'entity_type': 'CREDIT_CARD', 'start': 44, 'end': 63, 'score': 1.0, 'analysis_explanation': {'recognizer': 'CreditCardRecognizer', 'pattern_name': 'All Credit Cards (weak)', 'pattern': '\\\\b((4\\\\d{3})|(5[0-5]\\\\d{2})|(6\\\\d{3})|(1\\\\d{3})|(3\\\\d{3}))[- ]?(\\\\d{3,4})[- ]?(\\\\d{3,4})[- ]?(\\\\d{3,5})\\\\b', 'original_score': 0.3, 'score': 1.0, 'textual_explanation': None, 'score_context_improvement': 0.7, 'supportive_context_word': 'credit', 'validation_result': True}}", "{'entity_type': 'PERSON', 'start': 11, 'end': 23, 'score': 0.85, 'analysis_explanation': {'recognizer': 'SpacyRecognizer', 'pattern_name': None, 'pattern': None, 'original_score': 0.85, 'score': 0.85, 'textual_explanation': \"Identified as PERSON by Spacy's Named Entity Recognition\", 'score_context_improvement': 0, 'supportive_context_word': '', 'validation_result': None}}", "{'entity_type': 'PHONE_NUMBER', 'start': 78, 'end': 89, 'score': 0.85, 'analysis_explanation': {'recognizer': 'UsPhoneRecognizer', 'pattern_name': 'Phone (medium)', 'pattern': '\\\\b(\\\\d{3}[-\\\\.\\\\s]\\\\d{3}[-\\\\.\\\\s]??\\\\d{4})\\\\b', 'original_score': 0.5, 'score': 0.85, 'textual_explanation': None, 'score_context_improvement': 0.35, 'supportive_context_word': 'phone', 'validation_result': None}}"]] +``` + +The format of the traces is: `[Date Time][Interpretability][Log Level][Unique Correlation ID][Trace Message]` + +## Custom traces +Currently the traces are written automatically. It means that when you add a new recognizer, a generic interpretability traces will be written. + +However, it's possible to write custom data to the traces if you wish to. + +For exmple, the [spacy_recognizer.py](https://github.com/microsoft/presidio/blob/master/presidio-analyzer/analyzer/predefined_recognizers/spacy_recognizer.py) implemented a custom trace as follows: +```python +SPACY_DEFAULT_EXPLANATION = "Identified as {} by Spacy's Named Entity Recognition" + +def build_spacy_explanation(recognizer_name, original_score, entity): + explanation = AnalysisExplanation( + recognizer=recognizer_name, + original_score=original_score, + textual_explanation=SPACY_DEFAULT_EXPLANATION.format(entity)) + return explanation +``` + +The `textual_explanation` field in `AnalysisExplanation` class allows you to add your own custom text into the final trace which will be written. + +## Enabling/Disabling Traces +Interpretability traces are enabled by default. Disable App Tracing by setting the `enabled` constructor parameter to `False`. +PII entities are not stored in the Traces by default. Enable it by either set an evironment variable `ENABLE_TRACE_PII` to `True`, or you can set it directly in the command line, using the `enable-trace-pii` argument as follows: +```bash +pipenv run python __main__.py serve --grpc-port 3001 --enable-trace-pii True +``` + +## Notes +* Interpretability traces explain why PIIs were detected, but not why they were not detected. diff --git a/docs/monitoring_logging.md b/docs/monitoring_logging.md new file mode 100644 index 000000000..243412a3f --- /dev/null +++ b/docs/monitoring_logging.md @@ -0,0 +1,193 @@ +# Presidio logging and monitoring design concepts + +Presidio accommodates several ways to collect logs, metrics, and traces using cloud-native standards enabled by its runtime, kubnernetes. +The following document describes some use-cases suited for different environments and requirements from the logging system which have been tested by the presidio team. + +## Logging and Monitoring Basics + +- ***Logs*** are text-based records of events that occur while the application is running. +- ***Metrics*** are numerical values that can be analyzed. Different types of metrics include: + - **Node-level and Container metrics** including CPU, memory, network, disk, and file system usage. + - **Application metrics** include any metrics that are relevant to understanding the behavior of a service as well as custom metrics that are specific to the domain + - **Dependent service metrics** include external services or endpoints statistics for latency and error rate. +- ***Distributed Tracing*** - is used to profile and monitor applications built using a microservices architecture using a correlation ID. + +Visit the [Architecture center](https://docs.microsoft.com/en-us/azure/architecture/microservices/logging-monitoring) to learn about implementing logging and monitoring in microservices. + +## Technology Options + +The following section covers three logging technology stacks that include potential scenarios in public and private clouds: +- Azure Monitor +- EFK (Elastic, FluentD, Kibana) +- Kubernetes service mesh (Istio and Linkerd) + +### Azure Kubernetes Service (AKS) and Azure Monitor logging and metrics +When deploying presidio to AKS, [Azure Monitor](https://docs.microsoft.com/en-us/azure/azure-monitor/overview) provides the easiest way to manage and query logs and metrics using OOTB tooling. +There are a number of ways to enable Azure Monitor on either a new or an exising cluster using the portal, CLI and Terraform, [read more](https://docs.microsoft.com/en-us/azure/azure-monitor/insights/container-insights-onboard). + +##### Enabling Azure Monitor on AKS using the CLI + + +```sh +az aks enable-addons -a monitoring -n MyExistingManagedCluster -g MyExistingManagedClusterRG +``` + +##### Example - Viewing Analyzer Logs + +Run the following KQL query in Azure Logs: + +```sql +let startTimestamp = ago(1d); +let ContainerIDs = KubePodInventory +| where TimeGenerated > startTimestamp +| where ClusterId =~ "[YOUR_CLUSTER_ID]" +| distinct ContainerID; +ContainerLog +| where ContainerID in (ContainerIDs) and Image contains "analyzer" +| project LogEntrySource, LogEntry, TimeGenerated, Computer, Image, Name, ContainerID +| order by TimeGenerated desc +| limit 200 +``` + +### Logging with Elasticsearch, Kibana, FluentD + +Logs in presidio are outputted to stderr and stdout as a standard of logging in 12 factor/microservices applications. +to store logs for long term retention and exploration during failures and RCA, use [elasticsearch](https://github.com/elastic/elasticsearch) or other document databases that are optimized to act as a search engine (solr, splunk, etc). elasticsearch logs are easily queried and visualized using [kibana](https://github.com/elastic/kibana) or [grafana](https://github.com/grafana/grafana). +Shipping logs from a microservices platform such as kubernetes to the logs database is done using a logs processor\forwarder such as the CNCF project [FluentD](https://www.fluentd.org/). + +##### Enabling EFK on AKS + +The following section describes deploying an EFK (Elastic, FluentD, Kibana) stack to a development AKS cluster. +**Note that:** the following scripts do **not** fit a production environment in terms of security and scale. + +- Install elasticsearch + +```sh +helm install stable/elasticsearch --name=elasticsearch --namespace logging --set client.replicas=1 --set master.replicas=1 --set cluster.env.MINIMUM_MASTER_NODES=1 --set cluster.env.RECOVER_AFTER_MASTER_NODES=1 --set cluster.env.EXPECTED_MASTER_NODES=1 --set data.replicas=1 --set data.heapSize=300m --set master.persistence.storageClass=managed-premium --set data.persistence.storageClass=managed-premium +``` + +- Install fluent-bit (a lightweight fluentD log-forwarder) + +```sh +helm install stable/fluent-bit --name=fluent-bit --namespace=logging --set backend.type=es --set backend.es.host=elasticsearch-client +``` + +- Install kibana + +```sh +helm install stable/kibana --version 3.0.0 --name=kibana --namespace=logging --set env.ELASTICSEARCH_URL=http://elasticsearch-client:9200 --set files."kibana\.yml"."elasticsearch\.hosts"=http://elasticsearch-client:9200 --set service.type=NodePort --set service.nodePort=31000 +``` + +##### Example - Viewing Analyzer Logs + +- Open the kinana dashbaord + +```sh +kubectl -n logging port-forward $(kubectl -n logging get pod -l app=kibana -o jsonpath='{.items[0].metadata.name}') 5601:5601 +``` + +- Open your browser at http://localhost:5601 + +- After initilization of kibana index, switch to the "Discover" tab and search for presidio specific logs. + +- Search for 'presidio-analyzer' to view logs generated by the analyzer and different recognizers. + +### Service Level Metrics and Distributed Tracing + +Metrics and tracing provided by kubernetes and by the applications deployed in the cluster are best suited to be exported to a time-series database such as CNCF Prometheus, shipping of metrics and traces to the database is done using a log forwarder such as FluentD. +When using a service mesh such as istio or linkerd, cluster and service level telemetry are shipped to the database by the mesh using the sidecar containers, and adding distributed correlation-ID to identify the flow of events across services. + +### Using Istio + +##### Enabling Istio on AKS + +To enable istio on AKS, refer to the [aks documentation](https://docs.microsoft.com/en-us/azure/aks/istio-install). +To enable istio on your kubernetes cluster, refer to the official [quick-start guide](https://istio.io/docs/setup/kubernetes/install/kubernetes/) or your specific kubernetes hosting solution guide. + +##### Example - Presidio Service Metrics + +- Make sure presidio namespace is tagged for istio sidecar injection and that presidio is deployed using the istio ingress. + + ```sh + kubectl label namespace presidio istio-injection=enabled + + export REGISTRY=mcr.microsoft.com + export TAG=latest + + helm install --name presidio-demo --set registry=$REGISTRY,tag=$TAG,api.ingress.enabled=true,api.ingress.class=istio charts/presidio --namespace presidio + ``` + +- Open the grafana dashbaord + + ```sh + kubectl -n istio-system port-forward $(kubectl -n istio-system get pod -l app=grafana -o jsonpath='{.items[0].metadata.name}') 3000:3000 + ``` + +- Open your browser at http://localhost:3000 + +- Istio's grafana comes with several built in dashabords, for instance: + + * **[Istio Mesh Dashboard](http://localhost:3000/dashboard/db/istio-mesh-dashboard)** - global view of the Mesh along with services and workloads in the mesh. + * **[Service Dashboards](http://localhost:3000/dashboard/db/istio-service-dashboard)** - metrics for the service and then client workloads (workloads that are calling this service) and service workloads (workloads that are providing this service) for that service. + * **[Workload Dashboards](http://localhost:3000/dashboard/db/istio-workload-dashboard)** - metrics for each workload and then inbound workloads (workloads that are sending request to this workload) and outbound services (services to which this workload send requests) for that workload. + + +- Alternatively, open Prometheus dashabord to query the database directly + +```sh +kubectl -n istio-system port-forward $(kubectl -n istio-system get pod -l app=prometheus -o jsonpath='{.items[0].metadata.name}') 9090:9090 +``` + +- Open your browser at http://localhost:9090 + +-Search Prometheus for presidio containers telemetry + +##### Example - Presidio Service Dependecies + +- Open the [Kiali](https://www.kiali.io/) service mesh observability dashboard + +```sh +kubectl port-forward -n istio-system $(kubectl get pod -n istio-system -l app=kiali -o jsonpath='{.items[0].metadata.name}') 20001:20001 +``` + +- Open your browser at http://localhost:20001/kiali/console/ + +- View workload, application and service health and the dependency graph between presidio services with network and performance KPIs. + + +##### Example - Presidio Distributed Metrics + +- Open the [Jaeger](https://www.jaegertracing.io/) e2e distributed tracing tool + +```sh +kubectl port-forward -n istio-system $(kubectl get pod -n istio-system -l app=jaeger -o jsonpath='{.items[0].metadata.name}') 16686:16686 +``` + +- Open your browser at http://localhost:16686 + +- Note that jaeger has a sample rate of around 1/100, tracing may take time to show. + +- Use the search tab to find specific requests and the dependencies tab to view presidio service relations. + +### Using Linkerd + +##### Enabling Linkerd on AKS + +To enable linkerd on your kubernetes cluster, refer to the official [quick-start guide](https://linkerd.io/2/getting-started/) or your specific kubernetes hosting solution guide. + +##### Example - Presidio Service Mesh Dashbaord and Metrics + +- Open the linkerd dashbaord + +```sh +linkerd dashboard +``` + +- The browser is opened + +- Linkerd dashbaord are built on top of prometheus and provide an overview of services health and KPIs. + grafana is opened when clicking a service for greater insights, featuring the following dashbaords: + + * **Top Line Metrics** - "golden" KPIs for top services + * **Deployment Detail** - per deployment KPIs + * **Pod Details** - per pod KPIs diff --git a/pkg/presidio/presidio.go b/pkg/presidio/presidio.go index 5c26d8f60..0f4b1cd31 100644 --- a/pkg/presidio/presidio.go +++ b/pkg/presidio/presidio.go @@ -19,7 +19,7 @@ type ServicesAPI interface { SetupDatasinkService() SetupRecognizerStoreService() SetupCache() cache.Cache - AnalyzeItem(ctx context.Context, text string, template *types.AnalyzeTemplate) ([]*types.AnalyzeResult, error) + AnalyzeItem(ctx context.Context, text string, template *types.AnalyzeTemplate) (*types.AnalyzeResponse, error) AnonymizeItem(ctx context.Context, analyzeResults []*types.AnalyzeResult, text string, anonymizeTemplate *types.AnonymizeTemplate) (*types.AnonymizeResponse, error) AnonymizeImageItem(ctx context.Context, image *types.Image, analyzeResults []*types.AnalyzeResult, diff --git a/pkg/presidio/services/services.go b/pkg/presidio/services/services.go index e09407c41..a79262d43 100644 --- a/pkg/presidio/services/services.go +++ b/pkg/presidio/services/services.go @@ -249,18 +249,18 @@ func (services *Services) GetRecognizersHash( } //AnalyzeItem - search for PII -func (services *Services) AnalyzeItem(ctx context.Context, text string, template *types.AnalyzeTemplate) ([]*types.AnalyzeResult, error) { +func (services *Services) AnalyzeItem(ctx context.Context, text string, template *types.AnalyzeTemplate) (*types.AnalyzeResponse, error) { analyzeRequest := &types.AnalyzeRequest{ AnalyzeTemplate: template, Text: text, } - results, err := services.AnalyzerService.Apply(ctx, analyzeRequest) + response, err := services.AnalyzerService.Apply(ctx, analyzeRequest) if err != nil { return nil, err } - return results.AnalyzeResults, nil + return response, nil } //AnonymizeItem - anonymize text diff --git a/pkg/server/server.go b/pkg/server/server.go index f320edf42..23d435cf3 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -10,7 +10,7 @@ import ( "time" "github.com/gin-contrib/cors" - "github.com/gin-contrib/zap" + ginzap "github.com/gin-contrib/zap" "github.com/gin-gonic/gin" "golang.org/x/sync/errgroup" @@ -93,6 +93,17 @@ func WriteResponse( c.JSON(statusCode, responseBody) } +//WriteResponseWithRequestID writes a response and adds a request id header +func WriteResponseWithRequestID( + c *gin.Context, + statusCode int, + requestID string, + responseBody interface{}, +) { + c.Header("X-Correlation-Id", requestID) + WriteResponse(c, statusCode, responseBody) +} + //AbortWithError aborts the request and returns the error in the response body func AbortWithError(c *gin.Context, statusCode int, diff --git a/presidio-analyzer/Dockerfile b/presidio-analyzer/Dockerfile index 4f8cee97a..0373c6b2a 100644 --- a/presidio-analyzer/Dockerfile +++ b/presidio-analyzer/Dockerfile @@ -14,7 +14,7 @@ RUN pipenv install --dev --sequential && \ #---------------------------- -FROM ${REGISTRY}/presidio-python-deps +FROM ${REGISTRY}/presidio-python-deps:${PRESIDIO_DEPS_LABEL} ARG NAME=presidio-analyzer ADD ./${NAME}/analyzer /usr/bin/${NAME}/analyzer diff --git a/presidio-analyzer/VERSION b/presidio-analyzer/VERSION new file mode 100644 index 000000000..8a9ecc2ea --- /dev/null +++ b/presidio-analyzer/VERSION @@ -0,0 +1 @@ +0.0.1 \ No newline at end of file diff --git a/presidio-analyzer/analyzer/__init__.py b/presidio-analyzer/analyzer/__init__.py index 46daf1f7a..eb2c386d3 100644 --- a/presidio-analyzer/analyzer/__init__.py +++ b/presidio-analyzer/analyzer/__init__.py @@ -6,6 +6,7 @@ sys.path.append(os.path.dirname(os.path.dirname( os.path.abspath(__file__))) + "/analyzer") +from analyzer.analysis_explanation import AnalysisExplanation # noqa from analyzer.pattern import Pattern # noqa: F401 from analyzer.entity_recognizer import EntityRecognizer # noqa: F401 from analyzer.local_recognizer import LocalRecognizer # noqa: F401 diff --git a/presidio-analyzer/analyzer/__main__.py b/presidio-analyzer/analyzer/__main__.py index 14f6145e7..510f00217 100644 --- a/presidio-analyzer/analyzer/__main__.py +++ b/presidio-analyzer/analyzer/__main__.py @@ -18,6 +18,8 @@ sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) from analyzer_engine import AnalyzerEngine # noqa +from recognizer_registry.recognizer_registry import RecognizerRegistry # noqa +from nlp_engine.spacy_nlp_engine import SpacyNlpEngine # noqa WELCOME_MESSAGE = r""" @@ -58,12 +60,19 @@ def __init__(self, cli_ctx=None): welcome_message=WELCOME_MESSAGE) -def serve_command_handler(env_grpc_port=False, grpc_port=3000): +def serve_command_handler(enable_trace_pii, + env_grpc_port=False, + grpc_port=3000): server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) + registry = RecognizerRegistry() + nlp_engine = SpacyNlpEngine() analyze_pb2_grpc.add_AnalyzeServiceServicer_to_server( - AnalyzerEngine(), server) + AnalyzerEngine(registry=registry, + nlp_engine=nlp_engine, + enable_trace_pii=enable_trace_pii), + server) if env_grpc_port: port = os.environ.get('GRPC_PORT') @@ -108,8 +117,15 @@ def load_command_table(self, args): return super(CommandsLoader, self).load_command_table(args) def load_arguments(self, command): + enable_trace_pii = os.environ.get('ENABLE_TRACE_PII') + if enable_trace_pii is None: + enable_trace_pii = False + with ArgumentsContext(self, 'serve') as ac: ac.argument('env_grpc_port', default=False, required=False) + ac.argument('enable_trace_pii', + default=enable_trace_pii, + required=False) ac.argument('grpc_port', default=3001, type=int, required=False) with ArgumentsContext(self, 'analyze') as ac: ac.argument('env_grpc_port', default=False, required=False) diff --git a/presidio-analyzer/analyzer/analysis_explanation.py b/presidio-analyzer/analyzer/analysis_explanation.py new file mode 100644 index 000000000..2c9142469 --- /dev/null +++ b/presidio-analyzer/analyzer/analysis_explanation.py @@ -0,0 +1,51 @@ +class AnalysisExplanation: + + # pylint: disable=too-many-instance-attributes + def __init__(self, recognizer, original_score, pattern_name=None, + pattern=None, validation_result=None, + textual_explanation=None): + """ + AnalysisExplanation is a class that holds tracing information + to explain why PII entities where indentified as such + :param recognizer: name of recognizer that made the decision + :param original_score: recognizer's confidence in result + :param pattern_name: name of pattern + (if decision was made by a PatternRecognizer) + :param pattern: regex pattern that was applied (if PatternRecognizer) + :param validation_result: result of a validation (e.g. checksum) + :param textual_explanation: Free text for describing + a decision of a logic or model + """ + + self.recognizer = recognizer + self.pattern_name = pattern_name + self.pattern = pattern + self.original_score = original_score + self.score = original_score + self.textual_explanation = textual_explanation + self.score_context_improvement = 0 + self.supportive_context_word = '' + self.validation_result = validation_result + + def __repr__(self): + return str(self.__dict__) + + def set_improved_score(self, score): + """ Updated the score of the entity and compute the + improvment fromt the original scoree + """ + self.score = score + self.score_context_improvement = self.score - self.original_score + + def set_supportive_context_word(self, word): + """ Sets the context word which helped increase the score + """ + self.supportive_context_word = word + + def append_textual_explanation_line(self, text): + """Appends a new line to textual_explanation field""" + if self.textual_explanation is None: + self.textual_explanation = text + else: + self.textual_explanation = "{}\n{}".format( + self.textual_explanation, text) diff --git a/presidio-analyzer/analyzer/analyze_pb2.py b/presidio-analyzer/analyzer/analyze_pb2.py index 7f5a1aeff..d5d10bec9 100644 --- a/presidio-analyzer/analyzer/analyze_pb2.py +++ b/presidio-analyzer/analyzer/analyze_pb2.py @@ -21,7 +21,7 @@ name='analyze.proto', package='types', syntax='proto3', - serialized_pb=_b('\n\ranalyze.proto\x12\x05types\x1a\x0c\x63ommon.proto\x1a\x0etemplate.proto\"m\n\x11\x41nalyzeApiRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x19\n\x11\x61nalyzeTemplateId\x18\x02 \x01(\t\x12/\n\x0f\x61nalyzeTemplate\x18\x03 \x01(\x0b\x32\x16.types.AnalyzeTemplate\"O\n\x0e\x41nalyzeRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12/\n\x0f\x61nalyzeTemplate\x18\x02 \x01(\x0b\x32\x16.types.AnalyzeTemplate\"?\n\x0f\x41nalyzeResponse\x12,\n\x0e\x61nalyzeResults\x18\x01 \x03(\x0b\x32\x14.types.AnalyzeResult2J\n\x0e\x41nalyzeService\x12\x38\n\x05\x41pply\x12\x15.types.AnalyzeRequest\x1a\x16.types.AnalyzeResponse\"\x00\x62\x06proto3') + serialized_pb=_b('\n\ranalyze.proto\x12\x05types\x1a\x0c\x63ommon.proto\x1a\x0etemplate.proto\"m\n\x11\x41nalyzeApiRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x19\n\x11\x61nalyzeTemplateId\x18\x02 \x01(\t\x12/\n\x0f\x61nalyzeTemplate\x18\x03 \x01(\x0b\x32\x16.types.AnalyzeTemplate\"O\n\x0e\x41nalyzeRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12/\n\x0f\x61nalyzeTemplate\x18\x02 \x01(\x0b\x32\x16.types.AnalyzeTemplate\"R\n\x0f\x41nalyzeResponse\x12,\n\x0e\x61nalyzeResults\x18\x01 \x03(\x0b\x32\x14.types.AnalyzeResult\x12\x11\n\trequestId\x18\x02 \x01(\t2J\n\x0e\x41nalyzeService\x12\x38\n\x05\x41pply\x12\x15.types.AnalyzeRequest\x1a\x16.types.AnalyzeResponse\"\x00\x62\x06proto3') , dependencies=[common__pb2.DESCRIPTOR,template__pb2.DESCRIPTOR,]) @@ -125,6 +125,13 @@ message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='requestId', full_name='types.AnalyzeResponse.requestId', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), ], extensions=[ ], @@ -138,7 +145,7 @@ oneofs=[ ], serialized_start=246, - serialized_end=309, + serialized_end=328, ) _ANALYZEAPIREQUEST.fields_by_name['analyzeTemplate'].message_type = template__pb2._ANALYZETEMPLATE @@ -178,8 +185,8 @@ file=DESCRIPTOR, index=0, options=None, - serialized_start=311, - serialized_end=385, + serialized_start=330, + serialized_end=404, methods=[ _descriptor.MethodDescriptor( name='Apply', diff --git a/presidio-analyzer/analyzer/analyzer_engine.py b/presidio-analyzer/analyzer/analyzer_engine.py index 3dddf8df1..7157bf14c 100644 --- a/presidio-analyzer/analyzer/analyzer_engine.py +++ b/presidio-analyzer/analyzer/analyzer_engine.py @@ -1,51 +1,107 @@ -import logging -import os +import json +import uuid import analyze_pb2 import analyze_pb2_grpc import common_pb2 -from analyzer import RecognizerRegistry -from analyzer.nlp_engine import SpacyNlpEngine - -loglevel = os.environ.get("LOG_LEVEL", "INFO") -logging.basicConfig( - format='%(asctime)s:%(levelname)s:%(message)s', level=loglevel) +from analyzer.logger import Logger +from analyzer.app_tracer import AppTracer DEFAULT_LANGUAGE = "en" +logger = Logger() class AnalyzerEngine(analyze_pb2_grpc.AnalyzeServiceServicer): - def __init__(self, registry=RecognizerRegistry(), - nlp_engine=SpacyNlpEngine()): + def __init__(self, registry=None, nlp_engine=None, + app_tracer=None, enable_trace_pii=False, + default_score_threshold=None): + """ + AnalyzerEngine class: Orchestrating the detection of PII entities + and all related logic + :param registry: instance of type RecognizerRegistry + :param nlp_engine: instance of type NlpEngine + (for example SpacyNlpEngine) + :param app_tracer: instance of type AppTracer, + used to trace the logic used during each request + :param enable_trace_pii: bool, + defines whether PII values should be traced or not. + :param default_score_threshold: Minimum confidence value + for detected entities to be returned + """ + if not nlp_engine: + from analyzer.nlp_engine import SpacyNlpEngine + nlp_engine = SpacyNlpEngine() + if not registry: + from analyzer import RecognizerRegistry + registry = RecognizerRegistry() + if not app_tracer: + app_tracer = AppTracer() + # load nlp module self.nlp_engine = nlp_engine # prepare registry self.registry = registry # load all recognizers - registry.load_predefined_recognizers() + if not registry.recognizers: + registry.load_predefined_recognizers() + + self.app_tracer = app_tracer + self.enable_trace_pii = enable_trace_pii + + if default_score_threshold is None: + self.default_score_threshold = 0 + else: + self.default_score_threshold = default_score_threshold # pylint: disable=unused-argument def Apply(self, request, context): - logging.info("Starting Apply") + """ + GRPC entry point to Presidio-Analyzer + :param request: Presidio Analyzer resuest of type AnalyzeRequest + :param context: + :return: List of [AnalyzeResult] + """ + logger.info("Starting Analyzer's Apply") + entities = AnalyzerEngine.__convert_fields_to_entities( request.analyzeTemplate.fields) language = AnalyzerEngine.get_language_from_request(request) - results = self.analyze(request.text, entities, language, - request.analyzeTemplate.allFields) + + threshold = request.analyzeTemplate.resultsScoreThreshold + all_fields = request.analyzeTemplate.allFields + + # correlation is used to group all traces related to on request + + correlation_id = str(uuid.uuid4()) + results = self.analyze(correlation_id=correlation_id, + text=request.text, + entities=entities, + language=language, + all_fields=all_fields, + score_threshold=threshold, + trace=True) # Create Analyze Response Object response = analyze_pb2.AnalyzeResponse() + response.requestId = correlation_id # pylint: disable=no-member response.analyzeResults.extend( AnalyzerEngine.__convert_results_to_proto(results)) - logging.info("Found %d results", len(results)) + + logger.info("Found %d results", len(results)) return response @staticmethod def __remove_duplicates(results): + """ + Removes each result which has a span contained in a + result's span with ahigher score + :param results: List[RecognizerResult] + :return: List[RecognizerResult] + """ # bug# 597: Analyzer remove duplicates doesn't handle all cases of one # result as a substring of the other results = sorted(results, @@ -62,7 +118,8 @@ def __remove_duplicates(results): # If result is equal to or substring of # one of the other results if result.start >= filtered.start \ - and result.end <= filtered.end: + and result.end <= filtered.end \ + and result.entity_type == filtered.entity_type: valid_result = False break @@ -71,6 +128,23 @@ def __remove_duplicates(results): return filtered_results + def __remove_low_scores(self, results, score_threshold=None): + """ + Removes results for which the confidence is lower than the threshold + :param results: List of RecognizerResult + :param score_threshold: float value for minimum possible confidence + :return: List[RecognizerResult] + """ + if score_threshold is None: + score_threshold = self.default_score_threshold + + new_results = [] + for result in results: + if result.score >= score_threshold: + new_results.append(result) + + return new_results + @classmethod def get_language_from_request(cls, request): language = request.analyzeTemplate.language @@ -78,21 +152,27 @@ def get_language_from_request(cls, request): language = DEFAULT_LANGUAGE return language - def analyze(self, text, entities, language, all_fields): + def analyze(self, correlation_id, text, entities, language, all_fields, + score_threshold=None, trace=False): """ analyzes the requested text, searching for the given entities in the given language + :param correlation_id: cross call ID for this request :param text: the text to analyze :param entities: the text to search :param language: the language of the text :param all_fields: a Flag to return all fields of the requested language + :param score_threshold: A minimum value for which + to return an identified entity + :param trace: Should tracing of the response occur or not :return: an array of the found entities in the text """ - recognizers = self.registry.get_recognizers(language=language, - entities=entities, - all_fields=all_fields) + recognizers = self.registry.get_recognizers( + language=language, + entities=entities, + all_fields=all_fields) if all_fields: if entities: @@ -108,6 +188,11 @@ def analyze(self, text, entities, language, all_fields): # run the nlp pipeline over the given text, store the results in # a NlpArtifacts instance nlp_artifacts = self.nlp_engine.process_text(text, language) + + if self.enable_trace_pii and trace: + self.app_tracer.trace(correlation_id, "nlp artifacts:" + + nlp_artifacts.to_json()) + results = [] for recognizer in recognizers: # Lazy loading of the relevant recognizers @@ -120,10 +205,24 @@ def analyze(self, text, entities, language, all_fields): if current_results: results.extend(current_results) - return AnalyzerEngine.__remove_duplicates(results) + if trace: + self.app_tracer.trace(correlation_id, json.dumps( + [result.to_json() for result in results])) + + # Remove duplicates or low score results + results = AnalyzerEngine.__remove_duplicates(results) + results = self.__remove_low_scores(results, score_threshold) + + return results @staticmethod def __list_entities(recognizers): + """ + Returns a List[str] of unique entity names supported + by the provided recognizers + :param recognizers: list of EntityRecognizer + :return: List[str] + """ entities = [] for recognizer in recognizers: ents = [entity for entity in recognizer.supported_entities] @@ -133,15 +232,20 @@ def __list_entities(recognizers): @staticmethod def __convert_fields_to_entities(fields): - # Convert fields to entities - will be changed once the API - # will be changed - entities = [] - for field in fields: - entities.append(field.name) - return entities + """ + Converts the Field object to the name of the entity + :param fields: List of Fields in AnalyzeTemplate + :return: List[str] with field names + """ + return [field.name for field in fields] @staticmethod def __convert_results_to_proto(results): + """ + Converts a List[RecognizerResult] to List[AnalyzeResult] + :param results: List[RecognizerResult] + :return: List[AnalyzeResult] + """ proto_results = [] for result in results: res = common_pb2.AnalyzeResult() diff --git a/presidio-analyzer/analyzer/app_tracer.py b/presidio-analyzer/analyzer/app_tracer.py new file mode 100644 index 000000000..0154eaeaf --- /dev/null +++ b/presidio-analyzer/analyzer/app_tracer.py @@ -0,0 +1,25 @@ +from analyzer.logger import Logger + + +class AppTracer: + """This class provides the ability to log/trace the system's decisions, + such as which modules were used for detection, + which logic was utilized, what results were given and potentially why. + This can be useful for analyzing the detection accuracy of the system.""" + def __init__(self, enabled=True): + + self.logger = Logger('Interpretability') + self.logger.set_level("INFO") + self.enabled = enabled + + def trace(self, request_id, trace_data): + """ + Writes a value associated with a decision + for a specific request into the trace, + for further inspection if needed. + :param request_id: A unique ID, to correlate across calls. + :param trace_data: A string to write. + :return: + """ + if self.enabled: + self.logger.info("[%s][%s]", request_id, trace_data) diff --git a/presidio-analyzer/analyzer/entity_recognizer.py b/presidio-analyzer/analyzer/entity_recognizer.py index 1f4dae7ef..49a06d5dc 100644 --- a/presidio-analyzer/analyzer/entity_recognizer.py +++ b/presidio-analyzer/analyzer/entity_recognizer.py @@ -1,15 +1,15 @@ -import logging -import os from abc import abstractmethod import copy +from analyzer.logger import Logger + class EntityRecognizer: MIN_SCORE = 0 MAX_SCORE = 1.0 CONTEXT_SIMILARITY_THRESHOLD = 0.65 CONTEXT_SIMILARITY_FACTOR = 0.35 - MIN_SCORE_WITH_CONTEXT_SIMILARITY = 0.6 + MIN_SCORE_WITH_CONTEXT_SIMILARITY = 0.4 CONTEXT_PREFIX_COUNT = 5 CONTEXT_SUFFIX_COUNT = 0 @@ -36,11 +36,9 @@ def __init__(self, supported_entities, name=None, supported_language="en", self.version = version self.is_loaded = False - loglevel = os.environ.get("LOG_LEVEL", "INFO") - self.logger = logging.getLogger(__name__) - self.logger.setLevel(loglevel) + self.logger = Logger() self.load() - logging.info("Loaded recognizer: %s", self.name) + self.logger.info("Loaded recognizer: %s", self.name) self.is_loaded = True @abstractmethod @@ -97,7 +95,7 @@ def from_dict(cls, entity_recognizer_dict): return cls(**entity_recognizer_dict) def enhance_using_context(self, text, raw_results, - nlp_artifacts, predefined_context_words): + nlp_artifacts, recognizer_context_words): """ using the surrounding words of the actual word matches, look for specific strings that if found contribute to the score of the result, improving the confidence that the match is @@ -109,7 +107,7 @@ def enhance_using_context(self, text, raw_results, :param nlp_artifacts: The nlp artifacts contains elements such as lemmatized tokens for better accuracy of the context enhancement process - :param predefined_context_words: The words the current recognizer + :param recognizer_context_words: The words the current recognizer supports (words to lookup) """ # create a deep copy of the results object so we can manipulate it @@ -120,79 +118,82 @@ def enhance_using_context(self, text, raw_results, self.logger.warning('[%s]. NLP artifacts were not provided', self.name) return results - if predefined_context_words is None or predefined_context_words == []: + if recognizer_context_words is None or recognizer_context_words == []: self.logger.info("recognizer '%s' does not support context " "enhancement", self.name) return results for result in results: - # extract lemmatized context from the surronding of the match - context = self.__extract_context( + # extract lemmatized context from the surrounding of the match + + word = text[result.start:result.end] + + surrounding_words = self.__extract_surrounding_words( nlp_artifacts=nlp_artifacts, - word=text[result.start:result.end], + word=word, start=result.start) - context_similarity = self.__calculate_context_similarity( - context, predefined_context_words) - if context_similarity >= \ - self.CONTEXT_SIMILARITY_THRESHOLD: + supportive_context_word = self.__find_supportive_word_in_context( + surrounding_words, recognizer_context_words) + if supportive_context_word != "": result.score += \ - context_similarity * self.CONTEXT_SIMILARITY_FACTOR + self.CONTEXT_SIMILARITY_FACTOR result.score = max( result.score, self.MIN_SCORE_WITH_CONTEXT_SIMILARITY) result.score = min( result.score, EntityRecognizer.MAX_SCORE) + + # Update the explainability object with context information + # helped improving the score + result.analysis_explanation.set_supportive_context_word( + supportive_context_word) + result.analysis_explanation.set_improved_score(result.score) return results @staticmethod def __context_to_keywords(context): return context.split(' ') - def __calculate_context_similarity(self, - context_text, - context_list): - """Context similarity is 1 if there's exact match between a keyword in + def __find_supportive_word_in_context(self, + context_list, + recognizer_context_list): + """A word is considered a supportive context word if + there's exact match between a keyword in context_text and any keyword in context_list - :param context_text words before and after the matched enitity within + :param context_list words before and after the matched entity within a specified window size - :param context_list a list of words considered as context keywords - manually specified by the recognizer's author + :param recognizer_context_list a list of words considered as + context keywords manually specified by the recognizer's author """ + word = "" # If the context list is empty, no need to continue - if context_list is None: - return 0 + if context_list is None or recognizer_context_list is None: + return word - # Take the context text and break it into individual keywords - lemmatized_keywords = self.__context_to_keywords(context_text) - if lemmatized_keywords is None: - return 0 - - similarity = 0.0 - for predefined_context_word in context_list: + for predefined_context_word in recognizer_context_list: # result == true only if any of the predefined context words # is found exactly or as a substring in any of the collected # context words result = \ - next((True for keyword in lemmatized_keywords + next((True for keyword in context_list if predefined_context_word in keyword), False) if result: self.logger.debug("Found context keyword '%s'", predefined_context_word) - similarity = 1 + word = predefined_context_word break - return similarity + return word @staticmethod def __add_n_words(index, n_words, lemmas, lemmatized_filtered_keywords, - prefix, is_backward): """ Prepare a string of context words, which surrounds a lemma at a given index. The words will be collected only if exist @@ -201,14 +202,14 @@ def __add_n_words(index, :param index: index of the lemma that its surrounding words we want :param n_words: number of words to take :param lemmas: array of lemmas - :param lemmatized_filtered_keywords: the array of filter - lemmas, - :param prefix: string to be attached to the results as a prefix + :param lemmatized_filtered_keywords: the array of filtered + lemmas from the original sentence, :param is_backward: if true take the preceeding words, if false, take the successing words """ i = index - # The entity itself is no intrest to us...however we want to + context_words = [] + # The entity itself is no interest to us...however we want to # consider it anyway for cases were it is attached with no spaces # to an interesting context word, so we allow it and add 1 to # the number of collected words @@ -218,38 +219,33 @@ def __add_n_words(index, while 0 <= i < len(lemmas) and remaining > 0: lower_lemma = lemmas[i].lower() if lower_lemma in lemmatized_filtered_keywords: + context_words.append(lower_lemma) remaining -= 1 - prefix += ' ' + lower_lemma - i = i-1 if is_backward else i+1 - return prefix + return context_words def __add_n_words_forward(self, index, n_words, lemmas, - lemmatized_filtered_keywords, - prefix): + lemmatized_filtered_keywords): return self.__add_n_words( index, n_words, lemmas, lemmatized_filtered_keywords, - prefix, False) def __add_n_words_backward(self, index, n_words, lemmas, - lemmatized_filtered_keywords, - prefix): + lemmatized_filtered_keywords): return self. __add_n_words( index, n_words, lemmas, lemmatized_filtered_keywords, - prefix, True) @staticmethod @@ -276,16 +272,16 @@ def find_index_of_match_token(word, start, tokens, tokens_indices): if not found: raise ValueError("Did not find word '" + word + "' " - "in the list of tokens altough it " + "in the list of tokens although it " "is expected to be found") return i - def __extract_context(self, nlp_artifacts, word, start): - """ Extracts words surronding another given word. + def __extract_surrounding_words(self, nlp_artifacts, word, start): + """ Extracts words surrounding another given word. The text from which the context is extracted is given in the nlp doc :param nlp_artifacts: An abstraction layer which holds different - items which are result of a NLP pipeline + items which are the result of a NLP pipeline execution on a given text :param word: The word to look for context around :param start: The start index of the word in the original text @@ -313,20 +309,22 @@ def __extract_context(self, nlp_artifacts, word, start): nlp_artifacts.tokens_indices) # index i belongs to the PII entity, take the preceding n words - # and the successing m words into a context string - context_str = '' - context_str = \ + # and the successing m words into a context list + + backward_context = \ self.__add_n_words_backward(token_index, EntityRecognizer.CONTEXT_PREFIX_COUNT, nlp_artifacts.lemmas, - lemmatized_keywords, - context_str) - context_str = \ + lemmatized_keywords) + forward_context = \ self.__add_n_words_forward(token_index, EntityRecognizer.CONTEXT_SUFFIX_COUNT, nlp_artifacts.lemmas, - lemmatized_keywords, - context_str) - - self.logger.debug('Context sentence is: %s', context_str) - return context_str + lemmatized_keywords) + + context_list = [] + context_list.extend(backward_context) + context_list.extend(forward_context) + context_list = list(set(context_list)) + self.logger.debug('Context list is: %s', " ".join(context_list)) + return context_list diff --git a/presidio-analyzer/analyzer/logger.py b/presidio-analyzer/analyzer/logger.py new file mode 100644 index 000000000..12598ca61 --- /dev/null +++ b/presidio-analyzer/analyzer/logger.py @@ -0,0 +1,80 @@ +import logging +import os + + +class Logger: + """A wrapper class for logger""" + def __init__(self, logger_name=None): + if logger_name: + logger = logging.getLogger(logger_name) + else: + logger = logging.getLogger() + + if not logger.handlers: + loglevel = os.environ.get("LOG_LEVEL", "INFO") + ch = logging.StreamHandler() + formatter = logging.Formatter( + '[%(asctime)s][%(name)s][%(levelname)s]%(message)s') + ch.setFormatter(formatter) + logger.addHandler(ch) + logger.setLevel(loglevel) + + self.__logger = logger + + def set_level(self, level): + self.__logger.setLevel(level) + + def debug(self, msg, *args, **kwargs): + """ + Log 'msg % args' with severity 'DEBUG'. + + To pass exception information, use the keyword argument exc_info with + a true value, e.g. + + logger.debug("Houston, we have a %s", "thorny problem", exc_info=1) + """ + self.__logger.debug(msg, *args, **kwargs) + + def info(self, msg, *args, **kwargs): + """ + Log 'msg % args' with severity 'INFO'. + + To pass exception information, use the keyword argument exc_info with + a true value, e.g. + + logger.info("Houston, we have a %s", "interesting problem", exc_info=1) + """ + self.__logger.info(msg, *args, **kwargs) + + def warning(self, msg, *args, **kwargs): + """ + Log 'msg % args' with severity 'WARNING'. + + To pass exception information, use the keyword argument exc_info with + a true value, e.g. + + logger.warning("Houston, we have a %s", "bit of a problem", exc_info=1) + """ + self.__logger.warning(msg, *args, **kwargs) + + def error(self, msg, *args, **kwargs): + """ + Log 'msg % args' with severity 'ERROR'. + + To pass exception information, use the keyword argument exc_info with + a true value, e.g. + + logger.error("Houston, we have a %s", "major problem", exc_info=1) + """ + self.__logger.error(msg, *args, **kwargs) + + def critical(self, msg, *args, **kwargs): + """ + Log 'msg % args' with severity 'CRITICAL'. + + To pass exception information, use the keyword argument exc_info with + a true value, e.g. + + logger.critical("Houston, we have a %s", "major disaster", exc_info=1) + """ + self.__logger.critical(msg, *args, **kwargs) diff --git a/presidio-analyzer/analyzer/nlp_engine/nlp_artifacts.py b/presidio-analyzer/analyzer/nlp_engine/nlp_artifacts.py index ed18ff1ee..276bc2414 100644 --- a/presidio-analyzer/analyzer/nlp_engine/nlp_artifacts.py +++ b/presidio-analyzer/analyzer/nlp_engine/nlp_artifacts.py @@ -33,3 +33,6 @@ def set_keywords(nlp_engine, lemmas, language): keywords = \ [item for sublist in keywords for item in sublist] return keywords + + def to_json(self): + return str(self.__dict__) diff --git a/presidio-analyzer/analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/analyzer/nlp_engine/spacy_nlp_engine.py index f94782cc5..f30685a1b 100644 --- a/presidio-analyzer/analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/analyzer/nlp_engine/spacy_nlp_engine.py @@ -1,9 +1,9 @@ -import logging -import os - import spacy +from spacy.cli import download +from analyzer.logger import Logger from analyzer.nlp_engine import NlpArtifacts, NlpEngine +logger = Logger() class SpacyNlpEngine(NlpEngine): @@ -14,11 +14,10 @@ class SpacyNlpEngine(NlpEngine): """ def __init__(self): - loglevel = os.environ.get("LOG_LEVEL", "INFO") - self.logger = logging.getLogger(__name__) - self.logger.setLevel(loglevel) + logger.info("Loading NLP model...") - self.logger.info("Loading NLP model...") + # Download model lazily if it wasn't previously installed + download('en_core_web_lg') self.nlp = {"en": spacy.load("en_core_web_lg", disable=['parser', 'tagger'])} diff --git a/presidio-analyzer/analyzer/pattern_recognizer.py b/presidio-analyzer/analyzer/pattern_recognizer.py index 9dabf2e80..0b0cb8637 100644 --- a/presidio-analyzer/analyzer/pattern_recognizer.py +++ b/presidio-analyzer/analyzer/pattern_recognizer.py @@ -3,7 +3,8 @@ from analyzer import LocalRecognizer, \ Pattern, \ RecognizerResult, \ - EntityRecognizer + EntityRecognizer, \ + AnalysisExplanation # Import 're2' regex engine if installed, if not- import 'regex' try: @@ -83,21 +84,31 @@ def __black_list_to_regex(black_list): regex = r"(?:^|(?<= ))(" + '|'.join(black_list) + r")(?:(?= )|$)" return Pattern(name="black_list", regex=regex, score=1.0) - # pylint: disable=unused-argument, no-self-use - def validate_result(self, pattern_text, pattern_result): + # pylint: disable=unused-argument, no-self-use, assignment-from-none + def validate_result(self, pattern_text): """ Validates the pattern logic, for example by running checksum on a detected pattern. :param pattern_text: the text to validated. Only the part in text that was detected by the regex engine - :param pattern_result: The output of a specific pattern - detector that needs to be validated - :return: the updated result of the pattern. - For example, if a validation logic increased or decreased the score - that was given by a regex pattern. + :return: A bool indicating whether the validation was successful. """ - return pattern_result + return None + + @staticmethod + def build_regex_explanation( + recognizer_name, + pattern_name, + pattern, + original_score, + validation_result): + explanation = AnalysisExplanation(recognizer=recognizer_name, + original_score=original_score, + pattern_name=pattern_name, + pattern=pattern, + validation_result=validation_result) + return explanation def __analyze_patterns(self, text): """ @@ -130,11 +141,29 @@ def __analyze_patterns(self, text): score = pattern.score - res = RecognizerResult(self.supported_entities[0], start, end, - score) - res = self.validate_result(current_match, res) - if res and res.score > EntityRecognizer.MIN_SCORE: - results.append(res) + validation_result = self.validate_result(current_match) + description = PatternRecognizer.build_regex_explanation( + self.name, + pattern.name, + pattern.regex, + score, + validation_result + ) + pattern_result = RecognizerResult( + self.supported_entities[0], + start, + end, + score, + description) + + if validation_result is not None: + if validation_result: + pattern_result.score = EntityRecognizer.MAX_SCORE + else: + pattern_result.score = EntityRecognizer.MIN_SCORE + + if pattern_result.score > EntityRecognizer.MIN_SCORE: + results.append(pattern_result) return results diff --git a/presidio-analyzer/analyzer/predefined_recognizers/credit_card_recognizer.py b/presidio-analyzer/analyzer/predefined_recognizers/credit_card_recognizer.py index 1b5312e25..aa3654243 100644 --- a/presidio-analyzer/analyzer/predefined_recognizers/credit_card_recognizer.py +++ b/presidio-analyzer/analyzer/predefined_recognizers/credit_card_recognizer.py @@ -1,6 +1,5 @@ from analyzer import Pattern from analyzer import PatternRecognizer -from analyzer.entity_recognizer import EntityRecognizer # pylint: disable=line-too-long REGEX = r'\b((4\d{3})|(5[0-5]\d{2})|(6\d{3})|(1\d{3})|(3\d{3}))[- ]?(\d{3,4})[- ]?(\d{3,4})[- ]?(\d{3,5})\b' # noqa: E501 @@ -30,15 +29,11 @@ def __init__(self): super().__init__(supported_entity="CREDIT_CARD", patterns=patterns, context=CONTEXT) - def validate_result(self, pattern_text, pattern_result): + def validate_result(self, pattern_text): sanitized_value = CreditCardRecognizer.__sanitize_value(pattern_text) - res = CreditCardRecognizer.__luhn_checksum(sanitized_value) - if res == 0: - pattern_result.score = EntityRecognizer.MAX_SCORE - else: - pattern_result.score = EntityRecognizer.MIN_SCORE + checksum = CreditCardRecognizer.__luhn_checksum(sanitized_value) - return pattern_result + return checksum == 0 @staticmethod def __luhn_checksum(sanitized_value): diff --git a/presidio-analyzer/analyzer/predefined_recognizers/crypto_recognizer.py b/presidio-analyzer/analyzer/predefined_recognizers/crypto_recognizer.py index a6e7b4cf1..69c57cf3f 100644 --- a/presidio-analyzer/analyzer/predefined_recognizers/crypto_recognizer.py +++ b/presidio-analyzer/analyzer/predefined_recognizers/crypto_recognizer.py @@ -1,7 +1,6 @@ from hashlib import sha256 from analyzer import Pattern from analyzer import PatternRecognizer -from analyzer.entity_recognizer import EntityRecognizer # Copied from: # http://rosettacode.org/wiki/Bitcoin/address_validation#Python @@ -19,12 +18,12 @@ def __init__(self): super().__init__(supported_entity="CRYPTO", patterns=patterns, context=CONTEXT) - def validate_result(self, pattern_text, pattern_result): + def validate_result(self, pattern_text): # try: bcbytes = CryptoRecognizer.__decode_base58(pattern_text, 25) - if bcbytes[-4:] == sha256(sha256(bcbytes[:-4]).digest()).digest()[:4]: - pattern_result.score = EntityRecognizer.MAX_SCORE - return pattern_result + result = bcbytes[-4:] == sha256(sha256(bcbytes[:-4]) + .digest()).digest()[:4] + return result @staticmethod def __decode_base58(bc, length): diff --git a/presidio-analyzer/analyzer/predefined_recognizers/domain_recognizer.py b/presidio-analyzer/analyzer/predefined_recognizers/domain_recognizer.py index 94c553ece..1a578c330 100644 --- a/presidio-analyzer/analyzer/predefined_recognizers/domain_recognizer.py +++ b/presidio-analyzer/analyzer/predefined_recognizers/domain_recognizer.py @@ -2,7 +2,6 @@ from analyzer import Pattern from analyzer import PatternRecognizer -from analyzer.entity_recognizer import EntityRecognizer # pylint: disable=line-too-long REGEX = r'\b(((([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,86}[a-zA-Z0-9]))\.(([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,73}[a-zA-Z0-9]))\.(([a-zA-Z0-9]{2,12}\.[a-zA-Z0-9]{2,12})|([a-zA-Z0-9]{2,25})))|((([a-zA-Z0-9])|([a-zA-Z0-9][a-zA-Z0-9\-]{0,162}[a-zA-Z0-9]))\.(([a-zA-Z0-9]{2,12}\.[a-zA-Z0-9]{2,12})|([a-zA-Z0-9]{2,25}))))\b' # noqa: E501' # noqa: E501 @@ -19,10 +18,6 @@ def __init__(self): super().__init__(supported_entity="DOMAIN_NAME", patterns=patterns, context=CONTEXT) - def validate_result(self, pattern_text, pattern_result): + def validate_result(self, pattern_text): result = tldextract.extract(pattern_text) - if result.fqdn != '': - pattern_result.score = EntityRecognizer.MAX_SCORE - else: - pattern_result.score = EntityRecognizer.MIN_SCORE - return pattern_result + return result.fqdn != '' diff --git a/presidio-analyzer/analyzer/predefined_recognizers/email_recognizer.py b/presidio-analyzer/analyzer/predefined_recognizers/email_recognizer.py index 444dacbf2..5925608a4 100644 --- a/presidio-analyzer/analyzer/predefined_recognizers/email_recognizer.py +++ b/presidio-analyzer/analyzer/predefined_recognizers/email_recognizer.py @@ -2,7 +2,6 @@ from analyzer import Pattern from analyzer import PatternRecognizer -from analyzer.entity_recognizer import EntityRecognizer # pylint: disable=line-too-long REGEX = r"\b((([!#$%&'*+\-/=?^_`{|}~\w])|([!#$%&'*+\-/=?^_`{|}~\w][!#$%&'*+\-/=?^_`{|}~\.\w]{0,}[!#$%&'*+\-/=?^_`{|}~\w]))[@]\w+([-.]\w+)*\.\w+([-.]\w+)*)\b" # noqa: E501 @@ -19,11 +18,6 @@ def __init__(self): super().__init__(supported_entity="EMAIL_ADDRESS", patterns=patterns, context=CONTEXT) - def validate_result(self, pattern_text, pattern_result): + def validate_result(self, pattern_text): result = tldextract.extract(pattern_text) - - if result.fqdn != '': - pattern_result.score = EntityRecognizer.MAX_SCORE - else: - pattern_result.score = EntityRecognizer.MIN_SCORE - return pattern_result + return result.fqdn != '' diff --git a/presidio-analyzer/analyzer/predefined_recognizers/iban_recognizer.py b/presidio-analyzer/analyzer/predefined_recognizers/iban_recognizer.py index 817e659b7..4a39c102d 100644 --- a/presidio-analyzer/analyzer/predefined_recognizers/iban_recognizer.py +++ b/presidio-analyzer/analyzer/predefined_recognizers/iban_recognizer.py @@ -2,7 +2,6 @@ from analyzer.predefined_recognizers.iban_patterns import regex_per_country from analyzer import Pattern, PatternRecognizer -from analyzer.entity_recognizer import EntityRecognizer # Import 're2' regex engine if installed, if not- import 'regex' try: @@ -33,19 +32,18 @@ def __init__(self): patterns=patterns, context=CONTEXT) - def validate_result(self, pattern_text, pattern_result): + def validate_result(self, pattern_text): pattern_text = pattern_text.replace(' ', '') is_valid_checksum = (IbanRecognizer.__generate_iban_check_digits( pattern_text) == pattern_text[2:4]) - - score = EntityRecognizer.MIN_SCORE + # score = EntityRecognizer.MIN_SCORE + result = False if is_valid_checksum: if IbanRecognizer.__is_valid_format(pattern_text): - score = EntityRecognizer.MAX_SCORE + result = True elif IbanRecognizer.__is_valid_format(pattern_text.upper()): - score = IBAN_GENERIC_SCORE - pattern_result.score = score - return pattern_result + result = None + return result @staticmethod def __number_iban(iban): diff --git a/presidio-analyzer/analyzer/predefined_recognizers/spacy_recognizer.py b/presidio-analyzer/analyzer/predefined_recognizers/spacy_recognizer.py index 0f9dac2a3..213221145 100644 --- a/presidio-analyzer/analyzer/predefined_recognizers/spacy_recognizer.py +++ b/presidio-analyzer/analyzer/predefined_recognizers/spacy_recognizer.py @@ -1,7 +1,9 @@ -from analyzer import RecognizerResult, LocalRecognizer +from analyzer import RecognizerResult, LocalRecognizer, AnalysisExplanation NER_STRENGTH = 0.85 SUPPORTED_ENTITIES = ["DATE_TIME", "NRP", "LOCATION", "PERSON"] +SPACY_DEFAULT_EXPLANATION = \ + "Identified as {} by Spacy's Named Entity Recognition" class SpacyRecognizer(LocalRecognizer): @@ -15,7 +17,16 @@ def load(self): # preprocessed nlp artifacts pass + @staticmethod + def build_spacy_explanation(recognizer_name, original_score, entity): + explanation = AnalysisExplanation( + recognizer=recognizer_name, + original_score=original_score, + textual_explanation=SPACY_DEFAULT_EXPLANATION.format(entity)) + return explanation + # pylint: disable=unused-argument + def analyze(self, text, entities, nlp_artifacts=None): results = [] if not nlp_artifacts: @@ -29,9 +40,14 @@ def analyze(self, text, entities, nlp_artifacts=None): if entity in self.supported_entities: for ent in ner_entities: if SpacyRecognizer.__check_label(entity, ent.label_): - results.append( - RecognizerResult(entity, ent.start_char, - ent.end_char, NER_STRENGTH)) + explanation = SpacyRecognizer.build_spacy_explanation( + self.__class__.__name__, + NER_STRENGTH, + ent.label_) + spacy_result = RecognizerResult( + entity, ent.start_char, + ent.end_char, NER_STRENGTH, explanation) + results.append(spacy_result) return results diff --git a/presidio-analyzer/analyzer/predefined_recognizers/uk_nhs_recognizer.py b/presidio-analyzer/analyzer/predefined_recognizers/uk_nhs_recognizer.py index 2b3dc287d..0aa5fd8e9 100644 --- a/presidio-analyzer/analyzer/predefined_recognizers/uk_nhs_recognizer.py +++ b/presidio-analyzer/analyzer/predefined_recognizers/uk_nhs_recognizer.py @@ -18,9 +18,8 @@ def __init__(self): super().__init__(supported_entity="UK_NHS", patterns=patterns, context=CONTEXT) - def validate_result(self, pattern_text, pattern_result): + def validate_result(self, pattern_text): text = NhsRecognizer.__sanitize_value(pattern_text) - multiplier = 10 total = 0 for c in text: @@ -31,8 +30,7 @@ def validate_result(self, pattern_text, pattern_result): remainder = total % 11 check_digit = 11 - remainder - pattern_result.score = 1.0 if check_digit == 11 else 0 - return pattern_result + return check_digit == 11 @staticmethod def __sanitize_value(text): diff --git a/presidio-analyzer/analyzer/predefined_recognizers/us_driver_license_recognizer.py b/presidio-analyzer/analyzer/predefined_recognizers/us_driver_license_recognizer.py index d671d81d0..77002569c 100644 --- a/presidio-analyzer/analyzer/predefined_recognizers/us_driver_license_recognizer.py +++ b/presidio-analyzer/analyzer/predefined_recognizers/us_driver_license_recognizer.py @@ -14,13 +14,12 @@ # pylint: disable=line-too-long,abstract-method WA_WEAK_REGEX = r'\b((?=.*\d)([A-Z][A-Z0-9*]{11})|(?=.*\*)([A-Z][A-Z0-9*]{11}))\b' # noqa: E501 -WA_VERY_WEAK_REGEX = r'\b([A-Z]{12})\b' ALPHANUMERIC_REGEX = r'\b([A-Z][0-9]{3,6}|[A-Z][0-9]{5,9}|[A-Z][0-9]{6,8}|[A-Z][0-9]{4,8}|[A-Z][0-9]{9,11}|[A-Z]{1,2}[0-9]{5,6}|H[0-9]{8}|V[0-9]{6}|X[0-9]{8}|A-Z]{2}[0-9]{2,5}|[A-Z]{2}[0-9]{3,7}|[0-9]{2}[A-Z]{3}[0-9]{5,6}|[A-Z][0-9]{13,14}|[A-Z][0-9]{18}|[A-Z][0-9]{6}R|[A-Z][0-9]{9}|[A-Z][0-9]{1,12}|[0-9]{9}[A-Z]|[A-Z]{2}[0-9]{6}[A-Z]|[0-9]{8}[A-Z]{2}|[0-9]{3}[A-Z]{2}[0-9]{4}|[A-Z][0-9][A-Z][0-9][A-Z]|[0-9]{7,8}[A-Z])\b' # noqa: E501 -DIGITS_REGEX = r'\b([0-9]{1,9}|[0-9]{4,10}|[0-9]{6,10}|[0-9]{1,12}|[0-9]{12,14}|[0-9]{16})\b' # noqa: E501 +DIGITS_REGEX = r'\b([0-9]{6,14}|[0-9]{16})\b' # noqa: E501 LICENSE_CONTEXT = [ - "driver", "license", "permit", "id", "lic", "identification", "card", - "cards", "dl", "dls", "cdls", "id", "lic#" + "driver", "license", "permit", "lic", "identification", + "dl", "dls", "cdls", "id", "lic#", "driving" ] @@ -31,8 +30,6 @@ class UsLicenseRecognizer(PatternRecognizer): def __init__(self): patterns = [Pattern('Driver License - WA (weak) ', WA_WEAK_REGEX, 0.4), - Pattern('Driver License - WA (very weak) ', - WA_VERY_WEAK_REGEX, 0.01), Pattern('Driver License - Alphanumeric (weak) ', ALPHANUMERIC_REGEX, 0.3), Pattern('Driver License - Digits (very weak)', diff --git a/presidio-analyzer/analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/analyzer/recognizer_registry/recognizer_registry.py index 28325ff9e..776f30a78 100644 --- a/presidio-analyzer/analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/analyzer/recognizer_registry/recognizer_registry.py @@ -58,7 +58,8 @@ def load_predefined_recognizers(self): UsPhoneRecognizer(), UsSsnRecognizer(), SpacyRecognizer()]) - def get_recognizers(self, language, entities=None, all_fields=False): + def get_recognizers(self, language, entities=None, + all_fields=False): """ Returns a list of the recognizer, which supports the specified name and language. @@ -94,7 +95,8 @@ def get_recognizers(self, language, entities=None, all_fields=False): if not subset: logging.warning("Entity %s doesn't have the corresponding" " recognizer in language : %s", - entity, language) + entity, + language) else: to_return.extend(subset) diff --git a/presidio-analyzer/analyzer/recognizer_result.py b/presidio-analyzer/analyzer/recognizer_result.py index 4667dcbfc..6261451c1 100644 --- a/presidio-analyzer/analyzer/recognizer_result.py +++ b/presidio-analyzer/analyzer/recognizer_result.py @@ -1,6 +1,10 @@ +from . import AnalysisExplanation + + class RecognizerResult: - def __init__(self, entity_type, start, end, score): + def __init__(self, entity_type, start, end, score, + analysis_explanation: AnalysisExplanation = None): """ Recognizer Result represents the findings of the detected entity of the analyzer in the text. @@ -8,8 +12,18 @@ def __init__(self, entity_type, start, end, score): :param start: the start location of the detected entity :param end: the end location of the detected entity :param score: the score of the detection + :param analysis_explanation: contains the explanation of why this + entity was identified """ self.entity_type = entity_type self.start = start self.end = end self.score = score + self.analysis_explanation = analysis_explanation + + def append_analysis_explenation_text(self, text): + if self.analysis_explanation: + self.analysis_explanation.append_textual_explanation_line(text) + + def to_json(self): + return str(self.__dict__) diff --git a/presidio-analyzer/analyzer/template_pb2.py b/presidio-analyzer/analyzer/template_pb2.py index 2bf704b0b..4d726d78b 100644 --- a/presidio-analyzer/analyzer/template_pb2.py +++ b/presidio-analyzer/analyzer/template_pb2.py @@ -7,6 +7,7 @@ from google.protobuf import message as _message from google.protobuf import reflection as _reflection from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() @@ -19,8 +20,7 @@ name='template.proto', package='types', syntax='proto3', - serialized_options=None, - serialized_pb=_b('\n\x0etemplate.proto\x12\x05types\x1a\x0c\x63ommon.proto\"\x98\x01\n\x0f\x41nalyzeTemplate\x12!\n\x06\x66ields\x18\x01 \x03(\x0b\x32\x11.types.FieldTypes\x12\x11\n\tallFields\x18\x02 \x01(\x08\x12\x13\n\x0b\x64\x65scription\x18\x03 \x01(\t\x12\x12\n\ncreateTime\x18\x04 \x01(\t\x12\x14\n\x0cmodifiedTime\x18\x05 \x01(\t\x12\x10\n\x08language\x18\x06 \x01(\t\"\xca\x01\n\x11\x41nonymizeTemplate\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12\x12\n\ncreateTime\x18\x02 \x01(\t\x12\x14\n\x0cmodifiedTime\x18\x03 \x01(\t\x12@\n\x18\x66ieldTypeTransformations\x18\x04 \x03(\x0b\x32\x1e.types.FieldTypeTransformation\x12\x34\n\x15\x64\x65\x66\x61ultTransformation\x18\x05 \x01(\x0b\x32\x15.types.Transformation\"g\n\x12JsonSchemaTemplate\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12\x12\n\ncreateTime\x18\x02 \x01(\t\x12\x14\n\x0cmodifiedTime\x18\x03 \x01(\t\x12\x12\n\njsonSchema\x18\x04 \x01(\t\"k\n\x17\x46ieldTypeTransformation\x12!\n\x06\x66ields\x18\x01 \x03(\x0b\x32\x11.types.FieldTypes\x12-\n\x0etransformation\x18\x02 \x01(\x0b\x32\x15.types.Transformation\"\xd1\x01\n\x0eTransformation\x12)\n\x0creplaceValue\x18\x02 \x01(\x0b\x32\x13.types.ReplaceValue\x12\'\n\x0bredactValue\x18\x03 \x01(\x0b\x32\x12.types.RedactValue\x12#\n\thashValue\x18\x04 \x01(\x0b\x32\x10.types.HashValue\x12#\n\tmaskValue\x18\x05 \x01(\x0b\x32\x10.types.MaskValue\x12!\n\x08\x66PEValue\x18\x06 \x01(\x0b\x32\x0f.types.FPEValue\" \n\x0cReplaceValue\x12\x10\n\x08newValue\x18\x01 \x01(\t\"\r\n\x0bRedactValue\"\x0b\n\tHashValue\"K\n\tMaskValue\x12\x18\n\x10maskingCharacter\x18\x01 \x01(\t\x12\x13\n\x0b\x63harsToMask\x18\x02 \x01(\x05\x12\x0f\n\x07\x66romEnd\x18\x03 \x01(\x08\"7\n\x08\x46PEValue\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05tweak\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65\x63rypt\x18\x03 \x01(\x08\"E\n\x08\x44\x42\x43onfig\x12\x18\n\x10\x63onnectionString\x18\x01 \x01(\t\x12\x11\n\ttableName\x18\x02 \x01(\t\x12\x0c\n\x04type\x18\x03 \x01(\t\"\x8f\x01\n\x08\x44\x61tasink\x12!\n\x08\x64\x62\x43onfig\x18\x01 \x01(\x0b\x32\x0f.types.DBConfig\x12\x35\n\x12\x63loudStorageConfig\x18\x02 \x01(\x0b\x32\x19.types.CloudStorageConfig\x12)\n\x0cstreamConfig\x18\x03 \x01(\x0b\x32\x13.types.StreamConfig\"}\n\x10\x44\x61tasinkTemplate\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12(\n\x0f\x61nalyzeDatasink\x18\x02 \x03(\x0b\x32\x0f.types.Datasink\x12*\n\x11\x61nonymizeDatasink\x18\x03 \x03(\x0b\x32\x0f.types.Datasink\"S\n\x11\x42lobStorageConfig\x12\x13\n\x0b\x61\x63\x63ountName\x18\x01 \x01(\t\x12\x12\n\naccountKey\x18\x02 \x01(\t\x12\x15\n\rcontainerName\x18\x03 \x01(\t\"e\n\x08S3Config\x12\x10\n\x08\x61\x63\x63\x65ssId\x18\x01 \x01(\t\x12\x11\n\taccessKey\x18\x02 \x01(\t\x12\x0e\n\x06region\x18\x03 \x01(\t\x12\x12\n\nbucketName\x18\x04 \x01(\t\x12\x10\n\x08\x65ndpoint\x18\x05 \x01(\t\"Z\n\x13GoogleStorageConfig\x12\x0c\n\x04json\x18\x01 \x01(\t\x12\x11\n\tprojectId\x18\x02 \x01(\t\x12\x0e\n\x06scopes\x18\x03 \x01(\t\x12\x12\n\nbucketName\x18\x04 \x01(\t\"\xa5\x01\n\x12\x43loudStorageConfig\x12\x33\n\x11\x62lobStorageConfig\x18\x01 \x01(\x0b\x32\x18.types.BlobStorageConfig\x12!\n\x08s3Config\x18\x02 \x01(\x0b\x32\x0f.types.S3Config\x12\x37\n\x13GoogleStorageConfig\x18\x03 \x01(\x0b\x32\x1a.types.GoogleStorageConfig\"r\n\x0cStreamConfig\x12\'\n\x0bkafkaConfig\x18\x01 \x01(\x0b\x32\x12.types.KafkaConfig\x12!\n\x08\x65hConfig\x18\x02 \x01(\x0b\x32\x0f.types.EHConfig\x12\x16\n\x0epartitionCount\x18\x03 \x01(\x05\"Y\n\x0bKafkaConfig\x12\x0f\n\x07\x61\x64\x64ress\x18\x01 \x01(\t\x12\r\n\x05topic\x18\x02 \x01(\t\x12\x14\n\x0csaslUsername\x18\x03 \x01(\t\x12\x14\n\x0csaslPassword\x18\x04 \x01(\t\"\xcb\x01\n\x08\x45HConfig\x12\x13\n\x0b\x65hNamespace\x18\x01 \x01(\t\x12\x0e\n\x06\x65hName\x18\x02 \x01(\t\x12\x1a\n\x12\x65hConnectionString\x18\x03 \x01(\t\x12\x11\n\tehKeyName\x18\x04 \x01(\t\x12\x12\n\nehKeyValue\x18\x05 \x01(\t\x12\x1f\n\x17storageAccountNameValue\x18\x06 \x01(\t\x12\x1e\n\x16storageAccountKeyValue\x18\x07 \x01(\t\x12\x16\n\x0e\x63ontainerValue\x18\x08 \x01(\t\"\xb2\x01\n\x0eStreamTemplate\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12)\n\x0cstreamConfig\x18\x03 \x01(\x0b\x32\x13.types.StreamConfig\x12\x19\n\x11\x61nalyzeTemplateId\x18\x04 \x01(\t\x12\x1b\n\x13\x61nonymizeTemplateId\x18\x05 \x01(\t\x12\x1a\n\x12\x64\x61tasinkTemplateId\x18\x06 \x01(\t\"Z\n\x0cScanTemplate\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12\x35\n\x12\x63loudStorageConfig\x18\x02 \x01(\x0b\x32\x19.types.CloudStorageConfig\"\xc8\x01\n\x16ScannerCronJobTemplate\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x1f\n\x07trigger\x18\x03 \x01(\x0b\x32\x0e.types.Trigger\x12\x16\n\x0escanTemplateId\x18\x04 \x01(\t\x12\x19\n\x11\x61nalyzeTemplateId\x18\x05 \x01(\t\x12\x1b\n\x13\x61nonymizeTemplateId\x18\x06 \x01(\t\x12\x1a\n\x12\x64\x61tasinkTemplateId\x18\x07 \x01(\t\"\xa6\x01\n\x12StreamsJobTemplate\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x19\n\x11streamsTemplateId\x18\x03 \x01(\t\x12\x19\n\x11\x61nalyzeTemplateId\x18\x04 \x01(\t\x12\x1b\n\x13\x61nonymizeTemplateId\x18\x05 \x01(\t\x12\x1a\n\x12\x64\x61tasinkTemplateId\x18\x06 \x01(\t\",\n\x07Trigger\x12!\n\x08schedule\x18\x01 \x01(\x0b\x32\x0f.types.Schedule\"$\n\x08Schedule\x12\x18\n\x10recurrencePeriod\x18\x01 \x01(\t\"\x8b\x01\n\x16\x41nonymizeImageTemplate\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12\x12\n\ncreateTime\x18\x02 \x01(\t\x12\x14\n\x0cmodifiedTime\x18\x03 \x01(\t\x12\x32\n\x11\x66ieldTypeGraphics\x18\x04 \x03(\x0b\x32\x17.types.FieldTypeGraphic\"V\n\x10\x46ieldTypeGraphic\x12!\n\x06\x66ields\x18\x01 \x03(\x0b\x32\x11.types.FieldTypes\x12\x1f\n\x07graphic\x18\x02 \x01(\x0b\x32\x0e.types.Graphic\"8\n\x07Graphic\x12-\n\x0e\x66illColorValue\x18\x01 \x01(\x0b\x32\x15.types.FillColorValue\":\n\x0e\x46illColorValue\x12\x0b\n\x03red\x18\x01 \x01(\x01\x12\r\n\x05green\x18\x02 \x01(\x01\x12\x0c\n\x04\x62lue\x18\x03 \x01(\x01\x62\x06proto3') + serialized_pb=_b('\n\x0etemplate.proto\x12\x05types\x1a\x0c\x63ommon.proto\"\xb7\x01\n\x0f\x41nalyzeTemplate\x12!\n\x06\x66ields\x18\x01 \x03(\x0b\x32\x11.types.FieldTypes\x12\x11\n\tallFields\x18\x02 \x01(\x08\x12\x13\n\x0b\x64\x65scription\x18\x03 \x01(\t\x12\x12\n\ncreateTime\x18\x04 \x01(\t\x12\x14\n\x0cmodifiedTime\x18\x05 \x01(\t\x12\x10\n\x08language\x18\x06 \x01(\t\x12\x1d\n\x15resultsScoreThreshold\x18\x07 \x01(\x02\"\xca\x01\n\x11\x41nonymizeTemplate\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12\x12\n\ncreateTime\x18\x02 \x01(\t\x12\x14\n\x0cmodifiedTime\x18\x03 \x01(\t\x12@\n\x18\x66ieldTypeTransformations\x18\x04 \x03(\x0b\x32\x1e.types.FieldTypeTransformation\x12\x34\n\x15\x64\x65\x66\x61ultTransformation\x18\x05 \x01(\x0b\x32\x15.types.Transformation\"g\n\x12JsonSchemaTemplate\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12\x12\n\ncreateTime\x18\x02 \x01(\t\x12\x14\n\x0cmodifiedTime\x18\x03 \x01(\t\x12\x12\n\njsonSchema\x18\x04 \x01(\t\"k\n\x17\x46ieldTypeTransformation\x12!\n\x06\x66ields\x18\x01 \x03(\x0b\x32\x11.types.FieldTypes\x12-\n\x0etransformation\x18\x02 \x01(\x0b\x32\x15.types.Transformation\"\xd1\x01\n\x0eTransformation\x12)\n\x0creplaceValue\x18\x02 \x01(\x0b\x32\x13.types.ReplaceValue\x12\'\n\x0bredactValue\x18\x03 \x01(\x0b\x32\x12.types.RedactValue\x12#\n\thashValue\x18\x04 \x01(\x0b\x32\x10.types.HashValue\x12#\n\tmaskValue\x18\x05 \x01(\x0b\x32\x10.types.MaskValue\x12!\n\x08\x66PEValue\x18\x06 \x01(\x0b\x32\x0f.types.FPEValue\" \n\x0cReplaceValue\x12\x10\n\x08newValue\x18\x01 \x01(\t\"\r\n\x0bRedactValue\"\x0b\n\tHashValue\"K\n\tMaskValue\x12\x18\n\x10maskingCharacter\x18\x01 \x01(\t\x12\x13\n\x0b\x63harsToMask\x18\x02 \x01(\x05\x12\x0f\n\x07\x66romEnd\x18\x03 \x01(\x08\"7\n\x08\x46PEValue\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05tweak\x18\x02 \x01(\t\x12\x0f\n\x07\x64\x65\x63rypt\x18\x03 \x01(\x08\"E\n\x08\x44\x42\x43onfig\x12\x18\n\x10\x63onnectionString\x18\x01 \x01(\t\x12\x11\n\ttableName\x18\x02 \x01(\t\x12\x0c\n\x04type\x18\x03 \x01(\t\"\x8f\x01\n\x08\x44\x61tasink\x12!\n\x08\x64\x62\x43onfig\x18\x01 \x01(\x0b\x32\x0f.types.DBConfig\x12\x35\n\x12\x63loudStorageConfig\x18\x02 \x01(\x0b\x32\x19.types.CloudStorageConfig\x12)\n\x0cstreamConfig\x18\x03 \x01(\x0b\x32\x13.types.StreamConfig\"}\n\x10\x44\x61tasinkTemplate\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12(\n\x0f\x61nalyzeDatasink\x18\x02 \x03(\x0b\x32\x0f.types.Datasink\x12*\n\x11\x61nonymizeDatasink\x18\x03 \x03(\x0b\x32\x0f.types.Datasink\"S\n\x11\x42lobStorageConfig\x12\x13\n\x0b\x61\x63\x63ountName\x18\x01 \x01(\t\x12\x12\n\naccountKey\x18\x02 \x01(\t\x12\x15\n\rcontainerName\x18\x03 \x01(\t\"e\n\x08S3Config\x12\x10\n\x08\x61\x63\x63\x65ssId\x18\x01 \x01(\t\x12\x11\n\taccessKey\x18\x02 \x01(\t\x12\x0e\n\x06region\x18\x03 \x01(\t\x12\x12\n\nbucketName\x18\x04 \x01(\t\x12\x10\n\x08\x65ndpoint\x18\x05 \x01(\t\"Z\n\x13GoogleStorageConfig\x12\x0c\n\x04json\x18\x01 \x01(\t\x12\x11\n\tprojectId\x18\x02 \x01(\t\x12\x0e\n\x06scopes\x18\x03 \x01(\t\x12\x12\n\nbucketName\x18\x04 \x01(\t\"\xa5\x01\n\x12\x43loudStorageConfig\x12\x33\n\x11\x62lobStorageConfig\x18\x01 \x01(\x0b\x32\x18.types.BlobStorageConfig\x12!\n\x08s3Config\x18\x02 \x01(\x0b\x32\x0f.types.S3Config\x12\x37\n\x13GoogleStorageConfig\x18\x03 \x01(\x0b\x32\x1a.types.GoogleStorageConfig\"r\n\x0cStreamConfig\x12\'\n\x0bkafkaConfig\x18\x01 \x01(\x0b\x32\x12.types.KafkaConfig\x12!\n\x08\x65hConfig\x18\x02 \x01(\x0b\x32\x0f.types.EHConfig\x12\x16\n\x0epartitionCount\x18\x03 \x01(\x05\"Y\n\x0bKafkaConfig\x12\x0f\n\x07\x61\x64\x64ress\x18\x01 \x01(\t\x12\r\n\x05topic\x18\x02 \x01(\t\x12\x14\n\x0csaslUsername\x18\x03 \x01(\t\x12\x14\n\x0csaslPassword\x18\x04 \x01(\t\"\xcb\x01\n\x08\x45HConfig\x12\x13\n\x0b\x65hNamespace\x18\x01 \x01(\t\x12\x0e\n\x06\x65hName\x18\x02 \x01(\t\x12\x1a\n\x12\x65hConnectionString\x18\x03 \x01(\t\x12\x11\n\tehKeyName\x18\x04 \x01(\t\x12\x12\n\nehKeyValue\x18\x05 \x01(\t\x12\x1f\n\x17storageAccountNameValue\x18\x06 \x01(\t\x12\x1e\n\x16storageAccountKeyValue\x18\x07 \x01(\t\x12\x16\n\x0e\x63ontainerValue\x18\x08 \x01(\t\"\xb2\x01\n\x0eStreamTemplate\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12)\n\x0cstreamConfig\x18\x03 \x01(\x0b\x32\x13.types.StreamConfig\x12\x19\n\x11\x61nalyzeTemplateId\x18\x04 \x01(\t\x12\x1b\n\x13\x61nonymizeTemplateId\x18\x05 \x01(\t\x12\x1a\n\x12\x64\x61tasinkTemplateId\x18\x06 \x01(\t\"Z\n\x0cScanTemplate\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12\x35\n\x12\x63loudStorageConfig\x18\x02 \x01(\x0b\x32\x19.types.CloudStorageConfig\"\xc8\x01\n\x16ScannerCronJobTemplate\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x1f\n\x07trigger\x18\x03 \x01(\x0b\x32\x0e.types.Trigger\x12\x16\n\x0escanTemplateId\x18\x04 \x01(\t\x12\x19\n\x11\x61nalyzeTemplateId\x18\x05 \x01(\t\x12\x1b\n\x13\x61nonymizeTemplateId\x18\x06 \x01(\t\x12\x1a\n\x12\x64\x61tasinkTemplateId\x18\x07 \x01(\t\"\xa6\x01\n\x12StreamsJobTemplate\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x02 \x01(\t\x12\x19\n\x11streamsTemplateId\x18\x03 \x01(\t\x12\x19\n\x11\x61nalyzeTemplateId\x18\x04 \x01(\t\x12\x1b\n\x13\x61nonymizeTemplateId\x18\x05 \x01(\t\x12\x1a\n\x12\x64\x61tasinkTemplateId\x18\x06 \x01(\t\",\n\x07Trigger\x12!\n\x08schedule\x18\x01 \x01(\x0b\x32\x0f.types.Schedule\"$\n\x08Schedule\x12\x18\n\x10recurrencePeriod\x18\x01 \x01(\t\"\x8b\x01\n\x16\x41nonymizeImageTemplate\x12\x13\n\x0b\x64\x65scription\x18\x01 \x01(\t\x12\x12\n\ncreateTime\x18\x02 \x01(\t\x12\x14\n\x0cmodifiedTime\x18\x03 \x01(\t\x12\x32\n\x11\x66ieldTypeGraphics\x18\x04 \x03(\x0b\x32\x17.types.FieldTypeGraphic\"V\n\x10\x46ieldTypeGraphic\x12!\n\x06\x66ields\x18\x01 \x03(\x0b\x32\x11.types.FieldTypes\x12\x1f\n\x07graphic\x18\x02 \x01(\x0b\x32\x0e.types.Graphic\"8\n\x07Graphic\x12-\n\x0e\x66illColorValue\x18\x01 \x01(\x0b\x32\x15.types.FillColorValue\":\n\x0e\x46illColorValue\x12\x0b\n\x03red\x18\x01 \x01(\x01\x12\r\n\x05green\x18\x02 \x01(\x01\x12\x0c\n\x04\x62lue\x18\x03 \x01(\x01\x62\x06proto3') , dependencies=[common__pb2.DESCRIPTOR,]) @@ -40,56 +40,63 @@ has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='allFields', full_name='types.AnalyzeTemplate.allFields', index=1, number=2, type=8, cpp_type=7, label=1, has_default_value=False, default_value=False, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='description', full_name='types.AnalyzeTemplate.description', index=2, number=3, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='createTime', full_name='types.AnalyzeTemplate.createTime', index=3, number=4, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='modifiedTime', full_name='types.AnalyzeTemplate.modifiedTime', index=4, number=5, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='language', full_name='types.AnalyzeTemplate.language', index=5, number=6, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='resultsScoreThreshold', full_name='types.AnalyzeTemplate.resultsScoreThreshold', index=6, + number=7, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], serialized_start=40, - serialized_end=192, + serialized_end=223, ) @@ -106,49 +113,49 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='createTime', full_name='types.AnonymizeTemplate.createTime', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='modifiedTime', full_name='types.AnonymizeTemplate.modifiedTime', index=2, number=3, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='fieldTypeTransformations', full_name='types.AnonymizeTemplate.fieldTypeTransformations', index=3, number=4, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='defaultTransformation', full_name='types.AnonymizeTemplate.defaultTransformation', index=4, number=5, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=195, - serialized_end=397, + serialized_start=226, + serialized_end=428, ) @@ -165,42 +172,42 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='createTime', full_name='types.JsonSchemaTemplate.createTime', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='modifiedTime', full_name='types.JsonSchemaTemplate.modifiedTime', index=2, number=3, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='jsonSchema', full_name='types.JsonSchemaTemplate.jsonSchema', index=3, number=4, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=399, - serialized_end=502, + serialized_start=430, + serialized_end=533, ) @@ -217,28 +224,28 @@ has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='transformation', full_name='types.FieldTypeTransformation.transformation', index=1, number=2, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=504, - serialized_end=611, + serialized_start=535, + serialized_end=642, ) @@ -255,49 +262,49 @@ has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='redactValue', full_name='types.Transformation.redactValue', index=1, number=3, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='hashValue', full_name='types.Transformation.hashValue', index=2, number=4, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='maskValue', full_name='types.Transformation.maskValue', index=3, number=5, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='fPEValue', full_name='types.Transformation.fPEValue', index=4, number=6, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=614, - serialized_end=823, + serialized_start=645, + serialized_end=854, ) @@ -314,21 +321,21 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=825, - serialized_end=857, + serialized_start=856, + serialized_end=888, ) @@ -345,14 +352,14 @@ nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=859, - serialized_end=872, + serialized_start=890, + serialized_end=903, ) @@ -369,14 +376,14 @@ nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=874, - serialized_end=885, + serialized_start=905, + serialized_end=916, ) @@ -393,35 +400,35 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='charsToMask', full_name='types.MaskValue.charsToMask', index=1, number=2, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='fromEnd', full_name='types.MaskValue.fromEnd', index=2, number=3, type=8, cpp_type=7, label=1, has_default_value=False, default_value=False, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=887, - serialized_end=962, + serialized_start=918, + serialized_end=993, ) @@ -438,35 +445,35 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='tweak', full_name='types.FPEValue.tweak', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='decrypt', full_name='types.FPEValue.decrypt', index=2, number=3, type=8, cpp_type=7, label=1, has_default_value=False, default_value=False, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=964, - serialized_end=1019, + serialized_start=995, + serialized_end=1050, ) @@ -483,35 +490,35 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='tableName', full_name='types.DBConfig.tableName', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='type', full_name='types.DBConfig.type', index=2, number=3, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=1021, - serialized_end=1090, + serialized_start=1052, + serialized_end=1121, ) @@ -528,35 +535,35 @@ has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='cloudStorageConfig', full_name='types.Datasink.cloudStorageConfig', index=1, number=2, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='streamConfig', full_name='types.Datasink.streamConfig', index=2, number=3, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=1093, - serialized_end=1236, + serialized_start=1124, + serialized_end=1267, ) @@ -573,35 +580,35 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='analyzeDatasink', full_name='types.DatasinkTemplate.analyzeDatasink', index=1, number=2, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='anonymizeDatasink', full_name='types.DatasinkTemplate.anonymizeDatasink', index=2, number=3, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=1238, - serialized_end=1363, + serialized_start=1269, + serialized_end=1394, ) @@ -618,35 +625,35 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='accountKey', full_name='types.BlobStorageConfig.accountKey', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='containerName', full_name='types.BlobStorageConfig.containerName', index=2, number=3, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=1365, - serialized_end=1448, + serialized_start=1396, + serialized_end=1479, ) @@ -663,49 +670,49 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='accessKey', full_name='types.S3Config.accessKey', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='region', full_name='types.S3Config.region', index=2, number=3, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='bucketName', full_name='types.S3Config.bucketName', index=3, number=4, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='endpoint', full_name='types.S3Config.endpoint', index=4, number=5, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=1450, - serialized_end=1551, + serialized_start=1481, + serialized_end=1582, ) @@ -722,42 +729,42 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='projectId', full_name='types.GoogleStorageConfig.projectId', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='scopes', full_name='types.GoogleStorageConfig.scopes', index=2, number=3, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='bucketName', full_name='types.GoogleStorageConfig.bucketName', index=3, number=4, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=1553, - serialized_end=1643, + serialized_start=1584, + serialized_end=1674, ) @@ -774,35 +781,35 @@ has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='s3Config', full_name='types.CloudStorageConfig.s3Config', index=1, number=2, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='GoogleStorageConfig', full_name='types.CloudStorageConfig.GoogleStorageConfig', index=2, number=3, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=1646, - serialized_end=1811, + serialized_start=1677, + serialized_end=1842, ) @@ -819,35 +826,35 @@ has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='ehConfig', full_name='types.StreamConfig.ehConfig', index=1, number=2, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='partitionCount', full_name='types.StreamConfig.partitionCount', index=2, number=3, type=5, cpp_type=1, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=1813, - serialized_end=1927, + serialized_start=1844, + serialized_end=1958, ) @@ -864,42 +871,42 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='topic', full_name='types.KafkaConfig.topic', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='saslUsername', full_name='types.KafkaConfig.saslUsername', index=2, number=3, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='saslPassword', full_name='types.KafkaConfig.saslPassword', index=3, number=4, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=1929, - serialized_end=2018, + serialized_start=1960, + serialized_end=2049, ) @@ -916,70 +923,70 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='ehName', full_name='types.EHConfig.ehName', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='ehConnectionString', full_name='types.EHConfig.ehConnectionString', index=2, number=3, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='ehKeyName', full_name='types.EHConfig.ehKeyName', index=3, number=4, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='ehKeyValue', full_name='types.EHConfig.ehKeyValue', index=4, number=5, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='storageAccountNameValue', full_name='types.EHConfig.storageAccountNameValue', index=5, number=6, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='storageAccountKeyValue', full_name='types.EHConfig.storageAccountKeyValue', index=6, number=7, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='containerValue', full_name='types.EHConfig.containerValue', index=7, number=8, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=2021, - serialized_end=2224, + serialized_start=2052, + serialized_end=2255, ) @@ -996,56 +1003,56 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='description', full_name='types.StreamTemplate.description', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='streamConfig', full_name='types.StreamTemplate.streamConfig', index=2, number=3, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='analyzeTemplateId', full_name='types.StreamTemplate.analyzeTemplateId', index=3, number=4, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='anonymizeTemplateId', full_name='types.StreamTemplate.anonymizeTemplateId', index=4, number=5, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='datasinkTemplateId', full_name='types.StreamTemplate.datasinkTemplateId', index=5, number=6, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=2227, - serialized_end=2405, + serialized_start=2258, + serialized_end=2436, ) @@ -1062,28 +1069,28 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='cloudStorageConfig', full_name='types.ScanTemplate.cloudStorageConfig', index=1, number=2, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=2407, - serialized_end=2497, + serialized_start=2438, + serialized_end=2528, ) @@ -1100,63 +1107,63 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='description', full_name='types.ScannerCronJobTemplate.description', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='trigger', full_name='types.ScannerCronJobTemplate.trigger', index=2, number=3, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='scanTemplateId', full_name='types.ScannerCronJobTemplate.scanTemplateId', index=3, number=4, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='analyzeTemplateId', full_name='types.ScannerCronJobTemplate.analyzeTemplateId', index=4, number=5, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='anonymizeTemplateId', full_name='types.ScannerCronJobTemplate.anonymizeTemplateId', index=5, number=6, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='datasinkTemplateId', full_name='types.ScannerCronJobTemplate.datasinkTemplateId', index=6, number=7, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=2500, - serialized_end=2700, + serialized_start=2531, + serialized_end=2731, ) @@ -1173,56 +1180,56 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='description', full_name='types.StreamsJobTemplate.description', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='streamsTemplateId', full_name='types.StreamsJobTemplate.streamsTemplateId', index=2, number=3, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='analyzeTemplateId', full_name='types.StreamsJobTemplate.analyzeTemplateId', index=3, number=4, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='anonymizeTemplateId', full_name='types.StreamsJobTemplate.anonymizeTemplateId', index=4, number=5, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='datasinkTemplateId', full_name='types.StreamsJobTemplate.datasinkTemplateId', index=5, number=6, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=2703, - serialized_end=2869, + serialized_start=2734, + serialized_end=2900, ) @@ -1239,21 +1246,21 @@ has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=2871, - serialized_end=2915, + serialized_start=2902, + serialized_end=2946, ) @@ -1270,21 +1277,21 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=2917, - serialized_end=2953, + serialized_start=2948, + serialized_end=2984, ) @@ -1301,42 +1308,42 @@ has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='createTime', full_name='types.AnonymizeImageTemplate.createTime', index=1, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='modifiedTime', full_name='types.AnonymizeImageTemplate.modifiedTime', index=2, number=3, type=9, cpp_type=9, label=1, has_default_value=False, default_value=_b("").decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='fieldTypeGraphics', full_name='types.AnonymizeImageTemplate.fieldTypeGraphics', index=3, number=4, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=2956, - serialized_end=3095, + serialized_start=2987, + serialized_end=3126, ) @@ -1353,28 +1360,28 @@ has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='graphic', full_name='types.FieldTypeGraphic.graphic', index=1, number=2, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=3097, - serialized_end=3183, + serialized_start=3128, + serialized_end=3214, ) @@ -1391,21 +1398,21 @@ has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=3185, - serialized_end=3241, + serialized_start=3216, + serialized_end=3272, ) @@ -1422,35 +1429,35 @@ has_default_value=False, default_value=float(0), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='green', full_name='types.FillColorValue.green', index=1, number=2, type=1, cpp_type=5, label=1, has_default_value=False, default_value=float(0), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), _descriptor.FieldDescriptor( name='blue', full_name='types.FillColorValue.blue', index=2, number=3, type=1, cpp_type=5, label=1, has_default_value=False, default_value=float(0), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR), + options=None, file=DESCRIPTOR), ], extensions=[ ], nested_types=[], enum_types=[ ], - serialized_options=None, + options=None, is_extendable=False, syntax='proto3', extension_ranges=[], oneofs=[ ], - serialized_start=3243, - serialized_end=3301, + serialized_start=3274, + serialized_end=3332, ) _ANALYZETEMPLATE.fields_by_name['fields'].message_type = common__pb2._FIELDTYPES diff --git a/presidio-analyzer/setup.py b/presidio-analyzer/setup.py index b4a917232..670d4d66b 100644 --- a/presidio-analyzer/setup.py +++ b/presidio-analyzer/setup.py @@ -1,22 +1,33 @@ import setuptools import os.path +from os import path + +__version__ = "" +this_directory = path.abspath(path.dirname(__file__)) +with open(os.path.join(this_directory, 'VERSION')) as version_file: + __version__ = version_file.read().strip() setuptools.setup( name="presidio_analyzer", - version="0.1.0", - author="Presidio team", - author_email="torosent@microsoft.com", + version=__version__, description="Presidio analyzer package", # long_description=long_description, # long_description_content_type="text/markdown", url="https://github.com/Microsoft/presidio", packages=[ - 'analyzer', 'analyzer.predefined_recognizers' + 'analyzer', 'analyzer.predefined_recognizers', 'analyzer.nlp_engine', + 'analyzer.recognizer_registry' ], + trusted_host=['pypi.org'], + tests_require=['pytest', 'flake8', 'pylint==2.3.1'], install_requires=[ - 'grpcio>=1.13.0', 'cython>=0.28.5', 'protobuf>=3.6.0', - 'tldextract>=2.2.0', 'knack>=0.4.2', 'spacy>=2.1.3' - ], + 'cython==0.29.10', + 'spacy==2.1.4', + 'regex==2019.6.8', + 'grpcio==1.21.1', + 'protobuf==3.8.0', + 'tldextract==2.2.1', + 'knack==0.6.2'], include_package_data=True, license='MIT', scripts=[ diff --git a/presidio-analyzer/tests/data/demo.txt b/presidio-analyzer/tests/data/demo.txt index 87e3a1657..1f10cafde 100644 --- a/presidio-analyzer/tests/data/demo.txt +++ b/presidio-analyzer/tests/data/demo.txt @@ -5,7 +5,7 @@ Here are a few examples of entities we currently support: DateTime: September 18 Domain: microsoft.com Email address: test@presidio.site - IBAN: IL150120690000003111111 + IBAN code: IL150120690000003111111 IP: 192.168.0.1 Person name: David Johnson @@ -23,4 +23,4 @@ PR appropriately (e.g., label, comment). Simply follow the instructions provided epos using our CLA. This project has adopted the Microsoft Open Source Code of Conduct. For more information see the Code of Conduct FAQ or contact -opencode@microsoft.com with any additional questions or comments. \ No newline at end of file +opencode@microsoft.com with any additional questions or comments. diff --git a/presidio-analyzer/tests/mocks/__init__.py b/presidio-analyzer/tests/mocks/__init__.py index 47866a4f4..8b7ed5662 100644 --- a/presidio-analyzer/tests/mocks/__init__.py +++ b/presidio-analyzer/tests/mocks/__init__.py @@ -1 +1,2 @@ -from .nlp_engine_mock import MockNlpEngine \ No newline at end of file +from .nlp_engine_mock import MockNlpEngine +from tests.mocks import app_tracer_mock \ No newline at end of file diff --git a/presidio-analyzer/tests/mocks/app_tracer_mock.py b/presidio-analyzer/tests/mocks/app_tracer_mock.py new file mode 100644 index 000000000..0e9dd4db7 --- /dev/null +++ b/presidio-analyzer/tests/mocks/app_tracer_mock.py @@ -0,0 +1,39 @@ +import logging + + +class AppTracerMock: + + def __init__(self, enable_interpretability=True): + + logger = logging.getLogger('InterpretabilityMock') + if not logger.handlers: + ch = logging.StreamHandler() + formatter = logging.Formatter( + '[%(asctime)s][%(name)s][%(levelname)s]%(message)s') + ch.setFormatter(formatter) + logger.addHandler(ch) + logger.setLevel(logging.INFO) + logger.propagate = False + + self.logger = logger + self.last_trace = None + self.enable_interpretability = enable_interpretability + self.msg_counter = 0 + + def trace(self, request_id, trace_data): + """ + Writes interpretability trace + :param request_id: A unique ID, to correlate across calls. + :param trace_data: A string to write. + :return: + """ + if self.enable_interpretability: + self.last_trace = "[{}][{}]".format(request_id, trace_data) + self.logger.info("[%s][%s]", request_id, trace_data) + self.msg_counter = self.msg_counter + 1 + + def get_last_trace(self): + return self.last_trace + + def get_msg_counter(self): + return self.msg_counter diff --git a/presidio-analyzer/tests/mocks/nlp_engine_mock.py b/presidio-analyzer/tests/mocks/nlp_engine_mock.py index 28fcb6679..6b87534e4 100644 --- a/presidio-analyzer/tests/mocks/nlp_engine_mock.py +++ b/presidio-analyzer/tests/mocks/nlp_engine_mock.py @@ -1,12 +1,15 @@ -from analyzer.nlp_engine import NlpEngine +from analyzer.nlp_engine import NlpEngine, NlpArtifacts class MockNlpEngine(NlpEngine): - def __init__(self, stopwords, punct_words, nlp_artifacts): + def __init__(self, stopwords=[], punct_words=[], nlp_artifacts=None): self.stopwords = stopwords self.punct_words = punct_words - self.nlp_artifacts = nlp_artifacts + if nlp_artifacts is None: + self.nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") + else: + self.nlp_artifacts = nlp_artifacts def is_stopword(self, word, language): return word in self.stopwords diff --git a/presidio-analyzer/tests/test_analyzer_engine.py b/presidio-analyzer/tests/test_analyzer_engine.py index d0249d2b2..5e79a1265 100644 --- a/presidio-analyzer/tests/test_analyzer_engine.py +++ b/presidio-analyzer/tests/test_analyzer_engine.py @@ -9,7 +9,7 @@ from analyzer.analyze_pb2 import AnalyzeRequest from analyzer import AnalyzerEngine, PatternRecognizer, Pattern, \ - RecognizerResult, RecognizerRegistry + RecognizerResult, RecognizerRegistry, AnalysisExplanation from analyzer.predefined_recognizers import CreditCardRecognizer, \ UsPhoneRecognizer, DomainRecognizer, UsItinRecognizer, \ UsLicenseRecognizer, UsBankRecognizer, UsPassportRecognizer @@ -18,6 +18,8 @@ from analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts from analyzer.predefined_recognizers import IpRecognizer, UsSsnRecognizer from tests.mocks import MockNlpEngine +from tests.mocks.app_tracer_mock import AppTracerMock + class RecognizerStoreApiMock(RecognizerStoreApi): """ @@ -34,11 +36,15 @@ def get_latest_hash(self): def get_all_recognizers(self): return self.recognizers - def add_custom_pattern_recognizer(self, new_recognizer, skip_hash_update=False): + def add_custom_pattern_recognizer(self, new_recognizer, + skip_hash_update=False): patterns = [] for pat in new_recognizer.patterns: patterns.extend([Pattern(pat.name, pat.regex, pat.score)]) - new_custom_recognizer = PatternRecognizer(name=new_recognizer.name, supported_entity=new_recognizer.supported_entities[0], + new_custom_recognizer = PatternRecognizer(name=new_recognizer.name, + supported_entity= + new_recognizer.supported_entities[ + 0], supported_language=new_recognizer.supported_language, black_list=new_recognizer.black_list, context=new_recognizer.context, @@ -63,6 +69,7 @@ def remove_recognizer(self, name): m.update(recognizer.name.encode('utf-8')) self.latest_hash = m.digest() + class MockRecognizerRegistry(RecognizerRegistry): """ A mock that acts as a recognizers registry @@ -77,13 +84,8 @@ def load_recognizers(self, path): DomainRecognizer()]) -ip_recognizer = IpRecognizer() -us_ssn_recognizer = UsSsnRecognizer() -phone_recognizer = UsPhoneRecognizer() -us_itin_recognizer = UsItinRecognizer() -us_license_recognizer = UsLicenseRecognizer() -us_bank_recognizer = UsBankRecognizer() -us_passport_recognizer = UsPassportRecognizer() +loaded_spacy_nlp_engine = SpacyNlpEngine() + class TestAnalyzerEngine(TestCase): @@ -91,13 +93,21 @@ def __init__(self, *args, **kwargs): super(TestAnalyzerEngine, self).__init__(*args, **kwargs) self.loaded_registry = MockRecognizerRegistry(RecognizerStoreApiMock()) mock_nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") - self.loaded_analyzer_engine = AnalyzerEngine(self.loaded_registry, MockNlpEngine(stopwords=[], punct_words=[], nlp_artifacts=mock_nlp_artifacts)) + self.app_tracer = AppTracerMock(enable_interpretability=True) + self.loaded_analyzer_engine = AnalyzerEngine(self.loaded_registry, + MockNlpEngine(stopwords=[], + punct_words=[], + nlp_artifacts=mock_nlp_artifacts), + app_tracer=self.app_tracer, + enable_trace_pii=True) + self.unit_test_guid = "00000000-0000-0000-0000-000000000000" def test_analyze_with_predefined_recognizers_return_results(self): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD"] results = self.loaded_analyzer_engine.analyze( + self.unit_test_guid, text, entities, language, all_fields=False) assert len(results) == 1 @@ -110,15 +120,19 @@ def test_analyze_with_multiple_predefined_recognizers(self): entities = ["CREDIT_CARD", "PHONE_NUMBER"] # This analyzer engine is different from the global one, as this one - # also loads SpaCy so it can detect the phone number entity - analyzer_engine_with_spacy = AnalyzerEngine(self.loaded_registry) - results = analyzer_engine_with_spacy.analyze(text, entities, language, all_fields=False) + # also loads SpaCy so it can use the context words + + analyzer_engine_with_spacy = AnalyzerEngine( + registry=self.loaded_registry, nlp_engine=loaded_spacy_nlp_engine) + results = analyzer_engine_with_spacy.analyze(self.unit_test_guid, text, + entities, language, + all_fields=False) assert len(results) == 2 assert_result(results[0], "CREDIT_CARD", 14, 33, EntityRecognizer.MAX_SCORE) expected_score = UsPhoneRecognizer.MEDIUM_REGEX_SCORE + \ - PatternRecognizer.CONTEXT_SIMILARITY_FACTOR # 0.5 + 0.35 = 0.85 + PatternRecognizer.CONTEXT_SIMILARITY_FACTOR # 0.5 + 0.35 = 0.85 assert_result(results[1], "PHONE_NUMBER", 48, 59, expected_score) def test_analyze_without_entities(self): @@ -126,15 +140,17 @@ def test_analyze_without_entities(self): language = "en" text = " Credit card: 4095-2609-9393-4932, my name is John Oliver, DateTime: September 18 Domain: microsoft.com" entities = [] - self.loaded_analyzer_engine.analyze( - text, entities, language, all_fields=False) + self.loaded_analyzer_engine.analyze(self.unit_test_guid, + text, entities, language, + all_fields=False) def test_analyze_with_empty_text(self): language = "en" text = "" entities = ["CREDIT_CARD", "PHONE_NUMBER"] - results = self.loaded_analyzer_engine.analyze( - text, entities, language, all_fields=False) + results = self.loaded_analyzer_engine.analyze(self.unit_test_guid, + text, entities, language, + all_fields=False) assert len(results) == 0 @@ -143,19 +159,51 @@ def test_analyze_with_unsupported_language(self): language = "de" text = "" entities = ["CREDIT_CARD", "PHONE_NUMBER"] - self.loaded_analyzer_engine.analyze( - text, entities, language, all_fields=False) + self.loaded_analyzer_engine.analyze(self.unit_test_guid, + text, entities, language, + all_fields=False) def test_remove_duplicates(self): # test same result with different score will return only the highest - arr = [RecognizerResult(start=0, end=5, score=0.1, entity_type="x"), - RecognizerResult(start=0, end=5, score=0.5, entity_type="x")] + arr = [RecognizerResult(start=0, end=5, score=0.1, entity_type="x", + analysis_explanation=AnalysisExplanation( + recognizer='test', + original_score=0, + pattern_name='test', + pattern='test', + validation_result=None)), + RecognizerResult(start=0, end=5, score=0.5, entity_type="x", + analysis_explanation=AnalysisExplanation( + recognizer='test', + original_score=0, + pattern_name='test', + pattern='test', + validation_result=None))] results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr) assert len(results) == 1 assert results[0].score == 0.5 # TODO: add more cases with bug: # bug# 597: Analyzer remove duplicates doesn't handle all cases of one result as a substring of the other + def test_remove_duplicates_different_entity_no_removal(self): + # test same result with different score will return only the highest + arr = [RecognizerResult(start=0, end=5, score=0.1, entity_type="x", + analysis_explanation=AnalysisExplanation( + recognizer='test', + original_score=0, + pattern_name='test', + pattern='test', + validation_result=None)), + RecognizerResult(start=0, end=5, score=0.5, entity_type="y", + analysis_explanation=AnalysisExplanation( + recognizer='test', + original_score=0, + pattern_name='test', + pattern='test', + validation_result=None))] + results = AnalyzerEngine._AnalyzerEngine__remove_duplicates(arr) + assert len(results) == 2 + def test_added_pattern_recognizer_works(self): pattern = Pattern("rocket pattern", r'\W*(rocket)\W*', 0.8) pattern_recognizer = PatternRecognizer("ROCKET", @@ -164,12 +212,15 @@ def test_added_pattern_recognizer_works(self): # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() - analyze_engine = AnalyzerEngine( - MockRecognizerRegistry(recognizers_store_api_mock)) + analyze_engine = AnalyzerEngine(registry= + MockRecognizerRegistry( + recognizers_store_api_mock), + nlp_engine=MockNlpEngine()) text = "rocket is my favorite transportation" entities = ["CREDIT_CARD", "ROCKET"] - results = analyze_engine.analyze(text=text, entities=entities, + results = analyze_engine.analyze(self.unit_test_guid, text=text, + entities=entities, language='en', all_fields=False) assert len(results) == 0 @@ -179,7 +230,8 @@ def test_added_pattern_recognizer_works(self): pattern_recognizer) # Check that the entity is recognized: - results = analyze_engine.analyze(text=text, entities=entities, + results = analyze_engine.analyze(self.unit_test_guid, text=text, + entities=entities, language='en', all_fields=False) assert len(results) == 1 @@ -193,12 +245,13 @@ def test_removed_pattern_recognizer_doesnt_work(self): # Make sure the analyzer doesn't get this entity recognizers_store_api_mock = RecognizerStoreApiMock() - analyze_engine = AnalyzerEngine(MockRecognizerRegistry( - recognizers_store_api_mock)) + analyze_engine = AnalyzerEngine(registry=MockRecognizerRegistry( + recognizers_store_api_mock), nlp_engine=MockNlpEngine()) text = "spaceship is my favorite transportation" entities = ["CREDIT_CARD", "SPACESHIP"] - results = analyze_engine.analyze(text=text, entities=entities, + results = analyze_engine.analyze(self.unit_test_guid, text=text, + entities=entities, language='en', all_fields=False) assert len(results) == 0 @@ -207,7 +260,8 @@ def test_removed_pattern_recognizer_doesnt_work(self): recognizers_store_api_mock.add_custom_pattern_recognizer( pattern_recognizer) # Check that the entity is recognized: - results = analyze_engine.analyze(text=text, entities=entities, + results = analyze_engine.analyze(self.unit_test_guid, text=text, + entities=entities, language='en', all_fields=False) assert len(results) == 1 assert_result(results[0], "SPACESHIP", 0, 10, 0.8) @@ -216,7 +270,8 @@ def test_removed_pattern_recognizer_doesnt_work(self): recognizers_store_api_mock.remove_recognizer( "Spaceship recognizer") # Test again to see we didn't get any results - results = analyze_engine.analyze(text=text, entities=entities, + results = analyze_engine.analyze(self.unit_test_guid, text=text, + entities=entities, language='en', all_fields=False) assert len(results) == 0 @@ -224,6 +279,7 @@ def test_removed_pattern_recognizer_doesnt_work(self): def test_apply_with_language_returns_correct_response(self): request = AnalyzeRequest() request.analyzeTemplate.language = 'en' + request.analyzeTemplate.resultsScoreThreshold = 0 new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' @@ -235,6 +291,7 @@ def test_apply_with_language_returns_correct_response(self): def test_apply_with_no_language_returns_default(self): request = AnalyzeRequest() request.analyzeTemplate.language = '' + request.analyzeTemplate.resultsScoreThreshold = 0 new_field = request.analyzeTemplate.fields.add() new_field.name = 'CREDIT_CARD' new_field.minScore = '0.5' @@ -243,11 +300,13 @@ def test_apply_with_no_language_returns_default(self): assert response.analyzeResults is not None def test_when_allFields_is_true_return_all_fields(self): - analyze_engine = AnalyzerEngine(MockRecognizerRegistry()) + analyze_engine = AnalyzerEngine(registry=MockRecognizerRegistry(), + nlp_engine=MockNlpEngine()) request = AnalyzeRequest() request.analyzeTemplate.allFields = True + request.analyzeTemplate.resultsScoreThreshold = 0 request.text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090 " \ - "Domain: microsoft.com" + "Domain: microsoft.com" response = analyze_engine.Apply(request, None) returned_entities = [ field.field.name for field in response.analyzeResults] @@ -257,12 +316,14 @@ def test_when_allFields_is_true_return_all_fields(self): assert "PHONE_NUMBER" in returned_entities assert "DOMAIN_NAME" in returned_entities - def test_when_allFields_is_true_full_recognizers_list_return_all_fields(self): - analyze_engine = AnalyzerEngine(RecognizerRegistry()) + def test_when_allFields_is_true_full_recognizers_list_return_all_fields( + self): + analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(), + nlp_engine=loaded_spacy_nlp_engine) request = AnalyzeRequest() request.analyzeTemplate.allFields = True request.text = "My name is David and I live in Seattle." \ - "Domain: microsoft.com " + "Domain: microsoft.com " response = analyze_engine.Apply(request, None) returned_entities = [ field.field.name for field in response.analyzeResults] @@ -272,7 +333,8 @@ def test_when_allFields_is_true_full_recognizers_list_return_all_fields(self): assert "DOMAIN_NAME" in returned_entities def test_when_allFields_is_true_and_entities_not_empty_exception(self): - analyze_engine = AnalyzerEngine(registry=RecognizerRegistry()) + analyze_engine = AnalyzerEngine(registry=RecognizerRegistry(), + nlp_engine=MockNlpEngine()) request = AnalyzeRequest() request.text = "My name is David and I live in Seattle." \ "Domain: microsoft.com " @@ -282,4 +344,122 @@ def test_when_allFields_is_true_and_entities_not_empty_exception(self): new_field.minScore = '0.5' with pytest.raises(ValueError): analyze_engine.Apply(request, None) - \ No newline at end of file + + def test_when_analyze_then_apptracer_has_value(self): + text = "My name is Bart Simpson, and Credit card: 4095-2609-9393-4932, my phone is 425 8829090" + language = "en" + entities = ["CREDIT_CARD", "PHONE_NUMBER", "PERSON"] + analyzer_engine_with_spacy = AnalyzerEngine(self.loaded_registry, + app_tracer=self.app_tracer, + enable_trace_pii=True) + results = analyzer_engine_with_spacy.analyze(correlation_id=self.unit_test_guid, + text=text, + entities=entities, + language=language, + all_fields=False, + trace=True) + assert len(results) == 3 + for result in results: + assert result.analysis_explanation is not None + assert self.app_tracer.get_msg_counter() == 2 + assert self.app_tracer.get_last_trace() is not None + + def test_when_threshold_is_zero_all_results_pass(self): + text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" + language = "en" + entities = ["CREDIT_CARD", "PHONE_NUMBER"] + + # This analyzer engine is different from the global one, as this one + # also loads SpaCy so it can detect the phone number entity + + analyzer_engine = AnalyzerEngine( + registry=self.loaded_registry, nlp_engine=MockNlpEngine()) + results = analyzer_engine.analyze(self.unit_test_guid, text, + entities, language, + all_fields=False, + score_threshold=0) + + assert len(results) == 2 + + def test_when_threshold_is_more_than_half_only_credit_card_passes(self): + text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" + language = "en" + entities = ["CREDIT_CARD", "PHONE_NUMBER"] + + # This analyzer engine is different from the global one, as this one + # also loads SpaCy so it can detect the phone number entity + + analyzer_engine = AnalyzerEngine( + registry=self.loaded_registry, nlp_engine=MockNlpEngine()) + results = analyzer_engine.analyze(self.unit_test_guid, text, + entities, language, + all_fields=False, + score_threshold=0.51) + + assert len(results) == 1 + + def test_when_default_threshold_is_more_than_half_only_one_passes(self): + text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" + language = "en" + entities = ["CREDIT_CARD", "PHONE_NUMBER"] + + # This analyzer engine is different from the global one, as this one + # also loads SpaCy so it can detect the phone number entity + + analyzer_engine = AnalyzerEngine( + registry=self.loaded_registry, nlp_engine=MockNlpEngine(), + default_score_threshold=0.7) + results = analyzer_engine.analyze(self.unit_test_guid, text, + entities, language, + all_fields=False) + + assert len(results) == 1 + + def test_when_default_threshold_is_zero_all_results_pass(self): + text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" + language = "en" + entities = ["CREDIT_CARD", "PHONE_NUMBER"] + + # This analyzer engine is different from the global one, as this one + # also loads SpaCy so it can detect the phone number entity + + analyzer_engine = AnalyzerEngine( + registry=self.loaded_registry, nlp_engine=MockNlpEngine()) + results = analyzer_engine.analyze(self.unit_test_guid, text, + entities, language, + all_fields=False) + + assert len(results) == 2 + + def test_demo_text(self): + text = "Here are a few examples of entities we currently support: \n" \ + "Credit card: 4095-2609-9393-4932 \n" \ + "Crypto wallet id: 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ \n" \ + "DateTime: September 18 n" \ + "Domain: microsoft.com \n" \ + "Email address: test@presidio.site \n" \ + "IBAN code: IL150120690000003111111 \n" \ + "IP: 192.168.0.1 i\n" \ + "Person name: David Johnson\n" \ + "Bank account: 2854567876542\n" \ + "Driver license number: H12234567\n" \ + "Passport: 912803456\n" \ + "Phone number: (212) 555-1234.\n" \ + "Social security number: 078-05-1120\n" \ + "" \ + "This project welcomes contributions and suggestions. Most contributions require you to agree to a " \ + "Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us " \ + "the rights to use your contribution. For details, visit https://cla.microsoft.com.\n" \ + "When you submit a pull request, a CLA-bot will automatically determine whether you need to provide " \ + "a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions " \ + "provided by the bot. You will only need to do this once across all repos using our CLA.\n\n" \ + "This project has adopted the Microsoft Open Source Code of Conduct. For more information see the " \ + "Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments." + + language = "en" + + analyzer_engine = AnalyzerEngine(default_score_threshold=0.6) + results = analyzer_engine.analyze(correlation_id=self.unit_test_guid, text=text, entities=None, + language=language, all_fields=True) + + assert len(results) == 15 diff --git a/presidio-analyzer/tests/test_pattern_recognizer.py b/presidio-analyzer/tests/test_pattern_recognizer.py index 4bf051c2b..53421c835 100644 --- a/presidio-analyzer/tests/test_pattern_recognizer.py +++ b/presidio-analyzer/tests/test_pattern_recognizer.py @@ -10,8 +10,8 @@ class MockRecognizer(PatternRecognizer): - def validate_result(self, pattern_text, pattern_result): - return pattern_result + def validate_result(self, pattern_text): + return True def __init__(self, entity, patterns, black_list, name, context): super().__init__(supported_entity=entity, diff --git a/presidio-analyzer/tests/test_recognizer_registry.py b/presidio-analyzer/tests/test_recognizer_registry.py index 5a2e14288..578bdd1b5 100644 --- a/presidio-analyzer/tests/test_recognizer_registry.py +++ b/presidio-analyzer/tests/test_recognizer_registry.py @@ -49,7 +49,7 @@ def add_custom_pattern_recognizer(self, new_recognizer, self.latest_hash = m.digest() def remove_recognizer(self, name): - logging.info("removing recognizer " + name) + logging.info("removing recognizer %s", name) for i in self.recognizers: if i.name == name: self.recognizers.remove(i) @@ -60,6 +60,10 @@ def remove_recognizer(self, name): class TestRecognizerRegistry(TestCase): + def __init__(self, *args, **kwargs): + super(TestRecognizerRegistry, self).__init__(*args, **kwargs) + self.request_id = "UT" + def test_dummy(self): assert 1 == 1 @@ -111,7 +115,8 @@ def test_get_recognizers_one_language_one_entity(self): def test_get_recognizers_unsupported_language(self): with pytest.raises(ValueError): registry = self.get_mock_recognizer_registry() - registry.get_recognizers(language='brrrr', entities=["PERSON"]) + registry.get_recognizers( + language='brrrr', entities=["PERSON"]) def test_get_recognizers_specific_language_and_entity(self): registry = self.get_mock_recognizer_registry() diff --git a/presidio-analyzer/tests/test_us_driver_license_recognizer.py b/presidio-analyzer/tests/test_us_driver_license_recognizer.py index 5289c4d9e..4f49c0b59 100644 --- a/presidio-analyzer/tests/test_us_driver_license_recognizer.py +++ b/presidio-analyzer/tests/test_us_driver_license_recognizer.py @@ -58,26 +58,15 @@ def test_invalid_us_driver_license(self): # Driver License - Digits (very weak) - 0.05 # Regex: r'\b([0-9]{1,9}|[0-9]{4,10}|[0-9]{6,10}|[0-9]{1,12}|[0-9]{12,14}|[0-9]{16})\b' - + # Regex: r'\b([0-9]{6,14}|[0-9]{16})\b' def test_valid_us_driver_license_very_weak_digits(self): - num = '123456789 1234567890 12345679012 123456790123 1234567901234' + num = '123456789 1234567890 12345679012 123456790123 1234567901234 1234' results = us_license_recognizer.analyze(num, entities) assert len(results) == 5 for result in results: assert 0 < result.score < 0.02 - def test_load_from_file(self): - path = os.path.dirname(__file__) + '/data/demo.txt' - text_file = open(path, 'r') - text = text_file.read() - results = us_license_recognizer.analyze(text, entities) - - assert len(results) == 23 - - # Driver License - Letters (very weak) - 0.00 - # Regex: r'\b([A-Z]{7,9}\b' - def test_valid_us_driver_license_very_weak_letters(self): num = 'ABCDEFG ABCDEFGH ABCDEFGHI' results = us_license_recognizer.analyze(num, entities) diff --git a/presidio-api/cmd/presidio-api/api/analyze/analyze.go b/presidio-api/cmd/presidio-api/api/analyze/analyze.go index ed69a6e8b..faf6edc93 100644 --- a/presidio-api/cmd/presidio-api/api/analyze/analyze.go +++ b/presidio-api/cmd/presidio-api/api/analyze/analyze.go @@ -10,11 +10,13 @@ import ( ) //Analyze text -func Analyze(ctx context.Context, api *store.API, analyzeAPIRequest *types.AnalyzeApiRequest, project string) ([]*types.AnalyzeResult, error) { - - if analyzeAPIRequest.AnalyzeTemplateId == "" && analyzeAPIRequest.AnalyzeTemplate == nil { +func Analyze(ctx context.Context, api *store.API, analyzeAPIRequest *types.AnalyzeApiRequest, project string) (*types.AnalyzeResponse, error) { + switch { + case analyzeAPIRequest.AnalyzeTemplateId == "" && analyzeAPIRequest.AnalyzeTemplate == nil: return nil, fmt.Errorf("Analyze template is missing or empty") - } else if analyzeAPIRequest.AnalyzeTemplate == nil { + case analyzeAPIRequest.AnalyzeTemplateId != "" && analyzeAPIRequest.AnalyzeTemplate != nil: + return nil, fmt.Errorf("Analyze template and Analyze template ID are mutually exclusive") + case analyzeAPIRequest.AnalyzeTemplate == nil: analyzeAPIRequest.AnalyzeTemplate = &types.AnalyzeTemplate{} } @@ -28,7 +30,7 @@ func Analyze(ctx context.Context, api *store.API, analyzeAPIRequest *types.Analy return nil, err } if res == nil { - return nil, fmt.Errorf("No results") + return &types.AnalyzeResponse{}, err } return res, err diff --git a/presidio-api/cmd/presidio-api/api/analyze/analyze_test.go b/presidio-api/cmd/presidio-api/api/analyze/analyze_test.go index cb187eba8..d6fb2d940 100644 --- a/presidio-api/cmd/presidio-api/api/analyze/analyze_test.go +++ b/presidio-api/cmd/presidio-api/api/analyze/analyze_test.go @@ -4,6 +4,7 @@ import ( "context" "testing" + uuid "github.com/satori/go.uuid" "github.com/stretchr/testify/assert" types "github.com/Microsoft/presidio-genproto/golang" @@ -24,6 +25,18 @@ func setupMockServices() *store.API { return api } +func setupEmptyResponseMockServices() *store.API { + srv := &services.Services{ + AnalyzerService: mocks.GetAnalyzeServiceMock(mocks.GetAnalyzerMockEmptyResult()), + } + + api := &store.API{ + Services: srv, + Templates: mocks.GetTemplateMock(), + } + return api +} + func TestAnalyzeWithTemplateId(t *testing.T) { api := setupMockServices() @@ -32,11 +45,10 @@ func TestAnalyzeWithTemplateId(t *testing.T) { analyzeAPIRequest := &types.AnalyzeApiRequest{ Text: "My number is (555) 253-0000 and email johnsnow@foo.com", AnalyzeTemplateId: "test", - AnalyzeTemplate: &types.AnalyzeTemplate{}, } - results, err := Analyze(context.Background(), api, analyzeAPIRequest, project) + response, err := Analyze(context.Background(), api, analyzeAPIRequest, project) assert.NoError(t, err) - assert.Equal(t, 2, len(results)) + assert.Equal(t, 2, len(response.AnalyzeResults)) } func TestAnalyzeWithTemplateStruct(t *testing.T) { @@ -57,9 +69,9 @@ func TestAnalyzeWithTemplateStruct(t *testing.T) { }, }, } - results, err := Analyze(context.Background(), api, analyzeAPIRequest, project) + response, err := Analyze(context.Background(), api, analyzeAPIRequest, project) assert.NoError(t, err) - assert.Equal(t, 2, len(results)) + assert.Equal(t, 2, len(response.AnalyzeResults)) } func TestAnalyzeWithNoTemplate(t *testing.T) { @@ -83,7 +95,6 @@ func TestLanguageCode(t *testing.T) { analyzeAPIRequest := &types.AnalyzeApiRequest{ Text: "My number is (555) 253-0000 and email johnsnow@foo.com", AnalyzeTemplateId: "test", - AnalyzeTemplate: &types.AnalyzeTemplate{}, } Analyze(context.Background(), api, analyzeAPIRequest, project) assert.Equal(t, "langtest", analyzeAPIRequest.AnalyzeTemplate.Language) @@ -100,7 +111,41 @@ func TestAllFields(t *testing.T) { Language: "en", AllFields: true}, } - results, err := Analyze(context.Background(), api, analyzeAPIRequest, project) + response, err := Analyze(context.Background(), api, analyzeAPIRequest, project) + assert.NoError(t, err) + assert.Equal(t, 2, len(response.AnalyzeResults)) + assert.NotEqual(t, "", response.RequestId) + _, err = uuid.FromString(response.RequestId) + assert.NoError(t, err) +} + +func TestAnalyzeWhenNoEntitiesFoundThenExpectEmptyResponse(t *testing.T) { + + api := setupEmptyResponseMockServices() + + project := "tests" + noResultsanalyzeAPIRequest := &types.AnalyzeApiRequest{ + Text: "hello world", + AnalyzeTemplate: &types.AnalyzeTemplate{ + Language: "en", + AllFields: true}, + } + response, err := Analyze(context.Background(), api, noResultsanalyzeAPIRequest, project) assert.NoError(t, err) - assert.Equal(t, 2, len(results)) + assert.Equal(t, 0, len(response.AnalyzeResults)) +} + +func TestSettingTemplateAndTemplateIdReturnsError(t *testing.T) { + api := setupMockServices() + + project := "tests" + analyzeAPIRequest := &types.AnalyzeApiRequest{ + Text: "My number is (555) 253-0000 and email johnsnow@foo.com", + AnalyzeTemplate: &types.AnalyzeTemplate{ + Language: "en", + AllFields: true}, + AnalyzeTemplateId: "123", + } + _, err := Analyze(context.Background(), api, analyzeAPIRequest, project) + assert.Error(t, err) } diff --git a/presidio-api/cmd/presidio-api/api/anonymize-image/anonymize-image.go b/presidio-api/cmd/presidio-api/api/anonymize-image/anonymize-image.go index 5227f8ae9..76e9259c1 100644 --- a/presidio-api/cmd/presidio-api/api/anonymize-image/anonymize-image.go +++ b/presidio-api/cmd/presidio-api/api/anonymize-image/anonymize-image.go @@ -118,14 +118,14 @@ func applyPresidioOCR(ctx context.Context, services presidio.ServicesAPI, image image.Text = ocrRes.Image.Text image.Boundingboxes = ocrRes.Image.Boundingboxes - analyzeResults, err := services.AnalyzeItem(ctx, ocrRes.Image.Text, analyzeTemplate) + analyzeResponse, err := services.AnalyzeItem(ctx, ocrRes.Image.Text, analyzeTemplate) if err != nil { return nil, err } - if analyzeResults == nil { + if analyzeResponse == nil || analyzeResponse.AnalyzeResults == nil { return nil, fmt.Errorf("No PII content found in image") } - return analyzeResults, nil + return analyzeResponse.AnalyzeResults, nil } diff --git a/presidio-api/cmd/presidio-api/api/anonymize/anonymize.go b/presidio-api/cmd/presidio-api/api/anonymize/anonymize.go index 49af70d46..35b2c3f49 100644 --- a/presidio-api/cmd/presidio-api/api/anonymize/anonymize.go +++ b/presidio-api/cmd/presidio-api/api/anonymize/anonymize.go @@ -35,7 +35,7 @@ func Anonymize(ctx context.Context, api *store.API, anonymizeAPIRequest *types.A return nil, fmt.Errorf("No analyze results") } - anonymizeRes, err := api.Services.AnonymizeItem(ctx, analyzeRes, anonymizeAPIRequest.Text, anonymizeAPIRequest.AnonymizeTemplate) + anonymizeRes, err := api.Services.AnonymizeItem(ctx, analyzeRes.AnalyzeResults, anonymizeAPIRequest.Text, anonymizeAPIRequest.AnonymizeTemplate) if err != nil { return nil, err } else if anonymizeRes == nil { diff --git a/presidio-api/cmd/presidio-api/api/mocks/mocks.go b/presidio-api/cmd/presidio-api/api/mocks/mocks.go index 5e0db14e5..4619870f8 100644 --- a/presidio-api/cmd/presidio-api/api/mocks/mocks.go +++ b/presidio-api/cmd/presidio-api/api/mocks/mocks.go @@ -42,6 +42,13 @@ type TemplateMockedObject struct { mock.Mock } +//GetAnalyzerMockEmptyResult get analyzer mock empty response +func GetAnalyzerMockEmptyResult() *types.AnalyzeResponse { + return &types.AnalyzeResponse{ + AnalyzeResults: nil, + } +} + //GetAnalyzerMockResult get analyzer mock response func GetAnalyzerMockResult() *types.AnalyzeResponse { location := &types.Location{ @@ -62,6 +69,7 @@ func GetAnalyzerMockResult() *types.AnalyzeResponse { }, } return &types.AnalyzeResponse{ + RequestId: "21020352-c0bd-4af6-81e0-f1d53f34f2cb", AnalyzeResults: results, } } diff --git a/presidio-api/cmd/presidio-api/methods.go b/presidio-api/cmd/presidio-api/methods.go index 92cead4ce..23bbb67cf 100644 --- a/presidio-api/cmd/presidio-api/methods.go +++ b/presidio-api/cmd/presidio-api/methods.go @@ -95,7 +95,11 @@ func analyzeText(c *gin.Context) { server.AbortWithError(c, http.StatusBadRequest, err) return } - server.WriteResponse(c, http.StatusOK, result) + server.WriteResponseWithRequestID( + c, + http.StatusOK, + result.RequestId, + result.AnalyzeResults) } } diff --git a/presidio-collector/cmd/presidio-collector/processor/processor.go b/presidio-collector/cmd/presidio-collector/processor/processor.go index 4da71c128..ab6fa9636 100644 --- a/presidio-collector/cmd/presidio-collector/processor/processor.go +++ b/presidio-collector/cmd/presidio-collector/processor/processor.go @@ -18,26 +18,26 @@ import ( func ReceiveEventsFromStream(st stream.Stream, services presidio.ServicesAPI, streamRequest *types.StreamRequest) error { return st.Receive(func(ctx context.Context, partition string, sequence string, text string) error { - analyzerResult, err := services.AnalyzeItem(ctx, text, streamRequest.AnalyzeTemplate) + analyzerResponse, err := services.AnalyzeItem(ctx, text, streamRequest.AnalyzeTemplate) if err != nil { err = fmt.Errorf("error analyzing message: %s, error: %q", text, err.Error()) return err } - if len(analyzerResult) > 0 { - anonymizerResult, err := services.AnonymizeItem(ctx, analyzerResult, text, streamRequest.AnonymizeTemplate) + if len(analyzerResponse.AnalyzeResults) > 0 { + anonymizerResult, err := services.AnonymizeItem(ctx, analyzerResponse.AnalyzeResults, text, streamRequest.AnonymizeTemplate) if err != nil { err = fmt.Errorf("error anonymizing item: %s/%s, error: %q", partition, sequence, err.Error()) return err } - err = services.SendResultToDatasink(ctx, analyzerResult, anonymizerResult, fmt.Sprintf("%s/%s", partition, sequence)) + err = services.SendResultToDatasink(ctx, analyzerResponse.AnalyzeResults, anonymizerResult, fmt.Sprintf("%s/%s", partition, sequence)) if err != nil { err = fmt.Errorf("error sending message to datasink: %s/%s, error: %q", partition, sequence, err.Error()) return err } - log.Debug("%d results were sent to the datasink successfully", len(analyzerResult)) + log.Debug("%d results were sent to the datasink successfully", len(analyzerResponse.AnalyzeResults)) } return nil @@ -47,7 +47,7 @@ func ReceiveEventsFromStream(st stream.Stream, services presidio.ServicesAPI, st //ScanStorage .. func ScanStorage(ctx context.Context, scan scanner.Scanner, cache cache.Cache, services presidio.ServicesAPI, scanRequest *types.ScanRequest) error { return scan.Scan(func(item interface{}) error { - var analyzerResult []*types.AnalyzeResult + var analyzerResult *types.AnalyzeResponse //[]*types.AnalyzeResult scanItem := scanner.CreateItem(scanRequest, item) itemPath := scanItem.GetPath() @@ -78,19 +78,19 @@ func ScanStorage(ctx context.Context, scan scanner.Scanner, cache cache.Cache, s if err != nil { return err } - log.Debug("analyzed %d results", len(analyzerResult)) + log.Debug("analyzed %d results", len(analyzerResult.AnalyzeResults)) - if len(analyzerResult) > 0 { - anonymizerResult, err := services.AnonymizeItem(ctx, analyzerResult, content, scanRequest.AnonymizeTemplate) + if len(analyzerResult.AnalyzeResults) > 0 { + anonymizerResult, err := services.AnonymizeItem(ctx, analyzerResult.AnalyzeResults, content, scanRequest.AnonymizeTemplate) if err != nil { return err } - err = services.SendResultToDatasink(ctx, analyzerResult, anonymizerResult, itemPath) + err = services.SendResultToDatasink(ctx, analyzerResult.AnalyzeResults, anonymizerResult, itemPath) if err != nil { return err } - log.Info("%d results were sent to the datasink successfully", len(analyzerResult)) + log.Info("%d results were sent to the datasink successfully", len(analyzerResult.AnalyzeResults)) } writeItemToCache(uniqueID, itemPath, cache)