Skip to content

Commit

Permalink
feat: Add option to enable/disable regex matching for extraction rules.
Browse files Browse the repository at this point in the history
  • Loading branch information
Nico-AP committed Dec 3, 2023
1 parent f858a38 commit 76ddd81
Show file tree
Hide file tree
Showing 12 changed files with 116 additions and 76 deletions.
3 changes: 2 additions & 1 deletion ddm/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ class ProcessingRuleForm(forms.ModelForm):

class Meta:
model = ProcessingRule
fields = ['execution_order', 'name', 'field', 'comparison_operator', 'comparison_value', 'replacement_value']
fields = ['execution_order', 'name', 'field', 'regex_field',
'comparison_operator', 'comparison_value', 'replacement_value']
widgets = {
'field': TextInput(),
'comparison_value': Textarea(attrs={'cols': 60, 'rows': 1}),
Expand Down
23 changes: 23 additions & 0 deletions ddm/migrations/0044_auto_20231203_1248.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 3.2.13 on 2023-12-03 11:48

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('ddm', '0043_auto_20230525_1621'),
]

operations = [
migrations.AddField(
model_name='processingrule',
name='regex_field',
field=models.BooleanField(default=False, help_text='Select if you use a regex expression in the "Field" setting to match a variable.'),
),
migrations.AlterField(
model_name='openquestion',
name='display',
field=models.CharField(choices=[('small', 'Small'), ('large', 'Large')], default='large', help_text='"Small" displays a one-line textfield, "Large" a multiline textfield as input.', max_length=20),
),
]
18 changes: 18 additions & 0 deletions ddm/migrations/0045_alter_processingrule_comparison_operator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.2.13 on 2023-12-03 13:08

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('ddm', '0044_auto_20231203_1248'),
]

operations = [
migrations.AlterField(
model_name='processingrule',
name='comparison_operator',
field=models.CharField(blank=True, choices=[('', 'Keep Field'), ('==', 'Equal (==)'), ('!=', 'Not Equal (!=)'), ('>', 'Greater than (>)'), ('<', 'Smaller than (<)'), ('>=', 'Greater than or equal (>=)'), ('<=', 'Smaller than or equal (<=)'), ('regex-delete-match', 'Delete match (regex)'), ('regex-replace-match', 'Replace match (regex)'), ('regex-delete-row', 'Delete row when match (regex)')], default=None, max_length=24, null=True, verbose_name='Extraction Operator'),
),
]
7 changes: 7 additions & 0 deletions ddm/models/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,11 +599,17 @@ class ProcessingRule(models.Model):
'If a field is mentioned in a rule, it will be kept in the data that are sent to the server.'
)
)
regex_field = models.BooleanField(
default=False,
null=False,
help_text='Select if you use a regex expression in the "Field" setting to match a variable.'
)
execution_order = models.IntegerField(
help_text='The order in which the extraction steps are executed.'
)

class ComparisonOperators(models.TextChoices):
EMPTY = '', 'Keep Field'
EQUAL = '==', 'Equal (==)'
NOT_EQUAL = '!=', 'Not Equal (!=)'
GREATER = '>', 'Greater than (>)'
Expand Down Expand Up @@ -646,6 +652,7 @@ def get_rule_config(self):
"""
return {
'field': self.field,
'regex_field': self.regex_field,
'comparison_operator': self.comparison_operator,
'comparison_value': self.comparison_value,
'replacement_value': self.replacement_value
Expand Down
10 changes: 5 additions & 5 deletions ddm/static/ddm/vue/js/chunk-vendors.js

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions ddm/static/ddm/vue/js/vue_questionnaire.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions ddm/static/ddm/vue/js/vue_uploader.js

Large diffs are not rendered by default.

4 changes: 1 addition & 3 deletions ddm/static/ddm/vue/webpack-stats.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,5 @@
"js/vue_questionnaire.js"
]
},
"publicPath": "/static/ddm/vue/",
"error": "ModuleBuildError",
"message": "Module build failed (from ./node_modules/babel-loader/lib/index.js):\nSyntaxError: C:\\Files\\Arbeit\\Projekte\\Data Donation Lab\\Code\\DDM\\ddm\\frontend\\src\\components\\FileUploader.vue: Unexpected token, expected \"]\" (274:52)\n\n 272 | case '==':\n 273 | if (entry[key] !== rule.comparison_value) {\n> 274 | result[rule.field = entry[key];\n | ^\n 275 | } else {\n 276 | throw `Field \"${key}\" matches filter value \"${rule.comparison_value}\" for entry.`\n 277 | }\n at instantiate (C:\\Files\\Arbeit\\Projekte\\Data Donation Lab\\Code\\DDM\\ddm\\frontend\\node_modules\\@babel\\parser\\lib\\index.js:72:32)\n at constructor (C:\\Files\\Arbeit\\Projekte\\Data Donation Lab\\Code\\DDM\\ddm\\frontend\\node_modules\\@babel\\parser\\lib\\index.js:358:12)\n at Object.raise (C:\\Files\\Arbeit\\Projekte\\Data Donation Lab\\Code\\DDM\\ddm\\frontend\\node_modules\\@babel\\parser\\lib\\index.js:3341:19)\n at Object.unexpected (C:\\Files\\Arbeit\\Projekte\\Data Donation Lab\\Code\\DDM\\ddm\\frontend\\node_modules\\@babel\\parser\\lib\\index.js:3379:16)\n at Object.expect (C:\\Files\\Arbeit\\Projekte\\Data Donation Lab\\Code\\DDM\\ddm\\frontend\\node_modules\\@babel\\parser\\lib\\index.js:4008:28)\n at Object.parseMember (C:\\Files\\Arbeit\\Projekte\\Data Donation Lab\\Code\\DDM\\ddm\\frontend\\node_modules\\@babel\\parser\\lib\\index.js:12711:12)\n at Object.parseSubscript (C:\\Files\\Arbeit\\Projekte\\Data Donation Lab\\Code\\DDM\\ddm\\frontend\\node_modules\\@babel\\parser\\lib\\index.js:12696:21)\n at Object.parseSubscripts (C:\\Files\\Arbeit\\Projekte\\Data Donation Lab\\Code\\DDM\\ddm\\frontend\\node_modules\\@babel\\parser\\lib\\index.js:12660:19)\n at Object.parseExprSubscripts (C:\\Files\\Arbeit\\Projekte\\Data Donation Lab\\Code\\DDM\\ddm\\frontend\\node_modules\\@babel\\parser\\lib\\index.js:12649:17)\n at Object.parseUpdate (C:\\Files\\Arbeit\\Projekte\\Data Donation Lab\\Code\\DDM\\ddm\\frontend\\node_modules\\@babel\\parser\\lib\\index.js:12622:21)"
"publicPath": "/static/ddm/vue/"
}
93 changes: 38 additions & 55 deletions ddm/templates/ddm/admin/data_donation/donation_blueprint/edit.html
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,24 @@
{% endfor %}
</div>

<div class="ddm-admin-form">
<h5>Data Extraction Settings</h5>
<div class="ddm-admin-form pt-5">
<h4>Data Extraction Settings</h4>
</div>

<div>
<p>Data Extraction is a two-step process:</p>

<h6><b>1. File Validation</b></h6>
<p>
First, it is checked whether the file that you expect is included in the download.
This means that if the associated File Uploader expects a ZIP Upload, it tries to find the correct
file according to the <i>file path</i> you defined (this is skipped for single file uploads).
</p>
<p>
Next, it is checked whether the uploaded file has the expected format defined in the <i>Expected File Format</i>
setting (and other settings, depending on the file format).
</p>
<p>
Lastly, it is checked whether the identified file contains the expected fields
defined in the <i>Expected Fields</i> setting.<br>
If any of these validation steps fail, the participant will be shown an
exception message explaining what went wrong and the file upload and extraction is aborted.
</p>
<p>Data Extraction is a two-step process consisting of first the <b>file validation</b> and second the <b>data extraction</b>.</p>

<h5 class="pt-3"><b>File Validation</b></h5>
<ul>
<li>First, it is checked whether the expected file is included in the uploaded data (only applies to ZIP uploads).
If the associated File Uploader expects a ZIP Upload, the correct file is identified
using the provided <code>file path</code> (this is skipped for single file uploads).</li>
<li>Second, it is checked whether the uploaded file is in the <code>expected file format</code>.</li>
<li>Third, it is checked whether the identified file contains <b>all</b> <code>expected fields</code>.</li>
<li>
If any of these validation steps fail, the participant will be shown an
exception message explaining what went wrong and no data is extracted.</li>
</ul>
</div>

<div class="ddm-admin-form">
Expand All @@ -63,43 +58,31 @@ <h6><b>1. File Validation</b></h6>
{% endfor %}
</div>

<div>
<h6><b>2. Data Extraction</b></h6>
</div>
<div>
<p>
For the data extraction, the Data Donation Module follows the data sparsity paradigm.
This means that the base assumption is, that you do not want any data from your participants,
and you have to explicitly indicate which data fields you want to have included.
</p>
<p>
To keep data in the data donation, you must define <i>Extraction Rules</i>.<br>
An Extraction Rule is always related to one field/column in the uploaded data file
and a data field will only be kept in a participant's donation if it is explicitly
mentioned in at least one of the extraction rules.
</p>
<p>
An extraction rule can either indicate to just keep a field in the donation
(by mentioning the field/column in an extraction rule without defining any concrete comparison operator),
use data contained in a field to delete data entries (i.e., rows) from the donation
(e.g., to delete all entries where the date is < 01.01.2020) or
alter the data contained in a field (e.g., anonymize an e-mail address by replacing "name@mail.com" with "EMAIL").<br>
For this, there are several comparison and regex operations available. For the comparison operations, a match
means that a data entry will be deleted. The rules are applied to the uploaded file in the indicated order.
</p>
</div>

{{ formset.management_form }}
<div class="ddm-admin-form">
<h5 class="pt-4"><b>Data Extraction</b></h5>
<ul>
<li>The data extraction follows <i>extraction rules</i> which can be configured below. These rules are applied
consecutively in the defined order.
</li>
<li><b>Keep data:</b> For every field/column/variable that you want to
keep in the donated data, you first have to define an extraction rule with the "Keep field" operator.
</li>
<li><b>Filter and alter data:</b> Next, you can add rules to filter (i.e., delete) or alter entries in the
uploaded data
(e.g., to delete all entries where the date is < 01.01.2020, or to replace
e-mail-addresses with "ANONYMIZED EMAIL"). For this, there are several comparison and regex operations
available.
</li>
</ul>

{{ formset.management_form }}
<div class="ddm-admin-form">
<h6>Extraction Rules</h6>
<table id="inlineform-table" class="table table-borderless">
<tr class="border-bottom">
<th>Order</th>
{% for field in form.visible_fields %}
{% if field.name in 'name,field' %}
<th>{{ field.label }}</th>
{% endif %}
{% endfor %}
<th>Rule name</th>
<th>Field</th>
<th>Description</th>
<th></th>
<th>Delete</th>
Expand All @@ -121,7 +104,7 @@ <h6>Extraction Rules</h6>

<td><span id="step-description-{{ forloop.counter0 }}"></span></td>

<td><a href="" type="button" data-bs-toggle="modal" data-bs-target="#configuration-{{ forloop.counter0 }}">Configure Step</a></td>
<td><a href="" type="button" data-bs-toggle="modal" data-bs-target="#configuration-{{ forloop.counter0 }}">configure rule</a></td>

{% for field in form.visible_fields %}
{% if field.name in 'DELETE' %}
Expand Down Expand Up @@ -166,7 +149,7 @@ <h6>Extraction Rules</h6>

<td><span id="step-description-__prefix__"></span></td>

<td><a href="" type="button" data-bs-toggle="modal" data-bs-target="#configuration-__prefix__">Configure Step</a></td>
<td><a href="" type="button" data-bs-toggle="modal" data-bs-target="#configuration-__prefix__">configure rule</a></td>

{% for field in formset.empty_form.visible_fields %}
{% if field.name in 'DELETE' %}
Expand All @@ -186,7 +169,7 @@ <h6>Extraction Rules</h6>
<div id="empty-form" class="ddm-admin-form" style="display:none">
<div class="ddm-admin-form">
{% for field in formset.empty_form %}
{% if field.name in 'name,field,execution_order,input_type,comparison_operator,comparison_value,replacement_value' %}
{% if field.name in 'name,field,regex_field,execution_order,input_type,comparison_operator,comparison_value,replacement_value' %}
<p {% if field.name in 'replacement_value,comparison_value' %}style="display: none"{% endif %}>
{{ field.label }}
<span class="helptext"><br>{{ field.help_text }}</span>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<h4>Rule Configuration</h4>
<div class="ddm-admin-form">
{% for field in form %}
{% if field.name in 'name,field,execution_order,input_type,comparison_operator,comparison_value,replacement_value' %}
{% if field.name in 'name,field,regex_field,execution_order,input_type,comparison_operator,comparison_value,replacement_value' %}
<p>
{{ field.label_tag }}
<span class="helptext">{{ field.help_text }}</span>
Expand Down
7 changes: 5 additions & 2 deletions docs/modules/ROOT/pages/for_researchers.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ Expected File Format:: The file format of the expected data donation. Currently,
====== JSON specific settings

Extraction Root:: Indicates on which level of the files' data structure information
should be extractet. If you want to extract information contained on the first
should be extracted. If you want to extract information contained on the first
level (e.g., `{'field to be extracted': value}`, you can leave this field empty.
If you want to extract data located on a higher level, then you would provide
the path to the parent field of the data you want to extract (e.g., if your json
Expand All @@ -305,7 +305,10 @@ Execution Order:: The order in which the extraction rules are applied to a file.

Name:: The name of an extraction rule. For internal organisation only.

Field:: The field to which the rule will be applied.
Field:: The field to which the rule will be applied. This can either be a "normal" string or a
regular expression (regex). If the latter is the case, you must also select `regex field` (see below).

Regex field:: Select if you use a regex expression in the `Field` setting of this rule.

Extraction Operator:: Defines the main logic of the extraction step. If empty, this indicates
that you want to keep the field in the donated data. For all non-regex operations,
Expand Down
13 changes: 10 additions & 3 deletions frontend/src/components/FileUploader.vue
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ export default {
let nEntriesWithMissingFields = 0;
let nEntriesFilteredOut = 0;
// Limit the number of messages posted to the project logs.
let nMsgsPosted = 0;
let maxMsgs = 10;
Expand All @@ -513,12 +514,18 @@ export default {
return;
}
// Match (potential) regex variable names to the actual keys contained in an entry.
// Match variable names to the keys contained in an entry.
let rules = blueprint.filter_rules;
let keyMap = new Map();
rules.forEach(rule => {
let keyRegex = new RegExp(rule.field);
let keys = Object.keys(entry).filter(key => keyRegex.test(key));
let keys = Object.keys(entry);
if (rule.regex_field) {
let fieldRegex = new RegExp(rule.field);
keys = keys.filter(key => fieldRegex.test(key));
} else {
let field = rule.field;
keys = keys.filter(key => field === key);
}
if(keys.length > 1) {
if (nMsgsPosted < maxMsgs) {
Expand Down

0 comments on commit 76ddd81

Please sign in to comment.