Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[importer] Avoid creating scratchdir outside of encryption zone #3447

Merged
merged 1 commit into from
Aug 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions desktop/libs/hadoop/src/hadoop/fs/webhdfs_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(self, file_status, parent_path):
self.blockSize = file_status['blockSize']
self.replication = file_status['replication']
self.aclBit = file_status.get('aclBit')
self.encBit = file_status.get('encBit')
self.fileId = file_status.get('fileId')

self.mode = int(file_status['permission'], 8)
Expand Down
5 changes: 4 additions & 1 deletion desktop/libs/indexer/src/indexer/indexers/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,10 @@ def create_table_from_a_file(self, source, destination, start_time=-1, file_enco
split = urlparse(source_path)
# Only for HDFS, import data and non-external table
if split.scheme in ('', 'hdfs') and oct(stats["mode"])[-1] != '7':
user_scratch_dir = self.fs.get_home_dir() + '/.scratchdir/%s' % str(uuid.uuid4()) # Make sure it's unique.
# check if the csv file is in encryption zone (encBit), then the scratch dir will be
# in the same directory
base_dir = parent_path if stats.encBit else self.fs.get_home_dir()
user_scratch_dir = base_dir + '/.scratchdir/%s' % str(uuid.uuid4()) # Make sure it's unique.
self.fs.do_as_user(self.user, self.fs.mkdir, user_scratch_dir, 0o0777)
self.fs.do_as_user(self.user, self.fs.rename, source['path'], user_scratch_dir)
if editor_type == 'impala' and impala_conf and impala_conf.USER_SCRATCH_DIR_PERMISSION.get():
Expand Down
120 changes: 120 additions & 0 deletions desktop/libs/indexer/src/indexer/indexers/sql_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
else:
from mock import patch, Mock, MagicMock

def mock_uuid():
return '52f840a8-3dde-434d-934a-2d6e06f3687e'

class TestSQLIndexer(object):

Expand Down Expand Up @@ -105,6 +107,124 @@ def destination_dict(key):
[statement.strip() for statement in notebook.get_data()['snippets'][0]['statement_raw'].split(';')]
)

@patch('uuid.uuid4', mock_uuid)
def test_create_table_from_a_file_to_csv_for_kms_encryption(self):
def mock_parent_path(path):
return '/'.join(path.split('/')[:-1])

class MockStat:
def __init__(self, encBit=True, mode=16877):
self.encBit = encBit
self.mode = mode

def __getitem__(self, key):
if key == 'mode':
return 16877

def enc_source_dict(key):
return {
'path': '/enc_zn/upload_dir/data.csv',
'format': {'quoteChar': '"', 'fieldSeparator': ','},
'sampleCols': [{u'operations': [], u'comment': u'', u'name': u'customers.id'}],
'sourceType': 'hive'
}.get(key, Mock())
source = MagicMock()
source.__getitem__.side_effect = enc_source_dict

def destination_dict(key):
return {
'name': 'default.export_table',
'tableFormat': 'csv',
'importData': True,
'isIceberg': False,
'nonDefaultLocation': '/warehouse/tablespace/managed/hive/customer_stats.csv',
'columns': [{'name': 'id', 'type': 'int'}],
'partitionColumns': [{'name': 'day', 'type': 'date', 'partitionValue': '20200101'}],
'description': 'No comment!',
'sourceType': 'hive-1'
}.get(key, Mock())
destination = MagicMock()
destination.__getitem__.side_effect = destination_dict

fs = Mock(
stats=Mock(
return_value=MockStat()
),
parent_path=mock_parent_path,
get_home_dir=Mock(return_value='/user/test'),
)

notebook = SQLIndexer(user=self.user, fs=fs).create_table_from_a_file(source, destination)

### source dir is in encryption zone, so the scratch dir is in the same dir
assert_equal(
[statement.strip() for statement in u'''DROP TABLE IF EXISTS `default`.`hue__tmp_export_table`;
CREATE TABLE IF NOT EXISTS `default`.`hue__tmp_export_table`
(
`id` int ) COMMENT "No comment!"
PARTITIONED BY (
`day` date )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES ("separatorChar" = ",",
"quoteChar" = """,
"escapeChar" = "\\\\"
)
STORED AS TextFile TBLPROPERTIES('skip.header.line.count'='1', 'transactional'='false')
;
LOAD DATA INPATH '/enc_zn/upload_dir/.scratchdir/52f840a8-3dde-434d-934a-2d6e06f3687e/data.csv' INTO TABLE `default`.`hue__tmp_export_table` PARTITION (day='20200101');
CREATE TABLE `default`.`export_table` COMMENT "No comment!"
STORED AS csv
TBLPROPERTIES('transactional'='true', 'transactional_properties'='insert_only')
AS SELECT *
FROM `default`.`hue__tmp_export_table`;
DROP TABLE IF EXISTS `default`.`hue__tmp_export_table`;'''.split(';')],
[statement.strip() for statement in notebook.get_data()['snippets'][0]['statement_raw'].split(';')]
)

fs = Mock(
stats=Mock(
return_value=MockStat(encBit=False)
),
parent_path=mock_parent_path,
get_home_dir=Mock(return_value='/user/test'),
)

def source_dict(key):
return {
'path': '/user/test/data.csv',
'format': {'quoteChar': '"', 'fieldSeparator': ','},
'sampleCols': [{u'operations': [], u'comment': u'', u'name': u'customers.id'}],
'sourceType': 'hive'
}.get(key, Mock())
source = MagicMock()
source.__getitem__.side_effect = source_dict

notebook = SQLIndexer(user=self.user, fs=fs).create_table_from_a_file(source, destination)

### source dir is not in encryption zone, so the scratch dir is in user's home dir
assert_equal(
[statement.strip() for statement in u'''DROP TABLE IF EXISTS `default`.`hue__tmp_export_table`;
CREATE TABLE IF NOT EXISTS `default`.`hue__tmp_export_table`
(
`id` int ) COMMENT "No comment!"
PARTITIONED BY (
`day` date )
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES ("separatorChar" = ",",
"quoteChar" = """,
"escapeChar" = "\\\\"
)
STORED AS TextFile TBLPROPERTIES('skip.header.line.count'='1', 'transactional'='false')
;
LOAD DATA INPATH '/user/test/.scratchdir/52f840a8-3dde-434d-934a-2d6e06f3687e/data.csv' INTO TABLE `default`.`hue__tmp_export_table` PARTITION (day='20200101');
CREATE TABLE `default`.`export_table` COMMENT "No comment!"
STORED AS csv
TBLPROPERTIES('transactional'='true', 'transactional_properties'='insert_only')
AS SELECT *
FROM `default`.`hue__tmp_export_table`;
DROP TABLE IF EXISTS `default`.`hue__tmp_export_table`;'''.split(';')],
[statement.strip() for statement in notebook.get_data()['snippets'][0]['statement_raw'].split(';')]
)

class MockRequest(object):
def __init__(self, fs=None, user=None):
Expand Down
Loading