diff --git a/featuretools/mkfeat/columnspec.py b/featuretools/mkfeat/columnspec.py index 214b52725f..1c75dfcd3e 100644 --- a/featuretools/mkfeat/columnspec.py +++ b/featuretools/mkfeat/columnspec.py @@ -19,6 +19,7 @@ class ColumnSpec: """ def __init__(self, columns): self.columns = columns + self._auto_keyname = None def validate(self): """ @@ -44,8 +45,6 @@ def validate(self): if has_label: return Error.ERR_COLUMN_MULTI_LABEL has_label = True - if not has_key: - return Error.ERR_COLUMN_NO_KEY return Error.OK def get_colnames(self): @@ -60,14 +59,14 @@ def get_colnames(self): colnames.append(colinfo['name']) return colnames - def get_usecols(self, numeric_only: bool = False, label_only: bool = False, exclude_label: bool = False): + def get_usecols(self, numeric_only: bool = False, label_only: bool = False, exclude_skip: bool = False): """ 컬럼명 배열을 반환. pandas의 read_csv()의 usecols 파라미터 전달용 함수 Args: numeric_only (bool): True의 경우, numeric 형식으로 가능한 column명 만을 추출 label_only (bool): True의 경우 label에 대한 column명 만을 추출 - exclude_label (bool): True의 경우 label 컬럼을 제거하여 column명 목록 생성 + exclude_skip (bool): True의 경우 label, train, bypass 컬럼을 제거하여 column명 목록 생성 Returns: 컬럼명으로 구성된 배열 """ @@ -77,7 +76,9 @@ def get_usecols(self, numeric_only: bool = False, label_only: bool = False, excl continue if label_only and ('label' not in colinfo or not colinfo['label']): continue - if exclude_label and 'label' in colinfo and colinfo['label']: + if exclude_skip and (('label' in colinfo and colinfo['label']) or + ('train' in colinfo and colinfo['train']) or + ('bypass' in colinfo and colinfo['bypass'])): continue colnames.append(colinfo['name']) return colnames @@ -100,15 +101,19 @@ def get_converters(self): def get_key_colname(self): """ - 특징 추출시 id로 지정가능한 컬럼명 반환. key로 지정된 column명이 없는 경우 첫번째 컬럼명 반환 + Get key column name. If no key is specified, key is automatically generated. Returns: - id로 지정 가능한 column name. + key column name which can be used as row identifer. """ for colinfo in self.columns: if 'key' in colinfo and colinfo['key']: return colinfo['name'] - return self.columns[0]['name'] + self._setup_auto_keyname() + return self._auto_keyname + + def is_auto_keyname(self): + return True if self._auto_keyname else False def get_label_colname(self): for colinfo in self.columns: @@ -158,3 +163,16 @@ def _get_converter_from_strtype(typestr): @staticmethod def _is_numeric_type(self): return self in ('number', 'bool') + + def _setup_auto_keyname(self): + if self._auto_keyname: + return + colnames = self.get_colnames() + if 'id' not in colnames: + keyname = 'id' + else: + for i in range(1, 100000): + keyname = "id_{}".format(i) + if keyname not in colnames: + break + self._auto_keyname = keyname diff --git a/featuretools/mkfeat/error.py b/featuretools/mkfeat/error.py index fc0a54a5af..d360ae9362 100644 --- a/featuretools/mkfeat/error.py +++ b/featuretools/mkfeat/error.py @@ -15,6 +15,7 @@ class Error(str, Enum): ERR_COLUMN_HAS_NO_NAME_OR_TYPE = "ERR_COLUMN_HAS_NO_NAME_OR_TYPE" ERR_COLUMN_MULTI_KEY = "ERR_COLUMN_MULTI_KEY" ERR_COLUMN_MULTI_LABEL = "ERR_COLUMN_MULTI_LABEL" - ERR_COLUMN_NO_KEY = "ERR_COLUMN_NO_KEY" ERR_COLUMN_KEY_AND_LABEL = "ERR_COLUMN_KEY_AND_LABEL" ERR_COLUMN_TYPE = "ERR_COLUMN_TYPE" + ERR_COLUMN_BAD = "ERR_COLUMN_BAD" + ERR_DATA_LABEL_COUNT_MISMATCH = "ERR_DATA_LABEL_COUNT_MISMATCH" diff --git a/featuretools/mkfeat/feat_extractor.py b/featuretools/mkfeat/feat_extractor.py index bb1dfebf42..bb8e051548 100644 --- a/featuretools/mkfeat/feat_extractor.py +++ b/featuretools/mkfeat/feat_extractor.py @@ -141,7 +141,8 @@ def save(self, path): Returns: """ - self.feature_matrix.to_csv(path) + need_index = False if self.es.is_auto_key() else True + self.feature_matrix.to_csv(path, index=need_index) def get_feature_info(self): """ diff --git a/featuretools/mkfeat/feat_importance.py b/featuretools/mkfeat/feat_importance.py index 31c100b678..9e43e170a4 100644 --- a/featuretools/mkfeat/feat_importance.py +++ b/featuretools/mkfeat/feat_importance.py @@ -45,13 +45,15 @@ def _load(self) -> Error: return Error.ERR_LABEL_NOT_FOUND self._colspec_data = colspec_data = ColumnSpec(self._columns_data) + err = colspec_data.validate() + if err != Error.OK: + return err if self._path_label is None: if colspec_data.get_label_colname() is None: return Error.ERR_LABEL_NOT_FOUND csv_data = QufaCsv(self._path_data, colspec_data) - exclude_label = True if self._path_label is None else False - data = csv_data.load(self._progress_report, exclude_label=exclude_label, numeric_only=True) + data = csv_data.load(self._progress_report, exclude_skip=True, numeric_only=True) if isinstance(data, Error): return data self.data = data @@ -60,10 +62,17 @@ def _load(self) -> Error: label = csv_data.load(None, label_only=True) else: colspec_label = ColumnSpec(self._columns_label) + err = colspec_label.validate() + if err != Error.OK: + return err + if colspec_label.get_label_colname() is None: + return Error.ERR_LABEL_NOT_FOUND csv_label = QufaCsv(self._path_label, colspec_label) - label = csv_label.load(None) + label = csv_label.load(None, label_only=True) if isinstance(label, Error): return label + if len(data) != len(label): + return Error.ERR_DATA_LABEL_COUNT_MISMATCH self.label = label return Error.OK @@ -84,7 +93,7 @@ def analyze(self) -> Error: if err != Error.OK: return err - xtr, xv, ytr, yv = train_test_split(self.data.values, self.label, test_size=0.2, random_state=0) + xtr, xv, ytr, yv = train_test_split(self.data.values, self.label.values, test_size=0.2, random_state=0) dtrain = xgb.DMatrix(xtr, label=ytr) dvalid = xgb.DMatrix(xv, label=yv) diff --git a/featuretools/mkfeat/normalize.py b/featuretools/mkfeat/normalize.py index 67309180bc..6648cdd607 100644 --- a/featuretools/mkfeat/normalize.py +++ b/featuretools/mkfeat/normalize.py @@ -12,10 +12,11 @@ def normalize(df: DataFrame, key_colname): for _ in range(N_ITERS): df_samp = df.sample(n=n_samples) norminfos_new = _get_norminfos(df_samp, key_colname) - if norminfos is None: - norminfos = norminfos_new - else: - _merge_norminfos(norminfos, norminfos_new) + if norminfos_new: + if norminfos is None: + norminfos = norminfos_new + else: + _merge_norminfos(norminfos, norminfos_new) return norminfos else: return _get_norminfos(df, key_colname) @@ -50,29 +51,31 @@ def _clear_norminfos_upto_key(norminfos, key): def _get_norminfos(df: DataFrame, key_colname): - es = an.auto_entityset(df, index=key_colname, accuracy=accuracy) + try: + es = an.auto_entityset(df, index=key_colname, accuracy=accuracy) + except KeyError: + # Maybe autonormalize bug. It seems to have a problem in case of multi key normalization. + return None norminfos = [] # 첫번째 이외의 entity들에 대해서. 첫번째 entity가 main임을 가정 - entities = es.entities[1:] for et in entities: norminfo = [] for var in et.variables: norminfo.append(var.name) norminfos.append(norminfo) for norminfo in norminfos: - parent_ids = _get_parent_entity_ids(es, norminfo[0]) + parent_ids = _get_parent_entity_ids(relationships, norminfo[0]) for parent_id in parent_ids: vars = es[parent_id].variables for var in vars[1:]: norminfo.append(var.name) return norminfos - -def _get_parent_entity_ids(es, child_id): +def _get_parent_entity_ids(rels, child_id): parent_ids = [] - for rel in es.relationships: + for rel in rels: if child_id == rel.child_entity.id: parent_ids.append(rel.parent_entity.id) - parent_ids += _get_parent_entity_ids(es, rel.parent_entity.id) - return parent_ids + parent_ids += _get_parent_entity_ids(rels, rel.parent_entity.id) + return parent_ids \ No newline at end of file diff --git a/featuretools/mkfeat/qufa_ES.py b/featuretools/mkfeat/qufa_ES.py index b398891991..9e940e6c4c 100644 --- a/featuretools/mkfeat/qufa_ES.py +++ b/featuretools/mkfeat/qufa_ES.py @@ -1,4 +1,5 @@ from featuretools.entityset import EntitySet +import numpy as np from .columnspec import ColumnSpec from .error import Error @@ -10,6 +11,7 @@ class QufaES(EntitySet): def __init__(self): super().__init__() self.target_entity_name = None + self._is_auto_key = False self._df_label = None self._df_train = None self._df_bypass = None @@ -21,6 +23,9 @@ def load_from_csv(self, path, callback, colspec: ColumnSpec) -> Error: return data colname_key = colspec.get_key_colname() + if colspec.is_auto_keyname(): + data[colname_key] = np.arange(len(data)) + self._is_auto_key = True colname_label = colspec.get_label_colname() if colname_label: self._df_label = data[[colname_key, colname_label]] @@ -41,19 +46,28 @@ def load_from_csv(self, path, callback, colspec: ColumnSpec) -> Error: colnames_bypass.remove(colname_key) data = data.drop(columns=colnames_bypass) - norminfos = normalize(data, colname_key) + try: + norminfos = normalize(data, colname_key) + except AssertionError: + # There are many cases. One observed case is that key index is not unique. + return Error.ERR_COLUMN_BAD self.entity_from_dataframe("main", data, index=colname_key) - for norminfo in norminfos: - keyname = norminfo[0] - vars = norminfo[1:] - etname = self._search_owner_entity(keyname) - self.normalize_entity(etname, "tbl_{}".format(keyname), norminfo[0], additional_variables=vars) + if norminfos: + for norminfo in norminfos: + keyname = norminfo[0] + vars = norminfo[1:] + etname = self._search_owner_entity(keyname) + if etname: + self.normalize_entity(etname, "tbl_{}".format(keyname), norminfo[0], additional_variables=vars) self.target_entity_name = "main" return Error.OK + def is_auto_key(self): + return self._is_auto_key + def get_df_label(self): return self._df_label diff --git a/featuretools/mkfeat/qufa_csv.py b/featuretools/mkfeat/qufa_csv.py index b3a33d20b6..598bb017d9 100644 --- a/featuretools/mkfeat/qufa_csv.py +++ b/featuretools/mkfeat/qufa_csv.py @@ -22,11 +22,11 @@ def get_n_rows(self) -> int: n_rows += 1 return n_rows - def load(self, callback, label_only: bool = False, exclude_label: bool = False, numeric_only: bool = False): + def load(self, callback, label_only: bool = False, exclude_skip: bool = False, numeric_only: bool = False): colnames = self._colspec.get_colnames() if len(colnames) != self._guess_n_columns(): return Error.ERR_COLUMN_COUNT_MISMATCH - usecols = self._colspec.get_usecols(label_only=label_only, exclude_label=exclude_label, + usecols = self._colspec.get_usecols(label_only=label_only, exclude_skip=exclude_skip, numeric_only=numeric_only) n_total_rows = self.get_n_rows() diff --git a/featuretools/primitives/premium/aggregation_primitives.py b/featuretools/primitives/premium/aggregation_primitives.py index e6fe6d3b57..9c81e66cbd 100644 --- a/featuretools/primitives/premium/aggregation_primitives.py +++ b/featuretools/primitives/premium/aggregation_primitives.py @@ -953,7 +953,7 @@ class CountInsideRange(AggregationPrimitive): description_template = "count_inside_range" stack_on_self = False - def __init__(self, lower=0, upper=1, skipna=False): + def __init__(self, lower=0, upper=1, skipna=True): self.lower = lower self.upper = upper self.skipna = skipna @@ -966,8 +966,7 @@ def count_inside_range(array): continue elif not self.skipna and not val: return np.nan - - if val >= self.lower and val <= self.upper: + elif val >= self.lower and val <= self.upper: count += 1 return count @@ -1020,7 +1019,7 @@ class CountOutsideRange(AggregationPrimitive): description_template = "count_outside_range" stack_on_self = False - def __init__(self, lower=0, upper=1, skipna=False): + def __init__(self, lower=0, upper=1, skipna=True): self.lower = lower self.upper = upper self.skipna = skipna @@ -1033,8 +1032,7 @@ def count_outside_range(array): continue elif not self.skipna and not val: return np.nan - - if val < self.lower or val > self.upper: + elif val < self.lower or val > self.upper: count += 1 return count @@ -1294,31 +1292,23 @@ class MaxConsecutiveFalse(AggregationPrimitive): input_types = [Boolean] return_type = Numeric - def __init__(self, skipna = False): + def __init__(self, skipna = True): self.skipna = skipna def get_function(self): def max_consecutive_false(array): max_count = 0 - count = -1 - not_non_arr = [] + count = 0 for val in array: - if val: - not_non_arr.append(val) - elif self.skipna and not val: + if self.skipna and val == None: continue - elif not self.skipna and not val: - return np.nan - - for val in not_non_arr: - if not val and count == -1: - count = 1 - elif not val and count > 0: + elif not self.skipna and val == None: + count = 0 + elif val == False: count += 1 - elif val: + else: max_count = max(max_count, count) - count = -1 - max_count = max(max_count, count) + count = 0 return max_count return max_consecutive_false @@ -1336,31 +1326,23 @@ class MaxConsecutiveNegatives(AggregationPrimitive): input_types = [Numeric] return_type = Numeric - def __init__(self, skipna = False): + def __init__(self, skipna = True): self.skipna = skipna def get_function(self): def max_consecutive_negatives(array): max_count = 0 - count = -1 - not_non_arr = [] + count = 0 for val in array: - if val: - not_non_arr.append(val) - elif self.skipna and not val: + if self.skipna and val == None: continue - elif not self.skipna and not val: - return np.nan - - for val in not_non_arr: - if val < 0 and count == -1: - count = 1 - elif val < 0 and count > 0: + elif not self.skipna and val == None: + count = 0 + elif val < 0: count += 1 - elif val >= 0: + else: max_count = max(max_count, count) - count = -1 - max_count = max(max_count, count) + count = 0 return max_count return max_consecutive_negatives @@ -1378,31 +1360,23 @@ class MaxConsecutivePositives(AggregationPrimitive): input_types = [Numeric] return_type = Numeric - def __init__(self, skipna = False): + def __init__(self, skipna = True): self.skipna = skipna def get_function(self): def max_consecutive_positives(array): max_count = 0 - count = -1 - not_non_arr = [] + count = 0 for val in array: - if val: - not_non_arr.append(val) - elif self.skipna and not val: + if self.skipna and val == None: continue - elif not self.skipna and not val: - return np.nan - - for val in not_non_arr: - if val > 0 and count == -1: - count = 1 - elif val > 0 and count > 0: + elif not self.skipna and val == None: + count = 0 + elif val > 0: count += 1 - elif val <= 0: + else: max_count = max(max_count, count) - count = -1 - max_count = max(max_count, count) + count = 0 return max_count return max_consecutive_positives @@ -1420,31 +1394,23 @@ class MaxConsecutiveTrue(AggregationPrimitive): input_types = [Boolean] return_type = Numeric - def __init__(self, skipna = False): + def __init__(self, skipna = True): self.skipna = skipna def get_function(self): def max_consecutive_true(array): max_count = 0 - count = -1 - not_non_arr = [] + count = 0 for val in array: - if val: - not_non_arr.append(val) - elif self.skipna and not val: + if self.skipna and val == None: continue - elif not self.skipna and not val: - return np.nan - - for val in not_non_arr: - if val and count == -1: - count = 1 - elif val and count > 0: + elif not self.skipna and val == None: + count = 0 + elif val == True: count += 1 - elif not val: + else: max_count = max(max_count, count) - count = -1 - max_count = max(max_count, count) + count = 0 return max_count return max_consecutive_true @@ -1462,31 +1428,23 @@ class MaxConsecutiveZeros(AggregationPrimitive): input_types = [Numeric] return_type = Numeric - def __init__(self, skipna = False): + def __init__(self, skipna = True): self.skipna = skipna def get_function(self): def max_consecutive_zeros(array): max_count = 0 - count = -1 - not_non_arr = [] + count = 0 for val in array: - if val: - not_non_arr.append(val) - elif self.skipna and not val: + if self.skipna and val == None: continue - elif not self.skipna and not val: - return np.nan - - for val in not_non_arr: - if val == 0 and count == -1: - count = 1 - elif val == 0 and count > 0: + elif not self.skipna and val == None: + count = 0 + elif val == 0: count += 1 - elif val != 0: + else: max_count = max(max_count, count) - count = -1 - max_count = max(max_count, count) + count = 0 return max_count return max_consecutive_zeros diff --git a/featuretools/primitives/premium/transform_primitive.py b/featuretools/primitives/premium/transform_primitive.py index ab4dbcd6ff..e1796c9845 100644 --- a/featuretools/primitives/premium/transform_primitive.py +++ b/featuretools/primitives/premium/transform_primitive.py @@ -1216,12 +1216,15 @@ class GreaterThanPrevious(TransformPrimitive): input_types = [Numeric] return_type = Numeric - def __init__(self, fill_method = None, limit = None): + def __init__(self, fill_method = "pad", limit = None): self.fill_method = fill_method self.limit = limit def get_function(self): def greater_than_previous(numbers): + df = pandas.DataFrame(numbers) + df.fillna(method = self.fill_method, limit = self.limit) + numbers = df[0].tolist() results = [] prev = None for num in numbers: @@ -1453,12 +1456,15 @@ class LessThanPrevious(TransformPrimitive): input_types = [Numeric] return_type = Numeric - def __init__(self, fill_method = None, limit = None): + def __init__(self, fill_method = "pad", limit = None): self.fill_method = fill_method self.limit = limit def get_function(self): def less_than_previous(numbers): + df = pandas.DataFrame(numbers) + df.fillna(method = self.fill_method, limit = self.limit) + numbers = df[0].tolist() results = [] prev = None for num in numbers: