Skip to content

Commit

Permalink
#880: int -> bool type conversion updates
Browse files Browse the repository at this point in the history
  • Loading branch information
aschonfeld committed Sep 5, 2024
1 parent ea88024 commit 1b12e95
Show file tree
Hide file tree
Showing 7 changed files with 270 additions and 70 deletions.
213 changes: 148 additions & 65 deletions dtale/column_builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,22 @@ def int_to_hex(v):
return v if pd.isnull(v) else hex(v)

return pd.Series(apply(s, int_to_hex), name=self.name, index=s.index)
elif to_type == "bool":
bool_cfg = self.cfg.get("cfg") or {}
cond = None
if bool_cfg.get("equals", {}).get("active", False):
cond = s.isin(
[int(v) for v in bool_cfg["equals"]["value"].split(",")]
)
if bool_cfg.get("greaterThan", {}).get("active", False):
gt_cond = s > int(bool_cfg["greaterThan"]["value"])
cond = gt_cond if cond is None else cond | gt_cond
if bool_cfg.get("lessThan", {}).get("active", False):
lt_cond = s < int(bool_cfg["lessThan"]["value"])
cond = lt_cond if cond is None else cond | lt_cond
return pd.Series(
False if cond is None else cond, name=self.name, index=s.index
)
return pd.Series(s.astype(to_type), name=self.name, index=s.index)
elif classifier == "F": # str, int
if to_type == "hex":
Expand Down Expand Up @@ -607,7 +623,10 @@ def int_to_hex(v):
"data type conversion not supported for dtype: {}".format(from_type)
)

def build_inner_code(self):
def _wrap_code(self, code):
return "df.loc[:, '{name}'] = {code}".format(name=self.name, code=code)

def build_code(self):
col, from_type, to_type = (self.cfg.get(p) for p in ["col", "from", "to"])
s = "df['{col}']".format(col=col)
classifier = classify_type(from_type)
Expand All @@ -618,102 +637,166 @@ def build_inner_code(self):
else:
date_kwargs = "infer_datetime_format=True"
code = "pd.Series(pd.to_datetime({s}, {kwargs}), name='{name}', index={s}.index)"
return code.format(s=s, name=self.name, kwargs=date_kwargs)
return self._wrap_code(
code.format(s=s, name=self.name, kwargs=date_kwargs)
)
elif to_type == "int":
return (
"s = {s}"
"if s.str.startswith('0x').any():\n"
"\tdef str_hex_to_int(v):\n"
"\t\treturn v if pd.isnull(v) else int(v, base=16)\n"
"\tstr_data = s.apply(str_hex_to_int)\n"
"else:\n"
"\tstr_data = s.astype('float').astype('int')\n"
"pd.Series(str_data, name='{name}', index=s.index)"
).format(s=s, name=self.name)
return self._wrap_code(
(
"s = {s}"
"if s.str.startswith('0x').any():\n"
"\tdef str_hex_to_int(v):\n"
"\t\treturn v if pd.isnull(v) else int(v, base=16)\n"
"\tstr_data = s.apply(str_hex_to_int)\n"
"else:\n"
"\tstr_data = s.astype('float').astype('int')\n"
"pd.Series(str_data, name='{name}', index=s.index)"
).format(s=s, name=self.name)
)
elif to_type == "float":
return (
"s = {s}"
"if s.str.startswith('0x').any():\n"
"\tstr_data = s.apply(float.fromhex)\n"
"else:\n"
"\tstr_data = pd.to_numeric(s, errors='coerce')\n"
"pd.Series(str_data, name='{name}', index=s.index)"
).format(s=s, name=self.name)
return self._wrap_code(
(
"s = {s}"
"if s.str.startswith('0x').any():\n"
"\tstr_data = s.apply(float.fromhex)\n"
"else:\n"
"\tstr_data = pd.to_numeric(s, errors='coerce')\n"
"pd.Series(str_data, name='{name}', index=s.index)"
).format(s=s, name=self.name)
)
else:
if from_type.startswith("mixed"):
if to_type == "float":
return "pd.Series(pd.to_numeric({s}, errors='coerce'), name='{name}', index={s}.index)".format(
s=s, name=self.name
return self._wrap_code(
"pd.Series(pd.to_numeric({s}, errors='coerce'), name='{name}', index={s}.index)".format(
s=s, name=self.name
)
)
elif to_type == "bool":
return (
"def _process_mixed_bool(v):\n"
"from six import string_types\n\n"
"\tif isinstance(v, bool):\n"
"\t\treturn v\n"
"\tif isinstance(v, string_types):\n"
"\t\treturn dict(true=True, false=False).get(v.lower(), np.nan)\n"
"\treturn np.nan\n\n"
"pd.Series({s}.apply(_process_mixed_bool), name='{name}', index={s}.index)"
).format(s=s, name=self.name)
return "pd.Series({s}.astype({to_type}), name='{name}', index={s}.index)".format(
s=s, to_type=to_type, name=self.name
return self._wrap_code(
(
"def _process_mixed_bool(v):\n"
"from six import string_types\n\n"
"\tif isinstance(v, bool):\n"
"\t\treturn v\n"
"\tif isinstance(v, string_types):\n"
"\t\treturn dict(true=True, false=False).get(v.lower(), np.nan)\n"
"\treturn np.nan\n\n"
"pd.Series({s}.apply(_process_mixed_bool), name='{name}', index={s}.index)"
).format(s=s, name=self.name)
)
return self._wrap_code(
"pd.Series({s}.astype({to_type}), name='{name}', index={s}.index)".format(
s=s, to_type=to_type, name=self.name
)
)
elif classifier == "I": # date, float, category, str, bool, hex
if to_type == "date":
unit = self.cfg.get("unit") or "D"
if unit == "YYYYMMDD":
return "pd.Series({s}.astype(str).apply(pd.Timestamp), name='{name}', index={s}.index)".format(
s=s, name=self.name
return self._wrap_code(
"pd.Series({s}.astype(str).apply(pd.Timestamp), name='{name}', index={s}.index)".format(
s=s, name=self.name
)
)
return self._wrap_code(
"pd.Series(pd.to_datetime({s}, unit='{unit}'), name='{name}', index={s}.index)".format(
s=s, name=self.name, unit=unit
)
return "pd.Series(pd.to_datetime({s}, unit='{unit}'), name='{name}', index={s}.index)".format(
s=s, name=self.name, unit=unit
)
elif to_type == "hex":
return (
"pd.Series(\n"
"\t{s}.apply(lambda v: v if pd.isnull(v) else hex(v)), name='{name}', index={s}.index\n"
")"
).format(s=s, name=self.name)
return "pd.Series({s}.astype('{to_type}'), name='{name}', index={s}.index)".format(
s=s, to_type=to_type, name=self.name
return self._wrap_code(
(
"pd.Series(\n"
"\t{s}.apply(lambda v: v if pd.isnull(v) else hex(v)), name='{name}', index={s}.index\n"
")"
).format(s=s, name=self.name)
)
elif to_type == "bool":
bool_cfg = self.cfg.get("cfg") or {}
conds = []
if bool_cfg.get("equals", {}).get("active", False):
conds.append(
"cond"
+ str(len(conds) + 1)
+ " = {s}.isin(["
+ bool_cfg["equals"]["value"]
+ "])"
)
if bool_cfg.get("greaterThan", {}).get("active", False):
conds.append(
"cond"
+ str(len(conds) + 1)
+ " = {s} > "
+ bool_cfg["greaterThan"]["value"]
)
if bool_cfg.get("lessThan", {}).get("active", False):
conds.append(
"cond"
+ str(len(conds) + 1)
+ " = {s} < "
+ bool_cfg["lessThan"]["value"]
)
cond = (
" | ".join(["cond{}".format(i + 1) for i, _ in enumerate(conds)])
if len(conds)
else "False"
)
conds.append(
"df.loc[:, '{name}'] = pd.Series("
+ cond
+ ", name='{name}', index={s}.index)"
)
return "\n".join(conds).format(s=s, name=self.name)
return self._wrap_code(
"pd.Series({s}.astype('{to_type}'), name='{name}', index={s}.index)".format(
s=s, to_type=to_type, name=self.name
)
)
elif classifier == "F": # str, int, hex
if to_type == "hex":
return "pd.Series(s.apply(float.hex), name='{name}', index={s}.index)".format(
s=s, name=self.name
return self._wrap_code(
"pd.Series(s.apply(float.hex), name='{name}', index={s}.index)".format(
s=s, name=self.name
)
)
return self._wrap_code(
"pd.Series(s.astype('{to_type}'), name='{name}', index={s}.index)".format(
s=s, to_type=to_type, name=self.name
)
return "pd.Series(s.astype('{to_type}'), name='{name}', index={s}.index)".format(
s=s, to_type=to_type, name=self.name
)
elif classifier == "D": # str, int
if to_type == "int":
unit = self.cfg.get("unit") or "D"
if unit == "YYYYMMDD":
return "pd.Series({s}.dt.strftime('%Y%m%d').astype(int), name='{name}', index={s}.index)".format(
s=s, name=self.name
return self._wrap_code(
"pd.Series({s}.dt.strftime('%Y%m%d').astype(int), name='{name}', index={s}.index)".format(
s=s, name=self.name
)
)
return (
"pd.Series(\n"
"\t{s}.apply(lambda x: time.mktime(x.timetuple())).astype(int), \n"
"name='{name}', index={s}.index\n"
")"
).format(s=s, name=self.name)
return "pd.Series({s}.dt.strftime('{fmt}'), name='{name}', index={s}.index)".format(
fmt=self.cfg.get("fmt") or "%Y%m%d", s=s, name=self.name
return self._wrap_code(
(
"pd.Series(\n"
"\t{s}.apply(lambda x: time.mktime(x.timetuple())).astype(int), \n"
"name='{name}', index={s}.index\n"
")"
).format(s=s, name=self.name)
)
return self._wrap_code(
"pd.Series({s}.dt.strftime('{fmt}'), name='{name}', index={s}.index)".format(
fmt=self.cfg.get("fmt") or "%Y%m%d", s=s, name=self.name
)
)
elif classifier == "B":
return "pd.Series(s.astype('{to_type}'), name='{name}', index={s}.index)".format(
s=s, to_type=to_type, name=self.name
return self._wrap_code(
"pd.Series(s.astype('{to_type}'), name='{name}', index={s}.index)".format(
s=s, to_type=to_type, name=self.name
)
)
raise NotImplementedError(
"data type conversion not supported for dtype: {}".format(from_type)
)

def build_code(self):
code = self.build_inner_code()
return "df.loc[:, '{name}'] = {code}".format(name=self.name, code=code)


class TransformColumnBuilder(object):
def __init__(self, name, cfg):
Expand Down
27 changes: 27 additions & 0 deletions frontend/static/__tests__/dtale/create/type-conversion-test.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,33 @@ describe('CreateTypeConversion', () => {
name: 'conv_col',
type: CreateColumnType.TYPE_CONVERSION,
});
await act(async () => {
await fireEvent.click(screen.getByText('Bool'));
});
await act(async () => {
await fireEvent.click(screen.getByTestId('equals-checkbox'));
});
await act(async () => {
await act(async () => {
fireEvent.change(screen.getByTestId('equals-input'), { target: { value: '1' } });
});
});
await spies.validateCfg({
cfg: {
to: 'bool',
from: 'int64',
col: 'col1',
unit: TypeConversionUnit.DATE,
cfg: {
equals: { active: true, value: '1' },
greaterThan: { active: true, value: '0' },
lessThan: { active: false },
},
applyAllType: false,
},
name: 'conv_col',
type: CreateColumnType.TYPE_CONVERSION,
});
});

it('builds a float conversion column', async () => {
Expand Down
14 changes: 14 additions & 0 deletions frontend/static/popups/create/CreateColumnState.ts
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,27 @@ export enum TypeConversionUnit {
NANOSECOND = 'ns',
}

/** Value holder for different modes of converting integer to boolean */
export interface IntToBoolModeCfg {
active: boolean;
value?: string;
}

/** Configuration for converting integer to boolean */
export interface IntToBoolCfg {
equals: IntToBoolModeCfg;
greaterThan: IntToBoolModeCfg;
lessThan: IntToBoolModeCfg;
}

/** Type conversion column creation configuration */
export interface TypeConversionConfig {
col?: string;
fmt?: string;
unit?: TypeConversionUnit;
to?: string;
from?: string;
cfg?: IntToBoolCfg;
applyAllType: boolean;
}

Expand Down
Loading

0 comments on commit 1b12e95

Please sign in to comment.