From 69c2add020ea1b27f7c7ebff4bb133bd1fb1c3f3 Mon Sep 17 00:00:00 2001 From: pwwang <1188067+pwwang@users.noreply.github.com> Date: Mon, 7 Jun 2021 23:19:00 -0700 Subject: [PATCH] 0.0.7 (#10) * Add dplyr rows verbs * Add tidyr.chop/unchop * Add tidyr.pack/unpack * Allow mixed numbering for tibble construction * Add tidyr.nest/unnest * Add tidyr.expand/expand_grid * Add tidyr.complete * Move tidyr.drop_na * Move tidyr.extract * Move tidyr.fill * Move and fix tidyr.pivot_longer * Move and fix tidyr.pivot_wider * Move and fix tidyr.separate/separate_rows/uncount * Move and fix tidyr.unite * Move and fix tidyr.replace_na * Update notebook for tidyr.full_seq * 0.0.7 * Update CHANGELOG --- README.md | 2 +- datar/__init__.py | 2 +- datar/base/__init__.py | 2 +- datar/base/constants.py | 2 +- datar/base/funcs.py | 13 +- datar/base/verbs.py | 102 +- datar/core/contexts.py | 4 + datar/core/middlewares.py | 32 - datar/core/types.py | 2 +- datar/core/utils.py | 80 +- datar/dplyr/__init__.py | 1 + datar/dplyr/bind.py | 17 +- datar/dplyr/lead_lag.py | 6 +- datar/dplyr/recode.py | 74 +- datar/dplyr/rows.py | 278 +++ datar/tibble/funcs.py | 45 +- datar/tidyr/__init__.py | 21 +- datar/tidyr/chop.py | 295 +++ datar/tidyr/complete.py | 56 + datar/tidyr/drop_na.py | 56 + datar/tidyr/expand.py | 398 ++++ datar/tidyr/extract.py | 109 + datar/tidyr/fill.py | 75 + datar/tidyr/funcs.py | 8 +- datar/tidyr/nest.py | 226 ++ datar/tidyr/pack.py | 167 ++ datar/tidyr/pivot_long.py | 277 +++ datar/tidyr/pivot_wide.py | 237 ++ datar/tidyr/replace_na.py | 73 + datar/tidyr/separate.py | 241 ++ datar/tidyr/uncount.py | 97 + datar/tidyr/unite.py | 69 + datar/tidyr/verbs.py | 777 ------- docs/CHANGELOG.md | 6 + docs/notebooks/chop.ipynb | 297 +++ docs/notebooks/complete.ipynb | 119 + docs/notebooks/drop_na.ipynb | 239 +- docs/notebooks/expand.ipynb | 1591 ++----------- docs/notebooks/expand_grid.ipynb | 348 +-- docs/notebooks/extract.ipynb | 256 +-- docs/notebooks/fill.ipynb | 545 +---- docs/notebooks/full_seq.ipynb | 61 +- docs/notebooks/nb_helpers.py | 10 + docs/notebooks/nest.ipynb | 574 +++++ docs/notebooks/pack.ipynb | 336 +++ docs/notebooks/pivot_longer.ipynb | 1710 +------------- docs/notebooks/pivot_wider.ipynb | 3534 ++++------------------------- docs/notebooks/replace_na.ipynb | 180 +- docs/notebooks/rows.ipynb | 404 ++++ docs/notebooks/separate.ipynb | 835 +------ docs/notebooks/uncount.ipynb | 269 +-- docs/notebooks/unite.ipynb | 300 +-- docs/porting_rules.md | 19 + docs/reference-maps/ALL.md | 2 + docs/reference-maps/dplyr.md | 14 +- docs/reference-maps/tidyr.md | 119 + mkdocs.yml | 5 + pyproject.toml | 2 +- setup.py | 2 +- tests/test_dplyr_rows.py | 96 + tests/test_tibble.py | 7 + tests/test_tidyr_chop.py | 160 ++ tests/test_tidyr_complete.py | 48 + tests/test_tidyr_drop_na.py | 62 + tests/test_tidyr_expand.py | 203 ++ tests/test_tidyr_extract.py | 65 + tests/test_tidyr_fill.py | 75 + tests/test_tidyr_nest.py | 318 +++ tests/test_tidyr_pack.py | 91 + tests/test_tidyr_pivot_long.py | 226 ++ tests/test_tidyr_pivot_wide.py | 201 ++ tests/test_tidyr_replace_na.py | 47 + tests/test_tidyr_separate.py | 217 ++ tests/test_tidyr_uncount.py | 53 + tests/test_tidyr_unite.py | 62 + 75 files changed, 7976 insertions(+), 9576 deletions(-) create mode 100644 datar/dplyr/rows.py create mode 100644 datar/tidyr/chop.py create mode 100644 datar/tidyr/complete.py create mode 100644 datar/tidyr/drop_na.py create mode 100644 datar/tidyr/expand.py create mode 100644 datar/tidyr/extract.py create mode 100644 datar/tidyr/fill.py create mode 100644 datar/tidyr/nest.py create mode 100644 datar/tidyr/pack.py create mode 100644 datar/tidyr/pivot_long.py create mode 100644 datar/tidyr/pivot_wide.py create mode 100644 datar/tidyr/replace_na.py create mode 100644 datar/tidyr/separate.py create mode 100644 datar/tidyr/uncount.py create mode 100644 datar/tidyr/unite.py delete mode 100644 datar/tidyr/verbs.py create mode 100644 docs/notebooks/chop.ipynb create mode 100644 docs/notebooks/complete.ipynb create mode 100644 docs/notebooks/nest.ipynb create mode 100644 docs/notebooks/pack.ipynb create mode 100644 docs/notebooks/rows.ipynb create mode 100644 docs/porting_rules.md create mode 100644 docs/reference-maps/tidyr.md create mode 100644 tests/test_dplyr_rows.py create mode 100644 tests/test_tidyr_chop.py create mode 100644 tests/test_tidyr_complete.py create mode 100644 tests/test_tidyr_drop_na.py create mode 100644 tests/test_tidyr_expand.py create mode 100644 tests/test_tidyr_extract.py create mode 100644 tests/test_tidyr_fill.py create mode 100644 tests/test_tidyr_nest.py create mode 100644 tests/test_tidyr_pack.py create mode 100644 tests/test_tidyr_pivot_long.py create mode 100644 tests/test_tidyr_pivot_wide.py create mode 100644 tests/test_tidyr_replace_na.py create mode 100644 tests/test_tidyr_separate.py create mode 100644 tests/test_tidyr_uncount.py create mode 100644 tests/test_tidyr_unite.py diff --git a/README.md b/README.md index 87bbe5dd..008b19e1 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Port of [dplyr][2] and other related R packages in python, using [pipda][3]. -Unlike other similar packages in python that just mimic the piping sign, `datar` follows the API designs from the original packages as possible. So that nearly no extra effort is needed for those who are familar with those R packages to transition to python. +Unlike other similar packages in python that just mimic the piping sign, `datar` follows the API designs from the original packages as much as possible. So that minimal effort is needed for those who are familar with those R packages to transition to python. [![Pypi][6]][7] [![Github][8]][9] ![Building][10] [![Docs and API][11]][5] [![Codacy][12]][13] [![Codacy coverage][14]][13] diff --git a/datar/__init__.py b/datar/__init__.py index 501ecf05..25576dfc 100644 --- a/datar/__init__.py +++ b/datar/__init__.py @@ -3,4 +3,4 @@ from .core import operator as _datar_operator from .core.defaults import f -__version__ = '0.0.6' +__version__ = '0.0.7' diff --git a/datar/base/__init__.py b/datar/base/__init__.py index f9fe5a87..59fba630 100644 --- a/datar/base/__init__.py +++ b/datar/base/__init__.py @@ -10,7 +10,7 @@ from .options import options, getOption, options_context from .verbs import ( colnames, rownames, dim, nrow, ncol, diag, t, names, - intersect, union, setdiff, setequal + intersect, union, setdiff, setequal, duplicated ) from .funcs import ( as_date, as_character, as_double, as_factor, as_categorical, diff --git a/datar/base/constants.py b/datar/base/constants.py index 0e8396da..2c234f23 100644 --- a/datar/base/constants.py +++ b/datar/base/constants.py @@ -18,6 +18,6 @@ LETTERS = numpy.array(list(ascii_letters[26:])) NA_character_ = f"" -NA_integer_ = numpy.random.randint(numpy.iinfo(numpy.int64).max) +NA_integer_ = numpy.random.randint(numpy.iinfo(numpy.int32).max) NA_real_ = NA NA_compex_ = complex(NA, NA) diff --git a/datar/base/funcs.py b/datar/base/funcs.py index ca35b945..0d3bc431 100644 --- a/datar/base/funcs.py +++ b/datar/base/funcs.py @@ -21,7 +21,7 @@ from pipda import Context, register_func from .constants import NA -from ..core.utils import categorize, get_option, logger +from ..core.utils import categorized, get_option, logger from ..core.middlewares import WithDataEnv from ..core.collections import Collection from ..core.types import ( @@ -281,7 +281,7 @@ def as_int(x: Any) -> Union[int, Iterable[int]]: Otherwise, convert x to int. """ if is_categorical_dtype(x): - return categorize(x).codes + return categorized(x).codes return _as_type(x, int) @register_func(None, context=Context.EVAL) @@ -297,7 +297,7 @@ def as_integer(x: Any) -> Union[numpy.int64, Iterable[numpy.int64]]: Otherwise, convert x to numpy.int64. """ if is_categorical_dtype(x): - return categorize(x).codes + return categorized(x).codes return _as_type(x, numpy.int64) as_int64 = as_integer @@ -812,7 +812,7 @@ def droplevels(x: Categorical) -> Categorical: Returns: The categorical data with unused categories dropped. """ - return categorize(x).remove_unused_categories() + return categorized(x).remove_unused_categories() @register_func(None, context=Context.EVAL) def levels(x: CategoricalLikeType) -> Optional[List[Any]]: @@ -904,7 +904,7 @@ def lengths(x: Any) -> List[int]: # --------------------------------- def factor( - x: Iterable[Any], + x: Optional[Iterable[Any]] = None, # pylint: disable=redefined-outer-name levels: Optional[Iterable[Any]] = None, exclude: Any = NA, @@ -925,6 +925,9 @@ def factor( ordered: logical flag to determine if the levels should be regarded as ordered (in the order given). """ + if x is None: + x = [] + if is_categorical_dtype(x): x = x.to_numpy() ret = Categorical( diff --git a/datar/base/verbs.py b/datar/base/verbs.py index 759d58c4..c605ccd2 100644 --- a/datar/base/verbs.py +++ b/datar/base/verbs.py @@ -1,9 +1,11 @@ """Function from R-base that can be used as verbs""" # TODO: add tests -from typing import Any, Iterable, List, Optional, Tuple, Union +from typing import ( + Any, Iterable, List, Mapping, Optional, Tuple, Union +) import numpy -from pandas import DataFrame +from pandas import DataFrame, Series, Categorical from pipda import register_verb from ..core.types import IntType, is_scalar @@ -14,7 +16,8 @@ @register_verb(DataFrame, context=Context.EVAL) def colnames( df: DataFrame, - names: Optional[Iterable[str]] = None + names: Optional[Iterable[str]] = None, + stack: bool = True ) -> Union[List[Any], DataFrame]: """Get or set the column names of a dataframe @@ -28,10 +31,42 @@ def colnames( if the input dataframe is grouped, the structure is kept. """ from ..stats.verbs import setNames + if not stack: + if names is not None: + return setNames(df, names) + return df.columns.tolist() + if names is not None: - return setNames(df, names) + namei = 0 + newnames = [] + for colname in df.columns: + parts = colname.split('$', 1) + if not newnames: + if len(parts) < 2: + newnames.append(names[namei]) + namei += 1 + else: + newnames.append(f"{names[namei]}${parts[1]}") + elif len(parts) < 2: + newnames.append(names[namei]) + namei += 1 + elif newnames[-1].startswith(f"{parts[0]}$"): + newnames.append(f"{names[namei]}${parts[1]}") + else: + namei += 1 + newnames.append(f"{names[namei]}${parts[1]}") + return setNames(df, newnames) + + cols = [ + col.split('$', 1)[0] if isinstance(col, str) else col + for col in df.columns + ] + out = [] + for col in cols: + if col not in out: + out.append(col) + return out - return df.columns.tolist() @register_verb(DataFrame, context=Context.EVAL) def rownames( @@ -176,6 +211,14 @@ def names(x: DataFrame) -> List[str]: """Get the column names of a dataframe""" return x.columns.tolist() +@names.register(dict) +def _(x: Mapping[str, Any]) -> List[str]: + """Get the keys of a dict + + dict is like a list in R, mimic `names()` in R. + """ + return list(x) + @register_verb(context=Context.EVAL) def setdiff(x: Any, y: Any) -> List[Any]: """Diff of two iterables""" @@ -216,3 +259,52 @@ def setequal(x: Any, y: Any) -> List[Any]: x = sorted(x) y = sorted(y) return x == y + +@register_verb((list, tuple, numpy.ndarray, Series, Categorical)) +def duplicated( # pylint: disable=invalid-name + x: Iterable[Any], + incomparables: Optional[Iterable[Any]] = None, + fromLast: bool = False +) -> numpy.ndarray: + """Determine Duplicate Elements + + Args: + x: The iterable to detect duplicates + Currently, elements in `x` must be hashable. + fromLast: Whether start to detect from the last element + + Returns: + A bool array with the same length as `x` + """ + dups = set() + out = [] + out_append = out.append + if incomparables is None: + incomparables = [] + + if fromLast: + x = reversed(x) + for elem in x: + if elem in incomparables: + out_append(False) + if elem in dups: + out_append(True) + else: + dups.add(elem) + out_append(False) + if fromLast: + out = list(reversed(out)) + return numpy.array(out, dtype=bool) + +@duplicated.register(DataFrame) +def _( # pylint: disable=invalid-name,unused-argument + x: DataFrame, + incomparables: Optional[Iterable[Any]] = None, + fromLast: bool = False +) -> numpy.ndarray: + """Check if rows in a data frame are duplicated + + `incomparables` not working here + """ + keep = 'first' if not fromLast else 'last' + return x.duplicated(keep=keep).values diff --git a/datar/core/contexts.py b/datar/core/contexts.py index 52dfe281..c1c828ca 100644 --- a/datar/core/contexts.py +++ b/datar/core/contexts.py @@ -20,6 +20,10 @@ def __init__(self): def getitem(self, parent, ref): """Interpret f[ref]""" + if isinstance(ref, slice): + from .collections import Collection + return Collection(ref) + self.used_refs[ref] += 1 if isinstance(parent, DataFrame) and ref not in parent: cols = [col for col in parent.columns if col.startswith(f'{ref}$')] diff --git a/datar/core/middlewares.py b/datar/core/middlewares.py index 50b5bd26..078afbfd 100644 --- a/datar/core/middlewares.py +++ b/datar/core/middlewares.py @@ -2,8 +2,6 @@ from typing import Any, Mapping, Tuple from pipda.utils import DataEnv -from .utils import logger - class CurColumn: """Current column in across""" @classmethod @@ -34,33 +32,3 @@ def __enter__(self) -> Any: def __exit__(self, *exc_info) -> None: self.data.delete() - -class Nesting: - """Nesting objects for calls from tidyr.nesting""" - def __init__(self, *columns: Any, **kwargs: Any) -> None: - self.columns = [] - self.names = [] - - id_prefix = hex(id(self))[2:6] - for i, column in enumerate(columns): - self.columns.append(column) - if isinstance(column, str): - self.names.append(column) - continue - try: - # series - name = column.name - except AttributeError: - name = f'_tmp{id_prefix}_{i}' - logger.warning( - 'Temporary name used for a nesting column, use ' - 'keyword argument instead to specify the key as name.' - ) - self.names.append(name) - - for key, val in kwargs.items(): - self.columns.append(val) - self.names.append(key) - - def __len__(self): - return len(self.columns) diff --git a/datar/core/types.py b/datar/core/types.py index e60d84ff..56c9d5af 100644 --- a/datar/core/types.py +++ b/datar/core/types.py @@ -7,7 +7,6 @@ from pandas.core.frame import DataFrame from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy from pandas.core.series import Series -from pipda.function import Function # used for type annotations NumericType = Union[int, float, complex, numpy.number] @@ -23,6 +22,7 @@ BoolOrIter = Union[bool, Iterable[bool]] FloatOrIter = Union[FloatType, Iterable[FloatType]] NumericOrIter = Union[NumericType, Iterable[NumericType]] +DTypeType = Union[str, type, numpy.dtype] NoneType = type(None) # used for type checks diff --git a/datar/core/utils.py b/datar/core/utils.py index ed1a1b0b..e28e5b17 100644 --- a/datar/core/utils.py +++ b/datar/core/utils.py @@ -14,7 +14,7 @@ from varname import argname from .exceptions import ColumnNotExistingError, NameNonUniqueError -from .types import is_scalar +from .types import is_scalar, DTypeType from .defaults import DEFAULT_COLUMN_PREFIX # logger @@ -33,6 +33,7 @@ def vars_select( raise_nonexists: bool = True, base0: Optional[bool] = None ) -> List[int]: + # TODO: support selecting data-frame columns """Select columns Args: @@ -156,7 +157,7 @@ def df_assign_item( else: df.insert(df.shape[1], item, value, allow_duplicates=True) -def categorize(data: Any) -> Any: +def categorized(data: Any) -> Any: """Get the Categorical object""" if not is_categorical_dtype(data): return data @@ -381,3 +382,78 @@ def get_option(key: str, value: Any = None) -> Any: return value from ..base import getOption return getOption(key) + +def apply_dtypes( + df: DataFrame, + dtypes: Optional[Union[bool, DTypeType, Mapping[str, DTypeType]]] +) -> None: + """Apply dtypes to data frame""" + if dtypes is None or dtypes is False: + return + + if dtypes is True: + inferred = df.convert_dtypes() + for col in df: + df[col] = inferred[col] + return + + if not isinstance(dtypes, dict): + dtypes = dict(zip(df.columns, [dtypes]*df.shape[1])) + + for column, dtype in dtypes.items(): + if column in df: + df[column] = df[column].astype(dtype) + else: + for col in df: + if col.startswith(f"{column}$"): + df[col] = df[col].astype(dtype) + +def keep_column_order(df: DataFrame, order: Iterable[str]): + """Keep the order of columns as given `order` + + We cannot do `df[order]` directly, since `df` may have nested df columns. + """ + out_columns = [] + for col in order: + if col in df: + out_columns.append(col) + else: + out_columns.extend( + (dfcol for dfcol in df.columns if dfcol.startswith(f"{col}$")) + ) + if set(out_columns) != set(df.columns): + raise ValueError("Given `order` does not select all columns.") + + return df[out_columns] + +def reconstruct_tibble( + input: DataFrame, # pylint: disable=redefined-builtin + output: DataFrame, + ungrouped_vars: Optional[List[str]] = None, + keep_rowwise: bool = False +) -> DataFrame: + """Reconstruct the output dataframe based on input""" + from ..base import setdiff, intersect + from ..dplyr import group_vars, group_by_drop_default + from .grouped import DataFrameGroupBy, DataFrameRowwise + + if ungrouped_vars is None: + ungrouped_vars = [] + old_groups = group_vars(input) + new_groups = intersect(setdiff(old_groups, ungrouped_vars), output.columns) + + if isinstance(input, DataFrameRowwise): + return DataFrameRowwise( + output, + _group_vars=new_groups, + _drop=group_by_drop_default(input) + ) if keep_rowwise else output + + if isinstance(input, DataFrameGroupBy): + return DataFrameGroupBy( + output, + _group_vars=new_groups, + _drop=group_by_drop_default(input) + ) + + return output diff --git a/datar/dplyr/__init__.py b/datar/dplyr/__init__.py index 7b0c1521..1b33ebe4 100644 --- a/datar/dplyr/__init__.py +++ b/datar/dplyr/__init__.py @@ -52,3 +52,4 @@ from .lead_lag import lead, lag from .recode import recode, recode_factor, recode_categorical from .order_by import order_by, with_order +from .rows import rows_insert, rows_update, rows_patch, rows_upsert, rows_delete diff --git a/datar/dplyr/bind.py b/datar/dplyr/bind.py index f3045ecf..118499c3 100644 --- a/datar/dplyr/bind.py +++ b/datar/dplyr/bind.py @@ -26,6 +26,7 @@ def bind_rows( *datas: Optional[Union[DataFrame, dict]], _id: Optional[str] = None, _base0: Optional[bool] = None, + _copy: bool = True, **kwargs: Union[DataFrame, dict] ) -> DataFrame: # pylint: disable=too-many-branches @@ -41,6 +42,9 @@ def bind_rows( _base0: Whether `_id` starts from 0 or not, if no keys are provided. If `_base0` is not provided, will use `datar.base.getOption('index.base.0')` + _copy: If `False`, do not copy data unnecessarily. + Original API does not support this. This argument will be + passed by to `pandas.concat()` as `copy` argument. **kwargs: A mapping of dataframe, keys will be used as _id col. Returns: @@ -107,10 +111,11 @@ def data_to_df(data): return pandas.concat( key_data.values(), keys=key_data.keys(), - names=[_id, None] + names=[_id, None], + copy=_copy ).reset_index(level=0).reset_index(drop=True) - return pandas.concat(key_data.values()).reset_index(drop=True) + return pandas.concat(key_data.values(), copy=_copy).reset_index(drop=True) @bind_rows.register(DataFrameGroupBy, context=Context.PENDING) def _( @@ -130,7 +135,8 @@ def bind_cols( _data: Optional[Union[DataFrame, dict]], *datas: Optional[Union[DataFrame, dict]], _name_repair: Union[str, Callable] = "unique", - _base0: Optional[bool] = None + _base0: Optional[bool] = None, + _copy: bool = True ) -> DataFrame: """Bind columns of give dataframes @@ -150,6 +156,9 @@ def bind_cols( - a function: apply custom name repair _base0: Whether the numeric suffix starts from 0 or not. If not specified, will use `datar.base.getOption('index.base.0')`. + _copy: If `False`, do not copy data unnecessarily. + Original API does not support this. This argument will be + passed by to `pandas.concat()` as `copy` argument. Returns: The combined dataframe @@ -166,7 +175,7 @@ def bind_cols( more_data.insert(0, _data) if not more_data: return DataFrame() - ret = pandas.concat(more_data, axis=1) + ret = pandas.concat(more_data, axis=1, copy=_copy) ret.columns = repair_names( ret.columns.tolist(), repair=_name_repair, diff --git a/datar/dplyr/lead_lag.py b/datar/dplyr/lead_lag.py index db883b8c..a83a6e4f 100644 --- a/datar/dplyr/lead_lag.py +++ b/datar/dplyr/lead_lag.py @@ -38,7 +38,7 @@ def lead( if order_by is not None: return with_order(order_by, lead, series, n=n, default=default) - series, cats, default = lead_lag_prepare(series, n, default) + series, cats, default = _lead_lag_prepare(series, n, default) index = series.index ret = default * len(series) @@ -65,7 +65,7 @@ def lag( if order_by is not None: return with_order(order_by, lag, series, n=n, default=default) - series, cats, default = lead_lag_prepare(series, n, default) + series, cats, default = _lead_lag_prepare(series, n, default) index = series.index ret = default * len(series) @@ -75,7 +75,7 @@ def lag( ret = Categorical(ret, categories=cats) return Series(ret, index=index) -def lead_lag_prepare( +def _lead_lag_prepare( data: Iterable[Any], n: int, default: Any diff --git a/datar/dplyr/recode.py b/datar/dplyr/recode.py index c51dbfa5..89c06e7e 100644 --- a/datar/dplyr/recode.py +++ b/datar/dplyr/recode.py @@ -15,7 +15,7 @@ from ..core.types import is_scalar from ..base import NA, unique, c, intersect, NA_integer_, NA_character_ -def get_first(x: Iterable[Any]) -> Any: +def _get_first(x: Iterable[Any]) -> Any: """Get first raw item from an iterable""" x = x[0] try: @@ -23,7 +23,7 @@ def get_first(x: Iterable[Any]) -> Any: except AttributeError: return x -def args_to_recodings( +def _args_to_recodings( *args: Any, _force_index: bool = False, **kwargs: Any @@ -44,7 +44,7 @@ def args_to_recodings( values[int(key)] = values.pop(key) return values -def check_length(val: numpy.ndarray, x: numpy.ndarray, name: str): +def _check_length(val: numpy.ndarray, x: numpy.ndarray, name: str): """Check the length of the values to recode""" length_x = len(val) n = len(x) @@ -59,20 +59,20 @@ def check_length(val: numpy.ndarray, x: numpy.ndarray, name: str): f"{name} must be length {n}, not {length_x}." ) -def check_type(val: numpy.ndarray, out_type: Optional[type], name: str): +def _check_type(val: numpy.ndarray, out_type: Optional[type], name: str): """Check the type of the values to recode""" if val.dtype is numpy.dtype(object): if out_type and not all(isinstance(elem, out_type) for elem in val): raise TypeError( f"{name} must be {out_type.__name__}, not {type(val[0])}." ) - elif out_type and not isinstance(get_first(val), out_type): + elif out_type and not isinstance(_get_first(val), out_type): raise TypeError( f"{name} must be {out_type.__name__}, not {val.dtype.name}." ) -def replace_with( +def _replace_with( # pylint: disable=invalid-name x: numpy.ndarray, out_type: Optional[type], @@ -90,8 +90,8 @@ def replace_with( else: val = numpy.array(val) - check_length(val, x, name) - check_type(val, out_type, name) + _check_length(val, x, name) + _check_type(val, out_type, name) # check_class(val, x, name) i[pandas.isna(i)] = False @@ -103,7 +103,7 @@ def replace_with( return x -def validate_recode_default( +def _validate_recode_default( default: Any, x: numpy.ndarray, out: numpy.ndarray, @@ -111,7 +111,7 @@ def validate_recode_default( replaced: numpy.ndarray ) -> numpy.ndarray: """Validate default for recoding""" - default = recode_default(x, default, out_type) + default = _recode_default(x, default, out_type) if ( default is None and sum(replaced & ~pandas.isna(x)) < len(out[~pandas.isna(x)]) @@ -123,19 +123,19 @@ def validate_recode_default( return default -def recode_default( +def _recode_default( x: numpy.ndarray, default: Any, out_type: Optional[type] ) -> Any: """Get right default for recoding""" if default is None and ( - out_type is None or isinstance(get_first(x), out_type) + out_type is None or isinstance(_get_first(x), out_type) ): return x return default -def recode_numeric( +def _recode_numeric( _x: numpy.ndarray, *args: Any, _default: Any = None, @@ -144,11 +144,11 @@ def recode_numeric( ) -> numpy.ndarray: """Recode numeric vectors""" - values = args_to_recodings( + values = _args_to_recodings( *args, **kwargs, _force_index=True ) - check_args(values, _default, _missing) + _check_args(values, _default, _missing) if any(not isinstance(val, int) for val in values): raise ValueError( "All values must be unnamed (or named with integers)." @@ -162,16 +162,16 @@ def recode_numeric( for val in values: if out_type is None: out_type = type(values[val]) - out = replace_with( + out = _replace_with( out, out_type, _x == val, values[val], f"Element {val}" ) replaced[_x == val] = True - _default = validate_recode_default(_default, _x, out, out_type, replaced) - out = replace_with( + _default = _validate_recode_default(_default, _x, out, out_type, replaced) + out = _replace_with( out, out_type, ~replaced & ~pandas.isna(_x), _default, "`_default`" ) - out = replace_with( + out = _replace_with( out, out_type, pandas.isna(_x) | (_x == NA_integer_), _missing, @@ -181,7 +181,7 @@ def recode_numeric( out = out.astype(out_type) return out -def recode_character( +def _recode_character( _x: Iterable[Any], *args: Any, _default: Any = None, @@ -189,8 +189,8 @@ def recode_character( **kwargs: Any ) -> numpy.ndarray: """Recode character vectors""" - values = args_to_recodings(*args, **kwargs) - check_args(values, _default, _missing) + values = _args_to_recodings(*args, **kwargs) + _check_args(values, _default, _missing) if not all(isinstance(val, str) for val in values): raise ValueError("All values must be named.") @@ -202,14 +202,14 @@ def recode_character( for val in values: if out_type is None: out_type = type(values[val]) - out = replace_with(out, out_type, _x == val, values[val], f"`{val}`") + out = _replace_with(out, out_type, _x == val, values[val], f"`{val}`") replaced[_x == val] = True - _default = validate_recode_default(_default, _x, out, out_type, replaced) - out = replace_with( + _default = _validate_recode_default(_default, _x, out, out_type, replaced) + out = _replace_with( out, out_type, ~replaced & ~pandas.isna(_x), _default, "`_default`" ) - out = replace_with( + out = _replace_with( out, out_type, pandas.isna(_x) | (_x == NA_character_), _missing, @@ -219,7 +219,7 @@ def recode_character( out = out.astype(out_type) return out -def check_args( +def _check_args( values: Mapping[Any, Any], default: Any, missing: Any @@ -269,14 +269,14 @@ def recode( _x[pandas.isna(_x_obj)] = NA_integer_ if numpy.issubdtype(_x.dtype, numpy.number): - return recode_numeric( + return _recode_numeric( _x, *args, _default=_default, _missing=_missing, **kwargs ) - return recode_character( + return _recode_character( _x, *args, _default=_default, _missing=_missing, @@ -303,7 +303,7 @@ def _( if isinstance(_x, Series): _x = _x.values # get the Categorical object - values = args_to_recodings(*args, **kwargs) + values = _args_to_recodings(*args, **kwargs) if not values: raise ValueError("No replacements provided.") @@ -316,7 +316,7 @@ def _( raise ValueError("`_missing` is not supported for factors.") n = len(_x) - check_args(values, _default, _missing) + _check_args(values, _default, _missing) out = numpy.array([NA] * n, dtype=object) replaced = numpy.array([False] * n) out_type = None @@ -324,7 +324,7 @@ def _( for val in values: if out_type is None: out_type = type(values[val]) - out = replace_with( + out = _replace_with( out, out_type, _x.categories == val, @@ -333,8 +333,8 @@ def _( ) replaced[_x.categories == val] = True - _default = validate_recode_default(_default, _x, out, out_type, replaced) - out = replace_with( + _default = _validate_recode_default(_default, _x, out, out_type, replaced) + out = _replace_with( out, out_type, ~replaced, _default, "`_default`" ) @@ -355,15 +355,15 @@ def recode_factor( see recode(). """ - values = args_to_recodings(*args, **kwargs) + values = _args_to_recodings(*args, **kwargs) recoded = recode( _x, values, _default=_default, _missing=_missing ) - out_type = type(get_first(recoded)) - _default = recode_default(_x, _default, out_type) + out_type = type(_get_first(recoded)) + _default = _recode_default(_x, _default, out_type) all_levels = unique(c( list(values.values()), [] if _default is None else _default, diff --git a/datar/dplyr/rows.py b/datar/dplyr/rows.py new file mode 100644 index 00000000..077b0ecc --- /dev/null +++ b/datar/dplyr/rows.py @@ -0,0 +1,278 @@ +"""Provide functions to manipulate multiple rows + +https://github.com/tidyverse/dplyr/blob/master/R/rows.R +""" +from typing import List, Optional + +import numpy +import pandas +from pandas import DataFrame +from pipda import register_verb + +from ..base import setdiff +from ..core.types import StringOrIter, is_scalar +from ..core.utils import logger +from ..tibble import rownames_to_column + +from .bind import bind_rows +from .join import left_join +from .funs import coalesce + +@register_verb(DataFrame) +def rows_insert( + x: DataFrame, + y: DataFrame, + by: Optional[StringOrIter] = None, + copy: bool = True +) -> DataFrame: + """Adds new rows to a data frame + + Argument `in_place` not supported, as we always do data frames here. + + Args: + x: The seed data frame + y: The data frame with rows to be inserted into `x`. + - Key values in `y` must not occur in `x` + - `y` must have the same or a subset columns of `x` + by: A string or a list of strings giving the key columns. + The key values must uniquely identify each row + (i.e. each combination of key values occurs at most once), + and the key columns must exist in both x and y. + By default, we use the first column in y, since the first column + is a reasonable place to put an identifier variable. + copy: If `False`, do not copy data unnecessarily. + Original API does not support this. This argument will be + passed by to `pandas.concat()` as `copy` argument. + + Returns: + A data frame with `y` inserted into `x` + """ + key = _rows_check_key(by, x, y) + _rows_check_key_df(x, key, df_name='x') + _rows_check_key_df(y, key, df_name='y') + + idx = _rows_match(y[key], x[key]) + bad = ~pandas.isna(idx) + if any(bad): + raise ValueError("Attempting to insert duplicate rows.") + + return bind_rows(x, y, _copy=copy) + +@register_verb(DataFrame) +def rows_update( + x: DataFrame, + y: DataFrame, + by: Optional[StringOrIter] = None, + copy: bool = True +) -> DataFrame: + """Modifies existing rows in a data frame + + See Also: + [`rows_insert`](datar.dplyr.rows.rows_insert) + + Args: + x: The seed data frame + y: The data frame with rows to be inserted into `x`. + - Key values in `y` must not occur in `x` + - `y` must have the same or a subset columns of `x` + by: A string or a list of strings giving the key columns. + The key values must uniquely identify each row + (i.e. each combination of key values occurs at most once), + and the key columns must exist in both x and y. + By default, we use the first column in y, since the first column + is a reasonable place to put an identifier variable. + copy: Whether `x` should be copied and updated or updated directly + + Returns: + `x` with values of keys updated + """ + key = _rows_check_key(by, x, y) + _rows_check_key_df(x, key, df_name='x') + _rows_check_key_df(y, key, df_name='y') + + idx = _rows_match(y[key], x[key]) + bad = pandas.isna(idx) + if any(bad): + raise ValueError("Attempting to update missing rows.") + + idx = idx.astype(int) + if copy: + x = x.copy() + x.loc[idx, y.columns] = y.values + return x + +@register_verb(DataFrame) +def rows_patch( + x: DataFrame, + y: DataFrame, + by: Optional[StringOrIter] = None, + copy: bool = True +) -> DataFrame: + """Works like `rows_update()` but only overwrites `NA` values. + + See Also: + [`rows_insert`](datar.dplyr.rows.rows_insert) + + Args: + x: The seed data frame + y: The data frame with rows to be inserted into `x`. + - Key values in `y` must not occur in `x` + - `y` must have the same or a subset columns of `x` + by: A string or a list of strings giving the key columns. + The key values must uniquely identify each row + (i.e. each combination of key values occurs at most once), + and the key columns must exist in both x and y. + By default, we use the first column in y, since the first column + is a reasonable place to put an identifier variable. + copy: Whether `x` should be copied and updated or updated directly + + Returns: + `x` with values of keys updated + """ + key = _rows_check_key(by, x, y) + _rows_check_key_df(x, key, df_name='x') + _rows_check_key_df(y, key, df_name='y') + + idx = _rows_match(y[key], x[key]) + bad = pandas.isna(idx) + if any(bad): + raise ValueError("Attempting to patch missing rows.") + + new_data = [] + for col in y.columns: + new_data.append(coalesce(x.loc[idx, col].values, y[col])) + + if copy: + x = x.copy() + x.loc[idx, y.columns] = numpy.array(new_data).T + return x + +@register_verb(DataFrame) +def rows_upsert( + x: DataFrame, + y: DataFrame, + by: Optional[StringOrIter] = None, + copy: bool = True +) -> DataFrame: + """Inserts or updates depending on whether or not the + key value in `y` already exists in `x`. + + See Also: + [`rows_insert`](datar.dplyr.rows.rows_insert) + + Args: + x: The seed data frame + y: The data frame with rows to be inserted into `x`. + - Key values in `y` must not occur in `x` + - `y` must have the same or a subset columns of `x` + by: A string or a list of strings giving the key columns. + The key values must uniquely identify each row + (i.e. each combination of key values occurs at most once), + and the key columns must exist in both x and y. + By default, we use the first column in y, since the first column + is a reasonable place to put an identifier variable. + copy: If `False`, do not copy data unnecessarily. + Original API does not support this. This argument will be + passed by to `pandas.concat()` as `copy` argument. + + Returns: + `x` with values of keys updated + """ + key = _rows_check_key(by, x, y) + _rows_check_key_df(x, key, df_name='x') + _rows_check_key_df(y, key, df_name='y') + + idx = _rows_match(y[key], x[key]) + new = pandas.isna(idx) + # idx of x + idx_existing = idx[~new] + + x.loc[idx_existing, y.columns] = y.loc[~new].values + return bind_rows(x, y.loc[new], _copy=copy) + +@register_verb(DataFrame) +def rows_delete( + x: DataFrame, + y: DataFrame, + by: Optional[StringOrIter] = None, + copy: bool = True +) -> DataFrame: + """Deletes rows; key values in `y` must exist in `x`. + + See Also: + [`rows_insert`](datar.dplyr.rows.rows_insert) + + Args: + x: The seed data frame + y: The data frame with rows to be inserted into `x`. + - Key values in `y` must not occur in `x` + - `y` must have the same or a subset columns of `x` + by: A string or a list of strings giving the key columns. + The key values must uniquely identify each row + (i.e. each combination of key values occurs at most once), + and the key columns must exist in both x and y. + By default, we use the first column in y, since the first column + is a reasonable place to put an identifier variable. + copy: Whether `x` should be copied and deleted or deleted directly + + Returns: + `x` with values of keys deleted + """ + key = _rows_check_key(by, x, y) + _rows_check_key_df(x, key, df_name='x') + _rows_check_key_df(y, key, df_name='y') + + extra_cols = setdiff(y.columns, key) + if len(extra_cols) > 0: + logger.info("Ignoring extra columns: %s", extra_cols) + + idx = _rows_match(y[key], x[key]) + bad = pandas.isna(idx) + + if any(bad): + raise ValueError("Attempting to delete missing rows.") + + if copy: + x = x.copy() + + return x.loc[~x.index.isin(idx), :] + +# helpers ----------------------------------------------------------------- + +def _rows_check_key( + by: Optional[StringOrIter], + x: DataFrame, + y: DataFrame +) -> List[str]: + """Check the key and return the valid key""" + if by is None: + by = y.columns[0] + logger.info("Matching, by=%r", by) + + if is_scalar(by): + by = [by] + + for by_elem in by: + if not isinstance(by_elem, str): + raise ValueError("`by` must be a string or a list of strings.") + + bad = setdiff(y.columns, x.columns) + if len(bad) > 0: + raise ValueError("All columns in `y` must exist in `x`.") + + return by + +def _rows_check_key_df(df: DataFrame, by: List[str], df_name: str) -> None: + """Check key with the data frame""" + y_miss = setdiff(by, df.columns) + if len(y_miss) > 0: + raise ValueError(f"All `by` columns must exist in `{df_name}`.") + + if any(df.duplicated(by)): + raise ValueError(f"`{df_name}` key values are not unique.") + +def _rows_match(x: DataFrame, y: DataFrame) -> numpy.ndarray: + """Mimic vctrs::vec_match""" + id_col = '__id__' + y_with_id = rownames_to_column(y, var=id_col) + return left_join(x, y_with_id)[id_col].values diff --git a/datar/tibble/funcs.py b/datar/tibble/funcs.py index 1a40eda0..69fd5123 100644 --- a/datar/tibble/funcs.py +++ b/datar/tibble/funcs.py @@ -6,25 +6,31 @@ from pipda import Context, register_func, register_verb from pipda.utils import Expression from pipda.symbolic import DirectRefAttr, DirectRefItem +from pipda.function import Function from varname import argname, varname from varname.utils import VarnameRetrievingError from ..core.defaults import DEFAULT_COLUMN_PREFIX from ..core.utils import ( copy_attrs, df_assign_item, get_option, position_after, - position_at, to_df, logger + position_at, to_df, logger, apply_dtypes ) from ..core.names import repair_names from ..core.grouped import DataFrameGroupBy, DataFrameRowwise from ..core.types import is_scalar from ..core.exceptions import ColumnNotExistingError -from ..base import setdiff +from ..core.collections import Collection +from ..core.types import StringOrIter +from ..base import setdiff, c def tibble( *args: Any, _name_repair: Union[str, Callable] = 'check_unique', _rows: Optional[int] = None, _base0: Optional[bool] = None, + _dtypes: Optional[ + Mapping[str, Union[StringOrIter, type, Iterable[type]]] + ] = None, **kwargs: Any ) -> DataFrame: # pylint: disable=too-many-statements,too-many-branches @@ -78,8 +84,18 @@ def tibble( for name, arg in zip(names, values): if arg is None: continue + if isinstance(arg, Expression): - arg = arg(df, Context.EVAL.value) + # allow f[1:3] to work + if isinstance(arg, DirectRefItem) and isinstance(arg.ref, slice): + arg = Collection(arg.ref, base0=_base0) + elif ( + isinstance(arg, Function) and + arg.func.__qualname__ == c.__qualname__ + ): + arg = arg(df, Context.SELECT.value) + else: + arg = arg(df, Context.EVAL.value) if isinstance(arg, dict): arg = tibble(**arg) @@ -117,12 +133,18 @@ def tibble( if not kwargs and len(args) == 1 and isinstance(args[0], DataFrame): copy_attrs(df, args[0]) + + apply_dtypes(df, _dtypes) return df + def tibble_row( *args: Any, _name_repair: Union[str, Callable] = 'check_unique', _base0: Optional[bool] = None, + _dtypes: Optional[ + Mapping[str, Union[StringOrIter, type, Iterable[type]]] + ] = None, **kwargs: Any ) -> DataFrame: """Constructs a data frame that is guaranteed to occupy one row. @@ -156,6 +178,8 @@ def tibble_row( df.__dfname__ = varname(raise_exc=False) except VarnameRetrievingError: # pragma: no cover df.__dfname__ = None + + apply_dtypes(df, _dtypes) return df @register_func(None, context=Context.EVAL) @@ -164,6 +188,9 @@ def fibble( _name_repair: Union[str, Callable] = 'check_unique', _base0: Optional[bool] = None, _rows: Optional[int] = None, + _dtypes: Optional[ + Mapping[str, Union[StringOrIter, type, Iterable[type]]] + ] = None, **kwargs: Any ) -> DataFrame: """A function of tibble that can be used as an argument of verbs @@ -187,10 +214,16 @@ def fibble( *args, **kwargs, _name_repair=_name_repair, _rows=_rows, - _base0=_base0 + _base0=_base0, + _dtypes=_dtypes ) -def tribble(*dummies: Any) -> DataFrame: +def tribble( + *dummies: Any, + _dtypes: Optional[ + Mapping[str, Union[StringOrIter, type, Iterable[type]]] + ] = None +) -> DataFrame: """Create dataframe using an easier to read row-by-row layout Unlike original API that uses formula (`f.col`) to indicate the column @@ -237,6 +270,8 @@ def tribble(*dummies: Any) -> DataFrame: ret.__dfname__ = varname(raise_exc=False) except VarnameRetrievingError: # pragma: no cover ret.__dfname__ = None + + apply_dtypes(ret, _dtypes) return ret def enframe( diff --git a/datar/tidyr/__init__.py b/datar/tidyr/__init__.py index 8f5d5180..fc730165 100644 --- a/datar/tidyr/__init__.py +++ b/datar/tidyr/__init__.py @@ -1,8 +1,17 @@ """APIs ported from R-tidyr""" -from .funcs import full_seq, nesting -from .verbs import ( - pivot_longer, pivot_wider, uncount, replace_na, fill, - expand_grid, extract, separate, separate_rows, unite, - drop_na, expand -) +from .funcs import full_seq +from .chop import chop, unchop +from .nest import nest, unnest +from .pack import pack, unpack +from .expand import expand_grid, expand, nesting, crossing +from .complete import complete +from .drop_na import drop_na +from .extract import extract +from .fill import fill +from .pivot_long import pivot_longer +from .pivot_wide import pivot_wider +from .separate import separate, separate_rows +from .uncount import uncount +from .unite import unite +from .replace_na import replace_na diff --git a/datar/tidyr/chop.py b/datar/tidyr/chop.py new file mode 100644 index 00000000..26d822dc --- /dev/null +++ b/datar/tidyr/chop.py @@ -0,0 +1,295 @@ +"""Chop and unchop + +https://github.com/tidyverse/tidyr/blob/master/R/chop.R +""" +from collections import defaultdict +from typing import Iterable, List, Mapping, Optional, Tuple, Union + +import numpy +import pandas +from pandas import DataFrame, Series +from pipda import register_verb + +from ..core.types import IntOrIter, StringOrIter, DTypeType, is_scalar +from ..core.utils import ( + vars_select, copy_attrs, apply_dtypes, keep_column_order +) +from ..core.exceptions import ColumnNotExistingError +from ..core.contexts import Context +from ..core.grouped import DataFrameGroupBy + +from ..base import union, NA +from ..dplyr import ( + bind_cols, group_by, mutate, pull, arrange, + group_data, group_by_drop_default, group_vars +) + +from .drop_na import drop_na + +@register_verb(DataFrame, context=Context.SELECT) +def chop( + data: DataFrame, + cols: Optional[Union[IntOrIter, StringOrIter]] = None, + _base0: Optional[bool] = None +) -> DataFrame: + """Makes data frame shorter by converting rows within each group + into list-columns. + + Args: + data: A data frame + cols: Columns to chop + _base0: Whether `cols` are 0-based + if not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + Data frame with selected columns chopped + """ + if cols is None: + return data.copy() + + all_columns = data.columns + cols = vars_select(all_columns, cols, base0=_base0) + cols = all_columns[cols] + # when cols is empty + # order may change for all_columns.difference([]) + key_cols = all_columns.difference(cols) if len(cols) > 0 else all_columns + + vals = data[cols] + keys = data[key_cols] + + compacted = [] + if data.shape[0] == 0: + split_key = keys + else: + split = _vec_split(vals, keys) + try: + split_key = split >> pull('key', to='frame') + except ColumnNotExistingError: + split_key = None + split_val = split >> pull('val', to='list') + + for val in split_val: + compacted.append(_compact_df(val)) + + if not compacted: + vals = DataFrame(columns=cols) + else: + vals = pandas.concat(compacted, ignore_index=True) + + out = bind_cols(split_key, vals) + if isinstance(data, DataFrameGroupBy): + out = data.__class__( + out, + _group_vars=group_vars(data), + _drop=group_by_drop_default(data) + ) + + copy_attrs(out, data) + return out + +@register_verb(DataFrame, context=Context.SELECT) +def unchop( + data: DataFrame, + cols: Optional[Union[IntOrIter, StringOrIter]] = None, + keep_empty: bool = False, + dtypes: Optional[Union[DTypeType, Mapping[str, DTypeType]]] = None, + _base0: Optional[bool] = None +) -> DataFrame: + """Makes df longer by expanding list-columns so that each element + of the list-column gets its own row in the output. + + See https://tidyr.tidyverse.org/reference/chop.html + + Recycling size-1 elements might be different from `tidyr` + >>> df = tibble(x=[1, [2,3]], y=[[2,3], 1]) + >>> df >> unchop([f.x, f.y]) + >>> # tibble(x=[1,2,3], y=[2,3,1]) + >>> # instead of following in tidyr + >>> # tibble(x=[1,1,2,3], y=[2,3,1,1]) + + Args: + data: A data frame. + cols: Columns to unchop. + keep_empty: By default, you get one row of output for each element + of the list your unchopping/unnesting. + This means that if there's a size-0 element + (like NULL or an empty data frame), that entire row will be + dropped from the output. + If you want to preserve all rows, use `keep_empty` = `True` to + replace size-0 elements with a single row of missing values. + dtypes: NOT `ptype`. Providing the dtypes for the output columns. + Could be a single dtype, which will be applied to all columns, or + a dictionary of dtypes with keys for the columns and values the + dtypes. + For nested data frames, we need to specify `col$a` as key. If `col` + is used as key, all columns of the nested data frames will be casted + into that dtype. + _base0: Whether `cols` are 0-based + if not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + A data frame with selected columns unchopped. + """ + all_columns = data.columns + cols = vars_select(all_columns, cols, base0=_base0) + + if len(cols) == 0 or data.shape[0] == 0: + return data.copy() + + cols = all_columns[cols] + key_cols = all_columns.difference(cols).tolist() + out = _unchopping(data, cols, key_cols, keep_empty) + + apply_dtypes(out, dtypes) + if isinstance(data, DataFrameGroupBy): + out = data.__class__( + out, + _group_vars=group_vars(data), + _drop=group_by_drop_default(data) + ) + + copy_attrs(out, data) + return out + +def _vec_split( + x: Union[DataFrame, Series], + by: Union[DataFrame, Series] +) -> DataFrame: + """Split a vector into groups + + Returns a data frame with columns `key` and `val`. `key` is a stacked column + with data from by. + """ + if isinstance(x, Series): + x = x.to_frame() + if isinstance(by, Series): + by = by.to_frame() + df = bind_cols(x, by) + if df.shape[0] == 0: + return DataFrame(columns=['key', 'val']) + df = df >> group_by(*by.columns) + gdata = group_data(df) + gdata = arrange(gdata, gdata._rows) + out = DataFrame(index=gdata.index) + return mutate( + out, + key=gdata[by.columns], + val=[x.iloc[rows, :] for rows in gdata._rows] + ) + +def _compact_df(data: DataFrame) -> DataFrame: + """Compact each series as list in a data frame""" + out = DataFrame(index=[0], columns=data.columns) + for col in data.columns: + out.loc[0, col] = data[col].values.tolist() + return out + +def _unchopping( + data: DataFrame, + data_cols: Iterable[str], + key_cols: Iterable[str], + keep_empty: bool +) -> DataFrame: + # pylint: disable=line-too-long + """Unchop the data frame + + See https://stackoverflow.com/questions/53218931/how-to-unnest-explode-a-column-in-a-pandas-dataframe + """ + # pylint: enable=line-too-long + # key_cols could be empty + rsize = None + val_data = {} + for dcol in data_cols: + # check dtype first so that we don't need to check + # other types of columns element by element + is_df_col = data_cols.dtype == object and all( + # it's either null or a dataframe + (is_scalar(val) and pandas.isnull(val)) + or isinstance(val, DataFrame) + for val in data[dcol] + ) + if is_df_col: + vdata, sizes, dtypes = _unchopping_df_column(data[dcol]) + else: + vdata, sizes, dtypes = _unchopping_nondf_column(data[dcol]) + val_data.update(vdata) + + if rsize is None: + rsize = sizes + else: + tmpsize = [] + for prevsize, cursize in zip(rsize, sizes): + if prevsize != cursize and 1 not in (prevsize, cursize): + raise ValueError( + f"Incompatible lengths: {prevsize}, {cursize}." + ) + tmpsize.append(max(prevsize, cursize)) + rsize = tmpsize + + key_data = {key: numpy.repeat(data[key].values, rsize) for key in key_cols} + key_data.update(val_data) + # DataFrame(key_data) may have nested dfs + # say y$a, then ['y'] will not select it + out = keep_column_order(DataFrame(key_data), data.columns) + if not keep_empty: + out = drop_na(out, *val_data, how='all') + apply_dtypes(out, dtypes) + copy_attrs(out, data) + return out + +def _unchopping_df_column( + series: Series +) -> Tuple[Mapping[str, List], List[int], Mapping[str, DTypeType]]: + """Unchopping dataframe column""" + # Get union column names + union_cols = [] + # try to keep the same dtype + dtypes = None + for val in series: + if isinstance(val, DataFrame): + union_cols = union(union_cols, val.columns) + if dtypes is None: + dtypes = {col: val[col].dtype for col in val} + else: + for col in val: + # pylint: disable=unsupported-membership-test + # pylint: disable=unsupported-delete-operation + if col in dtypes and dtypes[col] != val[col].dtype: + del dtypes[col] + + sizes = [] + val_data = defaultdict(list) + # add missing columns to each df + for val in series: + if isinstance(val, DataFrame): + for col in union_cols: + val_data[f"{series.name}${col}"].extend( + val[col] if col in val + else [NA] * val.shape[0] + ) + sizes.append(val.shape[0]) + else: # null + for col in union_cols: + val_data[f"{series.name}${col}"].append(NA) + sizes.append(1) + + return val_data, sizes, dtypes + +def _unchopping_nondf_column( + series: Series +) -> Tuple[Mapping[str, List], List[int], Mapping[str, DTypeType]]: + """Unchopping non-dataframe column""" + val_data = {} + vals = [ + [val] if is_scalar(val) else val + for val in series + ] + val_data[series.name] = Series( + numpy.concatenate( + vals, + axis=None, + # casting="no" # only for numpy 1.20.0+ + ), + dtype=series.dtype + ) + return val_data, [len(val) for val in vals], {} diff --git a/datar/tidyr/complete.py b/datar/tidyr/complete.py new file mode 100644 index 00000000..9626ced2 --- /dev/null +++ b/datar/tidyr/complete.py @@ -0,0 +1,56 @@ +"""Complete a data frame with missing combinations of data + +https://github.com/tidyverse/tidyr/blob/HEAD/R/complete.R +""" + +from typing import Optional, Iterable, Mapping, Any + +from pandas import DataFrame +from pipda import register_verb + +from ..core.contexts import Context +from ..core.grouped import DataFrameGroupBy + +from ..dplyr import full_join, group_vars, group_by_drop_default +from .replace_na import replace_na +from .expand import expand + +@register_verb(DataFrame, context=Context.PENDING) +def complete( + data: DataFrame, + *args: Iterable[Any], + fill: Optional[Mapping[str, Any]] = None, + **kwargs: Iterable[Any] +) -> DataFrame: + """Turns implicit missing values into explicit missing values. + + Args: + data: A data frame + *args: columns to expand. Columns can be atomic lists. + - To find all unique combinations of x, y and z, including + those not present in the data, supply each variable as a + separate argument: `expand(df, x, y, z)`. + - To find only the combinations that occur in the data, use + nesting: `expand(df, nesting(x, y, z))`. + - You can combine the two forms. For example, + `expand(df, nesting(school_id, student_id), date)` would + produce a row for each present school-student combination + for all possible dates. + + Returns: + Data frame with missing values completed + """ + full = expand(data, *args, **kwargs) + if full.shape[0] == 0: + return data.copy() + + full = full_join(full, data, by=full.columns.tolist()) + full = replace_na(full, fill) + + if isinstance(full, DataFrameGroupBy): + return DataFrameGroupBy( + full, + _group_vars=group_vars(data), + _drop=group_by_drop_default(data) + ) + return full diff --git a/datar/tidyr/drop_na.py b/datar/tidyr/drop_na.py new file mode 100644 index 00000000..77b6ad4b --- /dev/null +++ b/datar/tidyr/drop_na.py @@ -0,0 +1,56 @@ +"""Drop rows containing missing values + +https://github.com/tidyverse/tidyr/blob/HEAD/R/drop-na.R +""" + +from typing import Optional +from pandas import DataFrame +from pipda import register_verb + +from ..core.contexts import Context +from ..core.utils import arg_match, vars_select, copy_attrs +from ..core.grouped import DataFrameGroupBy + +from ..dplyr import group_vars, group_by_drop_default + +@register_verb(DataFrame, context=Context.SELECT) +def drop_na( + _data: DataFrame, + *columns: str, + how: str = 'any', + _base0: Optional[bool] = None +) -> DataFrame: + """Drop rows containing missing values + + See https://tidyr.tidyverse.org/reference/drop_na.html + + Args: + data: A data frame. + *columns: Columns to inspect for missing values. + how: How to select the rows to drop + - all: All columns of `columns` to be `NA`s + - any: Any columns of `columns` to be `NA`s + (tidyr doesn't support this argument) + _base0: Whether `*columns` are 0-based if given by indexes + If not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + Dataframe with rows with NAs dropped and indexes dropped + """ + arg_match(how, ['any', 'all']) + all_columns = _data.columns + if columns: + columns = vars_select(all_columns, *columns, base0=_base0) + columns = all_columns[columns] + out = _data.dropna(subset=columns, how=how).reset_index(drop=True) + else: + out = _data.dropna(how=how).reset_index(drop=True) + + if isinstance(_data, DataFrameGroupBy): + out = _data.__class__( + out, + _group_vars=group_vars(_data), + _drop=group_by_drop_default(_data) + ) + copy_attrs(out, _data) + return out diff --git a/datar/tidyr/expand.py b/datar/tidyr/expand.py new file mode 100644 index 00000000..c39b2bbc --- /dev/null +++ b/datar/tidyr/expand.py @@ -0,0 +1,398 @@ +"""Expand data frame to include all possible combinations of values + +https://github.com/tidyverse/tidyr/blob/HEAD/R/expand.R +""" + +from typing import Any, Callable, Iterable, Mapping, Optional, Union + +import numpy +import pandas +from numpy import product +from pandas import DataFrame, Series, Categorical +from pandas.core.dtypes.common import is_categorical_dtype +from pipda import register_func, register_verb + +from ..core.contexts import Context +from ..core.defaults import DEFAULT_COLUMN_PREFIX +from ..core.types import is_scalar +from ..core.utils import categorized, copy_attrs +from ..core.grouped import DataFrameGroupBy, DataFrameRowwise +from ..core.names import repair_names + +from ..base import NA, NULL, factor, levels +from ..tibble import tibble +from ..dplyr import ( + arrange, distinct, pull, group_by_drop_default, group_vars +) + +@register_func(None, context=Context.EVAL) +def expand_grid( + *args: Iterable[Any], + _name_repair: Union[str, Callable] = 'check_unique', + _base0: Optional[bool] = None, + **kwargs: Iterable[Any] +) -> DataFrame: + """Create a tibble from all combinations of inputs + + Args: + *args: and + **kwargs: name-value pairs. + For `*args`, names will be inferred from the values and if failed, + `_Var0`, `_Var1`, etc will be used. + _name_repair: treatment of problematic column names: + - "minimal": No name repair or checks, beyond basic existence, + - "unique": Make sure names are unique and not empty, + - "check_unique": (default value), no name repair, + but check they are unique, + - "universal": Make the names unique and syntactic + - a function: apply custom name repair + _base0: Whether the suffixes of repaired names should be 0-based. + If not provided, will use `datar.base.getOption('index.base.0')`. + + Returns: + A data frame with one column for each input in `*args` and `**kwargs`. + The output will have one row for each combination of the inputs, + i.e. the size be equal to the product of the sizes of the inputs. + This implies that if any input has length 0, the output will have + zero rows. + """ + dots = _dots_cols(*args, **kwargs) + named = dots.pop('__named__') + ns = {key: len(val) for key, val in dots.items()} + n = product(list(ns.values())) + + if n == 0: + out = { + key: ( + val.iloc[[], :] + if isinstance(val, DataFrame) + else [] + ) + for key, val in dots.items() + } + else: + n = numpy.array([n], dtype=float) + ns_np = numpy.array(list(ns.values()), dtype=float) + + each = n / numpy.cumprod(ns_np) + times = n / each / ns_np + + each = dict(zip(dots, each.astype(int))) + times = dict(zip(dots, times.astype(int))) + out = { + key: _vec_repeat(val, each[key], times[key]) + for key, val in dots.items() + } + + ## tibble will somehow flatten the nested dataframes into fake nested df. + ## do it inside _flatten_nested + # out = tibble(out, _name_repair=_name_repair, _base0=_base0) + return _flatten_nested(out, named, _name_repair, _base0) + +@register_verb(DataFrame, context=Context.EVAL) +def expand( + data: DataFrame, + *args: Union[Series, DataFrame], + _name_repair: Union[str, Callable] = "check_unique", + _base0: Optional[bool] = None, + **kwargs: Union[Series, DataFrame] +) -> DataFrame: + """Generates all combination of variables found in a dataset. + + Args: + data: A data frame + *args: and, + **kwargs: columns to expand. Columns can be atomic lists. + - To find all unique combinations of x, y and z, including + those not present in the data, supply each variable as a + separate argument: `expand(df, x, y, z)`. + - To find only the combinations that occur in the data, use + nesting: `expand(df, nesting(x, y, z))`. + - You can combine the two forms. For example, + `expand(df, nesting(school_id, student_id), date)` would + produce a row for each present school-student combination + for all possible dates. + _name_repair: treatment of problematic column names: + - "minimal": No name repair or checks, beyond basic existence, + - "unique": Make sure names are unique and not empty, + - "check_unique": (default value), no name repair, + but check they are unique, + - "universal": Make the names unique and syntactic + - a function: apply custom name repair + _base0: Whether the suffixes of repaired names should be 0-based. + If not provided, will use `datar.base.getOption('index.base.0')`. + + Returns: + A data frame with all combination of variables. + """ + cols = _dots_cols(*args, **kwargs) + named = cols.pop('__named__') + cols = { + key: _sorted_unique(val) for key, val in cols.items() + } + + out = expand_grid(**cols, _name_repair=_name_repair, _base0=_base0) + out = _flatten_nested(out, named, _name_repair, _base0) + + copy_attrs(out, data) + return out + +@expand.register(DataFrameGroupBy, context=Context.PENDING) +def _( + data: DataFrameGroupBy, + *args: Union[Series, DataFrame], + _name_repair: Union[str, Callable] = "check_unique", + _base0: Optional[bool] = None, + **kwargs: Union[Series, DataFrame] +) -> DataFrameGroupBy: + """Expand on grouped data frame""" + def apply_func(df): + return expand( + df, + *args, + _name_repair=_name_repair, + _base0=_base0, + **kwargs + ) + + out = data.group_apply(apply_func) + out = DataFrameGroupBy( + out, + _group_vars=group_vars(data), + _drop=group_by_drop_default(data) + ) + copy_attrs(out, data) + return out + +@expand.register(DataFrameRowwise, context=Context.EVAL) +def _( + data: DataFrameRowwise, + *args: Union[Series, DataFrame], + _name_repair: Union[str, Callable] = "check_unique", + _base0: Optional[bool] = None, + **kwargs: Union[Series, DataFrame] +) -> DataFrame: + """Expand on rowwise dataframe""" + return expand.dispatch(DataFrame)( + data, + *args, + _name_repair=_name_repair, + _base0=_base0, + **kwargs + ) + +@register_func(None, context=Context.EVAL) +def nesting( + *args: Any, + _name_repair: Union[str, Callable] = "check_unique", + _base0: Optional[bool] = None, + **kwargs: Any +) -> DataFrame: + """A helper that only finds combinations already present in the data. + + Args: + *args: and, + **kwargs: columns to expand. Columns can be atomic lists. + - To find all unique combinations of x, y and z, including + those not present in the data, supply each variable as a + separate argument: `expand(df, x, y, z)`. + - To find only the combinations that occur in the data, use + nesting: `expand(df, nesting(x, y, z))`. + - You can combine the two forms. For example, + `expand(df, nesting(school_id, student_id), date)` would + produce a row for each present school-student combination + for all possible dates. + _name_repair: treatment of problematic column names: + - "minimal": No name repair or checks, beyond basic existence, + - "unique": Make sure names are unique and not empty, + - "check_unique": (default value), no name repair, + but check they are unique, + - "universal": Make the names unique and syntactic + - a function: apply custom name repair + _base0: Whether the suffixes of repaired names should be 0-based. + If not provided, will use `datar.base.getOption('index.base.0')`. + + Returns: + A data frame with all combinations in data. + """ + cols = _dots_cols(*args, **kwargs) + named = cols.pop('__named__') + out = _sorted_unique( + tibble(**cols, _name_repair=_name_repair, _base0=_base0) + ) + return _flatten_nested(out, named, _name_repair, _base0) + + +@register_func(None, context=Context.EVAL) +def crossing( + *args: Any, + _name_repair: Union[str, Callable] = "check_unique", + _base0: Optional[bool] = None, + **kwargs: Any +) -> DataFrame: + """A wrapper around `expand_grid()` that de-duplicates and sorts its inputs + + When values are not specified by literal `list`, they will be sorted. + + Args: + *args: and, + **kwargs: columns to expand. Columns can be atomic lists. + - To find all unique combinations of x, y and z, including + those not present in the data, supply each variable as a + separate argument: `expand(df, x, y, z)`. + - To find only the combinations that occur in the data, use + nesting: `expand(df, nesting(x, y, z))`. + - You can combine the two forms. For example, + `expand(df, nesting(school_id, student_id), date)` would + produce a row for each present school-student combination + for all possible dates. + _name_repair: treatment of problematic column names: + - "minimal": No name repair or checks, beyond basic existence, + - "unique": Make sure names are unique and not empty, + - "check_unique": (default value), no name repair, + but check they are unique, + - "universal": Make the names unique and syntactic + - a function: apply custom name repair + _base0: Whether the suffixes of repaired names should be 0-based. + If not provided, will use `datar.base.getOption('index.base.0')`. + + Returns: + A data frame with values deduplicated and sorted. + """ + cols = _dots_cols(*args, **kwargs) + named = cols.pop('__named__') + out = { + key: _sorted_unique(val) + for key, val in cols.items() + } + + out = expand_grid(**out, _name_repair=_name_repair, _base0=_base0) + return _flatten_nested(out, named, _name_repair, _base0) + + + +# Helpers -------------------------------- +def _dots_cols( + *args: Iterable[Any], + **kwargs: Iterable[Any] +) -> Mapping[str, Iterable[Any]]: + """Mimic tidyr:::dots_cols to clean up the dot (args, kwargs) arugments""" + out = {'__named__': {}} + for i, arg in enumerate(args): + if arg is None: + continue + + name = getattr( + arg, + '__dfname__', + getattr(arg, 'name', getattr(arg, '__name__', None)) + ) + name = name or f'{DEFAULT_COLUMN_PREFIX}{i}' + out['__named__'][name] = False + out[name] = [arg] if is_scalar(arg) else arg + + for name, arg in kwargs.items(): + if arg is None: + continue + + out[name] = [arg] if is_scalar(arg) else arg + out['__named__'][name] = True + + return out + +def _vec_repeat( + vec: Iterable[Any], + each: Iterable[int], + times: Iterable[int] +) -> Iterable[Any]: + """Repeat a vector or a dataframe by rows""" + if isinstance(vec, DataFrame): + indexes = _vec_repeat(vec.index, each=each, times=times) + return vec.loc[indexes, :].reset_index(drop=True) + + vec = categorized(vec) + # numpy.repeat() turn [numpy.nan, 'A'] to ['nan', 'A'] + vec_to_rep = vec + if ( + any(isinstance(elem, str) for elem in vec) and + any(pandas.isnull(elem) for elem in vec) + ): + vec_to_rep = numpy.array(vec, dtype=object) + out = numpy.tile(numpy.repeat(vec_to_rep, each), times) + if is_categorical_dtype(vec): + return factor(out, levels(vec), ordered=vec.ordered) + return out + +def _flatten_nested( + x: Union[DataFrame, Mapping[str, Iterable[Any]]], + named: Mapping[str, bool], + name_repair: Union[str, Callable], + base0: Optional[bool] = None +) -> DataFrame: + """Mimic `tidyr:::flatten_nested`""" + if isinstance(x, DataFrame): + names = repair_names(list(named), name_repair, base0) + named = dict(zip(names, named.values())) + x = {name: pull(x, name) for name in named} + + to_flatten = { + key: isinstance(val, DataFrame) and not named[key] + for key, val in x.items() + } + out = _flatten_at(x, to_flatten) + return tibble(**out, _name_repair=name_repair, _base0=base0) + +def _flatten_at( + x: Mapping[str, Iterable[Any]], + to_flatten: Mapping[str, bool] +) -> Mapping[str, Iterable[Any]]: + """Flatten data at `to_flatten`""" + if not any(to_flatten.values()): + return x + + out = {} + for name, val in x.items(): + if len(val) == 0: + continue + + if to_flatten[name]: + for col in val: + out[col] = val[col] + else: + out[name] = val + return out + +def _sorted_unique(x: Iterable[Any]) -> Union[Categorical, numpy.ndarray]: + """Sort and deduplicate the values""" + x = categorized(x) + if is_categorical_dtype(x): + lvls = levels(x) + return factor( + lvls, lvls, exclude=NULL, ordered=x.ordered + ) + + # don't sort on bare list? + # if isinstance(x, list): + # return pandas.unique(x) + + if isinstance(x, DataFrame): + return arrange(distinct(x)) + + # return numpy.sort(numpy.unique(x)) + # numpy.unique() will turn ['A', 'B', numpy.nan] to ['A', 'B', 'nan'] + try: + out = pandas.unique(x) + except TypeError: + # unhashable type: 'list' + # workaround for unhashable elements + # using its stringified form as key, which has side-effects + maps = {str(elem): elem for elem in x} + out = pandas.unique(list(maps.keys())) + out = numpy.array([maps[elem] for elem in out], dtype=object) + + has_na = any(pandas.isna(out)) + if has_na: + out = numpy.sort(out[~pandas.isna(out)]) + return numpy.concatenate([out, [NA]]) + # numpy.sort() cannot do comparisons between string and NA + return numpy.sort(out) diff --git a/datar/tidyr/extract.py b/datar/tidyr/extract.py new file mode 100644 index 00000000..12baf0ff --- /dev/null +++ b/datar/tidyr/extract.py @@ -0,0 +1,109 @@ +"""Extract a character column into multiple columns using regular +expression groups + +https://github.com/tidyverse/tidyr/blob/HEAD/R/extract.R +""" +import re +from typing import Optional, Union, Type, Mapping + +import pandas +from pandas import DataFrame +from pipda import register_verb + +from ..core.types import StringOrIter, DTypeType, is_scalar +from ..core.contexts import Context +from ..core.utils import vars_select, copy_attrs +from ..core.grouped import DataFrameGroupBy + +from ..dplyr import group_vars, group_by_drop_default + + +@register_verb(DataFrame, context=Context.SELECT) +def extract( + data: DataFrame, + col: Union[str, int], + into: StringOrIter, + regex: str = r'(\w+)', + remove: bool = True, + convert: Union[bool, DTypeType, Mapping[str, DTypeType]] = False, + _base0: Optional[bool] = None +) -> DataFrame: + """Given a regular expression with capturing groups, extract() turns each + group into a new column. If the groups don't match, or the input is NA, + the output will be NA. + + See https://tidyr.tidyverse.org/reference/extract.html + + Args: + data: The dataframe + col: Column name or position. + into: Names of new variables to create as character vector. + Use None to omit the variable in the output. + regex: a regular expression used to extract the desired values. + There should be one group (defined by ()) for each element of into. + remove: If TRUE, remove input column from output data frame. + convert: The universal type for the extracted columns or a dict for + individual ones + _base0: Whether `col` is 0-based when given by index + If not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + Dataframe with extracted columns. + """ + if is_scalar(into): + into = [into] + + all_columns = data.columns + col = vars_select(all_columns, col, base0=_base0) + col = all_columns[col[0]] + + outcols = {} + # merge columns with same name + # all columns are already strs + ## 'col' => i, j, k + ## i, j, k are indexes that have same name 'col' + mergedcols = {} + for i, outcol in enumerate(into): + if is_scalar(outcol) and pandas.isnull(outcol): + continue + if not isinstance(outcol, str): + raise ValueError( + "`into` must be a string or an iterable of strings." + ) + outcols[i] = outcol + mergedcols.setdefault(outcol, []).append(i) + + regex = re.compile(regex) + if regex.groups != len(into): + raise ValueError( + f"`regex` should define {len(into)} groups; " + f"found {regex.groups}." + ) + out = data[col].str.extract(regex) + out = { + outcol: ( + out.iloc[:, indexes[0]] + if len(indexes) == 1 + else out.iloc[:, indexes].astype(str).agg(''.join, axis=1) + ) + for outcol, indexes in mergedcols.items() + } + out = DataFrame(out) + + if isinstance(convert, (str, Type)): + out = out.astype(convert) + elif isinstance(convert, dict): + for key, conv in convert.items(): + out[key] = out[key].astype(conv) + + base = data[all_columns.difference([col])] if remove else data + out = pandas.concat([base, out], axis=1) + if isinstance(data, DataFrameGroupBy): + out = data.__class__( + out, + _group_vars=group_vars(data), + _drop=group_by_drop_default(data) + ) + + copy_attrs(out, data) + return out diff --git a/datar/tidyr/fill.py b/datar/tidyr/fill.py new file mode 100644 index 00000000..34970c67 --- /dev/null +++ b/datar/tidyr/fill.py @@ -0,0 +1,75 @@ +"""Fill in missing values with previous or next value + +https://github.com/tidyverse/tidyr/blob/HEAD/R/fill.R +""" + +from typing import Optional, Union + +from pandas import DataFrame +from pipda import register_verb + +from ..core.contexts import Context +from ..core.utils import vars_select, copy_attrs +from ..core.grouped import DataFrameGroupBy + +from ..dplyr import group_vars, group_by_drop_default + +@register_verb( + DataFrame, + context=Context.SELECT +) +def fill( + _data: DataFrame, + *columns: Union[str, int], + _direction: str = "down", + _base0: Optional[bool] = None +) -> DataFrame: + """Fills missing values in selected columns using the next or + previous entry. + + See https://tidyr.tidyverse.org/reference/fill.html + + Args: + _data: A dataframe + *columns: Columns to fill + _direction: Direction in which to fill missing values. + Currently either "down" (the default), "up", + "downup" (i.e. first down and then up) or + "updown" (first up and then down). + _base0: Whether `*columns` are 0-based if given by indexes + If not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + The dataframe with NAs being replaced. + """ + data = _data.copy() + if not columns: + data = data.fillna( + method='ffill' if _direction.startswith('down') else 'bfill', + ) + if _direction in ('updown', 'downup'): + data = data.fillna( + method='ffill' if _direction.endswith('down') else 'bfill', + ) + else: + colidx = vars_select(data.columns, *columns, base0=_base0) + data.iloc[:, colidx] = fill(data.iloc[:, colidx], _direction=_direction) + return data + +@fill.register(DataFrameGroupBy, context=Context.SELECT) +def _( + _data: DataFrameGroupBy, + *columns: str, + _direction: str = "down" +) -> DataFrameGroupBy: + # DataFrameGroupBy + out = _data.group_apply( + lambda df: fill(df, *columns, _direction=_direction) + ) + out = _data.__class__( + out, + _group_vars=group_vars(_data), + _drop=group_by_drop_default(_data) + ) + copy_attrs(out, _data) + return out diff --git a/datar/tidyr/funcs.py b/datar/tidyr/funcs.py index e538f278..1c052863 100644 --- a/datar/tidyr/funcs.py +++ b/datar/tidyr/funcs.py @@ -1,12 +1,11 @@ """Functions from tidyr""" -from typing import Any, Iterable +from typing import Iterable from pipda import register_func from ..core.types import NumericType from ..core.contexts import Context -from ..core.middlewares import Nesting from ..base import seq @register_func(None, context=Context.EVAL) @@ -40,8 +39,3 @@ def full_seq( maxx += tol return seq(minx, maxx, by=period) - -@register_func(None, context=None) -def nesting(*cols: Any, **kwargs: Any) -> Nesting: - """Nesting""" - return Nesting(*cols, **kwargs) diff --git a/datar/tidyr/nest.py b/datar/tidyr/nest.py new file mode 100644 index 00000000..7f464cc1 --- /dev/null +++ b/datar/tidyr/nest.py @@ -0,0 +1,226 @@ +"""Nest and unnest + +https://github.com/tidyverse/tidyr/blob/master/R/nest.R +""" +from typing import Callable, Mapping, Optional, Union, Iterable, List +import re + +import pandas +from pandas import DataFrame, Series +from pipda import register_verb + +from ..core.types import DTypeType, is_scalar +from ..core.utils import vars_select, align_value, to_df +from ..core.grouped import DataFrameGroupBy, DataFrameRowwise +from ..core.contexts import Context + +from ..base import setdiff, intersect, NA +from ..dplyr import distinct, bind_cols, group_vars, group_by_drop_default + +from .chop import unchop, _vec_split +from .pack import unpack + +@register_verb(DataFrame, context=Context.SELECT) +def nest( + _data: DataFrame, + _names_sep: Optional[str] = None, + _base0: Optional[bool] = None, + **cols: Union[str, int] +) -> DataFrame: + """Nesting creates a list-column of data frames + + Args: + _data: A data frame + **cols: Columns to nest + _names_sep: If `None`, the default, the names will be left as is. + Inner names will come from the former outer names + If a string, the inner and outer names will be used together. + The names of the new outer columns will be formed by pasting + together the outer and the inner column names, separated by + `_names_sep`. + _base0: Whether `**cols` are 0-based + if not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + Nested data frame. + """ + if not cols: + raise ValueError("`**cols` must not be empty.") + + all_columns = _data.columns + colgroups = {} + usedcols = set() + for group, columns in cols.items(): + oldcols = all_columns[vars_select(all_columns, columns, base0=_base0)] + usedcols = usedcols.union(oldcols) + newcols = ( + oldcols if _names_sep is None else + _strip_names(oldcols, group, _names_sep) + ) + colgroups[group] = dict(zip(newcols, oldcols)) + + asis = setdiff(_data.columns, usedcols) + keys = _data[asis] + u_keys = distinct(keys) + + nested = [] + for group, columns in colgroups.items(): + if _names_sep is None: # names as is + # out <- map(cols, ~ vec_split(.data[.x], keys)$val) + val = _vec_split(_data[list(columns)], keys).val + else: + # out <- map( + # cols, + # ~ vec_split(set_names(.data[.x], names(.x)), keys)$val + # ) + to_split = _data[list(columns.values())] + to_split.columns = list(columns) + val = _vec_split(to_split, keys).val + + nested.append(val) + + out = pandas.concat(nested, ignore_index=True, axis=1) + out.columns = list(colgroups) + if u_keys.shape[1] == 0: + return out if isinstance(out, DataFrame) else out.to_frame() + return bind_cols(u_keys, align_value(out, u_keys)) + +@nest.register(DataFrameGroupBy, context=Context.SELECT) +def _( + _data: DataFrameGroupBy, + _names_sep: Optional[str] = None, + _base0: Optional[bool] = None, + **cols: Mapping[str, Union[str, int]] +) -> DataFrameGroupBy: + """Nesting grouped dataframe""" + if not cols: + cols = {'data': setdiff(_data.columns, group_vars(_data))} + out = nest.dispatch(DataFrame)( + _data, **cols, _names_sep=_names_sep, _base0=_base0 + ) + gvars = intersect(out.columns, group_vars(_data)) + return _data.__class__( + out, + _group_vars=gvars, + _drop=group_by_drop_default(_data) + ) + +@register_verb(DataFrame, context=Context.SELECT) +def unnest( + data: DataFrame, + *cols: Union[str, int], + keep_empty: bool = False, + dtypes: Optional[Union[DTypeType, Mapping[str, DTypeType]]] = None, + names_sep: Optional[str] = None, + names_repair: Union[str, Callable] = 'check_unique', + _base0: Optional[bool] = None +) -> DataFrame: + """Flattens list-column of data frames back out into regular columns. + + Args: + data: A data frame to flatten. + *cols: Columns to unnest. + keep_empty: By default, you get one row of output for each element + of the list your unchopping/unnesting. + This means that if there's a size-0 element + (like NULL or an empty data frame), that entire row will be + dropped from the output. + If you want to preserve all rows, use `keep_empty` = `True` to + replace size-0 elements with a single row of missing values. + dtypes: NOT `ptype`. Providing the dtypes for the output columns. + Could be a single dtype, which will be applied to all columns, or + a dictionary of dtypes with keys for the columns and values the + dtypes. + names_sep: If `None`, the default, the names will be left as is. + Inner names will come from the former outer names + If a string, the inner and outer names will be used together. + The names of the new outer columns will be formed by pasting + together the outer and the inner column names, separated by + `names_sep`. + names_repair: treatment of problematic column names: + - "minimal": No name repair or checks, beyond basic existence, + - "unique": Make sure names are unique and not empty, + - "check_unique": (default value), no name repair, + but check they are unique, + - "universal": Make the names unique and syntactic + - a function: apply custom name repair + _base0: Whether `cols` are 0-based + if not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + Data frame with selected columns unnested. + """ + if not cols: + raise ValueError("`*cols` is required when using unnest().") + + all_columns = data.columns + cols = vars_select(all_columns, cols, base0=_base0) + cols = all_columns[cols] + + out = data.copy() + for col in cols: + out[col] = _as_df(data[col]) + + out = unchop( + out, cols, + keep_empty=keep_empty, dtypes=dtypes, _base0=_base0 + ) + return unpack( + out, cols, + names_sep=names_sep, names_repair=names_repair + ) + +@unnest.register(DataFrameRowwise, context=Context.SELECT) +def _( + data: DataFrameRowwise, + *cols: Union[str, int], + keep_empty: bool = False, + dtypes: Optional[Union[DTypeType, Mapping[str, DTypeType]]] = None, + names_sep: Optional[str] = None, + names_repair: Union[str, Callable] = 'check_unique', + _base0: Optional[bool] = None +) -> DataFrame: + """Unnest rowwise dataframe""" + out = unnest.dispatch(DataFrame)( + data, *cols, + keep_empty=keep_empty, + dtypes=dtypes, + names_sep=names_sep, + names_repair=names_repair, + _base0=_base0 + ) + return DataFrameGroupBy( + out, + _group_vars=group_vars(data), + _drop=group_by_drop_default(data) + ) + +def _strip_names(names: Iterable[str], base: str, sep: str) -> List[str]: + """Strip the base names with sep""" + out = [] + for name in names: + if not sep: + out.append(name[len(base):] if name.startswith(base) else name) + else: + parts = re.split(re.escape(sep), name, maxsplit=1) + out.append(parts[1] if parts[0] == base else name) + return out + +def _as_df(series: Series) -> List[Optional[DataFrame]]: + """Convert series to dataframe""" + out = [] + for val in series: + if isinstance(val, DataFrame): + if val.shape[1] == 0: # no columns + out.append(NA) + elif val.shape[0] == 0: + out.append( + DataFrame([[NA] * val.shape[1]], columns=val.columns) + ) + else: + out.append(val) + elif is_scalar(val) and pandas.isnull(val): + out.append(val) + else: + out.append(to_df(val, name=series.name)) + return out diff --git a/datar/tidyr/pack.py b/datar/tidyr/pack.py new file mode 100644 index 00000000..9382fca2 --- /dev/null +++ b/datar/tidyr/pack.py @@ -0,0 +1,167 @@ +"""Pack and unpack + +https://github.com/tidyverse/tidyr/blob/master/R/pack.R +""" +from typing import Iterable, Set, Optional, Union, Callable + +from pandas import DataFrame +from pipda import register_verb + +from ..core.utils import vars_select, copy_attrs +from ..core.grouped import DataFrameGroupBy +from ..core.contexts import Context +from ..core.types import StringOrIter, IntOrIter, is_scalar +from ..core.names import repair_names + +from ..base import setdiff, intersect +from ..dplyr import bind_cols, group_vars, group_by_drop_default + +@register_verb(DataFrame, context=Context.SELECT) +def pack( + _data: DataFrame, + _names_sep: Optional[str] = None, + _base0: Optional[bool] = None, + **cols: Union[str, int] +) -> DataFrame: + """Makes df narrow by collapsing a set of columns into a single df-column. + + Args: + _data: A data frame + **cols: Columns to pack + _names_sep: If `None`, the default, the names will be left as is. + Inner names will come from the former outer names + If a string, the inner and outer names will be used together. + The names of the new outer columns will be formed by pasting + together the outer and the inner column names, separated by + `_names_sep`. + _base0: Whether `**cols` are 0-based + if not provided, will use `datar.base.getOption('index.base.0')` + """ + if not cols: + return _data.copy() + + from .nest import _strip_names + all_columns = _data.columns + colgroups = {} + usedcols = set() + for group, columns in cols.items(): + oldcols = all_columns[vars_select(all_columns, columns, base0=_base0)] + usedcols = usedcols.union(oldcols) + newcols = ( + oldcols if _names_sep is None else + _strip_names(oldcols, group, _names_sep) + ) + colgroups[group] = zip(newcols, oldcols) + + cols = {} + for group, columns in colgroups.items(): + for newcol, oldcol in columns: + cols[f'{group}${newcol}'] = _data[oldcol] + + asis = setdiff(_data.columns, usedcols) + out = bind_cols(_data[asis], DataFrame(cols)) + if isinstance(_data, DataFrameGroupBy): + out = _data.__class__( + out, + _group_vars=intersect(group_vars(_data), out.columns), + _drop=group_by_drop_default(_data) + ) + + copy_attrs(out, _data) + return out + +@register_verb(DataFrame, context=Context.SELECT) +def unpack( + data: DataFrame, + cols: Union[StringOrIter, IntOrIter], + names_sep: Optional[str] = None, + names_repair: Union[str, Callable] = "check_unique", + _base0: Optional[bool] = None +) -> DataFrame: + """Makes df wider by expanding df-columns back out into individual columns. + + For empty columns, the column is kept asis, instead of removing it. + + Args: + data: A data frame + cols: Columns to unpack + names_sep: If `None`, the default, the names will be left as is. + Inner names will come from the former outer names + If a string, the inner and outer names will be used together. + The names of the new outer columns will be formed by pasting + together the outer and the inner column names, separated by + `_names_sep`. + name_repair: treatment of problematic column names: + - "minimal": No name repair or checks, beyond basic existence, + - "unique": Make sure names are unique and not empty, + - "check_unique": (default value), no name repair, + but check they are unique, + - "universal": Make the names unique and syntactic + - a function: apply custom name repair + _base0: Whether `cols` are 0-based + if not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + Data frame with given columns unpacked. + """ + if is_scalar(cols): + cols = [cols] + + all_columns = data.columns + cols = _check_present( + data, cols, all_columns, + base0=_base0, + ) + + out = data.copy() + new_cols = [] + for col in data.columns: + if '$' in col: + parts = col.split('$', 1) + if parts[0] not in cols: + new_cols.append(col) + else: + replace = "" if names_sep is None else f"{parts[0]}{names_sep}" + new_cols.append(f"{replace}{parts[1]}") + # elif col in cols: # empty list column + # # remove it from out + # out.drop(columns=col, inplace=True) + else: + new_cols.append(col) + + new_cols = repair_names(new_cols, names_repair, _base0) + out.columns = new_cols + + copy_attrs(out, data) + return out + + +def _check_present( + data: DataFrame, + cols: Iterable[Union[int, str]], + all_columns: Iterable[str], + base0: Optional[bool] = None +) -> Set[str]: + """Check if cols are packed columns""" + out = set() + for col in cols: + if not isinstance(col, str): + columns = vars_select(all_columns, col, base0=base0) + columns = all_columns[columns][0].split('$', 1)[0] + else: + columns = [col] + + for column in columns: + if not (column in data and len(data[column]) == 0) and not any( + allcol.startswith(f"{column}$") for allcol in all_columns + ): + raise ValueError(f"`{column}` must be a data frame column.") + + if column in out: + raise ValueError( + f"`{column}` has already been selected. " + "Number of packed columns also counts when " + "selecting using indexes." + ) + out.add(column) + return out diff --git a/datar/tidyr/pivot_long.py b/datar/tidyr/pivot_long.py new file mode 100644 index 00000000..429fb90c --- /dev/null +++ b/datar/tidyr/pivot_long.py @@ -0,0 +1,277 @@ +"""Pivot data from wide to long + +https://github.com/tidyverse/tidyr/blob/HEAD/R/pivot-long.R +""" +import re +from typing import Optional, Mapping, Callable, Union + +import pandas +from pandas import DataFrame +from pandas.core.dtypes.common import is_categorical_dtype +from pipda import register_verb + +from ..core.defaults import DEFAULT_COLUMN_PREFIX +from ..core.contexts import Context +from ..core.types import StringOrIter, DTypeType, is_scalar +from ..core.utils import vars_select, apply_dtypes +from ..core.grouped import DataFrameGroupBy, DataFrameRowwise +from ..core.names import repair_names + +from ..base import intersect, setdiff, union +from ..dplyr import group_vars, group_by_drop_default, relocate + +from .extract import extract +from .separate import separate + +# pylint: disable=too-many-branches +# pylint: disable=too-many-statements + +@register_verb(DataFrame, context=Context.SELECT) +def pivot_longer( + _data: DataFrame, + cols: StringOrIter, + names_to: StringOrIter = "name", + names_prefix: Optional[str] = None, + names_sep: Optional[str] = None, + names_pattern: Optional[str] = None, + names_ptypes: Optional[ + Union[DTypeType, Mapping[str, DTypeType]] + ] = None, + names_transform: Optional[ + Union[Callable, Mapping[str, Callable]] + ] = None, + names_repair="check_unique", + values_to: str = "value", + values_drop_na: bool = False, + values_ptypes: Optional[ + Union[DTypeType, Mapping[str, DTypeType]] + ] = None, + values_transform: Optional[ + Union[Callable, Mapping[str, Callable]] + ] = None, + _base0: Optional[bool] = None +): + """"lengthens" data, increasing the number of rows and + decreasing the number of columns. + + The row order is a bit different from `tidyr` and `pandas.DataFrame.melt`. + >>> df = tibble(x=f[1:2], y=f[3:4]) + >>> pivot_longer(df, f[f.x:f.y]) + >>> # name value + >>> # 0 x 1 + >>> # 1 x 2 + >>> # 2 y 3 + >>> # 3 y 4 + But with `tidyr::pivot_longer`, the output will be: + >>> # # A tibble: 4 x 2 + >>> # name value + >>> # + >>> # 1 x 1 + >>> # 2 y 3 + >>> # 3 x 2 + >>> # 4 y 4 + + Args: + _data: A data frame to pivot. + cols: Columns to pivot into longer format. + names_to: A string specifying the name of the column to create from + the data stored in the column names of data. + Can be a character vector, creating multiple columns, if names_sep + or names_pattern is provided. In this case, there are two special + values you can take advantage of: + - `None`/`NA`/`NULL` will discard that component of the name. + - `.value`/`_value` indicates that component of the name defines + the name of the column containing the cell values, + overriding values_to. + - Different as `tidyr`: With `.value`/`_value`, if there are other + parts of the names to distinguish the groups, they must be + captured. For example, use `r'(\\w)_(\\d)'` to match `'a_1'` and + `['.value', NA]` to discard the suffix, instead of use + `r'(\\w)_\\d'` to match. + names_prefix: A regular expression used to remove matching text from + the start of each variable name. + names_sep: and + names_pattern: If names_to contains multiple values, + these arguments control how the column name is broken up. + names_sep takes the same specification as separate(), and + can either be a numeric vector (specifying positions to break on), + or a single string (specifying a regular expression to split on). + names_pattern: takes the same specification as extract(), + a regular expression containing matching groups (()). + names_ptypes: and + values_ptypes: A list of column name-prototype pairs. + A prototype (or ptype for short) is a zero-length vector + (like integer() or numeric()) that defines the type, class, and + attributes of a vector. Use these arguments if you want to confirm + that the created columns are the types that you expect. + Note that if you want to change (instead of confirm) the types + of specific columns, you should use names_transform or + values_transform instead. + names_transform: and + values_transform: A list of column name-function pairs. + Use these arguments if you need to change the types of + specific columns. For example, + names_transform = dict(week = as.integer) would convert a + character variable called week to an integer. + If not specified, the type of the columns generated from names_to + will be character, and the type of the variables generated from + values_to will be the common type of the input columns used to + generate them. + names_repair: Not supported yet. + values_to: A string specifying the name of the column to create from + the data stored in cell values. If names_to is a character + containing the special `.value`/`_value` sentinel, this value + will be ignored, and the name of the value column will be derived + from part of the existing column names. + values_drop_na: If TRUE, will drop rows that contain only NAs in + the value_to column. This effectively converts explicit missing + values to implicit missing values, and should generally be used + only when missing values in data were created by its structure. + names_repair: treatment of problematic column names: + - "minimal": No name repair or checks, beyond basic existence, + - "unique": Make sure names are unique and not empty, + - "check_unique": (default value), no name repair, + but check they are unique, + - "universal": Make the names unique and syntactic + - a function: apply custom name repair + _base0: Whether `cols` are 0-based if given by indexes + If not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + The pivoted dataframe. + """ + rowid_column = '_PIVOT_ROWID_' + ret = _data.assign(**{rowid_column: range(_data.shape[0])}) + all_columns = ret.columns + columns = _data.columns[vars_select(_data.columns, cols, base0=_base0)] + id_columns = all_columns.difference(columns) + + if is_scalar(names_to): + names_to = [names_to] + + tmp_names_to = [] + # We need to NA/names to be kept for .value pivot + na_names_to = [] + for i, name in enumerate(names_to): + if pandas.isnull(name): + na_name = f'__{DEFAULT_COLUMN_PREFIX}_NA_{i}__' + na_names_to.append(na_name) + tmp_names_to.append(na_name) + elif name == '_value': + tmp_names_to.append('.value') + else: + tmp_names_to.append(name) + names_to = tmp_names_to + + if len(names_to) > 1 and not names_sep and not names_pattern: + raise ValueError( + "If you supply multiple names in `names_to` you must also " + "supply one of `names_sep` or `names_pattern`." + ) + + if names_sep and names_pattern: + raise ValueError( + "Only one of `names_sep` or `names_pattern` should be supplied." + ) + + var_name = '__tmp_names_to__' if names_pattern or names_sep else names_to[0] + ret = ret.melt( + id_vars=id_columns, + # Use the rest columns automatically. + # Don't specify so that duplicated column names can be used. + # value_vars=columns, + var_name=var_name, + value_name=values_to, + ) + if names_prefix: + names_prefix = re.compile(f'^{re.escape(names_prefix)}') + ret[var_name] = ret[var_name].str.replace(names_prefix, '') + + if all(is_categorical_dtype(_data[col]) for col in columns): + ret[values_to] = ret[values_to].astype('category') + + if names_pattern: + ret = extract( + ret, var_name, + into=names_to, + regex=names_pattern + ) + + if names_sep: + ret = separate( + ret, var_name, + into=names_to, + sep=names_sep + ) + # extract/separate puts `into` last + ret = relocate(ret, values_to, _after=-1, _base0=True) + + + if '.value' in names_to: + names_to = setdiff(names_to, ['.value']) + index_columns = union(id_columns, names_to) + names_to = setdiff(names_to, na_names_to) + + # keep the order + value_columns = pandas.unique(ret['.value'].values) + ret.set_index(index_columns, inplace=True) + ret.index = list(ret.index) + ret2 = ret.pivot(columns='.value', values=values_to).reset_index() + id_data = DataFrame(ret2['index'].tolist(), columns=index_columns) + ret = pandas.concat( + [ + id_data[ + id_data.columns. + difference(na_names_to). + difference([rowid_column]) + ], + ret2[value_columns] + ], + axis=1 + ) + values_to = value_columns + else: + values_to = [values_to] + ret.drop(columns=[rowid_column], inplace=True) + + if values_drop_na: + ret.dropna(subset=values_to, inplace=True) + + names_data = ret[names_to] + apply_dtypes(names_data, names_ptypes) + ret[names_to] = names_data + + values_data = ret[values_to] + apply_dtypes(values_data, values_ptypes) + ret[values_to] = values_data + + if names_transform: + for name in names_to: + if callable(names_transform): + ret[name] = ret[name].apply(names_transform) + elif name in names_transform: + ret[name] = ret[name].apply(names_transform[name]) + + if values_transform: + for name in values_to: + if callable(values_transform): + ret[name] = ret[name].apply(values_transform) + elif name in values_transform: + ret[name] = ret[name].apply(values_transform[name]) + + names = repair_names(ret.columns.tolist(), names_repair, _base0=_base0) + ret.columns = names + + if ( + isinstance(_data, DataFrameGroupBy) and + not isinstance(_data, DataFrameRowwise) + ): + groupvars = intersect(group_vars(_data), ret.columns) + if len(groupvars) > 0: + return DataFrameGroupBy( + ret, + _group_vars=groupvars, + _drop=group_by_drop_default(_data) + ) + + return ret diff --git a/datar/tidyr/pivot_wide.py b/datar/tidyr/pivot_wide.py new file mode 100644 index 00000000..25105f97 --- /dev/null +++ b/datar/tidyr/pivot_wide.py @@ -0,0 +1,237 @@ +"""Pivot data from long to wide""" + +from typing import List, Optional, Any, Union, Callable, Mapping + +import pandas +from pandas import DataFrame, Index +from pipda import register_verb + +from ..core.contexts import Context +from ..core.types import StringOrIter, is_scalar +from ..core.utils import vars_select +from ..core.grouped import DataFrameGroupBy, DataFrameRowwise +from ..core.exceptions import ColumnNotExistingError + +from ..base import intersect, NA +from ..base.constants import NA_integer_ +from ..dplyr import group_vars, group_by_drop_default + +ROWID_COLUMN = '_PIVOT_ROWID_' + +# pylint: disable=too-many-branches + +@register_verb(DataFrame, context=Context.SELECT) +def pivot_wider( + _data: DataFrame, + id_cols: Optional[StringOrIter] = None, + names_from: str = "name", + names_prefix: str = "", + names_sep: str = "_", + names_glue: Optional[str] = None, + names_sort: bool = False, + # names_repair: str = "check_unique", # todo + values_from: StringOrIter = "value", + values_fill: Any = None, + values_fn: Optional[Union[Callable, Mapping[str, Callable]]] = None, + _base0: Optional[bool] = None +) -> DataFrame: + """"widens" data, increasing the number of columns and decreasing + the number of rows. + + Args: + _data: A data frame to pivot. + id_cols: A set of columns that uniquely identifies each observation. + Defaults to all columns in data except for the columns specified + in names_from and values_from. + names_from: and + values_from: A pair of arguments describing which column + (or columns) to get the name of the output column (names_from), + and which column (or columns) to get the cell values from + (values_from). + names_prefix: String added to the start of every variable name. + names_sep: If names_from or values_from contains multiple variables, + this will be used to join their values together into a single + string to use as a column name. + names_glue: Instead of names_sep and names_prefix, you can supply + a glue specification that uses the names_from columns + (and special _value) to create custom column names. + names_sort: Should the column names be sorted? If FALSE, the default, + column names are ordered by first appearance. + names_repair: todo + values_fill: Optionally, a (scalar) value that specifies what + each value should be filled in with when missing. + values_fn: Optionally, a function applied to the value in each cell + in the output. You will typically use this when the combination + of `id_cols` and value column does not uniquely identify + an observation. + This can be a dict you want to apply different aggregations to + different value columns. + If not specified, will be `numpy.mean` + _base0: Whether `id_cols`, `names_from` and `values_from` + are 0-based if given by indexes. + If not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + The pivoted dataframe. + """ + if is_scalar(names_from): + names_from = [names_from] + if is_scalar(values_from): + values_from = [values_from] + if id_cols is not None and is_scalar(id_cols): + id_cols = [id_cols] + + if id_cols is None: + all_cols = _data.columns + names_from = all_cols[vars_select(all_cols, names_from, base0=_base0)] + # values_from could be a df-column + new_values_from = [] + for value_from in values_from: + if isinstance(value_from, str) and value_from not in all_cols: + df_cols = [ + col for col in all_cols if col.startswith(f'{value_from}$') + ] + if not df_cols: + raise ColumnNotExistingError(value_from) + new_values_from.extend(df_cols) + else: + new_values_from.append(value_from) + values_from = all_cols[ + vars_select(all_cols, *new_values_from, base0=_base0) + ] + id_cols = ( + all_cols + .difference(names_from) + .difference(values_from) + ) + + # build multiindex pivot table + id_cols = list(id_cols) + names_from = list(names_from) + values_from = list(values_from) + + # DF: + # id x y a b + # 0 10 X 1 1 1 + # 1 20 Y 2 2 2 + # + # to: + # id a b + # x X Y X Y + # y 1 2 1 2 + # 0 10 1.0 NaN 1.0 NaN + # 1 20 NaN 2.0 NaN 2.0 + # + # with: + # id_cols = ['id'] + # names_from = ['x', 'y'] + # values_from = ['a', 'b'] + # + # expected: + # id a_X_1 a_Y_2 b_X_1 b_Y_2 + # 0 10 1 NaN 1 NaN + # 1 20 NaN 2 NaN 2 + if len(id_cols) == 0 and len(values_from) > 1: + # need to add it to turn names_to to columns + ret = _data.assign(**{ROWID_COLUMN: 0}) + id_cols = [ROWID_COLUMN] + else: + ret = _data + + # hold NAs in values_from columns, so that they won't be filled + # by values_fill + for col in values_from: + ret[col].fillna(NA_integer_, inplace=True) + + ret = pandas.pivot_table( + ret, + index=id_cols, + columns=names_from, + fill_value=values_fill, + values=values_from[0] if len(values_from) == 1 else values_from, + aggfunc=values_fn or 'mean' + ) + + if len(id_cols) > 0: + ret.reset_index(inplace=True) + + if ROWID_COLUMN in ret: + ret.drop(columns=[ROWID_COLUMN], level=0, inplace=True) + + ret.columns = _flatten_column_names( + ret.columns, + names_prefix, + names_sep, + names_glue + ) + ret.reset_index(drop=True, inplace=True) + # Get the original NAs back + for col in ret.columns.difference(id_cols): + ret[col].replace({NA_integer_: NA}, inplace=True) + + if names_sort: + ret = ret.loc[:, sorted(ret.columns)] + + if ( + isinstance(_data, DataFrameGroupBy) and + not isinstance(_data, DataFrameRowwise) + ): + gvars = intersect(group_vars(_data), ret.columns) + if len(gvars) > 0: + return DataFrameGroupBy( + ret, + _group_vars=gvars, + _drop=group_by_drop_default(_data) + ) + + return ret + +def _flatten_column_names( + names: Index, + names_prefix: str, + names_sep: str, + names_glue: Optional[str] +) -> List[str]: + """Flatten the hierachical column names: + + For example, + >>> MultiIndex([('id', '', ''), + >>> ( 'a', 'X', 1), + >>> ( 'a', 'Y', 2), + >>> ( 'b', 'X', 1), + >>> ( 'b', 'Y', 2)], + >>> names=[None, 'x', 'y']) + To + >>> ['X1_a', 'Y2_a', 'X1_b', 'Y2_b'] + with `names_glue={x}{y}_{_value}` + """ + lvlnames = ['_value' if level is None else level for level in names.names] + out = [] + + for cols in names: + if is_scalar(cols): + out.append(f'{names_prefix}{cols}') + continue + # if len(cols) == 1: + # out.append(f'{names_prefix}{cols[0]}') + # continue + + cols = dict(zip(lvlnames, (str(col) for col in cols))) + # in case of ('id', '', '') + if all(name == '' for key, name in cols.items() if key != '_value'): + out.append(f'{names_prefix}{cols["_value"]}') + # in case of values_from is a dataframe column + # ('d$a', 'X', '1') + elif '$' in cols.get('_value', ''): + prefix = names_prefix + names_sep.join( + col for name, col in cols.items() if name != '_value' + ) + out.append(f'{prefix}${cols["_value"].split("$", 1)[1]}') + elif not names_glue: + out.append(f'{names_prefix}{names_sep.join(cols.values())}') + else: + if '_value' in cols: + cols['.value'] = cols['_value'] + out.append(names_glue.format(**cols)) + + return out diff --git a/datar/tidyr/replace_na.py b/datar/tidyr/replace_na.py new file mode 100644 index 00000000..cb628dd7 --- /dev/null +++ b/datar/tidyr/replace_na.py @@ -0,0 +1,73 @@ +"""Replace NAs with specified values""" +from functools import singledispatch +from typing import Any, Iterable, Optional + +import numpy +import pandas +from pandas import DataFrame +from pandas.core.series import Series +from pipda import register_verb + +from ..core.types import SeriesLikeType, is_scalar +from ..core.contexts import Context + +@singledispatch +def _replace_na(data: Iterable[Any], replace: Any) -> Iterable[Any]: + """Replace NA for any iterables""" + return type(data)( + replace if is_scalar(elem) and pandas.isnull(elem) else elem + for elem in data + ) + +@_replace_na.register(numpy.ndarray) +@_replace_na.register(Series) +def _(data: SeriesLikeType, replace: Any) -> SeriesLikeType: + """Replace NA for numpy.ndarray or Series""" + ret = data.copy() + ret[pandas.isnull(ret)] = replace + return ret + +@_replace_na.register(DataFrame) +def _(data: DataFrame, replace: Any) -> DataFrame: + """Replace NA for numpy.ndarray or DataFrame""" + # TODO: allow replace to be a list as an entire value to replace + return data.fillna(replace) + +@register_verb( + (DataFrame, Series, numpy.ndarray, list, tuple, set), + context=Context.EVAL +) +def replace_na( + data: Iterable[Any], + data_or_replace: Optional[Any] = None, + replace: Any = None +) -> Any: + """Replace NA with a value + + This function can be also used not as a verb. As a function called as + an argument in a verb, data is passed implicitly. Then one could + pass data_or_replace as the data to replace. + + Args: + data: The data piped in + data_or_replace: When called as argument of a verb, this is the + data to replace. Otherwise this is the replacement. + replace: The value to replace with + Can only be a scalar or dict for data frame. + So replace NA with a list is not supported yet. + + Returns: + Corresponding data with NAs replaced + """ + if data_or_replace is None and replace is None: + return data.copy() + + if replace is None: + # no replace, then data_or_replace should be replace + replace = data_or_replace + else: + # replace specified, determine data + # If data_or_replace is specified, it's data + data = data if data_or_replace is None else data_or_replace + + return _replace_na(data, replace) diff --git a/datar/tidyr/separate.py b/datar/tidyr/separate.py new file mode 100644 index 00000000..91d4bd49 --- /dev/null +++ b/datar/tidyr/separate.py @@ -0,0 +1,241 @@ +"""Separate a character column into multiple columns with a regular +expression or numeric locations + +https://github.com/tidyverse/tidyr/blob/HEAD/R/separate.R +""" +import re +from typing import Any, List, Union, Mapping, Optional + +import pandas +from pandas import DataFrame +from pipda import register_verb + +from ..core.contexts import Context +from ..core.types import DTypeType, StringOrIter, is_scalar +from ..core.utils import logger, vars_select, apply_dtypes, position_at +from ..core.grouped import DataFrameGroupBy, DataFrameRowwise + +from ..base import NA, setdiff, intersect +from ..dplyr import group_vars, group_by_drop_default, ungroup, mutate + +from .chop import unchop + +@register_verb(DataFrame, context=Context.SELECT) +def separate( + data: DataFrame, + col: Union[str, int], + into: StringOrIter, + sep: Union[int, str] = r'[^0-9A-Za-z]+', + remove: bool = True, + convert: Union[bool, DTypeType, Mapping[str, DTypeType]] = False, + extra: str = "warn", + fill: str = "warn", + _base0: Optional[bool] = None +) -> DataFrame: + """Given either a regular expression or a vector of character positions, + turns a single character column into multiple columns. + + Args: + data: The dataframe + col: Column name or position. + into: Names of new variables to create as character vector. + Use `None`/`NA`/`NULL` to omit the variable in the output. + sep: Separator between columns. + If str, `sep` is interpreted as a regular expression. + The default value is a regular expression that matches + any sequence of non-alphanumeric values. + If int, `sep` is interpreted as character positions to split at. + remove: If TRUE, remove input column from output data frame. + convert: The universal type for the extracted columns or a dict for + individual ones + Note that when given `TRUE`, `DataFrame.convert_dtypes()` is called, + but it will not convert `str` to other types + (For example, `'1'` to `1`). You have to specify the dtype yourself. + extra: If sep is a character vector, this controls what happens when + there are too many pieces. There are three valid options: + - "warn" (the default): emit a warning and drop extra values. + - "drop": drop any extra values without a warning. + - "merge": only splits at most length(into) times + fill: If sep is a character vector, this controls what happens when + there are not enough pieces. There are three valid options: + - "warn" (the default): emit a warning and fill from the right + - "right": fill with missing values on the right + - "left": fill with missing values on the left + _base0: Whether `col` is 0-based when given by index and Whether `sep` + is 0-based if given by position + If not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + Dataframe with separated columns. + """ + if is_scalar(into): + into = [into] + + if not all(isinstance(it, str) or pandas.isnull(it) for it in into): + raise ValueError("`into` must be a string or a list of strings.") + + all_columns = data.columns + col = vars_select(all_columns, col, base0=_base0) + col = all_columns[col[0]] + + colindex = [ + i for i, outcol in enumerate(into) + if not pandas.isnull(outcol) + ] + non_na_elems = lambda row: [row[i] for i in colindex] + # series.str.split can't do extra and fill + # extracted = data[col].str.split(sep, expand=True).iloc[:, colindex] + nout = len(into) + extra_warns = [] + missing_warns = [] + + separated = data[col].apply( + _separate_col, + nout=nout, + sep=sep, + extra=extra, + fill=fill, + base0=_base0, + extra_warns=extra_warns, + missing_warns=missing_warns + ) + + if extra_warns: + logger.warning( + 'Expected %s pieces. ' + 'Additional pieces discarded in %s rows %s.', + nout, + len(extra_warns), + extra_warns + ) + if missing_warns: + logger.warning( + 'Expected %s pieces. ' + 'Missing pieces filled with `NA` in %s rows %s.', + nout, + len(missing_warns), + missing_warns + ) + + separated = DataFrame(separated.values.tolist()).iloc[:, colindex] + separated.columns = non_na_elems(into) + apply_dtypes(separated, convert) + + out = data.drop(columns=[col]) if remove else data + out = mutate(out, separated) + + if ( + isinstance(data, DataFrameGroupBy) and + not isinstance(data, DataFrameRowwise) + ): + gvars = intersect(group_vars(data), out.columns) + + if len(gvars) > 0: + return DataFrameGroupBy( + out, + _group_vars=gvars, + _drop=group_by_drop_default(data) + ) + + return out + +@register_verb(DataFrame, context=Context.SELECT) +def separate_rows( + data: DataFrame, + *columns: str, + sep: str = r'[^0-9A-Za-z]+', + convert: Union[bool, DTypeType, Mapping[str, DTypeType]] = False, + _base0: Optional[bool] = None +) -> DataFrame: + """Separates the values and places each one in its own row. + + Args: + data: The dataframe + *columns: The columns to separate on + sep: Separator between columns. + convert: The universal type for the extracted columns or a dict for + individual ones + _base0: Whether `columns` is 0-based when given by index + If not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + Dataframe with rows separated and repeated. + """ + all_columns = data.columns + selected = all_columns[vars_select(all_columns, *columns, base0=_base0)] + out = data.copy() + for sel in selected: + out[sel] = out[sel].apply( + _separate_col, + nout=0, + sep=sep, + extra="merge", + fill="right", + base0=_base0, + extra_warns=[], + missing_warns=[] + ) + + out = unchop(out, selected, keep_empty=True, dtypes=convert, _base0=_base0) + gvars_exclude = intersect(selected, group_vars(out)) + if len(gvars_exclude) > 0: + gvars = setdiff(group_vars(out), gvars_exclude) + if len(gvars) == 0: + return ungroup(out) + + return out.__class__( + out, + _group_vars=gvars, + _drop=group_by_drop_default(data) + ) + + return out + +def _separate_col( + elem: Any, + nout: int, + sep: Union[str, int], + extra: str, + fill: str, + base0: Optional[bool], + # pylint: disable=dangerous-default-value + extra_warns: List[str] = [], # mutatable to save warnings + missing_warns: List[str] = [] +) -> List[Optional[str]]: + """Separate the column""" + if (is_scalar(elem) and pandas.isnull(elem)) or ( + not is_scalar(elem) and any(pandas.isnull(elem)) + ): + return [NA] * nout if nout > 0 else NA + + elem = str(elem) + if isinstance(sep, int): + try: + tmp = position_at(sep, len(elem), base0) + except IndexError: + tmp = 0 if sep < 0 else len(elem) - 1 + tmp = sep - 1 if sep < 0 else tmp + row = [elem[:tmp+1], elem[tmp+1:]] + else: + row = re.split(sep, elem, 0 if nout == 0 else nout - 1) + if nout == 0: + return row + if len(row) < nout: + if fill == 'warn' and ( + not missing_warns or missing_warns[-1] != '...truncated' + ): + missing_warns.append(elem) + if fill in ('warn', 'right'): + row += [NA] * (nout - len(row)) + else: + row = [NA] * (nout - len(row)) + row + elif not isinstance(sep, int): + more_splits = re.split(sep, row[-1], 1) + if len(more_splits) > 1: + if extra == 'warn' and ( + not extra_warns or extra_warns[-1] != '...truncated' + ): + extra_warns.append(elem) + if extra in ('warn', 'drop'): + row[-1] = more_splits[0] + return row diff --git a/datar/tidyr/uncount.py b/datar/tidyr/uncount.py new file mode 100644 index 00000000..459829fc --- /dev/null +++ b/datar/tidyr/uncount.py @@ -0,0 +1,97 @@ +"""Uncount a data frame""" + +from typing import Any, Iterable, Optional + +from pandas import DataFrame +from pipda import register_verb + +from ..core.contexts import Context +from ..core.types import IntOrIter, is_scalar +from ..core.utils import get_option +from ..core.grouped import DataFrameGroupBy, DataFrameRowwise + +from ..base import intersect +from ..dplyr import ( + group_by, mutate, row_number, group_vars, group_by_drop_default, ungroup +) + +INDEX_COLUMN = '_UNCOUND_INDEX_' + +@register_verb(DataFrame, context=Context.EVAL) +def uncount( + data: DataFrame, + weights: IntOrIter, + _remove: bool = True, + _id: Optional[str] = None, + _base0: Optional[bool] = None +) -> DataFrame: + """Duplicating rows according to a weighting variable + + Args: + data: A data frame + weights: A vector of weights. Evaluated in the context of data + _remove: If TRUE, and weights is the name of a column in data, + then this column is removed. + _id: Supply a string to create a new variable which gives a + unique identifier for each created row (0-based). + _base0: Whether the generated `_id` columns are 0-based. + If not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + dataframe with rows repeated. + """ + if is_scalar(weights): + weights = [weights] * data.shape[0] + + _check_weights(weights) + + indexes = [ + idx for i, idx in enumerate(data.index) + for _ in range(int(weights[i])) + ] + + all_columns = data.columns + weight_name = getattr(weights, 'name', None) + if weight_name in all_columns and weights is data[weight_name]: + rest_columns = all_columns.difference([weight_name]) + else: + rest_columns = all_columns + + out = data.loc[indexes, rest_columns] if _remove else data.loc[indexes, :] + # need the indexes to get the right id column + out = out.assign(**{INDEX_COLUMN: indexes}) + out.reset_index(drop=True, inplace=True) + + if _id: + base = int(not get_option('index.base.0', _base0)) + # pylint: disable=no-value-for-parameter + out = ( + out >> + group_by(INDEX_COLUMN) >> + mutate(**{_id: row_number() + base - 1}) >> + ungroup() + ) + out.drop(columns=[INDEX_COLUMN], inplace=True) + + if ( + isinstance(data, DataFrameGroupBy) and + not isinstance(data, DataFrameRowwise) + ): + grpvars = intersect(group_vars(data), out.columns) + + if len(grpvars) > 0: + return DataFrameGroupBy( + out, + _group_vars=grpvars, + _drop=group_by_drop_default(data) + ) + + return out + +def _check_weights(weights: Iterable[Any]) -> None: + """Check if uncounting weights are valid""" + for weight in weights: + if not isinstance(weight, (int, float)): + raise ValueError("`weights` must evaluate to numerics.") + if weight < 0: + raise ValueError("All elements in `weights` must be >= 0.") diff --git a/datar/tidyr/unite.py b/datar/tidyr/unite.py new file mode 100644 index 00000000..a60d7ac3 --- /dev/null +++ b/datar/tidyr/unite.py @@ -0,0 +1,69 @@ +"""Unite multiple columns into one by pasting strings together""" + +from typing import Optional, Union + +import pandas +from pandas import DataFrame +from pipda import register_verb + +from ..core.contexts import Context +from ..core.utils import vars_select, reconstruct_tibble + +from ..base import setdiff + +@register_verb(DataFrame, context=Context.SELECT) +def unite( + data: DataFrame, + col: str, + *columns: Union[str, int], + sep: str = '_', + remove: bool = True, + na_rm: bool = False, + _base0: Optional[bool] = None +) -> DataFrame: + """Unite multiple columns into one by pasting strings together + + Args: + data: A data frame. + col: The name of the new column, as a string or symbol. + *columns: Columns to unite + sep: Separator to use between values. + remove: If True, remove input columns from output data frame. + na_rm: If True, missing values will be remove prior to uniting + each value. + _base0: Whether `columns` is 0-based when given by index + If not provided, will use `datar.base.getOption('index.base.0')` + + Returns: + The dataframe with selected columns united + """ + all_columns = data.columns + if not columns: + columns = all_columns + else: + columns = all_columns[vars_select(all_columns, *columns, base0=_base0)] + + out = data.copy() + + def unite_cols(row): + if na_rm: + row = [elem for elem in row if not pandas.isnull(elem)] + return sep.join(str(elem) for elem in row) + + out[col] = out[columns].agg(unite_cols, axis=1) + # get indexes to relocate + insert_at = min(data.columns.get_indexer_for(columns)) + relocated_cols = ( + data.columns[:insert_at] + .difference([col]) + .union([col], sort=False) + .union(data.columns[insert_at:].difference([col]), sort=False) + ) + out = out[relocated_cols] + + if remove: + cols_to_remove = setdiff(columns, [col]) + if len(cols_to_remove) > 0: + out.drop(columns=cols_to_remove, inplace=True) + + return reconstruct_tibble(data, out) diff --git a/datar/tidyr/verbs.py b/datar/tidyr/verbs.py deleted file mode 100644 index 0de4ca7d..00000000 --- a/datar/tidyr/verbs.py +++ /dev/null @@ -1,777 +0,0 @@ -"""Verbs from R-tidyr""" -import re -import itertools -from functools import singledispatch -from typing import Any, Callable, Iterable, Mapping, Optional, Type, Union - -import numpy -import pandas -from pandas import DataFrame -from pandas.core.groupby.generic import SeriesGroupBy -from pandas.core.series import Series -from pipda import register_verb - -from ..core.utils import ( - copy_attrs, vars_select, logger -) -from ..core.types import ( - DataFrameType, IntOrIter, SeriesLikeType, StringOrIter, - is_scalar -) -from ..core.middlewares import Nesting -from ..core.contexts import Context -from ..core.names import repair_names -from ..core.grouped import DataFrameGroupBy -from ..base import NA, levels, setdiff -from ..dplyr.distinct import distinct -from ..dplyr.group_by import group_by_drop_default -from ..dplyr.group_data import group_vars - -@register_verb(DataFrame, context=Context.SELECT) -def pivot_longer( - _data: DataFrame, - cols: StringOrIter, - names_to: StringOrIter = "name", - names_prefix: Optional[str] = None, - names_sep: Optional[str] = None, - names_pattern: Optional[str] = None, - names_ptypes: Optional[Mapping[str, Type]] = None, - names_transform: Optional[Mapping[str, Callable]] = None, - # names_repair="check_unique", # todo - values_to: str = "value", - values_drop_na: bool = False, - values_ptypes: Optional[Mapping[str, Type]] = None, - values_transform: Optional[Mapping[str, Callable]] = None -): - """"lengthens" data, increasing the number of rows and - decreasing the number of columns. - - Args: - _data: A data frame to pivot. - cols: Columns to pivot into longer format. - names_to: A string specifying the name of the column to create from - the data stored in the column names of data. - Can be a character vector, creating multiple columns, if names_sep - or names_pattern is provided. In this case, there are two special - values you can take advantage of: - - None will discard that component of the name. - - .value indicates that component of the name defines the name of - the column containing the cell values, overriding values_to. - names_prefix: A regular expression used to remove matching text from - the start of each variable name. - names_sep: and - names_pattern: If names_to contains multiple values, - these arguments control how the column name is broken up. - names_sep takes the same specification as separate(), and - can either be a numeric vector (specifying positions to break on), - or a single string (specifying a regular expression to split on). - names_pattern: takes the same specification as extract(), - a regular expression containing matching groups (()). - names_ptypes: and - values_ptypes: A list of column name-prototype pairs. - A prototype (or ptype for short) is a zero-length vector - (like integer() or numeric()) that defines the type, class, and - attributes of a vector. Use these arguments if you want to confirm - that the created columns are the types that you expect. - Note that if you want to change (instead of confirm) the types - of specific columns, you should use names_transform or - values_transform instead. - names_transform: and - values_transform: A list of column name-function pairs. - Use these arguments if you need to change the types of - specific columns. For example, - names_transform = dict(week = as.integer) would convert a - character variable called week to an integer. - If not specified, the type of the columns generated from names_to - will be character, and the type of the variables generated from - values_to will be the common type of the input columns used to - generate them. - names_repair: Not supported yet. - values_to: A string specifying the name of the column to create from - the data stored in cell values. If names_to is a character - containing the special .value sentinel, this value will be ignored, - and the name of the value column will be derived from part of - the existing column names. - values_drop_na: If TRUE, will drop rows that contain only NAs in - the value_to column. This effectively converts explicit missing - values to implicit missing values, and should generally be used - only when missing values in data were created by its structure. - - Returns: - The pivoted dataframe. - """ - all_columns = _data.columns - columns = all_columns[vars_select(all_columns, cols)] - id_columns = setdiff(all_columns, columns) - var_name = '__tmp_names_to__' if names_pattern or names_sep else names_to - ret = _data.melt( - id_vars=id_columns, - value_vars=columns, - var_name=var_name, - value_name=values_to, - ) - - if names_pattern: - ret[names_to] = ret['__tmp_names_to__'].str.extract(names_pattern) - ret.drop(['__tmp_names_to__'], axis=1, inplace=True) - - if names_prefix: - ret[names_to] = ret[names_to].str.replace(names_prefix, '') - - if '.value' in names_to: - ret2 = ret.pivot(columns='.value', values=values_to) - rest_columns = setdiff(ret.columns, ['.value', values_to]) - ret2.loc[:, rest_columns] = ret.loc[:, rest_columns] - - ret2_1 = ret2.iloc[:(ret2.shape[0] // 2), ] - ret2_2 = ret2.iloc[(ret2.shape[0] // 2):, ].reset_index() - ret = ret2_1.assign(**{ - col: ret2_2[col] - for col in ret2_1.columns - if ret2_1[col].isna().all() - }) - - if values_drop_na: - ret.dropna(subset=[values_to], inplace=True) - if names_ptypes: - for key, ptype in names_ptypes.items(): - ret[key] = ret[key].astype(ptype) - if values_ptypes: - for key, ptype in values_ptypes.items(): - ret[key] = ret[key].astype(ptype) - if names_transform: - for key, tform in names_transform.items(): - ret[key] = ret[key].apply(tform) - if values_transform: - for key, tform in values_transform.items(): - ret[key] = ret[key].apply(tform) - - return ret - -@register_verb(DataFrame, context=Context.SELECT) -def pivot_wider( - _data: DataFrame, - id_cols: Optional[StringOrIter] = None, - names_from: str = "name", - names_prefix: str = "", - names_sep: str = "_", - names_glue: Optional[str] = None, - names_sort: bool = False, - # names_repair: str = "check_unique", # todo - values_from: StringOrIter = "value", - values_fill: Any = None, - values_fn: Optional[Union[Callable, Mapping[str, Callable]]] = None, -) -> DataFrame: - """"widens" data, increasing the number of columns and decreasing - the number of rows. - - Args: - _data: A data frame to pivot. - id_cols: A set of columns that uniquely identifies each observation. - Defaults to all columns in data except for the columns specified - in names_from and values_from. - names_from: and - values_from: A pair of arguments describing which column - (or columns) to get the name of the output column (names_from), - and which column (or columns) to get the cell values from - (values_from). - names_prefix: String added to the start of every variable name. - names_sep: If names_from or values_from contains multiple variables, - this will be used to join their values together into a single - string to use as a column name. - names_glue: Instead of names_sep and names_prefix, you can supply - a glue specification that uses the names_from columns - (and special _value) to create custom column names. - names_sort: Should the column names be sorted? If FALSE, the default, - column names are ordered by first appearance. - names_repair: todo - values_fill: Optionally, a (scalar) value that specifies what - each value should be filled in with when missing. - values_fn: Optionally, a function applied to the value in each cell - in the output. You will typically use this when the combination - of id_cols and value column does not uniquely identify - an observation. - This can be a dict you want to apply different aggregations to - different value columns. - - Returns: - The pivoted dataframe. - """ - if id_cols is None: - all_cols = _data.columns - selected_cols = all_cols[vars_select(all_cols, names_from, values_from)] - id_cols = setdiff(all_cols, selected_cols) - ret = pandas.pivot_table( - _data, - index=id_cols, - columns=names_from, - fill_value=values_fill, - values=values_from, - aggfunc=values_fn or numpy.mean - ) - - def get_new_colname(cols, names): - if is_scalar(cols): - cols = [cols] - if not names_glue: - return f'{names_prefix}{names_sep.join(cols)}' - names = ('_value' if name is None else name for name in names) - render_data = dict(zip(names, cols)) - return names_glue.format(**render_data) - - new_columns = [ - get_new_colname(col, ret.columns.names) - for col in ret.columns - ] - ret.columns = new_columns - if names_sort: - ret = ret.loc[:, sorted(new_columns)] - - return ret - -@register_verb((DataFrame, DataFrameGroupBy), context=Context.EVAL) -def uncount( - _data: DataFrameType, - weights: IntOrIter, - _remove: bool = True, - _id: Optional[str] = None, -) -> DataFrameType: - """Duplicating rows according to a weighting variable - - Args: - _data: A data frame - weights: A vector of weights. Evaluated in the context of data - _remove: If TRUE, and weights is the name of a column in data, - then this column is removed. - _id: Supply a string to create a new variable which gives a - unique identifier for each created row (0-based). - - Returns: - dataframe with rows repeated. - """ - gnames = ( - _data.grouper.names - if isinstance(_data, DataFrameGroupBy) else None - ) - if is_scalar(weights): - weights = [weights] * _data.shape[0] - - indexes = [ - idx for i, idx in enumerate(_data.index) - for _ in range(weights[i]) - ] - - all_columns = _data.columns.tolist() - weight_name = getattr(weights, 'name', None) - if weight_name in all_columns and weights is _data[weight_name]: - rest_columns = setdiff(all_columns, [weight_name]) - else: - rest_columns = all_columns - - ret = _data.loc[indexes, rest_columns] if _remove else _data.loc[indexes, :] - if _id: - ret = ret.groupby(rest_columns).apply( - lambda df: df.assign(**{_id: range(df.shape[0])}) - ).reset_index(drop=True, level=0) - if gnames: - return ret.groupby(gnames, dropna=False) - return ret - -@singledispatch -def _replace_na(data: Iterable[Any], replace: Any) -> Iterable[Any]: - """Replace NA for any iterables""" - return type(data)(replace if pandas.isnull(elem) else elem for elem in data) - -@_replace_na.register(numpy.ndarray) -@_replace_na.register(Series) -def _(data: SeriesLikeType, replace: Any) -> SeriesLikeType: - """Replace NA for numpy.ndarray or Series""" - ret = data.copy() - ret[pandas.isnull(ret)] = replace - return ret - -@_replace_na.register(DataFrame) -def _(data: DataFrame, replace: Any) -> DataFrame: - """Replace NA for numpy.ndarray or DataFrame""" - return data.fillna(replace) - -@_replace_na.register(DataFrameGroupBy) -@_replace_na.register(SeriesGroupBy) -def _( - data: Union[DataFrameGroupBy, SeriesGroupBy], - replace: Any -) -> Union[DataFrameGroupBy, SeriesGroupBy]: - """Replace NA for grouped data, keep the group structure""" - grouper = data.grouper - ret = _replace_na(data, replace) - return ret.groupby(grouper, dropna=False) - -@register_verb( - (DataFrame, DataFrameGroupBy, Series, numpy.ndarray, list, tuple, set), - context=Context.EVAL -) -def replace_na( - _data: Iterable[Any], - series_or_replace: Any, - replace: Any = None -) -> Any: - """Replace NA with a value - - This function can be also used not as a verb. As a function called as - an argument in a verb, _data is passed implicitly. Then one could - pass series_or_replace as the data to replace. - - Args: - _data: The data piped in - series_or_replace: When called as argument of a verb, this is the - data to replace. Otherwise this is the replacement. - replace: The value to replace with - - Returns: - Corresponding data with NAs replaced - """ - if replace is not None: - return _replace_na(series_or_replace, replace) - return _replace_na(_data, series_or_replace) - -@register_verb( - DataFrame, - context=Context.SELECT -) -def fill( - _data: DataFrame, - *columns: str, - _direction: str = "down" -) -> DataFrame: - """Fills missing values in selected columns using the next or - previous entry. - - See https://tidyr.tidyverse.org/reference/fill.html - - Args: - _data: A dataframe - *columns: Columns to fill - _direction: Direction in which to fill missing values. - Currently either "down" (the default), "up", - "downup" (i.e. first down and then up) or - "updown" (first up and then down). - - Returns: - The dataframe with NAs being replaced. - """ - data = _data.copy() - if not columns: - data = data.fillna( - method='ffill' if _direction.startswith('down') else 'bfill', - ) - if _direction in ('updown', 'downup'): - data = data.fillna( - method='ffill' if _direction.endswith('down') else 'bfill', - ) - else: - columns = data.columns[vars_select(data.columns, *columns)] - subset = fill(data[columns], _direction=_direction) - data[columns] = subset - return data - -@fill.register(DataFrameGroupBy, context=Context.SELECT) -def _( - _data: DataFrameGroupBy, - *columns: str, - _direction: str = "down" -) -> DataFrameGroupBy: - # DataFrameGroupBy - out = _data.group_apply( - lambda df: fill(df, *columns, _direction=_direction) - ) - out = _data.__class__( - out, - _group_vars=group_vars(_data), - _drop=group_by_drop_default(_data) - ) - copy_attrs(out, _data) - return out - -def expand_grid( - _data: Iterable[Any] = None, - _name_repair: str = "check_unique", - _base0: Optional[bool] = None, - **kwargs: Iterable[Any] -) -> DataFrame: - """Expand elements into a new dataframe - - See https://tidyr.tidyverse.org/reference/expand_grid.html - - Args: - _data, **kwargs: Name-value pairs. The name will become the column - name in the output. - For _data, will try to fetch name via `_data.__dfname__`. If failed - `_data` will be used. - _name_repair: treatment of problematic column names: - - "minimal": No name repair or checks, beyond basic existence, - - "unique": Make sure names are unique and not empty, - - "check_unique": (default value), no name repair, - but check they are unique, - - "universal": Make the names unique and syntactic - - a function: apply custom name repair - _base0: Whether the suffixes of repaired names should be 0-based. - If not provided, will use `datar.base.getOption('index.base.0')`. - - Returns: - The expanded dataframe - """ - product_args = [] - names = [] - if isinstance(_data, DataFrame): - dataname = getattr(_data, '__dfname__', '_data') - product_args = [(row[1] for row in _data.iterrows())] - names = [f'{dataname}_{col}' for col in _data.columns] - elif _data is not None: - raise ValueError('Positional argument must be a DataFrame or None.') - for key, val in kwargs.items(): - if isinstance(val, DataFrame): - product_args.append((row[1] for row in val.iterrows())) - names.extend(f'{key}_{col}' for col in val.columns) - else: - product_args.append(((value, ) for value in val)) - names.append(key) - - return DataFrame( - (itertools.chain.from_iterable(row) - for row in itertools.product(*product_args)), - columns=repair_names(names, _name_repair, _base0=_base0) - ) - -@register_verb((DataFrame, DataFrameGroupBy), context=Context.SELECT) -def extract( - _data: DataFrameType, - col: str, - into: StringOrIter, - regex: str = r'(\w+)', - remove: bool = True, - convert: Union[bool, str, Type, Mapping[str, Union[str, Type]]] = False -) -> DataFrameType: - """Given a regular expression with capturing groups, extract() turns each - group into a new column. If the groups don't match, or the input is NA, - the output will be NA. - - See https://tidyr.tidyverse.org/reference/extract.html - - Args: - _data: The dataframe - col: Column name or position. - into: Names of new variables to create as character vector. - Use None to omit the variable in the output. - regex: a regular expression used to extract the desired values. - There should be one group (defined by ()) for each element of into. - remove: If TRUE, remove input column from output data frame. - convert: The universal type for the extracted columns or a dict for - individual ones - - Returns: - Dataframe with extracted columns. - """ - if isinstance(_data, DataFrame): - if is_scalar(into): - into = [into] - colindex = [ - i for i, outcol in enumerate(into) - if outcol not in (None, NA) - ] - extracted = _data[col].str.extract(regex).iloc[:, colindex] - extracted.columns = [col for col in into if col not in (None, NA)] - - if isinstance(convert, (str, Type)): - extracted.astype(convert) - elif isinstance(convert, dict): - for key, conv in convert.items(): - extracted[key] = extracted[key].astype(conv) - if remove: - _data = _data[_data.columns.difference([col])] - - return pandas.concat([_data, extracted], axis=1) - - grouper = _data.grouper - return _data.apply( - lambda df: extract(df, col, into, regex, remove, convert) - ).groupby(grouper, dropna=False) - -@register_verb((DataFrame, DataFrameGroupBy), context=Context.SELECT) -def separate( # pylint: disable=too-many-branches - _data: DataFrameType, - col: str, - into: StringOrIter, - sep: Union[int, str] = r'[^0-9A-Za-z]+', - remove: bool = True, - convert: Union[bool, str, Type, Mapping[str, Union[str, Type]]] = False, - extra: str = "warn", - fill: str = "warn" # pylint: disable=redefined-outer-name -) -> DataFrameType: # pylint: disable=too-many-nested-blocks - """Given either a regular expression or a vector of character positions, - turns a single character column into multiple columns. - - Args: - _data: The dataframe - col: Column name or position. - into: Names of new variables to create as character vector. - Use None to omit the variable in the output. - sep: Separator between columns. - TODO: support index split (sep is an integer) - remove: If TRUE, remove input column from output data frame. - convert: The universal type for the extracted columns or a dict for - individual ones - extra: If sep is a character vector, this controls what happens when - there are too many pieces. There are three valid options: - - "warn" (the default): emit a warning and drop extra values. - - "drop": drop any extra values without a warning. - - "merge": only splits at most length(into) times - fill: If sep is a character vector, this controls what happens when - there are not enough pieces. There are three valid options: - - "warn" (the default): emit a warning and fill from the right - - "right": fill with missing values on the right - - "left": fill with missing values on the left - - Returns: - Dataframe with separated columns. - """ - if isinstance(_data, DataFrame): - if is_scalar(into): - into = [into] - colindex = [ - i for i, outcol in enumerate(into) - if outcol not in (None, NA) - ] - non_na_elems = lambda row: [row[i] for i in colindex] - # series.str.split can do extra and fill - # extracted = _data[col].str.split(sep, expand=True).iloc[:, colindex] - nout = len(into) - outdata = [] - extra_warns = [] - missing_warns = [] - for i, elem in enumerate(_data[col]): - if elem in (NA, None): - row = [NA] * nout - continue - - row = re.split(sep, str(elem), nout - 1) - if len(row) < nout: - if fill == 'warn': - missing_warns.append(i) - if fill in ('warn', 'right'): - row += [NA] * (nout - len(row)) - else: - row = [NA] * (nout - len(row)) + row - else: - more_splits = re.split(sep, row[-1], 1) - if len(more_splits) > 1: - if extra == 'warn': - extra_warns.append(i) - if extra in ('warn', 'drop'): - row[-1] = more_splits[0] - - outdata.append(non_na_elems(row)) - - if extra_warns: - logger.warning( - 'Expected %s pieces. ' - 'Additional pieces discarded in %s rows %s.', - nout, - len(extra_warns), - extra_warns - ) - if missing_warns: - logger.warning( - 'Expected %s pieces. ' - 'Missing pieces filled with `NA` in %s rows %s.', - nout, - len(missing_warns), - missing_warns - ) - separated = DataFrame(outdata, columns=non_na_elems(into)) - - if isinstance(convert, (str, Type)): - separated.astype(convert) - elif isinstance(convert, dict): - for key, conv in convert.items(): - separated[key] = separated[key].astype(conv) - if remove: - _data = _data[_data.columns.difference([col])] - - return pandas.concat([_data, separated], axis=1) - - grouper = _data.grouper - return _data.apply( - lambda df: separate(df, col, into, sep, remove, convert, extra, fill) - ).groupby(grouper, dropna=False) - - -@register_verb(DataFrame, context=Context.SELECT) -def separate_rows( - _data: DataFrame, - *columns: str, - sep: str = r'[^0-9A-Za-z]+', - convert: Union[bool, str, Type, Mapping[str, Union[str, Type]]] = False, -) -> DataFrame: - """Separates the values and places each one in its own row. - - Args: - _data: The dataframe - *columns: The columns to separate on - sep: Separator between columns. - convert: The universal type for the extracted columns or a dict for - individual ones - - Returns: - Dataframe with rows separated and repeated. - """ - all_columns = _data.columns - selected = all_columns[vars_select(all_columns, *columns)] - - weights = [] - repeated = [] - for row in _data[selected].iterrows(): - row = row[1] - weights.append(None) - rdata = [] - for col in selected: - splits = re.split(sep, row[col]) - if weights[-1] and weights[-1] != len(splits): - raise ValueError( - f'Error: Incompatible lengths: {weights[-1]}, ' - f'{len(splits)}.' - ) - weights[-1] = len(splits) - rdata.append(splits) - repeated.extend(zip(*rdata)) - - ret = uncount(_data, weights) - ret[selected] = repeated - - if isinstance(convert, (str, Type)): - ret.astype(convert) - elif isinstance(convert, dict): - for key, conv in convert.items(): - ret[key] = ret[key].astype(conv) - return ret - -@register_verb((DataFrame, DataFrameGroupBy), context=Context.SELECT) -def unite( - _data: DataFrameType, - col: str, - *columns: str, - sep: str = '_', - remove: bool = True, - na_rm: bool = False -) -> DataFrameType: - """Unite multiple columns into one by pasting strings together - - Args: - data: A data frame. - col: The name of the new column, as a string or symbol. - *columns: Columns to unite - sep: Separator to use between values. - remove: If True, remove input columns from output data frame. - na_rm: If True, missing values will be remove prior to uniting - each value. - - Returns: - The dataframe with selected columns united - """ - all_columns = _data.columns - columns = all_columns[vars_select(all_columns, *columns)] - - out = _data.copy() - - def unite_cols(row): - if na_rm: - row = [elem for elem in row if elem is not NA] - return sep.join(str(elem) for elem in row) - - out[col] = out[columns].agg(unite_cols, axis=1) - if remove: - out.drop(columns=columns, inplace=True) - - if isinstance(_data, DataFrameGroupBy): - out = _data.__class__( - out, - _group_vars=group_vars(_data), - _drop=group_by_drop_default(_data) - ) - copy_attrs(out, _data) - return out - -@register_verb(DataFrame, context=Context.SELECT) -def drop_na( - _data: DataFrame, - *columns: str -) -> DataFrame: - """Drop rows containing missing values - - See https://tidyr.tidyverse.org/reference/drop_na.html - - Args: - data: A data frame. - *columns: Columns to inspect for missing values. - - Returns: - Dataframe with rows with NAs dropped - """ - all_columns = _data.columns - columns = vars_select(all_columns, *columns) - columns = all_columns[columns] - out = _data.dropna(subset=columns) - - if isinstance(_data, DataFrameGroupBy): - out = _data.__class__( - out, - _group_vars=group_vars(_data), - _drop=group_by_drop_default(_data) - ) - copy_attrs(out, _data) - return out - -@register_verb(DataFrame, context=Context.EVAL) -def expand( - _data: DataFrame, # pylint: disable=no-value-for-parameter - *columns: Union[str, Nesting], - # _name_repair: Union[str, Callable] = None # todo - **kwargs: Iterable[Any] -) -> DataFrame: - """See https://tidyr.tidyverse.org/reference/expand.html""" - iterables = [] - names = [] - for i, column in enumerate(columns): - if isinstance(column, Nesting): - iterables.append(zip(*column.columns)) - names.extend(column.names) - else: - cats = levels(column) - iterables.append(zip( - column if cats is None else cats - )) - - try: - name = column.name - except AttributeError: - name = f'_tmp{hex(id(column))[2:6]}_{i}' - logger.warning( - 'Temporary name used. Use keyword argument to ' - 'specify the key as column name.' - ) - names.append(name) - - for key, val in kwargs.items(): - if isinstance(val, Nesting): - iterables.append(zip(*val.columns)) - names.extend(f'{key}_{name}' for name in val.names) - else: - cats = levels(val) - iterables.append(zip( - val if cats is None else cats - )) - names.append(key) - - return DataFrame(( - itertools.chain.from_iterable(row) - for row in itertools.product(*iterables) - ), columns=names) >> distinct() # pylint: disable=no-value-for-parameter diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 63eab502..43a7cd14 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.0.7 +- Add dplyr rows verbs +- Allow mixed numbering (with `c()` and `f[...]`) for tibble construction +- Allow slice (`f[a:b]`) to be expanded into sequence for `EVAL` context +- Finish tidyr porting. + ## 0.0.6 - Add `options`, `getOption` and `options_context` to `datar.base` to allow set/get global options - Add options: `dplyr.summarise.inform` diff --git a/docs/notebooks/chop.ipynb b/docs/notebooks/chop.ipynb new file mode 100644 index 00000000..6998e174 --- /dev/null +++ b/docs/notebooks/chop.ipynb @@ -0,0 +1,297 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python378jvsc74a57bd0c4cc73b080e063fcebb9afb794613be7caf4b26129562cba1382945a18cc49cc", + "display_name": "Python 3.7.8 64-bit ('base': conda)" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/html": "
Try this notebook on binder.
" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # chop " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Makes data frame shorter by converting rows within each group\ninto list-columns. \n\n##### Args:\n  `data`: A data frame \n  `cols`: Columns to chop \n  `_base0`: Whether `cols` are 0-based \n    if not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  Data frame with selected columns chopped \n" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # unchop " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Makes df longer by expanding list-columns so that each element\nof the list-column gets its own row in the output. \n\nSee https://tidyr.tidyverse.org/reference/chop.html \n\nRecycling size-1 elements might be different from `tidyr` \n  >>> df = tibble(x=[1, [2,3]], y=[[2,3], 1]) \n  >>> df >> unchop([f.x, f.y]) \n  >>> # tibble(x=[1,2,3], y=[2,3,1]) \n  >>> # instead of following in tidyr \n  >>> # tibble(x=[1,1,2,3], y=[2,3,1,1]) \n\n##### Args:\n  `data`: A data frame. \n  `cols`: Columns to unchop. \n  `keep_empty`: By default, you get one row of output for each element \n    of the list your unchopping/unnesting. \n    This means that if there's a size-0 element \n    (like NULL or an empty data frame), that entire row will be \n    dropped from the output. \n    If you want to preserve all rows, use `keep_empty` = `True` to \n    replace size-0 elements with a single row of missing values. \n\n  `dtypes`: NOT `ptype`. Providing the dtypes for the output columns. \n    Could be a single dtype, which will be applied to all columns, or \n    a dictionary of dtypes with keys for the columns and values the \n    dtypes. \n    For nested data frames, we need to specify `col$a` as key. If `col` \n    is used as key, all columns of the nested data frames will be casted \n    into that dtype. \n\n  `_base0`: Whether `cols` are 0-based \n    if not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  A data frame with selected columns unchopped. \n" + }, + "metadata": {} + } + ], + "source": [ + "from datar.datasets import iris, fish_encounters\n", + "from datar.all import *\n", + "\n", + "%run nb_helpers.py\n", + "nb_header(chop, unchop)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x data\n", + "0 1 y z\n", + "0 1 6\n", + "1 2 5\n", + "2 3 4\n", + "1 2 y z\n", + "3 4 3\n", + "4 5 2\n", + "2 3 y z\n", + "5 6 1" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xdata
01y z\n0 1 6\n1 2 5\n2 3 4
12y z\n3 4 3\n4 5 2
23y z\n5 6 1
\n
" + }, + "metadata": {}, + "execution_count": 2 + } + ], + "source": [ + "df = tibble(x = c(1, 1, 1, 2, 2, 3), y = f[1:6], z = f[6:1])\n", + "df >> nest(data = c(f.y, f.z))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x y z\n", + "0 1 [1, 2, 3] [6, 5, 4]\n", + "1 2 [4, 5] [3, 2]\n", + "2 3 [6] [1]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xyz
01[1, 2, 3][6, 5, 4]
12[4, 5][3, 2]
23[6][1]
\n
" + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "df >> chop(c(f.y, f.z))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x y\n", + "0 2 1.0\n", + "1 3 1.0\n", + "2 3 2.0\n", + "3 4 1.0\n", + "4 4 2.0\n", + "5 4 3.0" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
021.0
131.0
232.0
341.0
442.0
543.0
\n
" + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "# Unchop\n", + "df = tibble(x = f[1:4], y = [[], [1], [1,2], [1,2,3]])\n", + "df >> unchop(f.y)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x y\n", + "0 2 1\n", + "1 3 1\n", + "2 3 2\n", + "3 4 1\n", + "4 4 2\n", + "5 4 3" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
021
131
232
341
442
543
\n
" + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "df >> unchop(f.y, keep_empty=True, dtypes=int)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x y\n", + "0 1 a\n", + "1 2 1\n", + "2 2 2\n", + "3 2 3" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
01a
121
222
323
\n
" + }, + "metadata": {}, + "execution_count": 6 + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "x int64\n", + "y object\n", + "dtype: object" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "df = tibble(x = f[1:2], y = [\"a\", [1,2,3]])\n", + "df >> unchop(f.y)\n", + "_.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[ValueError] invalid literal for int() with base 10: 'a'\n" + ] + } + ], + "source": [ + "with try_catch():\n", + " df >> unchop(f.y, dtypes=int)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x y$x y$y\n", + "0 2 1.0 NaN\n", + "1 3 NaN 1.0\n", + "2 3 NaN 2.0" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy$xy$y
021.0NaN
13NaN1.0
23NaN2.0
\n
" + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "df = tibble(x = f[1:3], y = [NULL, tibble(x = 1), tibble(y = f[1:2])])\n", + "df >> unchop(f.y)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x y$x y$y\n", + "0 1 NaN NaN\n", + "1 2 1.0 NaN\n", + "2 3 NaN 1.0\n", + "3 3 NaN 2.0" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy$xy$y
01NaNNaN
121.0NaN
23NaN1.0
33NaN2.0
\n
" + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "df >> unchop(f.y, keep_empty=True)" + ] + } + ] +} \ No newline at end of file diff --git a/docs/notebooks/complete.ipynb b/docs/notebooks/complete.ipynb new file mode 100644 index 00000000..a296c182 --- /dev/null +++ b/docs/notebooks/complete.ipynb @@ -0,0 +1,119 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python378jvsc74a57bd0c4cc73b080e063fcebb9afb794613be7caf4b26129562cba1382945a18cc49cc", + "display_name": "Python 3.7.8 64-bit ('base': conda)" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/html": "
Try this notebook on binder.
" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # complete " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Turns implicit missing values into explicit missing values.\n\n##### Args:\n  `data`: A data frame \n  `*args`: columns to expand. Columns can be atomic lists. \n    - To find all unique combinations of x, y and z, including\n      those not present in the data, supply each variable as a \n      separate argument: `expand(df, x, y, z)`. \n\n    - To find only the combinations that occur in the data, use\n      `nesting`: `expand(df, nesting(x, y, z))`. \n\n    - You can combine the two forms. For example,\n      `expand(df, nesting(school_id, student_id), date)` would \n      produce a row for each present school-student combination \n      for all possible dates. \n\n##### Returns:\n  Data frame with missing values completed \n" + }, + "metadata": {} + } + ], + "source": [ + "from datar.all import *\n", + "\n", + "%run nb_helpers.py\n", + "nb_header(complete)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " group item_id item_name value1 value2\n", + "0 1 1 a 1.0 4.0\n", + "1 1 2 b 3.0 6.0\n", + "2 2 1 a NaN NaN\n", + "3 2 2 b 2.0 5.0" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
groupitem_iditem_namevalue1value2
011a1.04.0
112b3.06.0
221aNaNNaN
322b2.05.0
\n
" + }, + "metadata": {}, + "execution_count": 2 + } + ], + "source": [ + "df = tibble(\n", + " group = c(f[1:2], 1),\n", + " item_id = c(f[1:2], 2),\n", + " item_name = c(\"a\", \"b\", \"b\"),\n", + " value1 = f[1:3],\n", + " value2 = f[4:6]\n", + ")\n", + "df >> complete(f.group, nesting(f.item_id, f.item_name))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " group item_id item_name value1 value2\n", + "0 1 1 a 1.0 4.0\n", + "1 1 2 b 3.0 6.0\n", + "2 2 1 a 0.0 NaN\n", + "3 2 2 b 2.0 5.0" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
groupitem_iditem_namevalue1value2
011a1.04.0
112b3.06.0
221a0.0NaN
322b2.05.0
\n
" + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "df >> complete(f.group, nesting(f.item_id, f.item_name), fill=dict(value1=0))" + ] + } + ] +} \ No newline at end of file diff --git a/docs/notebooks/drop_na.ipynb b/docs/notebooks/drop_na.ipynb index 9d91a4be..5aedd6e3 100644 --- a/docs/notebooks/drop_na.ipynb +++ b/docs/notebooks/drop_na.ipynb @@ -14,49 +14,28 @@ }, "outputs": [ { + "output_type": "display_data", "data": { - "text/html": [ - "
Try this notebook on binder.
" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/html": "
Try this notebook on binder.
" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "### # drop_na " - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "### # drop_na " }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "##### Drop rows containing missing values\n", - "\n", - "See https://tidyr.tidyverse.org/reference/drop_na.html \n", - "\n", - "##### Args:\n", - "  `data`: A data frame. \n", - "  `*columns`: Columns to inspect for missing values. \n", - "\n", - "##### Returns:\n", - "  Dataframe with rows with NAs dropped \n" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "##### Drop rows containing missing values\n\nSee https://tidyr.tidyverse.org/reference/drop_na.html \n\n##### Args:\n  `data`: A data frame. \n  `*columns`: Columns to inspect for missing values. \n  `how`: How to select the rows to drop \n    - all: All columns of `columns` to be `NA`s\n\n    - any: Any columns of `columns` to be `NA`s\n\n    (tidyr doesn't support this argument) \n\n  `_base0`: Whether `*columns` are 0-based if given by indexes \n    If not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  Dataframe with rows with NAs dropped and indexes dropped \n" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], "source": [ @@ -82,60 +61,16 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xy
01.0a
12.0NaN
2NaNb
\n", - "
" - ], "text/plain": [ - " x y\n", - "0 1.0 a\n", - "1 2.0 NaN\n", - "2 NaN b" - ] + " x y\n", + "0 1.0 a" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
01.0a
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], "source": [ @@ -157,59 +92,21 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xy
01.0a
12.0NaN
\n", - "
" - ], "text/plain": [ " x y\n", "0 1.0 a\n", "1 2.0 NaN" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
01.0a
12.0NaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], "source": [ - "df >> drop_na(f.x)\n", - "# FIXME" + "df >> drop_na(f.x)" ] }, { @@ -226,54 +123,70 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xy
01.0a
\n", - "
" - ], "text/plain": [ " x y\n", "0 1.0 a" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
01.0a
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], "source": [ "vars = [\"y\"]\n", "df >> drop_na(f.x, any_of(vars))" ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x y\n", + "0 1.0 a\n", + "1 2.0 NaN\n", + "2 NaN b" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
01.0a
12.0NaN
2NaNb
\n
" + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "# how='any' or how='all'\n", + "# not supported by tidyr\n", + "df >> drop_na(how='all')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x y\n", + "0 1.0 a" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
01.0a
\n
" + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "df >> drop_na(how='any')" + ] } ], "metadata": { @@ -297,4 +210,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/notebooks/expand.ipynb b/docs/notebooks/expand.ipynb index 6ff228cc..154c8401 100644 --- a/docs/notebooks/expand.ipynb +++ b/docs/notebooks/expand.ipynb @@ -14,40 +14,60 @@ }, "outputs": [ { + "output_type": "display_data", "data": { - "text/html": [ - "
Try this notebook on binder.
" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/html": "
Try this notebook on binder.
" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "### # expand " - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "### # expand " }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "##### See https://tidyr.tidyverse.org/reference/expand.html\n" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "##### Generates all combination of variables found in a dataset.\n\n##### Args:\n  `data`: A data frame \n  `*args`: and, \n  `**kwargs`: columns to expand. Columns can be atomic lists. \n    - To find all unique combinations of x, y and z, including\n      those not present in the data, supply each variable as a \n      separate argument: `expand(df, x, y, z)`. \n\n    - To find only the combinations that occur in the data, use\n      `nesting`: `expand(df, nesting(x, y, z))`. \n\n    - You can combine the two forms. For example,\n      `expand(df, nesting(school_id, student_id), date)` would \n      produce a row for each present school-student combination \n      for all possible dates. \n\n  `_name_repair`: treatment of problematic column names: \n    - \"minimal\": No name repair or checks, beyond basic existence,\n\n    - \"unique\": Make sure names are unique and not empty,\n\n    - \"check_unique\": (default value), no name repair,\n      but check they are unique, \n\n    - \"universal\": Make the names unique and syntactic\n\n    - a function: apply custom name repair\n\n  `_base0`: Whether the suffixes of repaired names should be 0-based. \n    If not provided, will use `datar.base.getOption('index.base.0')`. \n\n##### Returns:\n  A data frame with all combination of variables. \n" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # nesting " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### A helper that only finds combinations already present in the data.\n\n##### Args:\n  `*args`: and, \n  `**kwargs`: columns to expand. Columns can be atomic lists. \n    - To find all unique combinations of x, y and z, including\n      those not present in the data, supply each variable as a \n      separate argument: `expand(df, x, y, z)`. \n\n    - To find only the combinations that occur in the data, use\n      `nesting`: `expand(df, nesting(x, y, z))`. \n\n    - You can combine the two forms. For example,\n      `expand(df, nesting(school_id, student_id), date)` would \n      produce a row for each present school-student combination \n      for all possible dates. \n\n  `_name_repair`: treatment of problematic column names: \n    - \"minimal\": No name repair or checks, beyond basic existence,\n\n    - \"unique\": Make sure names are unique and not empty,\n\n    - \"check_unique\": (default value), no name repair,\n      but check they are unique, \n\n    - \"universal\": Make the names unique and syntactic\n\n    - a function: apply custom name repair\n\n  `_base0`: Whether the suffixes of repaired names should be 0-based. \n    If not provided, will use `datar.base.getOption('index.base.0')`. \n\n##### Returns:\n  A data frame with all combinations in data. \n" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # crossing " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### A wrapper around `expand_grid()` that de-duplicates and sorts its inputs\n\nWhen values are not specified by literal `list`, they will be sorted. \n\n##### Args:\n  `*args`: and, \n  `**kwargs`: columns to expand. Columns can be atomic lists. \n    - To find all unique combinations of x, y and z, including\n      those not present in the data, supply each variable as a \n      separate argument: `expand(df, x, y, z)`. \n\n    - To find only the combinations that occur in the data, use\n      `nesting`: `expand(df, nesting(x, y, z))`. \n\n    - You can combine the two forms. For example,\n      `expand(df, nesting(school_id, student_id), date)` would \n      produce a row for each present school-student combination \n      for all possible dates. \n\n  `_name_repair`: treatment of problematic column names: \n    - \"minimal\": No name repair or checks, beyond basic existence,\n\n    - \"unique\": Make sure names are unique and not empty,\n\n    - \"check_unique\": (default value), no name repair,\n      but check they are unique, \n\n    - \"universal\": Make the names unique and syntactic\n\n    - a function: apply custom name repair\n\n  `_base0`: Whether the suffixes of repaired names should be 0-based. \n    If not provided, will use `datar.base.getOption('index.base.0')`. \n\n##### Returns:\n  A data frame with values deduplicated and sorted. \n" + }, + "metadata": {} } ], "source": [ @@ -56,7 +76,7 @@ "from datar.all import *\n", "\n", "%run nb_helpers.py\n", - "nb_header(expand)" + "nb_header(expand, nesting, crossing)" ] }, { @@ -73,92 +93,21 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typeyearsizeweights
0apple2010XS-0.197515
1orange2010S1.366764
2apple2012M0.585883
3orange2010S-1.781964
4orange2010S0.648979
5orange2012M0.303533
\n", - "
" - ], "text/plain": [ " type year size weights\n", - "0 apple 2010 XS -0.197515\n", - "1 orange 2010 S 1.366764\n", - "2 apple 2012 M 0.585883\n", - "3 orange 2010 S -1.781964\n", - "4 orange 2010 S 0.648979\n", - "5 orange 2012 M 0.303533" - ] + "0 apple 2010 XS 0.705952\n", + "1 orange 2010 S -1.150749\n", + "2 apple 2012 M -1.558679\n", + "3 orange 2010 S 0.171499\n", + "4 orange 2010 S -1.965301\n", + "5 orange 2012 M 0.212120" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
typeyearsizeweights
0apple2010XS0.705952
1orange2010S-1.150749
2apple2012M-1.558679
3orange2010S0.171499
4orange2010S-1.965301
5orange2012M0.212120
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], "source": [ @@ -189,51 +138,17 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
type
0apple
1orange
\n", - "
" - ], "text/plain": [ " type\n", "0 apple\n", "1 orange" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
type
0apple
1orange
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], "source": [ @@ -254,75 +169,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typesize
0appleXS
1appleS
2appleM
3appleL
4orangeXS
5orangeS
6orangeM
7orangeL
\n", - "
" - ], "text/plain": [ " type size\n", "0 apple XS\n", @@ -333,15 +181,15 @@ "5 orange S\n", "6 orange M\n", "7 orange L" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
typesize
0appleXS
1appleS
2appleM
3appleL
4orangeXS
5orangeS
6orangeM
7orangeL
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], "source": [ - "fruits >> expand(f.type, f['size']) # size is a DataFrame attribute" + "fruits >> expand(f.type, f.size) " ] }, { @@ -358,132 +206,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typesizeyear
0appleXS2010
1appleXS2012
2appleS2010
3appleS2012
4appleM2010
5appleM2012
6appleL2010
7appleL2012
8orangeXS2010
9orangeXS2012
10orangeS2010
11orangeS2012
12orangeM2010
13orangeM2012
14orangeL2010
15orangeL2012
\n", - "
" - ], "text/plain": [ " type size year\n", "0 apple XS 2010\n", @@ -502,15 +226,15 @@ "13 orange M 2012\n", "14 orange L 2010\n", "15 orange L 2012" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
typesizeyear
0appleXS2010
1appleXS2012
2appleS2010
3appleS2012
4appleM2010
5appleM2012
6appleL2010
7appleL2012
8orangeXS2010
9orangeXS2012
10orangeS2010
11orangeS2012
12orangeM2010
13orangeM2012
14orangeL2010
15orangeL2012
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], "source": [ - "fruits >> expand(f.type, f['size'], f.year)" + "fruits >> expand(f.type, f.size, f.year)" ] }, { @@ -527,51 +251,17 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
type
0apple
1orange
\n", - "
" - ], "text/plain": [ " type\n", "0 apple\n", "1 orange" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
type
0apple
1orange
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 6 } ], "source": [ @@ -592,70 +282,23 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typesize
0appleXS
1orangeS
2appleM
3orangeM
\n", - "
" - ], "text/plain": [ " type size\n", "0 apple XS\n", "1 orange S\n", "2 apple M\n", "3 orange M" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
typesize
0appleXS
1orangeS
2appleM
3orangeM
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 7 } ], "source": [ - "fruits >> expand(nesting(f.type, f['size']))" + "fruits >> expand(nesting(f.type, f.size))" ] }, { @@ -672,75 +315,23 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typesizeyear
0appleXS2010
1orangeS2010
2appleM2012
3orangeM2012
\n", - "
" - ], "text/plain": [ " type size year\n", "0 apple XS 2010\n", "1 orange S 2010\n", "2 apple M 2012\n", "3 orange M 2012" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
typesizeyear
0appleXS2010
1orangeS2010
2appleM2012
3orangeM2012
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 8 } ], "source": [ - "fruits >> expand(nesting(f.type, f['size'], f.year))" + "fruits >> expand(nesting(f.type, f.size, f.year))" ] }, { @@ -757,159 +348,43 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typesizeyear
0appleXS2010
1appleXS2012
2appleS2010
3appleS2012
4appleM2010
5appleM2012
6appleL2010
7appleL2012
8orangeXS2010
9orangeXS2012
10orangeS2010
11orangeS2012
12orangeM2010
13orangeM2012
14orangeL2010
15orangeL2012
\n", - "
" - ], "text/plain": [ - " type size year\n", - "0 apple XS 2010\n", - "1 apple XS 2012\n", - "2 apple S 2010\n", - "3 apple S 2012\n", - "4 apple M 2010\n", - "5 apple M 2012\n", - "6 apple L 2010\n", - "7 apple L 2012\n", - "8 orange XS 2010\n", - "9 orange XS 2012\n", - "10 orange S 2010\n", - "11 orange S 2012\n", - "12 orange M 2010\n", - "13 orange M 2012\n", - "14 orange L 2010\n", - "15 orange L 2012" - ] + " type size _Var2\n", + "0 apple XS 2010\n", + "1 apple XS 2011\n", + "2 apple XS 2012\n", + "3 apple S 2010\n", + "4 apple S 2011\n", + "5 apple S 2012\n", + "6 apple M 2010\n", + "7 apple M 2011\n", + "8 apple M 2012\n", + "9 apple L 2010\n", + "10 apple L 2011\n", + "11 apple L 2012\n", + "12 orange XS 2010\n", + "13 orange XS 2011\n", + "14 orange XS 2012\n", + "15 orange S 2010\n", + "16 orange S 2011\n", + "17 orange S 2012\n", + "18 orange M 2010\n", + "19 orange M 2011\n", + "20 orange M 2012\n", + "21 orange L 2010\n", + "22 orange L 2011\n", + "23 orange L 2012" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
typesize_Var2
0appleXS2010
1appleXS2011
2appleXS2012
3appleS2010
4appleS2011
5appleS2012
6appleM2010
7appleM2011
8appleM2012
9appleL2010
10appleL2011
11appleL2012
12orangeXS2010
13orangeXS2011
14orangeXS2012
15orangeS2010
16orangeS2011
17orangeS2012
18orangeM2010
19orangeM2011
20orangeM2012
21orangeL2010
22orangeL2011
23orangeL2012
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 9 } ], "source": [ - "fruits >> expand(f.type, f['size'], full_seq(f.year, 1))" + "fruits >> expand(f.type, f.size, full_seq(f.year, 1))" ] }, { @@ -926,222 +401,43 @@ }, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2021-04-16 17:52:26][datar][WARNING] Temporary name used. Use keyword argument to specify the key as column name.\n" - ] - }, - { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typesize_tmp7f7a_2
0appleXS2010
1appleXS2011
2appleXS2012
3appleS2010
4appleS2011
5appleS2012
6appleM2010
7appleM2011
8appleM2012
9appleL2010
10appleL2011
11appleL2012
12orangeXS2010
13orangeXS2011
14orangeXS2012
15orangeS2010
16orangeS2011
17orangeS2012
18orangeM2010
19orangeM2011
20orangeM2012
21orangeL2010
22orangeL2011
23orangeL2012
\n", - "
" - ], "text/plain": [ - " type size _tmp7f7a_2\n", - "0 apple XS 2010\n", - "1 apple XS 2011\n", - "2 apple XS 2012\n", - "3 apple S 2010\n", - "4 apple S 2011\n", - "5 apple S 2012\n", - "6 apple M 2010\n", - "7 apple M 2011\n", - "8 apple M 2012\n", - "9 apple L 2010\n", - "10 apple L 2011\n", - "11 apple L 2012\n", - "12 orange XS 2010\n", - "13 orange XS 2011\n", - "14 orange XS 2012\n", - "15 orange S 2010\n", - "16 orange S 2011\n", - "17 orange S 2012\n", - "18 orange M 2010\n", - "19 orange M 2011\n", - "20 orange M 2012\n", - "21 orange L 2010\n", - "22 orange L 2011\n", - "23 orange L 2012" - ] + " type size _Var2\n", + "0 apple XS 2010\n", + "1 apple XS 2011\n", + "2 apple XS 2012\n", + "3 apple S 2010\n", + "4 apple S 2011\n", + "5 apple S 2012\n", + "6 apple M 2010\n", + "7 apple M 2011\n", + "8 apple M 2012\n", + "9 apple L 2010\n", + "10 apple L 2011\n", + "11 apple L 2012\n", + "12 orange XS 2010\n", + "13 orange XS 2011\n", + "14 orange XS 2012\n", + "15 orange S 2010\n", + "16 orange S 2011\n", + "17 orange S 2012\n", + "18 orange M 2010\n", + "19 orange M 2011\n", + "20 orange M 2012\n", + "21 orange L 2010\n", + "22 orange L 2011\n", + "23 orange L 2012" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
typesize_Var2
0appleXS2010
1appleXS2011
2appleXS2012
3appleS2010
4appleS2011
5appleS2012
6appleM2010
7appleM2011
8appleM2012
9appleL2010
10appleL2011
11appleL2012
12orangeXS2010
13orangeXS2011
14orangeXS2012
15orangeS2010
16orangeS2011
17orangeS2012
18orangeM2010
19orangeM2011
20orangeM2012
21orangeL2010
22orangeL2011
23orangeL2012
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 10 } ], "source": [ - "fruits >> expand(f.type, f['size'], range(2010, 2013))" + "fruits >> expand(f.type, f.size, seq(2010, 2012))" ] }, { @@ -1158,180 +454,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typesizeyear
0appleXS2010
1appleXS2011
2appleXS2012
3appleS2010
4appleS2011
5appleS2012
6appleM2010
7appleM2011
8appleM2012
9appleL2010
10appleL2011
11appleL2012
12orangeXS2010
13orangeXS2011
14orangeXS2012
15orangeS2010
16orangeS2011
17orangeS2012
18orangeM2010
19orangeM2011
20orangeM2012
21orangeL2010
22orangeL2011
23orangeL2012
\n", - "
" - ], "text/plain": [ " type size year\n", "0 apple XS 2010\n", @@ -1358,15 +482,15 @@ "21 orange L 2010\n", "22 orange L 2011\n", "23 orange L 2012" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
typesizeyear
0appleXS2010
1appleXS2011
2appleXS2012
3appleS2010
4appleS2011
5appleS2012
6appleM2010
7appleM2011
8appleM2012
9appleL2010
10appleL2011
11appleL2012
12orangeXS2010
13orangeXS2011
14orangeXS2012
15orangeS2010
16orangeS2011
17orangeS2012
18orangeM2010
19orangeM2011
20orangeM2012
21orangeL2010
22orangeL2011
23orangeL2012
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 11 } ], "source": [ - "fruits >> expand(f.type, f['size'], year=range(2010, 2013))" + "fruits >> expand(f.type, f.size, year=seq(2010, 2012))" ] }, { @@ -1383,132 +507,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typesizeyear
0appleXS2010
1appleXS2012
2appleS2010
3appleS2012
4appleM2010
5appleM2012
6appleL2010
7appleL2012
8orangeXS2010
9orangeXS2012
10orangeS2010
11orangeS2012
12orangeM2010
13orangeM2012
14orangeL2010
15orangeL2012
\n", - "
" - ], "text/plain": [ " type size year\n", "0 apple XS 2010\n", @@ -1527,15 +527,15 @@ "13 orange M 2012\n", "14 orange L 2010\n", "15 orange L 2012" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
typesizeyear
0appleXS2010
1appleXS2012
2appleS2010
3appleS2012
4appleM2010
5appleM2012
6appleL2010
7appleL2012
8orangeXS2010
9orangeXS2012
10orangeS2010
11orangeS2012
12orangeM2010
13orangeM2012
14orangeL2010
15orangeL2012
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 12 } ], "source": [ - "all = fruits >> expand(f.type, f['size'], f.year)\n", + "all = fruits >> expand(f.type, f.size, f.year)\n", "all" ] }, @@ -1553,108 +553,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typesizeyear
1appleXS2012
2appleS2010
3appleS2012
4appleM2010
6appleL2010
7appleL2012
8orangeXS2010
9orangeXS2012
13orangeS2012
14orangeM2010
16orangeL2010
17orangeL2012
\n", - "
" - ], "text/plain": [ " type size year\n", "1 apple XS 2012\n", @@ -1669,11 +569,11 @@ "14 orange M 2010\n", "16 orange L 2010\n", "17 orange L 2012" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
typesizeyear
1appleXS2012
2appleS2010
3appleS2012
4appleM2010
6appleL2010
7appleL2012
8orangeXS2010
9orangeXS2012
13orangeS2012
14orangeM2010
16orangeL2010
17orangeL2012
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 13 } ], "source": [ @@ -1694,188 +594,33 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
typeyearsizeweights
0apple2010XS-0.197515
1apple2012XSNaN
2apple2010SNaN
3apple2012SNaN
4apple2010MNaN
5apple2012M0.585883
6apple2010LNaN
7apple2012LNaN
8orange2010XSNaN
9orange2012XSNaN
10orange2010S1.366764
11orange2010S-1.781964
12orange2010S0.648979
13orange2012SNaN
14orange2010MNaN
15orange2012M0.303533
16orange2010LNaN
17orange2012LNaN
\n", - "
" - ], "text/plain": [ " type year size weights\n", - "0 apple 2010 XS -0.197515\n", + "0 apple 2010 XS 0.705952\n", "1 apple 2012 XS NaN\n", "2 apple 2010 S NaN\n", "3 apple 2012 S NaN\n", "4 apple 2010 M NaN\n", - "5 apple 2012 M 0.585883\n", + "5 apple 2012 M -1.558679\n", "6 apple 2010 L NaN\n", "7 apple 2012 L NaN\n", "8 orange 2010 XS NaN\n", "9 orange 2012 XS NaN\n", - "10 orange 2010 S 1.366764\n", - "11 orange 2010 S -1.781964\n", - "12 orange 2010 S 0.648979\n", + "10 orange 2010 S -1.150749\n", + "11 orange 2010 S 0.171499\n", + "12 orange 2010 S -1.965301\n", "13 orange 2012 S NaN\n", "14 orange 2010 M NaN\n", - "15 orange 2012 M 0.303533\n", + "15 orange 2012 M 0.212120\n", "16 orange 2010 L NaN\n", "17 orange 2012 L NaN" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
typeyearsizeweights
0apple2010XS0.705952
1apple2012XSNaN
2apple2010SNaN
3apple2012SNaN
4apple2010MNaN
5apple2012M-1.558679
6apple2010LNaN
7apple2012LNaN
8orange2010XSNaN
9orange2012XSNaN
10orange2010S-1.150749
11orange2010S0.171499
12orange2010S-1.965301
13orange2012SNaN
14orange2010MNaN
15orange2012M0.212120
16orange2010LNaN
17orange2012LNaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 14 } ], "source": [ @@ -1904,4 +649,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/notebooks/expand_grid.ipynb b/docs/notebooks/expand_grid.ipynb index 64dee24e..5f699a45 100644 --- a/docs/notebooks/expand_grid.ipynb +++ b/docs/notebooks/expand_grid.ipynb @@ -14,51 +14,28 @@ }, "outputs": [ { + "output_type": "display_data", "data": { - "text/html": [ - "
Try this notebook on binder.
" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/html": "
Try this notebook on binder.
" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "### # expand_grid " - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "### # expand_grid " }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "##### Expand elements into a new dataframe\n", - "\n", - "See https://tidyr.tidyverse.org/reference/expand_grid.html \n", - "\n", - "##### Args:\n", - "  _data, **kwargs: Name-value pairs. The name will become the column \n", - "    name in the output. \n", - "    For _data, will try to fetch name via `_data.__dfname__`. If failed \n", - "    `_data` will be used. \n", - "\n", - "##### Returns:\n", - "  The expanded dataframe \n" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "##### Create a tibble from all combinations of inputs\n\n##### Args:\n  `*args`: and \n  `**kwargs`: name-value pairs. \n    For `*args`, names will be inferred from the values and if failed, \n    `_Var0`, `_Var1`, etc will be used. \n\n  `_name_repair`: treatment of problematic column names: \n    - \"minimal\": No name repair or checks, beyond basic existence,\n\n    - \"unique\": Make sure names are unique and not empty,\n\n    - \"check_unique\": (default value), no name repair,\n      but check they are unique, \n\n    - \"universal\": Make the names unique and syntactic\n\n    - a function: apply custom name repair\n\n  `_base0`: Whether the suffixes of repaired names should be 0-based. \n    If not provided, will use `datar.base.getOption('index.base.0')`. \n\n##### Returns:\n  A data frame with one column for each input in `*args` and `**kwargs`. \n  The output will have one row for each combination of the inputs, \n  i.e. the size be equal to the product of the sizes of the inputs. \n  This implies that if any input has length 0, the output will have \n  zero rows. \n" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], "source": [ @@ -84,65 +61,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xy
011
112
221
322
431
532
\n", - "
" - ], "text/plain": [ " x y\n", "0 1 1\n", @@ -151,15 +71,15 @@ "3 2 2\n", "4 3 1\n", "5 3 2" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
011
112
221
322
431
532
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], "source": [ - "expand_grid(x=range(1,4), y=range(1,3))" + "expand_grid(x=seq(1,3), y=seq(1,2))" ] }, { @@ -176,91 +96,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
l1l2
0aA
1aB
2aC
3aD
4aE
.........
671zV
672zW
673zX
674zY
675zZ
\n", - "

676 rows × 2 columns

\n", - "
" - ], "text/plain": [ " l1 l2\n", "0 a A\n", @@ -276,11 +113,11 @@ "675 z Z\n", "\n", "[676 rows x 2 columns]" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
l1l2
0aA
1aB
2aC
3aD
4aE
.........
671zV
672zW
673zX
674zY
675zZ
\n

676 rows × 2 columns

\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], "source": [ @@ -301,85 +138,21 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_xdf_yz
0121
1122
2123
3211
4212
5213
\n", - "
" - ], "text/plain": [ - " df_x df_y z\n", + " df$x df$y z\n", "0 1 2 1\n", "1 1 2 2\n", "2 1 2 3\n", "3 2 1 1\n", "4 2 1 2\n", "5 2 1 3" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
df$xdf$yz
0121
1122
2123
3211
4212
5213
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], "source": [ @@ -400,76 +173,19 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
x1_ax1_bx2_ax2_b
01357
11368
22457
32468
\n", - "
" - ], "text/plain": [ - " x1_a x1_b x2_a x2_b\n", + " x1$a x1$b x2$a x2$b\n", "0 1 3 5 7\n", "1 1 3 6 8\n", "2 2 4 5 7\n", "3 2 4 6 8" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
x1$ax1$bx2$ax2$b
01357
11368
22457
32468
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], "source": [ @@ -506,4 +222,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/notebooks/extract.ipynb b/docs/notebooks/extract.ipynb index 548e8ce4..3b75287d 100644 --- a/docs/notebooks/extract.ipynb +++ b/docs/notebooks/extract.ipynb @@ -14,60 +14,28 @@ }, "outputs": [ { + "output_type": "display_data", "data": { - "text/html": [ - "
Try this notebook on binder.
" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/html": "
Try this notebook on binder.
" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "### # extract " - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "### # extract " }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "##### Given a regular expression with capturing groups, extract() turns each\n", - "group into a new column. If the groups don't match, or the input is NA, \n", - "the output will be NA. \n", - "\n", - "See https://tidyr.tidyverse.org/reference/extract.html \n", - "\n", - "##### Args:\n", - "  `_data`: The dataframe \n", - "  `col`: Column name or position. \n", - "  `into`: Names of new variables to create as character vector. \n", - "    Use None to omit the variable in the output. \n", - "\n", - "  `regex`: a regular expression used to extract the desired values. \n", - "    There should be one group (defined by ()) for each element of into. \n", - "\n", - "  `remove`: If TRUE, remove input column from output data frame. \n", - "  `convert`: The universal type for the extracted columns or a dict for \n", - "    individual ones \n", - "\n", - "##### Returns:\n", - "  Dataframe with extracted columns. \n" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "##### Given a regular expression with capturing groups, extract() turns each\ngroup into a new column. If the groups don't match, or the input is NA, \nthe output will be NA. \n\nSee https://tidyr.tidyverse.org/reference/extract.html \n\n##### Args:\n  `data`: The dataframe \n  `col`: Column name or position. \n  `into`: Names of new variables to create as character vector. \n    Use None to omit the variable in the output. \n\n  `regex`: a regular expression used to extract the desired values. \n    There should be one group (defined by ()) for each element of into. \n\n  `remove`: If TRUE, remove input column from output data frame. \n  `convert`: The universal type for the extracted columns or a dict for \n    individual ones \n\n  `_base0`: Whether `col` is 0-based when given by index \n    If not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  Dataframe with extracted columns. \n" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], "source": [ @@ -93,54 +61,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
A
0NaN
1a
2a
3b
4d
\n", - "
" - ], "text/plain": [ " A\n", "0 NaN\n", @@ -148,11 +70,11 @@ "2 a\n", "3 b\n", "4 d" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
A
0NaN
1a
2a
3b
4d
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], "source": [ @@ -174,60 +96,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AB
0NaNNaN
1ab
2ad
3bc
4de
\n", - "
" - ], "text/plain": [ " A B\n", "0 NaN NaN\n", @@ -235,11 +105,11 @@ "2 a d\n", "3 b c\n", "4 d e" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
AB
0NaNNaN
1ab
2ad
3bc
4de
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], "source": [ @@ -260,60 +130,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AB
0NaNNaN
1ab
2ad
3bc
4NaNNaN
\n", - "
" - ], "text/plain": [ " A B\n", "0 NaN NaN\n", @@ -321,16 +139,40 @@ "2 a d\n", "3 b c\n", "4 NaN NaN" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
AB
0NaNNaN
1ab
2ad
3bc
4NaNNaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], "source": [ "df >> extract(f.x, c(\"A\", \"B\"), r\"([a-d]+)-([a-d]+)\")" ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " a b\n", + "0 ac bd" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ab
0acbd
\n
" + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "# combine multiple columns\n", + "df = tibble(x='abcd')\n", + "df >> extract(f.x, ['a', 'b', 'a', 'b'], r'(.)(.)(.)(.)')" + ] } ], "metadata": { @@ -354,4 +196,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/notebooks/fill.ipynb b/docs/notebooks/fill.ipynb index 83dc4292..19b24386 100644 --- a/docs/notebooks/fill.ipynb +++ b/docs/notebooks/fill.ipynb @@ -14,54 +14,28 @@ }, "outputs": [ { + "output_type": "display_data", "data": { - "text/html": [ - "
Try this notebook on binder.
" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/html": "
Try this notebook on binder.
" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "### # fill " - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "### # fill " }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "##### Fills missing values in selected columns using the next or\n", - "previous entry. \n", - "\n", - "See https://tidyr.tidyverse.org/reference/fill.html \n", - "\n", - "##### Args:\n", - "  `_data`: A dataframe \n", - "  `*columns`: Columns to fill \n", - "  `_direction`: Direction in which to fill missing values. \n", - "    Currently either \"down\" (the default), \"up\", \n", - "    \"downup\" (i.e. first down and then up) or \n", - "    \"updown\" (first up and then down). \n", - "\n", - "##### Returns:\n", - "  The dataframe with NAs being replaced. \n" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "##### Fills missing values in selected columns using the next or\nprevious entry. \n\nSee https://tidyr.tidyverse.org/reference/fill.html \n\n##### Args:\n  `_data`: A dataframe \n  `*columns`: Columns to fill \n  `_direction`: Direction in which to fill missing values. \n    Currently either \"down\" (the default), \"up\", \n    \"downup\" (i.e. first down and then up) or \n    \"updown\" (first up and then down). \n\n  `_base0`: Whether `*columns` are 0-based if given by indexes \n    If not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  The dataframe with NAs being replaced. \n" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], "source": [ @@ -87,132 +61,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
quarteryearsales
0Q12000.066013
1Q2NaN69182
2Q3NaN53175
3Q4NaN21001
4Q12001.046036
5Q2NaN58842
6Q3NaN44568
7Q4NaN50197
8Q12002.039113
9Q2NaN41668
10Q3NaN30144
11Q4NaN52897
12Q12004.032129
13Q2NaN67686
14Q3NaN31768
15Q4NaN49094
\n", - "
" - ], "text/plain": [ " quarter year sales\n", "0 Q1 2000.0 66013\n", @@ -231,11 +81,11 @@ "13 Q2 NaN 67686\n", "14 Q3 NaN 31768\n", "15 Q4 NaN 49094" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
quarteryearsales
0Q12000.066013
1Q2NaN69182
2Q3NaN53175
3Q4NaN21001
4Q12001.046036
5Q2NaN58842
6Q3NaN44568
7Q4NaN50197
8Q12002.039113
9Q2NaN41668
10Q3NaN30144
11Q4NaN52897
12Q12004.032129
13Q2NaN67686
14Q3NaN31768
15Q4NaN49094
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], "source": [ @@ -275,132 +125,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
quarteryearsales
0Q12000.066013
1Q22000.069182
2Q32000.053175
3Q42000.021001
4Q12001.046036
5Q22001.058842
6Q32001.044568
7Q42001.050197
8Q12002.039113
9Q22002.041668
10Q32002.030144
11Q42002.052897
12Q12004.032129
13Q22004.067686
14Q32004.031768
15Q42004.049094
\n", - "
" - ], "text/plain": [ " quarter year sales\n", "0 Q1 2000.0 66013\n", @@ -419,11 +145,11 @@ "13 Q2 2004.0 67686\n", "14 Q3 2004.0 31768\n", "15 Q4 2004.0 49094" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
quarteryearsales
0Q12000.066013
1Q22000.069182
2Q32000.053175
3Q42000.021001
4Q12001.046036
5Q22001.058842
6Q32001.044568
7Q42001.050197
8Q12002.039113
9Q22002.041668
10Q32002.030144
11Q42002.052897
12Q12004.032129
13Q22004.067686
14Q32004.031768
15Q42004.049094
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], "source": [ @@ -444,108 +170,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
rankpet_typebreed
01DogBoston Terrier
12DogRetrievers (Labrador)
23DogRetrievers (Golden)
34DogFrench Bulldogs
45DogBulldogs
56DogBeagles
61CatPersian
72CatMaine Coon
83CatRagdoll
94CatExotic
105CatSiamese
116CatAmerican Short
\n", - "
" - ], "text/plain": [ " rank pet_type breed\n", "0 1 Dog Boston Terrier\n", @@ -560,11 +186,11 @@ "9 4 Cat Exotic\n", "10 5 Cat Siamese\n", "11 6 Cat American Short" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
rankpet_typebreed
01DogBoston Terrier
12DogRetrievers (Labrador)
23DogRetrievers (Golden)
34DogFrench Bulldogs
45DogBulldogs
56DogBeagles
61CatPersian
72CatMaine Coon
83CatRagdoll
94CatExotic
105CatSiamese
116CatAmerican Short
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], "source": [ @@ -600,121 +226,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
groupnamerolen_squirrels
01SamObserver8.0
11MaraScorekeeper8.0
21JesseObserver8.0
31TomObserver8.0
42MikeObserver14.0
52RachaelObserver14.0
62SydekeaScorekeeper14.0
72GabrielaObserver14.0
83DerrickObserver9.0
93KaraScorekeeper9.0
103EmilyObserver9.0
113DanielleObserver9.0
\n", - "
[Groups: ['group'] (n=3)]" - ], "text/plain": [ " group name role n_squirrels\n", "0 1 Sam Observer 8.0\n", @@ -730,11 +243,11 @@ "10 3 Emily Observer 9.0\n", "11 3 Danielle Observer 9.0\n", "[Groups: ['group'] (n=3)]" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
groupnamerolen_squirrels
01SamObserver8.0
11MaraScorekeeper8.0
21JesseObserver8.0
31TomObserver8.0
42MikeObserver14.0
52RachaelObserver14.0
62SydekeaScorekeeper14.0
72GabrielaObserver14.0
83DerrickObserver9.0
93KaraScorekeeper9.0
103EmilyObserver9.0
113DanielleObserver9.0
\n
[Groups: ['group'] (n=3)]" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], "source": [ @@ -789,4 +302,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/notebooks/full_seq.ipynb b/docs/notebooks/full_seq.ipynb index 6bb1b165..7a18f397 100644 --- a/docs/notebooks/full_seq.ipynb +++ b/docs/notebooks/full_seq.ipynb @@ -14,50 +14,28 @@ }, "outputs": [ { + "output_type": "display_data", "data": { - "text/html": [ - "
Try this notebook on binder.
" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/html": "
Try this notebook on binder.
" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "### # full_seq " - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "### # full_seq " }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "##### Create the full sequence of values in a vector\n", - "\n", - "##### Args:\n", - "  `x`: A numeric vector. \n", - "  `period`: Gap between each observation. The existing data will be \n", - "    checked to ensure that it is actually of this periodicity. \n", - "\n", - "  `tol`: Numerical tolerance for checking periodicity. \n", - "\n", - "##### Returns:\n", - "  The full sequence \n" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "##### Create the full sequence of values in a vector\n\n##### Args:\n  `x`: A numeric vector. \n  `period`: Gap between each observation. The existing data will be \n    checked to ensure that it is actually of this periodicity. \n\n  `tol`: Numerical tolerance for checking periodicity. \n\n##### Returns:\n  The full sequence \n" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], "source": [ @@ -83,19 +61,26 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ - "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]" + "array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])" ] }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], "source": [ "full_seq(c(1, 2, 4, 5, 10), 1)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -119,4 +104,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/notebooks/nb_helpers.py b/docs/notebooks/nb_helpers.py index baaa640e..08739357 100644 --- a/docs/notebooks/nb_helpers.py +++ b/docs/notebooks/nb_helpers.py @@ -1,4 +1,6 @@ """helpers for notebooks""" +from contextlib import contextmanager + from IPython.display import display, Markdown, HTML from IPython.core.interactiveshell import InteractiveShell import pardoc @@ -29,3 +31,11 @@ def nb_header(*funcs, book=None): ) display(Markdown(f'{"#"*3} # {func.__name__} ')) display(Markdown(formatted)) + +@contextmanager +def try_catch(): + """Catch the error and print it out""" + try: + yield + except Exception as exc: + print(f"[{type(exc).__name__}] {exc}") diff --git a/docs/notebooks/nest.ipynb b/docs/notebooks/nest.ipynb new file mode 100644 index 00000000..e4bac161 --- /dev/null +++ b/docs/notebooks/nest.ipynb @@ -0,0 +1,574 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python378jvsc74a57bd0c4cc73b080e063fcebb9afb794613be7caf4b26129562cba1382945a18cc49cc", + "display_name": "Python 3.7.8 64-bit ('base': conda)" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/html": "
Try this notebook on binder.
" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # nest " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Nesting creates a list-column of data frames\n\n##### Args:\n  `_data`: A data frame \n  `**cols`: Columns to nest \n  `_names_sep`: If `None`, the default, the names will be left as is. \n    Inner names will come from the former outer names \n    If a string, the inner and outer names will be used together. \n    The names of the new outer columns will be formed by pasting \n    together the outer and the inner column names, separated by \n    `_names_sep`. \n\n  `_base0`: Whether `**cols` are 0-based \n    if not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  Nested data frame. \n" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # unnest " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Flattens list-column of data frames back out into regular columns.\n\n##### Args:\n  `data`: A data frame to flatten. \n  `*cols`: Columns to unnest. \n  `keep_empty`: By default, you get one row of output for each element \n    of the list your unchopping/unnesting. \n    This means that if there's a size-0 element \n    (like NULL or an empty data frame), that entire row will be \n    dropped from the output. \n    If you want to preserve all rows, use `keep_empty` = `True` to \n    replace size-0 elements with a single row of missing values. \n\n  `dtypes`: NOT `ptype`. Providing the dtypes for the output columns. \n    Could be a single dtype, which will be applied to all columns, or \n    a dictionary of dtypes with keys for the columns and values the \n    dtypes. \n\n  `names_sep`: If `None`, the default, the names will be left as is. \n    Inner names will come from the former outer names \n    If a string, the inner and outer names will be used together. \n    The names of the new outer columns will be formed by pasting \n    together the outer and the inner column names, separated by \n    `names_sep`. \n\n  `names_repair`: treatment of problematic column names: \n    - \"minimal\": No name repair or checks, beyond basic existence,\n\n    - \"unique\": Make sure names are unique and not empty,\n\n    - \"check_unique\": (default value), no name repair,\n      but check they are unique, \n\n    - \"universal\": Make the names unique and syntactic\n\n    - a function: apply custom name repair\n\n  `_base0`: Whether `cols` are 0-based \n    if not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  Data frame with selected columns unnested. \n" + }, + "metadata": {} + } + ], + "source": [ + "from datar.datasets import iris, fish_encounters, mtcars\n", + "from datar.all import *\n", + "\n", + "%run nb_helpers.py\n", + "nb_header(nest, unnest)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x data\n", + "0 1 y z\n", + "0 1 6\n", + "1 2 5\n", + "2 3 4\n", + "1 2 y z\n", + "3 4 3\n", + "4 5 2\n", + "2 3 y z\n", + "5 6 1" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xdata
01y z\n0 1 6\n1 2 5\n2 3 4
12y z\n3 4 3\n4 5 2
23y z\n5 6 1
\n
" + }, + "metadata": {}, + "execution_count": 2 + } + ], + "source": [ + "df = tibble(x = c(1, 1, 1, 2, 2, 3), y = f[1:6], z = f[6:1])\n", + "df >> nest(data=c(f.y, f.z))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x y z\n", + "0 1 [1, 2, 3] [6, 5, 4]\n", + "1 2 [4, 5] [3, 2]\n", + "2 3 [6] [1]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xyz
01[1, 2, 3][6, 5, 4]
12[4, 5][3, 2]
23[6][1]
\n
" + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "df >> chop(c(f.y, f.z))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x data\n", + "0 1 y z\n", + "0 1 6\n", + "1 2 5\n", + "2 3 4\n", + "1 2 y z\n", + "3 4 3\n", + "4 5 2\n", + "2 3 y z\n", + "5 6 1" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xdata
01y z\n0 1 6\n1 2 5\n2 3 4
12y z\n3 4 3\n4 5 2
23y z\n5 6 1
\n
" + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "df >> nest(data=any_of(c(f.y, f.z)))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Species data\n", + "0 setosa Sepal_Length Sepal_Width Petal_Length P...\n", + "1 versicolor Sepal_Length Sepal_Width Petal_Length P...\n", + "2 virginica Sepal_Length Sepal_Width Petal_Length ..." + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Speciesdata
0setosaSepal_Length Sepal_Width Petal_Length P...
1versicolorSepal_Length Sepal_Width Petal_Length P...
2virginicaSepal_Length Sepal_Width Petal_Length ...
\n
" + }, + "metadata": {}, + "execution_count": 5 + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Sepal_Length Sepal_Width Petal_Length Petal_Width\n", + "0 5.1 3.5 1.4 0.2\n", + "1 4.9 3.0 1.4 0.2\n", + "2 4.7 3.2 1.3 0.2\n", + "3 4.6 3.1 1.5 0.2\n", + "4 5.0 3.6 1.4 0.2\n", + "5 5.4 3.9 1.7 0.4\n", + "6 4.6 3.4 1.4 0.3\n", + "7 5.0 3.4 1.5 0.2\n", + "8 4.4 2.9 1.4 0.2\n", + "9 4.9 3.1 1.5 0.1\n", + "10 5.4 3.7 1.5 0.2\n", + "11 4.8 3.4 1.6 0.2\n", + "12 4.8 3.0 1.4 0.1\n", + "13 4.3 3.0 1.1 0.1\n", + "14 5.8 4.0 1.2 0.2\n", + "15 5.7 4.4 1.5 0.4\n", + "16 5.4 3.9 1.3 0.4\n", + "17 5.1 3.5 1.4 0.3\n", + "18 5.7 3.8 1.7 0.3\n", + "19 5.1 3.8 1.5 0.3\n", + "20 5.4 3.4 1.7 0.2\n", + "21 5.1 3.7 1.5 0.4\n", + "22 4.6 3.6 1.0 0.2\n", + "23 5.1 3.3 1.7 0.5\n", + "24 4.8 3.4 1.9 0.2\n", + "25 5.0 3.0 1.6 0.2\n", + "26 5.0 3.4 1.6 0.4\n", + "27 5.2 3.5 1.5 0.2\n", + "28 5.2 3.4 1.4 0.2\n", + "29 4.7 3.2 1.6 0.2\n", + "30 4.8 3.1 1.6 0.2\n", + "31 5.4 3.4 1.5 0.4\n", + "32 5.2 4.1 1.5 0.1\n", + "33 5.5 4.2 1.4 0.2\n", + "34 4.9 3.1 1.5 0.2\n", + "35 5.0 3.2 1.2 0.2\n", + "36 5.5 3.5 1.3 0.2\n", + "37 4.9 3.6 1.4 0.1\n", + "38 4.4 3.0 1.3 0.2\n", + "39 5.1 3.4 1.5 0.2\n", + "40 5.0 3.5 1.3 0.3\n", + "41 4.5 2.3 1.3 0.3\n", + "42 4.4 3.2 1.3 0.2\n", + "43 5.0 3.5 1.6 0.6\n", + "44 5.1 3.8 1.9 0.4\n", + "45 4.8 3.0 1.4 0.3\n", + "46 5.1 3.8 1.6 0.2\n", + "47 4.6 3.2 1.4 0.2\n", + "48 5.3 3.7 1.5 0.2\n", + "49 5.0 3.3 1.4 0.2" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Sepal_LengthSepal_WidthPetal_LengthPetal_Width
05.13.51.40.2
14.93.01.40.2
24.73.21.30.2
34.63.11.50.2
45.03.61.40.2
55.43.91.70.4
64.63.41.40.3
75.03.41.50.2
84.42.91.40.2
94.93.11.50.1
105.43.71.50.2
114.83.41.60.2
124.83.01.40.1
134.33.01.10.1
145.84.01.20.2
155.74.41.50.4
165.43.91.30.4
175.13.51.40.3
185.73.81.70.3
195.13.81.50.3
205.43.41.70.2
215.13.71.50.4
224.63.61.00.2
235.13.31.70.5
244.83.41.90.2
255.03.01.60.2
265.03.41.60.4
275.23.51.50.2
285.23.41.40.2
294.73.21.60.2
304.83.11.60.2
315.43.41.50.4
325.24.11.50.1
335.54.21.40.2
344.93.11.50.2
355.03.21.20.2
365.53.51.30.2
374.93.61.40.1
384.43.01.30.2
395.13.41.50.2
405.03.51.30.3
414.52.31.30.3
424.43.21.30.2
435.03.51.60.6
445.13.81.90.4
454.83.01.40.3
465.13.81.60.2
474.63.21.40.2
485.33.71.50.2
495.03.31.40.2
\n
" + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "iris >> nest(data=~f.Species)\n", + "_.data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Species data\n", + "0 setosa Sepal_Length Sepal_Width Petal_Length P...\n", + "1 versicolor Sepal_Length Sepal_Width Petal_Length P...\n", + "2 virginica Sepal_Length Sepal_Width Petal_Length ..." + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Speciesdata
0setosaSepal_Length Sepal_Width Petal_Length P...
1versicolorSepal_Length Sepal_Width Petal_Length P...
2virginicaSepal_Length Sepal_Width Petal_Length ...
\n
" + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "nest_vars = names(iris)[:4]\n", + "iris >> nest(data = any_of(nest_vars))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Species petal \\\n", + "0 setosa Petal_Length Petal_Width\n", + "0 1.4... \n", + "1 versicolor Petal_Length Petal_Width\n", + "50 4.7... \n", + "2 virginica Petal_Length Petal_Width\n", + "100 6... \n", + "\n", + " sepal \n", + "0 Sepal_Length Sepal_Width\n", + "0 5.1... \n", + "1 Sepal_Length Sepal_Width\n", + "50 7.0... \n", + "2 Sepal_Length Sepal_Width\n", + "100 6... " + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Speciespetalsepal
0setosaPetal_Length Petal_Width\n0 1.4...Sepal_Length Sepal_Width\n0 5.1...
1versicolorPetal_Length Petal_Width\n50 4.7...Sepal_Length Sepal_Width\n50 7.0...
2virginicaPetal_Length Petal_Width\n100 6...Sepal_Length Sepal_Width\n100 6...
\n
" + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "iris >> nest(petal = starts_with(\"Petal\"), sepal = starts_with(\"Sepal\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Species width \\\n", + "0 setosa Sepal_Width Petal_Width\n", + "0 3.5 ... \n", + "1 versicolor Sepal_Width Petal_Width\n", + "50 3.2 ... \n", + "2 virginica Sepal_Width Petal_Width\n", + "100 3.3... \n", + "\n", + " length \n", + "0 Sepal_Length Petal_Length\n", + "0 5.... \n", + "1 Sepal_Length Petal_Length\n", + "50 7.... \n", + "2 Sepal_Length Petal_Length\n", + "100 ... " + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Specieswidthlength
0setosaSepal_Width Petal_Width\n0 3.5 ...Sepal_Length Petal_Length\n0 5....
1versicolorSepal_Width Petal_Width\n50 3.2 ...Sepal_Length Petal_Length\n50 7....
2virginicaSepal_Width Petal_Width\n100 3.3...Sepal_Length Petal_Length\n100 ...
\n
" + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "iris >> nest(width = contains(\"Width\"), length = contains(\"Length\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " fish data\n", + "0 4842 station seen\n", + "0 Release 1\n", + "1 I80_...\n", + "1 4843 station seen\n", + "11 Release 1\n", + "12 I80_...\n", + "2 4844 station seen\n", + "22 Release 1\n", + "23 I80_...\n", + "3 4845 station seen\n", + "33 Release 1\n", + "34 I80_...\n", + "4 4847 station seen\n", + "38 Release 1\n", + "39 I80_...\n", + "5 4848 station seen\n", + "41 Release 1\n", + "42 I80_...\n", + "6 4849 station seen\n", + "45 Release 1\n", + "46 I80_...\n", + "7 4850 station seen\n", + "47 Release 1\n", + "48 I80_...\n", + "8 4851 station seen\n", + "53 Release 1\n", + "54 I80_...\n", + "9 4854 station seen\n", + "55 Release 1\n", + "56 I80_...\n", + "10 4855 station seen\n", + "57 Release 1\n", + "58 I80_...\n", + "11 4857 station seen\n", + "62 Release 1\n", + "63 I80_...\n", + "12 4858 station seen\n", + "71 Release 1\n", + "72 I80_...\n", + "13 4859 station seen\n", + "82 Release 1\n", + "83 I80_...\n", + "14 4861 station seen\n", + "87 Release 1\n", + "88 I80_...\n", + "15 4862 station seen\n", + "98 Release 1\n", + "99 I...\n", + "16 4863 station seen\n", + "107 Release 1\n", + "108 I...\n", + "17 4864 station seen\n", + "109 Release 1\n", + "110 I...\n", + "18 4865 station seen\n", + "111 Release 1\n", + "112 I...\n", + "[Groups: ['fish'] (n=19)]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
fishdata
04842station seen\n0 Release 1\n1 I80_...
14843station seen\n11 Release 1\n12 I80_...
24844station seen\n22 Release 1\n23 I80_...
34845station seen\n33 Release 1\n34 I80_...
44847station seen\n38 Release 1\n39 I80_...
54848station seen\n41 Release 1\n42 I80_...
64849station seen\n45 Release 1\n46 I80_...
74850station seen\n47 Release 1\n48 I80_...
84851station seen\n53 Release 1\n54 I80_...
94854station seen\n55 Release 1\n56 I80_...
104855station seen\n57 Release 1\n58 I80_...
114857station seen\n62 Release 1\n63 I80_...
124858station seen\n71 Release 1\n72 I80_...
134859station seen\n82 Release 1\n83 I80_...
144861station seen\n87 Release 1\n88 I80_...
154862station seen\n98 Release 1\n99 I...
164863station seen\n107 Release 1\n108 I...
174864station seen\n109 Release 1\n110 I...
184865station seen\n111 Release 1\n112 I...
\n
[Groups: ['fish'] (n=19)]" + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "fish_encounters >> group_by(f.fish) >> nest()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " cyl data models\n", + "0 6 mpg disp hp drat wt qsec vs ... \n", + "1 4 mpg disp hp drat wt qsec vs ... \n", + "2 8 mpg disp hp drat wt qsec vs ... \n", + "[Groups: ['cyl'] (n=3)]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
cyldatamodels
06mpg disp hp drat wt qsec vs ...<df 7x10>
14mpg disp hp drat wt qsec vs ...<df 11x10>
28mpg disp hp drat wt qsec vs ...<df 14x10>
\n
[Groups: ['cyl'] (n=3)]" + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "from pipda import register_func\n", + "@register_func(None)\n", + "def get_models(dfs):\n", + " # do whatever with the dfs\n", + " return [\n", + " f\"\"\n", + " for df in dfs\n", + " ]\n", + "\n", + "mtcars >> group_by(f.cyl) >> nest() >> mutate(\n", + " models=get_models(f.data)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x a b\n", + "0 2 1 2\n", + "1 3 1 3\n", + "2 3 2 2\n", + "3 3 3 1" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xab
0212
1313
2322
3331
\n
" + }, + "metadata": {}, + "execution_count": 11 + } + ], + "source": [ + "df = tibble(\n", + " x = f[1:3],\n", + " y = [\n", + " NULL,\n", + " tibble(a = 1, b = 2),\n", + " tibble(a = f[1:3], b = f[3:1])\n", + " ]\n", + ")\n", + "df >> unnest(f.y, dtypes=int)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x a b\n", + "0 1 NaN NaN\n", + "1 2 1.0 2.0\n", + "2 3 1.0 3.0\n", + "3 3 2.0 2.0\n", + "4 3 3.0 1.0" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xab
01NaNNaN
121.02.0
231.03.0
332.02.0
433.01.0
\n
" + }, + "metadata": {}, + "execution_count": 12 + } + ], + "source": [ + "df >> unnest(f.y, keep_empty=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " a b c\n", + "0 a 1 11\n", + "1 b 2 11\n", + "2 c 3 22" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
abc
0a111
1b211
2c322
\n
" + }, + "metadata": {}, + "execution_count": 13 + } + ], + "source": [ + "df = tibble(\n", + " a = [c(\"a\", \"b\"), \"c\"],\n", + " b = [[1,2], 3],\n", + " c = c(11, 22)\n", + ")\n", + "df >> unnest(c(f.a, f.b))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " a b c\n", + "0 a 1 11\n", + "1 a 2 11\n", + "2 b 1 11\n", + "3 b 2 11\n", + "4 c 3 22" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
abc
0a111
1a211
2b111
3b211
4c322
\n
" + }, + "metadata": {}, + "execution_count": 14 + } + ], + "source": [ + "df >> unnest(f.a) >> unnest(f.b)" + ] + } + ] +} \ No newline at end of file diff --git a/docs/notebooks/pack.ipynb b/docs/notebooks/pack.ipynb new file mode 100644 index 00000000..335eb1d0 --- /dev/null +++ b/docs/notebooks/pack.ipynb @@ -0,0 +1,336 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python378jvsc74a57bd0c4cc73b080e063fcebb9afb794613be7caf4b26129562cba1382945a18cc49cc", + "display_name": "Python 3.7.8 64-bit ('base': conda)" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/html": "
Try this notebook on binder.
" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # pack " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Makes df narrow by collapsing a set of columns into a single df-column.\n\n##### Args:\n  `_data`: A data frame \n  `**cols`: Columns to pack \n  `_names_sep`: If `None`, the default, the names will be left as is. \n    Inner names will come from the former outer names \n    If a string, the inner and outer names will be used together. \n    The names of the new outer columns will be formed by pasting \n    together the outer and the inner column names, separated by \n    `_names_sep`. \n\n  `_base0`: Whether `**cols` are 0-based \n    if not provided, will use `datar.base.getOption('index.base.0')` \n" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # unpack " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Makes df wider by expanding df-columns back out into individual columns.\n\n##### Args:\n  `data`: A data frame \n  `cols`: Columns to unpack \n  `names_sep`: If `None`, the default, the names will be left as is. \n    Inner names will come from the former outer names \n    If a string, the inner and outer names will be used together. \n    The names of the new outer columns will be formed by pasting \n    together the outer and the inner column names, separated by \n    `_names_sep`. \n\n  `name_repair`: treatment of problematic column names: \n    - \"minimal\": No name repair or checks, beyond basic existence,\n\n    - \"unique\": Make sure names are unique and not empty,\n\n    - \"check_unique\": (default value), no name repair,\n      but check they are unique, \n\n    - \"universal\": Make the names unique and syntactic\n\n    - a function: apply custom name repair\n\n  `_base0`: Whether `cols` are 0-based \n    if not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  Data frame with given columns unpacked. \n" + }, + "metadata": {} + } + ], + "source": [ + "from datar.datasets import iris\n", + "from datar.all import *\n", + "\n", + "%run nb_helpers.py\n", + "nb_header(pack, unpack)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x1 x2 x3 y\n", + "0 1 4 7 1\n", + "1 2 5 8 2\n", + "2 3 6 9 3" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
x1x2x3y
01471
12582
23693
\n
" + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "df = tibble(x1 = f[1:3], x2 = f[4:6], x3 = f[7:9], y = f[1:3])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " y x$x1 x$x2 x$x3\n", + "0 1 1 4 7\n", + "1 2 2 5 8\n", + "2 3 3 6 9" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
yx$x1x$x2x$x3
01147
12258
23369
\n
" + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "df >> pack(x=starts_with('x'))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x$x1 x$x2 x$x3 y$y\n", + "0 1 4 7 1\n", + "1 2 5 8 2\n", + "2 3 6 9 3" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
x$x1x$x2x$x3y$y
01471
12582
23693
\n
" + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "df >> pack(x=c(f.x1, f.x2, f.x3), y=f.y)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Species Sepal$Length Sepal$Width Petal$Length Petal$Width\n", + "0 setosa 5.1 3.5 1.4 0.2\n", + "1 setosa 4.9 3.0 1.4 0.2\n", + "2 setosa 4.7 3.2 1.3 0.2\n", + "3 setosa 4.6 3.1 1.5 0.2\n", + "4 setosa 5.0 3.6 1.4 0.2\n", + ".. ... ... ... ... ...\n", + "145 virginica 6.7 3.0 5.2 2.3\n", + "146 virginica 6.3 2.5 5.0 1.9\n", + "147 virginica 6.5 3.0 5.2 2.0\n", + "148 virginica 6.2 3.4 5.4 2.3\n", + "149 virginica 5.9 3.0 5.1 1.8\n", + "\n", + "[150 rows x 5 columns]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
SpeciesSepal$LengthSepal$WidthPetal$LengthPetal$Width
0setosa5.13.51.40.2
1setosa4.93.01.40.2
2setosa4.73.21.30.2
3setosa4.63.11.50.2
4setosa5.03.61.40.2
..................
145virginica6.73.05.22.3
146virginica6.32.55.01.9
147virginica6.53.05.22.0
148virginica6.23.45.42.3
149virginica5.93.05.11.8
\n

150 rows × 5 columns

\n
" + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "iris >> pack(\n", + " Sepal=starts_with(\"Sepal\"),\n", + " Petal=starts_with(\"Petal\"),\n", + " _names_sep=\"_\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x y$a y$b z$X z$Y z$Z\n", + "0 1 1 3 a 0.410709 True\n", + "1 2 2 2 b 0.841327 False\n", + "2 3 3 1 c 0.795641 NaN" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy$ay$bz$Xz$Yz$Z
0113a0.410709True
1222b0.841327False
2331c0.795641NaN
\n
" + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "# Unpacking ===========================================================\n", + "\n", + "df = tibble(\n", + " x = f[1:3],\n", + " y = tibble(a = f[1:3], b = f[3:1]),\n", + " z = tibble(X = c(\"a\", \"b\", \"c\"), Y = runif(3), Z = c(TRUE, FALSE, NA))\n", + ")\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x a b z$X z$Y z$Z\n", + "0 1 1 3 a 0.410709 True\n", + "1 2 2 2 b 0.841327 False\n", + "2 3 3 1 c 0.795641 NaN" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xabz$Xz$Yz$Z
0113a0.410709True
1222b0.841327False
2331c0.795641NaN
\n
" + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "df >> unpack(f.y)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x a b X Y Z\n", + "0 1 1 3 a 0.410709 True\n", + "1 2 2 2 b 0.841327 False\n", + "2 3 3 1 c 0.795641 NaN" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xabXYZ
0113a0.410709True
1222b0.841327False
2331c0.795641NaN
\n
" + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "df >> unpack(c(f.y, f.z))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x y_a y_b z_X z_Y z_Z\n", + "0 1 1 3 a 0.410709 True\n", + "1 2 2 2 b 0.841327 False\n", + "2 3 3 1 c 0.795641 NaN" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy_ay_bz_Xz_Yz_Z
0113a0.410709True
1222b0.841327False
2331c0.795641NaN
\n
" + }, + "metadata": {}, + "execution_count": 11 + } + ], + "source": [ + "df >> unpack(c(f.y, f.z), names_sep=\"_\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[ValueError] `y` has already been selected. Number of packed columns also counts when selecting using indexes.\n" + ] + } + ], + "source": [ + "with try_catch():\n", + " # indexes from inner data frame counts\n", + " df >> unpack(c(2,3))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " x a b X Y Z\n", + "0 1 1 3 a 0.410709 True\n", + "1 2 2 2 b 0.841327 False\n", + "2 3 3 1 c 0.795641 NaN" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xabXYZ
0113a0.410709True
1222b0.841327False
2331c0.795641NaN
\n
" + }, + "metadata": {}, + "execution_count": 13 + } + ], + "source": [ + "df >> unpack(c(2,4))" + ] + } + ] +} \ No newline at end of file diff --git a/docs/notebooks/pivot_longer.ipynb b/docs/notebooks/pivot_longer.ipynb index afa9700f..57c17e15 100644 --- a/docs/notebooks/pivot_longer.ipynb +++ b/docs/notebooks/pivot_longer.ipynb @@ -14,104 +14,28 @@ }, "outputs": [ { + "output_type": "display_data", "data": { - "text/html": [ - "
Try this notebook on binder.
" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/html": "
Try this notebook on binder.
" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "### # pivot_longer " - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "### # pivot_longer " }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "##### \"lengthens\" data, increasing the number of rows and\n", - "decreasing the number of columns. \n", - "\n", - "##### Args:\n", - "  `_data`: A data frame to pivot. \n", - "  `cols`: Columns to pivot into longer format. \n", - "  `names_to`: A string specifying the name of the column to create from \n", - "    the data stored in the column names of data. \n", - "    Can be a character vector, creating multiple columns, if names_sep \n", - "    or names_pattern is provided. In this case, there are two special \n", - "    values you can take advantage of: \n", - "\n", - "    - None will discard that component of the name. \n", - "\n", - "    - .value indicates that component of the name defines the name of \n", - "      the column containing the cell values, overriding values_to. \n", - "\n", - "  `names_prefix`: A regular expression used to remove matching text from \n", - "    the start of each variable name. \n", - "\n", - "  `names_sep`: and \n", - "  `names_pattern`: If names_to contains multiple values, \n", - "    these arguments control how the column name is broken up. \n", - "    names_sep takes the same specification as separate(), and \n", - "    can either be a numeric vector (specifying positions to break on), \n", - "    or a single string (specifying a regular expression to split on). \n", - "\n", - "  `names_pattern`: takes the same specification as extract(), \n", - "    a regular expression containing matching groups (()). \n", - "\n", - "  `names_ptypes`: and \n", - "  `values_ptypes`: A list of column name-prototype pairs. \n", - "    A prototype (or ptype for short) is a zero-length vector \n", - "    (like integer() or numeric()) that defines the type, class, and \n", - "    attributes of a vector. Use these arguments if you want to confirm \n", - "    that the created columns are the types that you expect. \n", - "    Note that if you want to change (instead of confirm) the types \n", - "    of specific columns, you should use names_transform or \n", - "    values_transform instead. \n", - "\n", - "  `names_transform`: and \n", - "  `values_transform`: A list of column name-function pairs. \n", - "    Use these arguments if you need to change the types of \n", - "    specific columns. For example, \n", - "    names_transform = dict(week = as.integer) would convert a \n", - "    character variable called week to an integer. \n", - "    If not specified, the type of the columns generated from names_to \n", - "    will be character, and the type of the variables generated from \n", - "    values_to will be the common type of the input columns used to \n", - "    generate them. \n", - "\n", - "  `names_repair`: Not supported yet. \n", - "  `values_to`: A string specifying the name of the column to create from \n", - "    the data stored in cell values. If names_to is a character \n", - "    containing the special .value sentinel, this value will be ignored, \n", - "    and the name of the value column will be derived from part of \n", - "    the existing column names. \n", - "\n", - "  `values_drop_na`: If TRUE, will drop rows that contain only NAs in \n", - "    the value_to column. This effectively converts explicit missing \n", - "    values to implicit missing values, and should generally be used \n", - "    only when missing values in data were created by its structure. \n", - "\n", - "##### Returns:\n", - "  The pivoted dataframe. \n" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "##### \"lengthens\" data, increasing the number of rows and\ndecreasing the number of columns. \n\nThe row order is a bit different from `tidyr` and `pandas.DataFrame.melt`. \n  >>> df = tibble(x=f[1:2], y=f[3:4]) \n  >>> pivot_longer(df, f[f.x:f.y]) \n  >>> # name value \n  >>> # 0 x 1 \n  >>> # 1 x 2 \n  >>> # 2 y 3 \n  >>> # 3 y 4 \n\nBut with `tidyr::pivot_longer`, the output will be: \n  >>> # # A tibble: 4 x 2 \n  >>> # name value \n  >>> # \n  >>> # 1 x 1 \n  >>> # 2 y 3 \n  >>> # 3 x 2 \n  >>> # 4 y 4 \n\n##### Args:\n  `_data`: A data frame to pivot. \n  `cols`: Columns to pivot into longer format. \n  `names_to`: A string specifying the name of the column to create from \n    the data stored in the column names of data. \n    Can be a character vector, creating multiple columns, if names_sep \n    or names_pattern is provided. In this case, there are two special \n    values you can take advantage of: \n\n    - `None`/`NA`/`NULL` will discard that component of the name.\n\n    - `.value`/`_value` indicates that component of the name defines\n      the name of the column containing the cell values, \n      overriding values_to. \n\n    - Different as `tidyr`: With `.value`/`_value`, if there are other\n      parts of the names to distinguish the groups, they must be \n      captured. For example, use `r'(\\w)_(\\d)'` to match `'a_1'` and \n      `['.value', NA]` to discard the suffix, instead of use \n      `r'(\\w)_\\d'` to match. \n\n  `names_prefix`: A regular expression used to remove matching text from \n    the start of each variable name. \n\n  `names_sep`: and \n  `names_pattern`: If names_to contains multiple values, \n    these arguments control how the column name is broken up. \n    names_sep takes the same specification as separate(), and \n    can either be a numeric vector (specifying positions to break on), \n    or a single string (specifying a regular expression to split on). \n\n  `names_pattern`: takes the same specification as extract(), \n    a regular expression containing matching groups (()). \n\n  `names_ptypes`: and \n  `values_ptypes`: A list of column name-prototype pairs. \n    A prototype (or ptype for short) is a zero-length vector \n    (like integer() or numeric()) that defines the type, class, and \n    attributes of a vector. Use these arguments if you want to confirm \n    that the created columns are the types that you expect. \n    Note that if you want to change (instead of confirm) the types \n    of specific columns, you should use names_transform or \n    values_transform instead. \n\n  `names_transform`: and \n  `values_transform`: A list of column name-function pairs. \n    Use these arguments if you need to change the types of \n    specific columns. For example, \n    names_transform = dict(week = as.integer) would convert a \n    character variable called week to an integer. \n    If not specified, the type of the columns generated from names_to \n    will be character, and the type of the variables generated from \n    values_to will be the common type of the input columns used to \n    generate them. \n\n  `names_repair`: Not supported yet. \n  `values_to`: A string specifying the name of the column to create from \n    the data stored in cell values. If names_to is a character \n    containing the special `.value`/`_value` sentinel, this value \n    will be ignored, and the name of the value column will be derived \n    from part of the existing column names. \n\n  `values_drop_na`: If TRUE, will drop rows that contain only NAs in \n    the value_to column. This effectively converts explicit missing \n    values to implicit missing values, and should generally be used \n    only when missing values in data were created by its structure. \n\n  `names_repair`: treatment of problematic column names: \n    - \"minimal\": No name repair or checks, beyond basic existence,\n\n    - \"unique\": Make sure names are unique and not empty,\n\n    - \"check_unique\": (default value), no name repair,\n      but check they are unique, \n\n    - \"universal\": Make the names unique and syntactic\n\n    - a function: apply custom name repair\n\n  `_base0`: Whether `cols` are 0-based if given by indexes \n    If not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  The pivoted dataframe. \n" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], "source": [ @@ -138,296 +62,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
religion<$10k$10-20k$20-30k$30-40k$40-50k$50-75k$75-100k$100-150k>150kDon't know/refused
0Agnostic27346081761371221098496
1Atheist12273752357073597476
2Buddhist27213034335862395354
3Catholic41861773267063811169497926331489
4Don’t know/refused151415111035211718116
5Evangelical Prot575869106498288114869497234141529
6Hindu1979113447485437
7Historically Black Prot2282442362381972231318178339
8Jehovah's Witness2027242421301511637
9Jewish1919252530956987151162
10Mainline Prot28949561965565111079397536341328
11Mormon294048515611285494269
12Muslim67910923168622
13Orthodox13172332324738424673
14Other Christian971113131418141218
15Other Faiths20334046496346404171
16Other World Religions5234273448
17Unaffiliated217299374365341528407321258597
\n", - "
" - ], "text/plain": [ " religion <$10k $10-20k $20-30k $30-40k $40-50k \\\n", "0 Agnostic 27 34 60 81 76 \n", @@ -468,11 +104,11 @@ "15 63 46 40 41 71 \n", "16 7 3 4 4 8 \n", "17 528 407 321 258 597 " - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
religion<$10k$10-20k$20-30k$30-40k$40-50k$50-75k$75-100k$100-150k>150kDon't know/refused
0Agnostic27346081761371221098496
1Atheist12273752357073597476
2Buddhist27213034335862395354
3Catholic41861773267063811169497926331489
4Don’t know/refused151415111035211718116
5Evangelical Prot575869106498288114869497234141529
6Hindu1979113447485437
7Historically Black Prot2282442362381972231318178339
8Jehovah's Witness2027242421301511637
9Jewish1919252530956987151162
10Mainline Prot28949561965565111079397536341328
11Mormon294048515611285494269
12Muslim67910923168622
13Orthodox13172332324738424673
14Other Christian971113131418141218
15Other Faiths20334046496346404171
16Other World Religions5234273448
17Unaffiliated217299374365341528407321258597
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], "source": [ @@ -493,103 +129,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
religionincomecount
0Agnostic<$10k27
1Atheist<$10k12
2Buddhist<$10k27
3Catholic<$10k418
4Don’t know/refused<$10k15
............
175OrthodoxDon't know/refused73
176Other ChristianDon't know/refused18
177Other FaithsDon't know/refused71
178Other World ReligionsDon't know/refused8
179UnaffiliatedDon't know/refused597
\n", - "

180 rows × 3 columns

\n", - "
" - ], "text/plain": [ " religion income count\n", "0 Agnostic <$10k 27\n", @@ -605,11 +146,11 @@ "179 Unaffiliated Don't know/refused 597\n", "\n", "[180 rows x 3 columns]" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
religionincomecount
0Agnostic<$10k27
1Atheist<$10k12
2Buddhist<$10k27
3Catholic<$10k418
4Don’t know/refused<$10k15
............
175OrthodoxDon't know/refused73
176Other ChristianDon't know/refused18
177Other FaithsDon't know/refused71
178Other World ReligionsDon't know/refused8
179UnaffiliatedDon't know/refused597
\n

180 rows × 3 columns

\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], "source": [ @@ -631,319 +172,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
artisttrackdate.enteredwk1wk2wk3wk4wk5wk6wk7...wk67wk68wk69wk70wk71wk72wk73wk74wk75wk76
02 PacBaby Don't Cry (Keep...2000-02-268782.072.077.087.094.099.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
12Ge+herThe Hardest Part Of ...2000-09-029187.092.0NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
23 Doors DownKryptonite2000-04-088170.068.067.066.057.054.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
33 Doors DownLoser2000-10-217676.072.069.067.065.055.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4504 BoyzWobble Wobble2000-04-155734.025.017.017.031.036.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
312Yankee GreyAnother Nine Minutes2000-04-298683.077.074.083.079.088.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
313Yearwood, TrishaReal Live Woman2000-04-018583.083.082.081.091.0NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
314Ying Yang TwinsWhistle While You Tw...2000-03-189594.091.085.084.078.074.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
315Zombie NationKernkraft 4002000-09-029999.0NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
316matchbox twentyBent2000-04-296037.029.024.022.021.018.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", - "

317 rows × 79 columns

\n", - "
" - ], "text/plain": [ " artist track date.entered wk1 wk2 wk3 \\\n", "0 2 Pac Baby Don't Cry (Keep... 2000-02-26 87 82.0 72.0 \n", @@ -985,11 +215,11 @@ "316 NaN NaN NaN \n", "\n", "[317 rows x 79 columns]" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
artisttrackdate.enteredwk1wk2wk3wk4wk5wk6wk7...wk67wk68wk69wk70wk71wk72wk73wk74wk75wk76
02 PacBaby Don't Cry (Keep...2000-02-268782.072.077.087.094.099.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
12Ge+herThe Hardest Part Of ...2000-09-029187.092.0NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
23 Doors DownKryptonite2000-04-088170.068.067.066.057.054.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
33 Doors DownLoser2000-10-217676.072.069.067.065.055.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4504 BoyzWobble Wobble2000-04-155734.025.017.017.031.036.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
..................................................................
312Yankee GreyAnother Nine Minutes2000-04-298683.077.074.083.079.088.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
313Yearwood, TrishaReal Live Woman2000-04-018583.083.082.081.091.0NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
314Ying Yang TwinsWhistle While You Tw...2000-03-189594.091.085.084.078.074.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
315Zombie NationKernkraft 4002000-09-029999.0NaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
316matchbox twentyBent2000-04-296037.029.024.022.021.018.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n

317 rows × 79 columns

\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], "source": [ @@ -1010,147 +240,28 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
artisttrackdate.enteredweekrank
02 PacBaby Don't Cry (Keep...2000-02-26187.0
12Ge+herThe Hardest Part Of ...2000-09-02191.0
23 Doors DownKryptonite2000-04-08181.0
33 Doors DownLoser2000-10-21176.0
4504 BoyzWobble Wobble2000-04-15157.0
..................
19716CreedHigher1999-09-116350.0
19833LonestarAmazed1999-06-056345.0
20033CreedHigher1999-09-116450.0
20150LonestarAmazed1999-06-056450.0
20350CreedHigher1999-09-116549.0
\n", - "

5307 rows × 5 columns

\n", - "
" - ], "text/plain": [ - " artist track date.entered week rank\n", - "0 2 Pac Baby Don't Cry (Keep... 2000-02-26 1 87.0\n", - "1 2Ge+her The Hardest Part Of ... 2000-09-02 1 91.0\n", - "2 3 Doors Down Kryptonite 2000-04-08 1 81.0\n", - "3 3 Doors Down Loser 2000-10-21 1 76.0\n", - "4 504 Boyz Wobble Wobble 2000-04-15 1 57.0\n", - "... ... ... ... ... ...\n", - "19716 Creed Higher 1999-09-11 63 50.0\n", - "19833 Lonestar Amazed 1999-06-05 63 45.0\n", - "20033 Creed Higher 1999-09-11 64 50.0\n", - "20150 Lonestar Amazed 1999-06-05 64 50.0\n", - "20350 Creed Higher 1999-09-11 65 49.0\n", + " artist date.entered track week rank\n", + "0 2 Pac 2000-02-26 Baby Don't Cry (Keep... 1 87.0\n", + "1 2Ge+her 2000-09-02 The Hardest Part Of ... 1 91.0\n", + "2 3 Doors Down 2000-04-08 Kryptonite 1 81.0\n", + "3 3 Doors Down 2000-10-21 Loser 1 76.0\n", + "4 504 Boyz 2000-04-15 Wobble Wobble 1 57.0\n", + "... ... ... ... ... ...\n", + "19716 Creed 1999-09-11 Higher 63 50.0\n", + "19833 Lonestar 1999-06-05 Amazed 63 45.0\n", + "20033 Creed 1999-09-11 Higher 64 50.0\n", + "20150 Lonestar 1999-06-05 Amazed 64 50.0\n", + "20350 Creed 1999-09-11 Higher 65 49.0\n", "\n", "[5307 rows x 5 columns]" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
artistdate.enteredtrackweekrank
02 Pac2000-02-26Baby Don't Cry (Keep...187.0
12Ge+her2000-09-02The Hardest Part Of ...191.0
23 Doors Down2000-04-08Kryptonite181.0
33 Doors Down2000-10-21Loser176.0
4504 Boyz2000-04-15Wobble Wobble157.0
..................
19716Creed1999-09-11Higher6350.0
19833Lonestar1999-06-05Amazed6345.0
20033Creed1999-09-11Higher6450.0
20150Lonestar1999-06-05Amazed6450.0
20350Creed1999-09-11Higher6549.0
\n

5307 rows × 5 columns

\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], "source": [ @@ -1178,183 +289,28 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
countryiso2iso3yearcountdiagnosisgenderage
0AfghanistanAFAFG1980NaNspm014
1AfghanistanAFAFG1981NaNspm014
2AfghanistanAFAFG1982NaNspm014
3AfghanistanAFAFG1983NaNspm014
4AfghanistanAFAFG1984NaNspm014
...........................
405435ZimbabweZWZWE2009NaNrelf65
405436ZimbabweZWZWE2010NaNrelf65
405437ZimbabweZWZWE2011NaNrelf65
405438ZimbabweZWZWE2012NaNrelf65
405439ZimbabweZWZWE2013725.0relf65
\n", - "

405440 rows × 8 columns

\n", - "
" - ], "text/plain": [ - " country iso2 iso3 year count diagnosis gender age\n", - "0 Afghanistan AF AFG 1980 NaN sp m 014\n", - "1 Afghanistan AF AFG 1981 NaN sp m 014\n", - "2 Afghanistan AF AFG 1982 NaN sp m 014\n", - "3 Afghanistan AF AFG 1983 NaN sp m 014\n", - "4 Afghanistan AF AFG 1984 NaN sp m 014\n", - "... ... ... ... ... ... ... ... ...\n", - "405435 Zimbabwe ZW ZWE 2009 NaN rel f 65\n", - "405436 Zimbabwe ZW ZWE 2010 NaN rel f 65\n", - "405437 Zimbabwe ZW ZWE 2011 NaN rel f 65\n", - "405438 Zimbabwe ZW ZWE 2012 NaN rel f 65\n", - "405439 Zimbabwe ZW ZWE 2013 725.0 rel f 65\n", + " country iso2 iso3 year diagnosis gender age count\n", + "0 Afghanistan AF AFG 1980 sp m 014 NaN\n", + "1 Afghanistan AF AFG 1981 sp m 014 NaN\n", + "2 Afghanistan AF AFG 1982 sp m 014 NaN\n", + "3 Afghanistan AF AFG 1983 sp m 014 NaN\n", + "4 Afghanistan AF AFG 1984 sp m 014 NaN\n", + "... ... ... ... ... ... ... ... ...\n", + "405435 Zimbabwe ZW ZWE 2009 rel f 65 NaN\n", + "405436 Zimbabwe ZW ZWE 2010 rel f 65 NaN\n", + "405437 Zimbabwe ZW ZWE 2011 rel f 65 NaN\n", + "405438 Zimbabwe ZW ZWE 2012 rel f 65 NaN\n", + "405439 Zimbabwe ZW ZWE 2013 rel f 65 725.0\n", "\n", "[405440 rows x 8 columns]" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
countryiso2iso3yeardiagnosisgenderagecount
0AfghanistanAFAFG1980spm014NaN
1AfghanistanAFAFG1981spm014NaN
2AfghanistanAFAFG1982spm014NaN
3AfghanistanAFAFG1983spm014NaN
4AfghanistanAFAFG1984spm014NaN
...........................
405435ZimbabweZWZWE2009relf65NaN
405436ZimbabweZWZWE2010relf65NaN
405437ZimbabweZWZWE2011relf65NaN
405438ZimbabweZWZWE2012relf65NaN
405439ZimbabweZWZWE2013relf65725.0
\n

405440 rows × 8 columns

\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 6 } ], "source": [ @@ -1380,162 +336,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
x1x2x3x4y1y2y3y4
010101088.049.147.466.58
188886.958.146.775.76
213131387.588.7412.747.71
399988.818.777.118.84
411111188.339.267.818.47
514141489.968.108.847.04
666687.246.136.085.25
7444194.263.105.3912.50
8121212810.849.138.155.56
977784.827.266.427.91
1055585.684.745.736.89
\n", - "
" - ], "text/plain": [ " x1 x2 x3 x4 y1 y2 y3 y4\n", "0 10 10 10 8 8.04 9.14 7.46 6.58\n", @@ -1549,11 +351,11 @@ "8 12 12 12 8 10.84 9.13 8.15 5.56\n", "9 7 7 7 8 4.82 7.26 6.42 7.91\n", "10 5 5 5 8 5.68 4.74 5.73 6.89" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
x1x2x3x4y1y2y3y4
010101088.049.147.466.58
188886.958.146.775.76
213131387.588.7412.747.71
399988.818.777.118.84
411111188.339.267.818.47
514141489.968.108.847.04
666687.246.136.085.25
7444194.263.105.3912.50
8121212810.849.138.155.56
977784.827.266.427.91
1055585.684.745.736.89
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 7 } ], "source": [ @@ -1574,351 +376,59 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
.valuexyset
010.08.041
18.06.951
213.07.581
39.08.811
411.08.331
514.09.961
66.07.241
74.04.261
812.010.841
97.04.821
105.05.681
1110.09.142
128.08.142
1313.08.742
149.08.772
1511.09.262
1614.08.102
176.06.132
184.03.102
1912.09.132
207.07.262
215.04.742
2210.07.463
238.06.773
2413.012.743
259.07.113
2611.07.813
2714.08.843
286.06.083
294.05.393
3012.08.153
317.06.423
325.05.733
338.06.584
348.05.764
358.07.714
368.08.844
378.08.474
388.07.044
398.05.254
4019.012.504
418.05.564
428.07.914
438.06.894
\n", - "
" - ], "text/plain": [ - ".value x y set\n", - "0 10.0 8.04 1\n", - "1 8.0 6.95 1\n", - "2 13.0 7.58 1\n", - "3 9.0 8.81 1\n", - "4 11.0 8.33 1\n", - "5 14.0 9.96 1\n", - "6 6.0 7.24 1\n", - "7 4.0 4.26 1\n", - "8 12.0 10.84 1\n", - "9 7.0 4.82 1\n", - "10 5.0 5.68 1\n", - "11 10.0 9.14 2\n", - "12 8.0 8.14 2\n", - "13 13.0 8.74 2\n", - "14 9.0 8.77 2\n", - "15 11.0 9.26 2\n", - "16 14.0 8.10 2\n", - "17 6.0 6.13 2\n", - "18 4.0 3.10 2\n", - "19 12.0 9.13 2\n", - "20 7.0 7.26 2\n", - "21 5.0 4.74 2\n", - "22 10.0 7.46 3\n", - "23 8.0 6.77 3\n", - "24 13.0 12.74 3\n", - "25 9.0 7.11 3\n", - "26 11.0 7.81 3\n", - "27 14.0 8.84 3\n", - "28 6.0 6.08 3\n", - "29 4.0 5.39 3\n", - "30 12.0 8.15 3\n", - "31 7.0 6.42 3\n", - "32 5.0 5.73 3\n", - "33 8.0 6.58 4\n", - "34 8.0 5.76 4\n", - "35 8.0 7.71 4\n", - "36 8.0 8.84 4\n", - "37 8.0 8.47 4\n", - "38 8.0 7.04 4\n", - "39 8.0 5.25 4\n", - "40 19.0 12.50 4\n", - "41 8.0 5.56 4\n", - "42 8.0 7.91 4\n", - "43 8.0 6.89 4" - ] + " set x y\n", + "0 1 10.0 8.04\n", + "1 2 10.0 9.14\n", + "2 3 10.0 7.46\n", + "3 4 8.0 6.58\n", + "4 1 8.0 6.95\n", + "5 2 8.0 8.14\n", + "6 3 8.0 6.77\n", + "7 4 8.0 5.76\n", + "8 1 13.0 7.58\n", + "9 2 13.0 8.74\n", + "10 3 13.0 12.74\n", + "11 4 8.0 7.71\n", + "12 1 9.0 8.81\n", + "13 2 9.0 8.77\n", + "14 3 9.0 7.11\n", + "15 4 8.0 8.84\n", + "16 1 11.0 8.33\n", + "17 2 11.0 9.26\n", + "18 3 11.0 7.81\n", + "19 4 8.0 8.47\n", + "20 1 14.0 9.96\n", + "21 2 14.0 8.10\n", + "22 3 14.0 8.84\n", + "23 4 8.0 7.04\n", + "24 1 6.0 7.24\n", + "25 2 6.0 6.13\n", + "26 3 6.0 6.08\n", + "27 4 8.0 5.25\n", + "28 1 4.0 4.26\n", + "29 2 4.0 3.10\n", + "30 3 4.0 5.39\n", + "31 4 19.0 12.50\n", + "32 1 12.0 10.84\n", + "33 2 12.0 9.13\n", + "34 3 12.0 8.15\n", + "35 4 8.0 5.56\n", + "36 1 7.0 4.82\n", + "37 2 7.0 7.26\n", + "38 3 7.0 6.42\n", + "39 4 8.0 7.91\n", + "40 1 5.0 5.68\n", + "41 2 5.0 4.74\n", + "42 3 5.0 5.73\n", + "43 4 8.0 6.89" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
setxy
0110.08.04
1210.09.14
2310.07.46
348.06.58
418.06.95
528.08.14
638.06.77
748.05.76
8113.07.58
9213.08.74
10313.012.74
1148.07.71
1219.08.81
1329.08.77
1439.07.11
1548.08.84
16111.08.33
17211.09.26
18311.07.81
1948.08.47
20114.09.96
21214.08.10
22314.08.84
2348.07.04
2416.07.24
2526.06.13
2636.06.08
2748.05.25
2814.04.26
2924.03.10
3034.05.39
31419.012.50
32112.010.84
33212.09.13
34312.08.15
3548.05.56
3617.04.82
3727.07.26
3837.06.42
3948.07.91
4015.05.68
4125.04.74
4235.05.73
4348.06.89
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 8 } ], "source": [ @@ -1959,4 +469,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/notebooks/pivot_wider.ipynb b/docs/notebooks/pivot_wider.ipynb index eb2fbafb..9f96fc4a 100644 --- a/docs/notebooks/pivot_wider.ipynb +++ b/docs/notebooks/pivot_wider.ipynb @@ -14,79 +14,28 @@ }, "outputs": [ { + "output_type": "display_data", "data": { - "text/html": [ - "
Try this notebook on binder.
" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/html": "
Try this notebook on binder.
" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "### # pivot_wider " - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "### # pivot_wider " }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "##### \"widens\" data, increasing the number of columns and decreasing\n", - "the number of rows. \n", - "\n", - "##### Args:\n", - "  `_data`: A data frame to pivot. \n", - "  `id_cols`: A set of columns that uniquely identifies each observation. \n", - "    Defaults to all columns in data except for the columns specified \n", - "    in names_from and values_from. \n", - "\n", - "  `names_from`: and \n", - "  `values_from`: A pair of arguments describing which column \n", - "    (or columns) to get the name of the output column (names_from), \n", - "    and which column (or columns) to get the cell values from \n", - "    (values_from). \n", - "\n", - "  `names_prefix`: String added to the start of every variable name. \n", - "  `names_sep`: If names_from or values_from contains multiple variables, \n", - "    this will be used to join their values together into a single \n", - "    string to use as a column name. \n", - "\n", - "  `names_glue`: Instead of names_sep and names_prefix, you can supply \n", - "    a glue specification that uses the names_from columns \n", - "    (and special _value) to create custom column names. \n", - "\n", - "  `names_sort`: Should the column names be sorted? If FALSE, the default, \n", - "    column names are ordered by first appearance. \n", - "\n", - "  `names_repair`: todo \n", - "  `values_fill`: Optionally, a (scalar) value that specifies what \n", - "    each value should be filled in with when missing. \n", - "\n", - "  `values_fn`: Optionally, a function applied to the value in each cell \n", - "    in the output. You will typically use this when the combination \n", - "    of id_cols and value column does not uniquely identify \n", - "    an observation. \n", - "    This can be a dict you want to apply different aggregations to \n", - "    different value columns. \n", - "\n", - "##### Returns:\n", - "  The pivoted dataframe. \n" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "##### \"widens\" data, increasing the number of columns and decreasing\nthe number of rows. \n\n##### Args:\n  `_data`: A data frame to pivot. \n  `id_cols`: A set of columns that uniquely identifies each observation. \n    Defaults to all columns in data except for the columns specified \n    in names_from and values_from. \n\n  `names_from`: and \n  `values_from`: A pair of arguments describing which column \n    (or columns) to get the name of the output column (names_from), \n    and which column (or columns) to get the cell values from \n    (values_from). \n\n  `names_prefix`: String added to the start of every variable name. \n  `names_sep`: If names_from or values_from contains multiple variables, \n    this will be used to join their values together into a single \n    string to use as a column name. \n\n  `names_glue`: Instead of names_sep and names_prefix, you can supply \n    a glue specification that uses the names_from columns \n    (and special _value) to create custom column names. \n\n  `names_sort`: Should the column names be sorted? If FALSE, the default, \n    column names are ordered by first appearance. \n\n  `names_repair`: todo \n  `values_fill`: Optionally, a (scalar) value that specifies what \n    each value should be filled in with when missing. \n\n  `values_fn`: Optionally, a function applied to the value in each cell \n    in the output. You will typically use this when the combination \n    of `id_cols` and value column does not uniquely identify \n    an observation. \n    This can be a dict you want to apply different aggregations to \n    different value columns. \n    If not specified, will be `numpy.mean` \n\n  `_base0`: Whether `id_cols`, `names_from` and `values_from` \n    are 0-based if given by indexes. \n    If not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  The pivoted dataframe. \n" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], "source": [ @@ -113,103 +62,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
fishstationseen
04842Release1
14842I80_11
24842Lisbon1
34842Rstr1
44842Base_TD1
............
1094864Release1
1104864I80_11
1114865Release1
1124865I80_11
1134865Lisbon1
\n", - "

114 rows × 3 columns

\n", - "
" - ], "text/plain": [ " fish station seen\n", "0 4842 Release 1\n", @@ -225,11 +79,11 @@ "113 4865 Lisbon 1\n", "\n", "[114 rows x 3 columns]" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
fishstationseen
04842Release1
14842I80_11
24842Lisbon1
34842Rstr1
44842Base_TD1
............
1094864Release1
1104864I80_11
1114865Release1
1124865I80_11
1134865Lisbon1
\n

114 rows × 3 columns

\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], "source": [ @@ -250,351 +104,55 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
BCEBCE2BCWBCW2Base_TDI80_1LisbonMAEMAWReleaseRstr
fish
48421.01.01.01.01.01.01.01.01.01.01.0
48431.01.01.01.01.01.01.01.01.01.01.0
48441.01.01.01.01.01.01.01.01.01.01.0
4845NaNNaNNaNNaN1.01.01.0NaNNaN1.01.0
4847NaNNaNNaNNaNNaN1.01.0NaNNaN1.0NaN
4848NaNNaNNaNNaNNaN1.01.0NaNNaN1.01.0
4849NaNNaNNaNNaNNaN1.0NaNNaNNaN1.0NaN
48501.0NaN1.0NaN1.01.0NaNNaNNaN1.01.0
4851NaNNaNNaNNaNNaN1.0NaNNaNNaN1.0NaN
4854NaNNaNNaNNaNNaN1.0NaNNaNNaN1.0NaN
4855NaNNaNNaNNaN1.01.01.0NaNNaN1.01.0
48571.01.01.01.01.01.01.0NaNNaN1.01.0
48581.01.01.01.01.01.01.01.01.01.01.0
4859NaNNaNNaNNaN1.01.01.0NaNNaN1.01.0
48611.01.01.01.01.01.01.01.01.01.01.0
48621.01.01.01.01.01.01.0NaNNaN1.01.0
4863NaNNaNNaNNaNNaN1.0NaNNaNNaN1.0NaN
4864NaNNaNNaNNaNNaN1.0NaNNaNNaN1.0NaN
4865NaNNaNNaNNaNNaN1.01.0NaNNaN1.0NaN
\n", - "
" + " Rstr \n", + "0 1.0 \n", + "1 1.0 \n", + "2 1.0 \n", + "3 1.0 \n", + "4 NaN \n", + "5 1.0 \n", + "6 NaN \n", + "7 1.0 \n", + "8 NaN \n", + "9 NaN \n", + "10 1.0 \n", + "11 1.0 \n", + "12 1.0 \n", + "13 1.0 \n", + "14 1.0 \n", + "15 1.0 \n", + "16 NaN \n", + "17 NaN \n", + "18 NaN " ], - "text/plain": [ - " BCE BCE2 BCW BCW2 Base_TD I80_1 Lisbon MAE MAW Release Rstr\n", - "fish \n", - "4842 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0\n", - "4843 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0\n", - "4844 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0\n", - "4845 NaN NaN NaN NaN 1.0 1.0 1.0 NaN NaN 1.0 1.0\n", - "4847 NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN 1.0 NaN\n", - "4848 NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN 1.0 1.0\n", - "4849 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN 1.0 NaN\n", - "4850 1.0 NaN 1.0 NaN 1.0 1.0 NaN NaN NaN 1.0 1.0\n", - "4851 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN 1.0 NaN\n", - "4854 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN 1.0 NaN\n", - "4855 NaN NaN NaN NaN 1.0 1.0 1.0 NaN NaN 1.0 1.0\n", - "4857 1.0 1.0 1.0 1.0 1.0 1.0 1.0 NaN NaN 1.0 1.0\n", - "4858 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0\n", - "4859 NaN NaN NaN NaN 1.0 1.0 1.0 NaN NaN 1.0 1.0\n", - "4861 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0\n", - "4862 1.0 1.0 1.0 1.0 1.0 1.0 1.0 NaN NaN 1.0 1.0\n", - "4863 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN 1.0 NaN\n", - "4864 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN 1.0 NaN\n", - "4865 NaN NaN NaN NaN NaN 1.0 1.0 NaN NaN 1.0 NaN" - ] + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
fishBCEBCE2BCWBCW2Base_TDI80_1LisbonMAEMAWReleaseRstr
048421.01.01.01.01.01.01.01.01.01.01.0
148431.01.01.01.01.01.01.01.01.01.01.0
248441.01.01.01.01.01.01.01.01.01.01.0
34845NaNNaNNaNNaN1.01.01.0NaNNaN1.01.0
44847NaNNaNNaNNaNNaN1.01.0NaNNaN1.0NaN
54848NaNNaNNaNNaNNaN1.01.0NaNNaN1.01.0
64849NaNNaNNaNNaNNaN1.0NaNNaNNaN1.0NaN
748501.0NaN1.0NaN1.01.0NaNNaNNaN1.01.0
84851NaNNaNNaNNaNNaN1.0NaNNaNNaN1.0NaN
94854NaNNaNNaNNaNNaN1.0NaNNaNNaN1.0NaN
104855NaNNaNNaNNaN1.01.01.0NaNNaN1.01.0
1148571.01.01.01.01.01.01.0NaNNaN1.01.0
1248581.01.01.01.01.01.01.01.01.01.01.0
134859NaNNaNNaNNaN1.01.01.0NaNNaN1.01.0
1448611.01.01.01.01.01.01.01.01.01.01.0
1548621.01.01.01.01.01.01.0NaNNaN1.01.0
164863NaNNaNNaNNaNNaN1.0NaNNaNNaN1.0NaN
174864NaNNaNNaNNaNNaN1.0NaNNaNNaN1.0NaN
184865NaNNaNNaNNaNNaN1.01.0NaNNaN1.0NaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], "source": [ @@ -616,351 +174,55 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
BCEBCE2BCWBCW2Base_TDI80_1LisbonMAEMAWReleaseRstr
fish
484211111111111
484311111111111
484411111111111
484500001110011
484700000110010
484800000110011
484900000100010
485010101100011
485100000100010
485400000100010
485500001110011
485711111110011
485811111111111
485900001110011
486111111111111
486211111110011
486300000100010
486400000100010
486500000110010
\n", - "
" + " Rstr \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 0 \n", + "5 1 \n", + "6 0 \n", + "7 1 \n", + "8 0 \n", + "9 0 \n", + "10 1 \n", + "11 1 \n", + "12 1 \n", + "13 1 \n", + "14 1 \n", + "15 1 \n", + "16 0 \n", + "17 0 \n", + "18 0 " ], - "text/plain": [ - " BCE BCE2 BCW BCW2 Base_TD I80_1 Lisbon MAE MAW Release Rstr\n", - "fish \n", - "4842 1 1 1 1 1 1 1 1 1 1 1\n", - "4843 1 1 1 1 1 1 1 1 1 1 1\n", - "4844 1 1 1 1 1 1 1 1 1 1 1\n", - "4845 0 0 0 0 1 1 1 0 0 1 1\n", - "4847 0 0 0 0 0 1 1 0 0 1 0\n", - "4848 0 0 0 0 0 1 1 0 0 1 1\n", - "4849 0 0 0 0 0 1 0 0 0 1 0\n", - "4850 1 0 1 0 1 1 0 0 0 1 1\n", - "4851 0 0 0 0 0 1 0 0 0 1 0\n", - "4854 0 0 0 0 0 1 0 0 0 1 0\n", - "4855 0 0 0 0 1 1 1 0 0 1 1\n", - "4857 1 1 1 1 1 1 1 0 0 1 1\n", - "4858 1 1 1 1 1 1 1 1 1 1 1\n", - "4859 0 0 0 0 1 1 1 0 0 1 1\n", - "4861 1 1 1 1 1 1 1 1 1 1 1\n", - "4862 1 1 1 1 1 1 1 0 0 1 1\n", - "4863 0 0 0 0 0 1 0 0 0 1 0\n", - "4864 0 0 0 0 0 1 0 0 0 1 0\n", - "4865 0 0 0 0 0 1 1 0 0 1 0" - ] + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
fishBCEBCE2BCWBCW2Base_TDI80_1LisbonMAEMAWReleaseRstr
0484211111111111
1484311111111111
2484411111111111
3484500001110011
4484700000110010
5484800000110011
6484900000100010
7485010101100011
8485100000100010
9485400000100010
10485500001110011
11485711111110011
12485811111111111
13485900001110011
14486111111111111
15486211111110011
16486300000100010
17486400000100010
18486500000110010
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], "source": [ @@ -982,127 +244,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
GEOIDNAMEvariableestimatemoe
01Alabamaincome24476.0136.0
11Alabamarent747.03.0
22Alaskaincome32940.0508.0
32Alaskarent1200.013.0
44Arizonaincome27517.0148.0
..................
9955Wisconsinrent813.03.0
10056Wyomingincome30854.0342.0
10156Wyomingrent828.011.0
10272Puerto RicoincomeNaNNaN
10372Puerto Ricorent464.06.0
\n", - "

104 rows × 5 columns

\n", - "
" - ], "text/plain": [ " GEOID NAME variable estimate moe\n", "0 1 Alabama income 24476.0 136.0\n", @@ -1118,11 +261,11 @@ "103 72 Puerto Rico rent 464.0 6.0\n", "\n", "[104 rows x 5 columns]" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
GEOIDNAMEvariableestimatemoe
01Alabamaincome24476.0136.0
11Alabamarent747.03.0
22Alaskaincome32940.0508.0
32Alaskarent1200.013.0
44Arizonaincome27517.0148.0
..................
9955Wisconsinrent813.03.0
10056Wyomingincome30854.0342.0
10156Wyomingrent828.011.0
10272Puerto RicoincomeNaNNaN
10372Puerto Ricorent464.06.0
\n

104 rows × 5 columns

\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], "source": [ @@ -1143,577 +286,121 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
estimate_incomeestimate_rentmoe_incomemoe_rent
GEOIDNAME
1Alabama24476.0747.0136.03.0
2Alaska32940.01200.0508.013.0
4Arizona27517.0972.0148.04.0
5Arkansas23789.0709.0165.05.0
6California29454.01358.0109.03.0
8Colorado32401.01125.0109.05.0
9Connecticut35326.01123.0195.05.0
10Delaware31560.01076.0247.010.0
11District of Columbia43198.01424.0681.017.0
12Florida25952.01077.070.03.0
13Georgia27024.0927.0106.03.0
15Hawaii32453.01507.0218.018.0
16Idaho25298.0792.0208.07.0
17Illinois30684.0952.083.03.0
18Indiana27247.0782.0117.03.0
19Iowa30002.0740.0143.04.0
20Kansas29126.0801.0208.05.0
21Kentucky24702.0713.0159.04.0
22Louisiana25086.0825.0155.04.0
23Maine26841.0808.0187.07.0
24Maryland37147.01311.0152.05.0
25Massachusetts34498.01173.0199.05.0
26Michigan26987.0824.082.03.0
27Minnesota32734.0906.0189.04.0
28Mississippi22766.0740.0194.05.0
29Missouri26999.0784.0113.04.0
30Montana26249.0751.0206.09.0
31Nebraska30020.0773.0146.04.0
32Nevada29019.01017.0213.06.0
33New Hampshire33172.01052.0387.09.0
34New Jersey35075.01249.0148.04.0
35New Mexico24457.0809.0214.06.0
36New York31057.01194.069.03.0
37North Carolina26482.0844.0111.03.0
38North Dakota32336.0775.0245.09.0
39Ohio27435.0764.094.02.0
40Oklahoma26207.0766.0101.03.0
41Oregon27389.0988.0146.04.0
42Pennsylvania28923.0885.0119.03.0
44Rhode Island30210.0957.0259.06.0
45South Carolina25454.0836.0123.04.0
46South Dakota28821.0696.0276.07.0
47Tennessee25453.0808.0102.04.0
48Texas28063.0952.0110.02.0
49Utah27928.0948.0239.06.0
50Vermont29351.0945.0361.011.0
51Virginia32545.01166.0202.05.0
53Washington32318.01120.0113.04.0
54West Virginia23707.0681.0203.06.0
55Wisconsin29868.0813.0135.03.0
56Wyoming30854.0828.0342.011.0
72Puerto RicoNaN464.0NaN6.0
\n", - "
" - ], "text/plain": [ - " estimate_income estimate_rent moe_income \\\n", - "GEOID NAME \n", - "1 Alabama 24476.0 747.0 136.0 \n", - "2 Alaska 32940.0 1200.0 508.0 \n", - "4 Arizona 27517.0 972.0 148.0 \n", - "5 Arkansas 23789.0 709.0 165.0 \n", - "6 California 29454.0 1358.0 109.0 \n", - "8 Colorado 32401.0 1125.0 109.0 \n", - "9 Connecticut 35326.0 1123.0 195.0 \n", - "10 Delaware 31560.0 1076.0 247.0 \n", - "11 District of Columbia 43198.0 1424.0 681.0 \n", - "12 Florida 25952.0 1077.0 70.0 \n", - "13 Georgia 27024.0 927.0 106.0 \n", - "15 Hawaii 32453.0 1507.0 218.0 \n", - "16 Idaho 25298.0 792.0 208.0 \n", - "17 Illinois 30684.0 952.0 83.0 \n", - "18 Indiana 27247.0 782.0 117.0 \n", - "19 Iowa 30002.0 740.0 143.0 \n", - "20 Kansas 29126.0 801.0 208.0 \n", - "21 Kentucky 24702.0 713.0 159.0 \n", - "22 Louisiana 25086.0 825.0 155.0 \n", - "23 Maine 26841.0 808.0 187.0 \n", - "24 Maryland 37147.0 1311.0 152.0 \n", - "25 Massachusetts 34498.0 1173.0 199.0 \n", - "26 Michigan 26987.0 824.0 82.0 \n", - "27 Minnesota 32734.0 906.0 189.0 \n", - "28 Mississippi 22766.0 740.0 194.0 \n", - "29 Missouri 26999.0 784.0 113.0 \n", - "30 Montana 26249.0 751.0 206.0 \n", - "31 Nebraska 30020.0 773.0 146.0 \n", - "32 Nevada 29019.0 1017.0 213.0 \n", - "33 New Hampshire 33172.0 1052.0 387.0 \n", - "34 New Jersey 35075.0 1249.0 148.0 \n", - "35 New Mexico 24457.0 809.0 214.0 \n", - "36 New York 31057.0 1194.0 69.0 \n", - "37 North Carolina 26482.0 844.0 111.0 \n", - "38 North Dakota 32336.0 775.0 245.0 \n", - "39 Ohio 27435.0 764.0 94.0 \n", - "40 Oklahoma 26207.0 766.0 101.0 \n", - "41 Oregon 27389.0 988.0 146.0 \n", - "42 Pennsylvania 28923.0 885.0 119.0 \n", - "44 Rhode Island 30210.0 957.0 259.0 \n", - "45 South Carolina 25454.0 836.0 123.0 \n", - "46 South Dakota 28821.0 696.0 276.0 \n", - "47 Tennessee 25453.0 808.0 102.0 \n", - "48 Texas 28063.0 952.0 110.0 \n", - "49 Utah 27928.0 948.0 239.0 \n", - "50 Vermont 29351.0 945.0 361.0 \n", - "51 Virginia 32545.0 1166.0 202.0 \n", - "53 Washington 32318.0 1120.0 113.0 \n", - "54 West Virginia 23707.0 681.0 203.0 \n", - "55 Wisconsin 29868.0 813.0 135.0 \n", - "56 Wyoming 30854.0 828.0 342.0 \n", - "72 Puerto Rico NaN 464.0 NaN \n", + " GEOID NAME estimate_income estimate_rent moe_income \\\n", + "0 1 Alabama 24476.0 747.0 136.0 \n", + "1 2 Alaska 32940.0 1200.0 508.0 \n", + "2 4 Arizona 27517.0 972.0 148.0 \n", + "3 5 Arkansas 23789.0 709.0 165.0 \n", + "4 6 California 29454.0 1358.0 109.0 \n", + "5 8 Colorado 32401.0 1125.0 109.0 \n", + "6 9 Connecticut 35326.0 1123.0 195.0 \n", + "7 10 Delaware 31560.0 1076.0 247.0 \n", + "8 11 District of Columbia 43198.0 1424.0 681.0 \n", + "9 12 Florida 25952.0 1077.0 70.0 \n", + "10 13 Georgia 27024.0 927.0 106.0 \n", + "11 15 Hawaii 32453.0 1507.0 218.0 \n", + "12 16 Idaho 25298.0 792.0 208.0 \n", + "13 17 Illinois 30684.0 952.0 83.0 \n", + "14 18 Indiana 27247.0 782.0 117.0 \n", + "15 19 Iowa 30002.0 740.0 143.0 \n", + "16 20 Kansas 29126.0 801.0 208.0 \n", + "17 21 Kentucky 24702.0 713.0 159.0 \n", + "18 22 Louisiana 25086.0 825.0 155.0 \n", + "19 23 Maine 26841.0 808.0 187.0 \n", + "20 24 Maryland 37147.0 1311.0 152.0 \n", + "21 25 Massachusetts 34498.0 1173.0 199.0 \n", + "22 26 Michigan 26987.0 824.0 82.0 \n", + "23 27 Minnesota 32734.0 906.0 189.0 \n", + "24 28 Mississippi 22766.0 740.0 194.0 \n", + "25 29 Missouri 26999.0 784.0 113.0 \n", + "26 30 Montana 26249.0 751.0 206.0 \n", + "27 31 Nebraska 30020.0 773.0 146.0 \n", + "28 32 Nevada 29019.0 1017.0 213.0 \n", + "29 33 New Hampshire 33172.0 1052.0 387.0 \n", + "30 34 New Jersey 35075.0 1249.0 148.0 \n", + "31 35 New Mexico 24457.0 809.0 214.0 \n", + "32 36 New York 31057.0 1194.0 69.0 \n", + "33 37 North Carolina 26482.0 844.0 111.0 \n", + "34 38 North Dakota 32336.0 775.0 245.0 \n", + "35 39 Ohio 27435.0 764.0 94.0 \n", + "36 40 Oklahoma 26207.0 766.0 101.0 \n", + "37 41 Oregon 27389.0 988.0 146.0 \n", + "38 42 Pennsylvania 28923.0 885.0 119.0 \n", + "39 44 Rhode Island 30210.0 957.0 259.0 \n", + "40 45 South Carolina 25454.0 836.0 123.0 \n", + "41 46 South Dakota 28821.0 696.0 276.0 \n", + "42 47 Tennessee 25453.0 808.0 102.0 \n", + "43 48 Texas 28063.0 952.0 110.0 \n", + "44 49 Utah 27928.0 948.0 239.0 \n", + "45 50 Vermont 29351.0 945.0 361.0 \n", + "46 51 Virginia 32545.0 1166.0 202.0 \n", + "47 53 Washington 32318.0 1120.0 113.0 \n", + "48 54 West Virginia 23707.0 681.0 203.0 \n", + "49 55 Wisconsin 29868.0 813.0 135.0 \n", + "50 56 Wyoming 30854.0 828.0 342.0 \n", + "51 72 Puerto Rico NaN 464.0 NaN \n", "\n", - " moe_rent \n", - "GEOID NAME \n", - "1 Alabama 3.0 \n", - "2 Alaska 13.0 \n", - "4 Arizona 4.0 \n", - "5 Arkansas 5.0 \n", - "6 California 3.0 \n", - "8 Colorado 5.0 \n", - "9 Connecticut 5.0 \n", - "10 Delaware 10.0 \n", - "11 District of Columbia 17.0 \n", - "12 Florida 3.0 \n", - "13 Georgia 3.0 \n", - "15 Hawaii 18.0 \n", - "16 Idaho 7.0 \n", - "17 Illinois 3.0 \n", - "18 Indiana 3.0 \n", - "19 Iowa 4.0 \n", - "20 Kansas 5.0 \n", - "21 Kentucky 4.0 \n", - "22 Louisiana 4.0 \n", - "23 Maine 7.0 \n", - "24 Maryland 5.0 \n", - "25 Massachusetts 5.0 \n", - "26 Michigan 3.0 \n", - "27 Minnesota 4.0 \n", - "28 Mississippi 5.0 \n", - "29 Missouri 4.0 \n", - "30 Montana 9.0 \n", - "31 Nebraska 4.0 \n", - "32 Nevada 6.0 \n", - "33 New Hampshire 9.0 \n", - "34 New Jersey 4.0 \n", - "35 New Mexico 6.0 \n", - "36 New York 3.0 \n", - "37 North Carolina 3.0 \n", - "38 North Dakota 9.0 \n", - "39 Ohio 2.0 \n", - "40 Oklahoma 3.0 \n", - "41 Oregon 4.0 \n", - "42 Pennsylvania 3.0 \n", - "44 Rhode Island 6.0 \n", - "45 South Carolina 4.0 \n", - "46 South Dakota 7.0 \n", - "47 Tennessee 4.0 \n", - "48 Texas 2.0 \n", - "49 Utah 6.0 \n", - "50 Vermont 11.0 \n", - "51 Virginia 5.0 \n", - "53 Washington 4.0 \n", - "54 West Virginia 6.0 \n", - "55 Wisconsin 3.0 \n", - "56 Wyoming 11.0 \n", - "72 Puerto Rico 6.0 " - ] + " moe_rent \n", + "0 3.0 \n", + "1 13.0 \n", + "2 4.0 \n", + "3 5.0 \n", + "4 3.0 \n", + "5 5.0 \n", + "6 5.0 \n", + "7 10.0 \n", + "8 17.0 \n", + "9 3.0 \n", + "10 3.0 \n", + "11 18.0 \n", + "12 7.0 \n", + "13 3.0 \n", + "14 3.0 \n", + "15 4.0 \n", + "16 5.0 \n", + "17 4.0 \n", + "18 4.0 \n", + "19 7.0 \n", + "20 5.0 \n", + "21 5.0 \n", + "22 3.0 \n", + "23 4.0 \n", + "24 5.0 \n", + "25 4.0 \n", + "26 9.0 \n", + "27 4.0 \n", + "28 6.0 \n", + "29 9.0 \n", + "30 4.0 \n", + "31 6.0 \n", + "32 3.0 \n", + "33 3.0 \n", + "34 9.0 \n", + "35 2.0 \n", + "36 3.0 \n", + "37 4.0 \n", + "38 3.0 \n", + "39 6.0 \n", + "40 4.0 \n", + "41 7.0 \n", + "42 4.0 \n", + "43 2.0 \n", + "44 6.0 \n", + "45 11.0 \n", + "46 5.0 \n", + "47 4.0 \n", + "48 6.0 \n", + "49 3.0 \n", + "50 11.0 \n", + "51 6.0 " + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
GEOIDNAMEestimate_incomeestimate_rentmoe_incomemoe_rent
01Alabama24476.0747.0136.03.0
12Alaska32940.01200.0508.013.0
24Arizona27517.0972.0148.04.0
35Arkansas23789.0709.0165.05.0
46California29454.01358.0109.03.0
58Colorado32401.01125.0109.05.0
69Connecticut35326.01123.0195.05.0
710Delaware31560.01076.0247.010.0
811District of Columbia43198.01424.0681.017.0
912Florida25952.01077.070.03.0
1013Georgia27024.0927.0106.03.0
1115Hawaii32453.01507.0218.018.0
1216Idaho25298.0792.0208.07.0
1317Illinois30684.0952.083.03.0
1418Indiana27247.0782.0117.03.0
1519Iowa30002.0740.0143.04.0
1620Kansas29126.0801.0208.05.0
1721Kentucky24702.0713.0159.04.0
1822Louisiana25086.0825.0155.04.0
1923Maine26841.0808.0187.07.0
2024Maryland37147.01311.0152.05.0
2125Massachusetts34498.01173.0199.05.0
2226Michigan26987.0824.082.03.0
2327Minnesota32734.0906.0189.04.0
2428Mississippi22766.0740.0194.05.0
2529Missouri26999.0784.0113.04.0
2630Montana26249.0751.0206.09.0
2731Nebraska30020.0773.0146.04.0
2832Nevada29019.01017.0213.06.0
2933New Hampshire33172.01052.0387.09.0
3034New Jersey35075.01249.0148.04.0
3135New Mexico24457.0809.0214.06.0
3236New York31057.01194.069.03.0
3337North Carolina26482.0844.0111.03.0
3438North Dakota32336.0775.0245.09.0
3539Ohio27435.0764.094.02.0
3640Oklahoma26207.0766.0101.03.0
3741Oregon27389.0988.0146.04.0
3842Pennsylvania28923.0885.0119.03.0
3944Rhode Island30210.0957.0259.06.0
4045South Carolina25454.0836.0123.04.0
4146South Dakota28821.0696.0276.07.0
4247Tennessee25453.0808.0102.04.0
4348Texas28063.0952.0110.02.0
4449Utah27928.0948.0239.06.0
4550Vermont29351.0945.0361.011.0
4651Virginia32545.01166.0202.05.0
4753Washington32318.01120.0113.04.0
4854West Virginia23707.0681.0203.06.0
4955Wisconsin29868.0813.0135.03.0
5056Wyoming30854.0828.0342.011.0
5172Puerto RicoNaN464.0NaN6.0
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 6 } ], "source": [ @@ -1735,577 +422,121 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
estimate.incomeestimate.rentmoe.incomemoe.rent
GEOIDNAME
1Alabama24476.0747.0136.03.0
2Alaska32940.01200.0508.013.0
4Arizona27517.0972.0148.04.0
5Arkansas23789.0709.0165.05.0
6California29454.01358.0109.03.0
8Colorado32401.01125.0109.05.0
9Connecticut35326.01123.0195.05.0
10Delaware31560.01076.0247.010.0
11District of Columbia43198.01424.0681.017.0
12Florida25952.01077.070.03.0
13Georgia27024.0927.0106.03.0
15Hawaii32453.01507.0218.018.0
16Idaho25298.0792.0208.07.0
17Illinois30684.0952.083.03.0
18Indiana27247.0782.0117.03.0
19Iowa30002.0740.0143.04.0
20Kansas29126.0801.0208.05.0
21Kentucky24702.0713.0159.04.0
22Louisiana25086.0825.0155.04.0
23Maine26841.0808.0187.07.0
24Maryland37147.01311.0152.05.0
25Massachusetts34498.01173.0199.05.0
26Michigan26987.0824.082.03.0
27Minnesota32734.0906.0189.04.0
28Mississippi22766.0740.0194.05.0
29Missouri26999.0784.0113.04.0
30Montana26249.0751.0206.09.0
31Nebraska30020.0773.0146.04.0
32Nevada29019.01017.0213.06.0
33New Hampshire33172.01052.0387.09.0
34New Jersey35075.01249.0148.04.0
35New Mexico24457.0809.0214.06.0
36New York31057.01194.069.03.0
37North Carolina26482.0844.0111.03.0
38North Dakota32336.0775.0245.09.0
39Ohio27435.0764.094.02.0
40Oklahoma26207.0766.0101.03.0
41Oregon27389.0988.0146.04.0
42Pennsylvania28923.0885.0119.03.0
44Rhode Island30210.0957.0259.06.0
45South Carolina25454.0836.0123.04.0
46South Dakota28821.0696.0276.07.0
47Tennessee25453.0808.0102.04.0
48Texas28063.0952.0110.02.0
49Utah27928.0948.0239.06.0
50Vermont29351.0945.0361.011.0
51Virginia32545.01166.0202.05.0
53Washington32318.01120.0113.04.0
54West Virginia23707.0681.0203.06.0
55Wisconsin29868.0813.0135.03.0
56Wyoming30854.0828.0342.011.0
72Puerto RicoNaN464.0NaN6.0
\n", - "
" - ], "text/plain": [ - " estimate.income estimate.rent moe.income \\\n", - "GEOID NAME \n", - "1 Alabama 24476.0 747.0 136.0 \n", - "2 Alaska 32940.0 1200.0 508.0 \n", - "4 Arizona 27517.0 972.0 148.0 \n", - "5 Arkansas 23789.0 709.0 165.0 \n", - "6 California 29454.0 1358.0 109.0 \n", - "8 Colorado 32401.0 1125.0 109.0 \n", - "9 Connecticut 35326.0 1123.0 195.0 \n", - "10 Delaware 31560.0 1076.0 247.0 \n", - "11 District of Columbia 43198.0 1424.0 681.0 \n", - "12 Florida 25952.0 1077.0 70.0 \n", - "13 Georgia 27024.0 927.0 106.0 \n", - "15 Hawaii 32453.0 1507.0 218.0 \n", - "16 Idaho 25298.0 792.0 208.0 \n", - "17 Illinois 30684.0 952.0 83.0 \n", - "18 Indiana 27247.0 782.0 117.0 \n", - "19 Iowa 30002.0 740.0 143.0 \n", - "20 Kansas 29126.0 801.0 208.0 \n", - "21 Kentucky 24702.0 713.0 159.0 \n", - "22 Louisiana 25086.0 825.0 155.0 \n", - "23 Maine 26841.0 808.0 187.0 \n", - "24 Maryland 37147.0 1311.0 152.0 \n", - "25 Massachusetts 34498.0 1173.0 199.0 \n", - "26 Michigan 26987.0 824.0 82.0 \n", - "27 Minnesota 32734.0 906.0 189.0 \n", - "28 Mississippi 22766.0 740.0 194.0 \n", - "29 Missouri 26999.0 784.0 113.0 \n", - "30 Montana 26249.0 751.0 206.0 \n", - "31 Nebraska 30020.0 773.0 146.0 \n", - "32 Nevada 29019.0 1017.0 213.0 \n", - "33 New Hampshire 33172.0 1052.0 387.0 \n", - "34 New Jersey 35075.0 1249.0 148.0 \n", - "35 New Mexico 24457.0 809.0 214.0 \n", - "36 New York 31057.0 1194.0 69.0 \n", - "37 North Carolina 26482.0 844.0 111.0 \n", - "38 North Dakota 32336.0 775.0 245.0 \n", - "39 Ohio 27435.0 764.0 94.0 \n", - "40 Oklahoma 26207.0 766.0 101.0 \n", - "41 Oregon 27389.0 988.0 146.0 \n", - "42 Pennsylvania 28923.0 885.0 119.0 \n", - "44 Rhode Island 30210.0 957.0 259.0 \n", - "45 South Carolina 25454.0 836.0 123.0 \n", - "46 South Dakota 28821.0 696.0 276.0 \n", - "47 Tennessee 25453.0 808.0 102.0 \n", - "48 Texas 28063.0 952.0 110.0 \n", - "49 Utah 27928.0 948.0 239.0 \n", - "50 Vermont 29351.0 945.0 361.0 \n", - "51 Virginia 32545.0 1166.0 202.0 \n", - "53 Washington 32318.0 1120.0 113.0 \n", - "54 West Virginia 23707.0 681.0 203.0 \n", - "55 Wisconsin 29868.0 813.0 135.0 \n", - "56 Wyoming 30854.0 828.0 342.0 \n", - "72 Puerto Rico NaN 464.0 NaN \n", + " GEOID NAME estimate.income estimate.rent moe.income \\\n", + "0 1 Alabama 24476.0 747.0 136.0 \n", + "1 2 Alaska 32940.0 1200.0 508.0 \n", + "2 4 Arizona 27517.0 972.0 148.0 \n", + "3 5 Arkansas 23789.0 709.0 165.0 \n", + "4 6 California 29454.0 1358.0 109.0 \n", + "5 8 Colorado 32401.0 1125.0 109.0 \n", + "6 9 Connecticut 35326.0 1123.0 195.0 \n", + "7 10 Delaware 31560.0 1076.0 247.0 \n", + "8 11 District of Columbia 43198.0 1424.0 681.0 \n", + "9 12 Florida 25952.0 1077.0 70.0 \n", + "10 13 Georgia 27024.0 927.0 106.0 \n", + "11 15 Hawaii 32453.0 1507.0 218.0 \n", + "12 16 Idaho 25298.0 792.0 208.0 \n", + "13 17 Illinois 30684.0 952.0 83.0 \n", + "14 18 Indiana 27247.0 782.0 117.0 \n", + "15 19 Iowa 30002.0 740.0 143.0 \n", + "16 20 Kansas 29126.0 801.0 208.0 \n", + "17 21 Kentucky 24702.0 713.0 159.0 \n", + "18 22 Louisiana 25086.0 825.0 155.0 \n", + "19 23 Maine 26841.0 808.0 187.0 \n", + "20 24 Maryland 37147.0 1311.0 152.0 \n", + "21 25 Massachusetts 34498.0 1173.0 199.0 \n", + "22 26 Michigan 26987.0 824.0 82.0 \n", + "23 27 Minnesota 32734.0 906.0 189.0 \n", + "24 28 Mississippi 22766.0 740.0 194.0 \n", + "25 29 Missouri 26999.0 784.0 113.0 \n", + "26 30 Montana 26249.0 751.0 206.0 \n", + "27 31 Nebraska 30020.0 773.0 146.0 \n", + "28 32 Nevada 29019.0 1017.0 213.0 \n", + "29 33 New Hampshire 33172.0 1052.0 387.0 \n", + "30 34 New Jersey 35075.0 1249.0 148.0 \n", + "31 35 New Mexico 24457.0 809.0 214.0 \n", + "32 36 New York 31057.0 1194.0 69.0 \n", + "33 37 North Carolina 26482.0 844.0 111.0 \n", + "34 38 North Dakota 32336.0 775.0 245.0 \n", + "35 39 Ohio 27435.0 764.0 94.0 \n", + "36 40 Oklahoma 26207.0 766.0 101.0 \n", + "37 41 Oregon 27389.0 988.0 146.0 \n", + "38 42 Pennsylvania 28923.0 885.0 119.0 \n", + "39 44 Rhode Island 30210.0 957.0 259.0 \n", + "40 45 South Carolina 25454.0 836.0 123.0 \n", + "41 46 South Dakota 28821.0 696.0 276.0 \n", + "42 47 Tennessee 25453.0 808.0 102.0 \n", + "43 48 Texas 28063.0 952.0 110.0 \n", + "44 49 Utah 27928.0 948.0 239.0 \n", + "45 50 Vermont 29351.0 945.0 361.0 \n", + "46 51 Virginia 32545.0 1166.0 202.0 \n", + "47 53 Washington 32318.0 1120.0 113.0 \n", + "48 54 West Virginia 23707.0 681.0 203.0 \n", + "49 55 Wisconsin 29868.0 813.0 135.0 \n", + "50 56 Wyoming 30854.0 828.0 342.0 \n", + "51 72 Puerto Rico NaN 464.0 NaN \n", "\n", - " moe.rent \n", - "GEOID NAME \n", - "1 Alabama 3.0 \n", - "2 Alaska 13.0 \n", - "4 Arizona 4.0 \n", - "5 Arkansas 5.0 \n", - "6 California 3.0 \n", - "8 Colorado 5.0 \n", - "9 Connecticut 5.0 \n", - "10 Delaware 10.0 \n", - "11 District of Columbia 17.0 \n", - "12 Florida 3.0 \n", - "13 Georgia 3.0 \n", - "15 Hawaii 18.0 \n", - "16 Idaho 7.0 \n", - "17 Illinois 3.0 \n", - "18 Indiana 3.0 \n", - "19 Iowa 4.0 \n", - "20 Kansas 5.0 \n", - "21 Kentucky 4.0 \n", - "22 Louisiana 4.0 \n", - "23 Maine 7.0 \n", - "24 Maryland 5.0 \n", - "25 Massachusetts 5.0 \n", - "26 Michigan 3.0 \n", - "27 Minnesota 4.0 \n", - "28 Mississippi 5.0 \n", - "29 Missouri 4.0 \n", - "30 Montana 9.0 \n", - "31 Nebraska 4.0 \n", - "32 Nevada 6.0 \n", - "33 New Hampshire 9.0 \n", - "34 New Jersey 4.0 \n", - "35 New Mexico 6.0 \n", - "36 New York 3.0 \n", - "37 North Carolina 3.0 \n", - "38 North Dakota 9.0 \n", - "39 Ohio 2.0 \n", - "40 Oklahoma 3.0 \n", - "41 Oregon 4.0 \n", - "42 Pennsylvania 3.0 \n", - "44 Rhode Island 6.0 \n", - "45 South Carolina 4.0 \n", - "46 South Dakota 7.0 \n", - "47 Tennessee 4.0 \n", - "48 Texas 2.0 \n", - "49 Utah 6.0 \n", - "50 Vermont 11.0 \n", - "51 Virginia 5.0 \n", - "53 Washington 4.0 \n", - "54 West Virginia 6.0 \n", - "55 Wisconsin 3.0 \n", - "56 Wyoming 11.0 \n", - "72 Puerto Rico 6.0 " - ] + " moe.rent \n", + "0 3.0 \n", + "1 13.0 \n", + "2 4.0 \n", + "3 5.0 \n", + "4 3.0 \n", + "5 5.0 \n", + "6 5.0 \n", + "7 10.0 \n", + "8 17.0 \n", + "9 3.0 \n", + "10 3.0 \n", + "11 18.0 \n", + "12 7.0 \n", + "13 3.0 \n", + "14 3.0 \n", + "15 4.0 \n", + "16 5.0 \n", + "17 4.0 \n", + "18 4.0 \n", + "19 7.0 \n", + "20 5.0 \n", + "21 5.0 \n", + "22 3.0 \n", + "23 4.0 \n", + "24 5.0 \n", + "25 4.0 \n", + "26 9.0 \n", + "27 4.0 \n", + "28 6.0 \n", + "29 9.0 \n", + "30 4.0 \n", + "31 6.0 \n", + "32 3.0 \n", + "33 3.0 \n", + "34 9.0 \n", + "35 2.0 \n", + "36 3.0 \n", + "37 4.0 \n", + "38 3.0 \n", + "39 6.0 \n", + "40 4.0 \n", + "41 7.0 \n", + "42 4.0 \n", + "43 2.0 \n", + "44 6.0 \n", + "45 11.0 \n", + "46 5.0 \n", + "47 4.0 \n", + "48 6.0 \n", + "49 3.0 \n", + "50 11.0 \n", + "51 6.0 " + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
GEOIDNAMEestimate.incomeestimate.rentmoe.incomemoe.rent
01Alabama24476.0747.0136.03.0
12Alaska32940.01200.0508.013.0
24Arizona27517.0972.0148.04.0
35Arkansas23789.0709.0165.05.0
46California29454.01358.0109.03.0
58Colorado32401.01125.0109.05.0
69Connecticut35326.01123.0195.05.0
710Delaware31560.01076.0247.010.0
811District of Columbia43198.01424.0681.017.0
912Florida25952.01077.070.03.0
1013Georgia27024.0927.0106.03.0
1115Hawaii32453.01507.0218.018.0
1216Idaho25298.0792.0208.07.0
1317Illinois30684.0952.083.03.0
1418Indiana27247.0782.0117.03.0
1519Iowa30002.0740.0143.04.0
1620Kansas29126.0801.0208.05.0
1721Kentucky24702.0713.0159.04.0
1822Louisiana25086.0825.0155.04.0
1923Maine26841.0808.0187.07.0
2024Maryland37147.01311.0152.05.0
2125Massachusetts34498.01173.0199.05.0
2226Michigan26987.0824.082.03.0
2327Minnesota32734.0906.0189.04.0
2428Mississippi22766.0740.0194.05.0
2529Missouri26999.0784.0113.04.0
2630Montana26249.0751.0206.09.0
2731Nebraska30020.0773.0146.04.0
2832Nevada29019.01017.0213.06.0
2933New Hampshire33172.01052.0387.09.0
3034New Jersey35075.01249.0148.04.0
3135New Mexico24457.0809.0214.06.0
3236New York31057.01194.069.03.0
3337North Carolina26482.0844.0111.03.0
3438North Dakota32336.0775.0245.09.0
3539Ohio27435.0764.094.02.0
3640Oklahoma26207.0766.0101.03.0
3741Oregon27389.0988.0146.04.0
3842Pennsylvania28923.0885.0119.03.0
3944Rhode Island30210.0957.0259.06.0
4045South Carolina25454.0836.0123.04.0
4146South Dakota28821.0696.0276.07.0
4247Tennessee25453.0808.0102.04.0
4348Texas28063.0952.0110.02.0
4449Utah27928.0948.0239.06.0
4550Vermont29351.0945.0361.011.0
4651Virginia32545.01166.0202.05.0
4753Washington32318.01120.0113.04.0
4854West Virginia23707.0681.0203.06.0
4955Wisconsin29868.0813.0135.03.0
5056Wyoming30854.0828.0342.011.0
5172Puerto RicoNaN464.0NaN6.0
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 7 } ], "source": [ @@ -2331,577 +562,121 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
income_estimaterent_estimateincome_moerent_moe
GEOIDNAME
1Alabama24476.0747.0136.03.0
2Alaska32940.01200.0508.013.0
4Arizona27517.0972.0148.04.0
5Arkansas23789.0709.0165.05.0
6California29454.01358.0109.03.0
8Colorado32401.01125.0109.05.0
9Connecticut35326.01123.0195.05.0
10Delaware31560.01076.0247.010.0
11District of Columbia43198.01424.0681.017.0
12Florida25952.01077.070.03.0
13Georgia27024.0927.0106.03.0
15Hawaii32453.01507.0218.018.0
16Idaho25298.0792.0208.07.0
17Illinois30684.0952.083.03.0
18Indiana27247.0782.0117.03.0
19Iowa30002.0740.0143.04.0
20Kansas29126.0801.0208.05.0
21Kentucky24702.0713.0159.04.0
22Louisiana25086.0825.0155.04.0
23Maine26841.0808.0187.07.0
24Maryland37147.01311.0152.05.0
25Massachusetts34498.01173.0199.05.0
26Michigan26987.0824.082.03.0
27Minnesota32734.0906.0189.04.0
28Mississippi22766.0740.0194.05.0
29Missouri26999.0784.0113.04.0
30Montana26249.0751.0206.09.0
31Nebraska30020.0773.0146.04.0
32Nevada29019.01017.0213.06.0
33New Hampshire33172.01052.0387.09.0
34New Jersey35075.01249.0148.04.0
35New Mexico24457.0809.0214.06.0
36New York31057.01194.069.03.0
37North Carolina26482.0844.0111.03.0
38North Dakota32336.0775.0245.09.0
39Ohio27435.0764.094.02.0
40Oklahoma26207.0766.0101.03.0
41Oregon27389.0988.0146.04.0
42Pennsylvania28923.0885.0119.03.0
44Rhode Island30210.0957.0259.06.0
45South Carolina25454.0836.0123.04.0
46South Dakota28821.0696.0276.07.0
47Tennessee25453.0808.0102.04.0
48Texas28063.0952.0110.02.0
49Utah27928.0948.0239.06.0
50Vermont29351.0945.0361.011.0
51Virginia32545.01166.0202.05.0
53Washington32318.01120.0113.04.0
54West Virginia23707.0681.0203.06.0
55Wisconsin29868.0813.0135.03.0
56Wyoming30854.0828.0342.011.0
72Puerto RicoNaN464.0NaN6.0
\n", - "
" - ], "text/plain": [ - " income_estimate rent_estimate income_moe \\\n", - "GEOID NAME \n", - "1 Alabama 24476.0 747.0 136.0 \n", - "2 Alaska 32940.0 1200.0 508.0 \n", - "4 Arizona 27517.0 972.0 148.0 \n", - "5 Arkansas 23789.0 709.0 165.0 \n", - "6 California 29454.0 1358.0 109.0 \n", - "8 Colorado 32401.0 1125.0 109.0 \n", - "9 Connecticut 35326.0 1123.0 195.0 \n", - "10 Delaware 31560.0 1076.0 247.0 \n", - "11 District of Columbia 43198.0 1424.0 681.0 \n", - "12 Florida 25952.0 1077.0 70.0 \n", - "13 Georgia 27024.0 927.0 106.0 \n", - "15 Hawaii 32453.0 1507.0 218.0 \n", - "16 Idaho 25298.0 792.0 208.0 \n", - "17 Illinois 30684.0 952.0 83.0 \n", - "18 Indiana 27247.0 782.0 117.0 \n", - "19 Iowa 30002.0 740.0 143.0 \n", - "20 Kansas 29126.0 801.0 208.0 \n", - "21 Kentucky 24702.0 713.0 159.0 \n", - "22 Louisiana 25086.0 825.0 155.0 \n", - "23 Maine 26841.0 808.0 187.0 \n", - "24 Maryland 37147.0 1311.0 152.0 \n", - "25 Massachusetts 34498.0 1173.0 199.0 \n", - "26 Michigan 26987.0 824.0 82.0 \n", - "27 Minnesota 32734.0 906.0 189.0 \n", - "28 Mississippi 22766.0 740.0 194.0 \n", - "29 Missouri 26999.0 784.0 113.0 \n", - "30 Montana 26249.0 751.0 206.0 \n", - "31 Nebraska 30020.0 773.0 146.0 \n", - "32 Nevada 29019.0 1017.0 213.0 \n", - "33 New Hampshire 33172.0 1052.0 387.0 \n", - "34 New Jersey 35075.0 1249.0 148.0 \n", - "35 New Mexico 24457.0 809.0 214.0 \n", - "36 New York 31057.0 1194.0 69.0 \n", - "37 North Carolina 26482.0 844.0 111.0 \n", - "38 North Dakota 32336.0 775.0 245.0 \n", - "39 Ohio 27435.0 764.0 94.0 \n", - "40 Oklahoma 26207.0 766.0 101.0 \n", - "41 Oregon 27389.0 988.0 146.0 \n", - "42 Pennsylvania 28923.0 885.0 119.0 \n", - "44 Rhode Island 30210.0 957.0 259.0 \n", - "45 South Carolina 25454.0 836.0 123.0 \n", - "46 South Dakota 28821.0 696.0 276.0 \n", - "47 Tennessee 25453.0 808.0 102.0 \n", - "48 Texas 28063.0 952.0 110.0 \n", - "49 Utah 27928.0 948.0 239.0 \n", - "50 Vermont 29351.0 945.0 361.0 \n", - "51 Virginia 32545.0 1166.0 202.0 \n", - "53 Washington 32318.0 1120.0 113.0 \n", - "54 West Virginia 23707.0 681.0 203.0 \n", - "55 Wisconsin 29868.0 813.0 135.0 \n", - "56 Wyoming 30854.0 828.0 342.0 \n", - "72 Puerto Rico NaN 464.0 NaN \n", + " GEOID NAME income_estimate rent_estimate income_moe \\\n", + "0 1 Alabama 24476.0 747.0 136.0 \n", + "1 2 Alaska 32940.0 1200.0 508.0 \n", + "2 4 Arizona 27517.0 972.0 148.0 \n", + "3 5 Arkansas 23789.0 709.0 165.0 \n", + "4 6 California 29454.0 1358.0 109.0 \n", + "5 8 Colorado 32401.0 1125.0 109.0 \n", + "6 9 Connecticut 35326.0 1123.0 195.0 \n", + "7 10 Delaware 31560.0 1076.0 247.0 \n", + "8 11 District of Columbia 43198.0 1424.0 681.0 \n", + "9 12 Florida 25952.0 1077.0 70.0 \n", + "10 13 Georgia 27024.0 927.0 106.0 \n", + "11 15 Hawaii 32453.0 1507.0 218.0 \n", + "12 16 Idaho 25298.0 792.0 208.0 \n", + "13 17 Illinois 30684.0 952.0 83.0 \n", + "14 18 Indiana 27247.0 782.0 117.0 \n", + "15 19 Iowa 30002.0 740.0 143.0 \n", + "16 20 Kansas 29126.0 801.0 208.0 \n", + "17 21 Kentucky 24702.0 713.0 159.0 \n", + "18 22 Louisiana 25086.0 825.0 155.0 \n", + "19 23 Maine 26841.0 808.0 187.0 \n", + "20 24 Maryland 37147.0 1311.0 152.0 \n", + "21 25 Massachusetts 34498.0 1173.0 199.0 \n", + "22 26 Michigan 26987.0 824.0 82.0 \n", + "23 27 Minnesota 32734.0 906.0 189.0 \n", + "24 28 Mississippi 22766.0 740.0 194.0 \n", + "25 29 Missouri 26999.0 784.0 113.0 \n", + "26 30 Montana 26249.0 751.0 206.0 \n", + "27 31 Nebraska 30020.0 773.0 146.0 \n", + "28 32 Nevada 29019.0 1017.0 213.0 \n", + "29 33 New Hampshire 33172.0 1052.0 387.0 \n", + "30 34 New Jersey 35075.0 1249.0 148.0 \n", + "31 35 New Mexico 24457.0 809.0 214.0 \n", + "32 36 New York 31057.0 1194.0 69.0 \n", + "33 37 North Carolina 26482.0 844.0 111.0 \n", + "34 38 North Dakota 32336.0 775.0 245.0 \n", + "35 39 Ohio 27435.0 764.0 94.0 \n", + "36 40 Oklahoma 26207.0 766.0 101.0 \n", + "37 41 Oregon 27389.0 988.0 146.0 \n", + "38 42 Pennsylvania 28923.0 885.0 119.0 \n", + "39 44 Rhode Island 30210.0 957.0 259.0 \n", + "40 45 South Carolina 25454.0 836.0 123.0 \n", + "41 46 South Dakota 28821.0 696.0 276.0 \n", + "42 47 Tennessee 25453.0 808.0 102.0 \n", + "43 48 Texas 28063.0 952.0 110.0 \n", + "44 49 Utah 27928.0 948.0 239.0 \n", + "45 50 Vermont 29351.0 945.0 361.0 \n", + "46 51 Virginia 32545.0 1166.0 202.0 \n", + "47 53 Washington 32318.0 1120.0 113.0 \n", + "48 54 West Virginia 23707.0 681.0 203.0 \n", + "49 55 Wisconsin 29868.0 813.0 135.0 \n", + "50 56 Wyoming 30854.0 828.0 342.0 \n", + "51 72 Puerto Rico NaN 464.0 NaN \n", "\n", - " rent_moe \n", - "GEOID NAME \n", - "1 Alabama 3.0 \n", - "2 Alaska 13.0 \n", - "4 Arizona 4.0 \n", - "5 Arkansas 5.0 \n", - "6 California 3.0 \n", - "8 Colorado 5.0 \n", - "9 Connecticut 5.0 \n", - "10 Delaware 10.0 \n", - "11 District of Columbia 17.0 \n", - "12 Florida 3.0 \n", - "13 Georgia 3.0 \n", - "15 Hawaii 18.0 \n", - "16 Idaho 7.0 \n", - "17 Illinois 3.0 \n", - "18 Indiana 3.0 \n", - "19 Iowa 4.0 \n", - "20 Kansas 5.0 \n", - "21 Kentucky 4.0 \n", - "22 Louisiana 4.0 \n", - "23 Maine 7.0 \n", - "24 Maryland 5.0 \n", - "25 Massachusetts 5.0 \n", - "26 Michigan 3.0 \n", - "27 Minnesota 4.0 \n", - "28 Mississippi 5.0 \n", - "29 Missouri 4.0 \n", - "30 Montana 9.0 \n", - "31 Nebraska 4.0 \n", - "32 Nevada 6.0 \n", - "33 New Hampshire 9.0 \n", - "34 New Jersey 4.0 \n", - "35 New Mexico 6.0 \n", - "36 New York 3.0 \n", - "37 North Carolina 3.0 \n", - "38 North Dakota 9.0 \n", - "39 Ohio 2.0 \n", - "40 Oklahoma 3.0 \n", - "41 Oregon 4.0 \n", - "42 Pennsylvania 3.0 \n", - "44 Rhode Island 6.0 \n", - "45 South Carolina 4.0 \n", - "46 South Dakota 7.0 \n", - "47 Tennessee 4.0 \n", - "48 Texas 2.0 \n", - "49 Utah 6.0 \n", - "50 Vermont 11.0 \n", - "51 Virginia 5.0 \n", - "53 Washington 4.0 \n", - "54 West Virginia 6.0 \n", - "55 Wisconsin 3.0 \n", - "56 Wyoming 11.0 \n", - "72 Puerto Rico 6.0 " - ] + " rent_moe \n", + "0 3.0 \n", + "1 13.0 \n", + "2 4.0 \n", + "3 5.0 \n", + "4 3.0 \n", + "5 5.0 \n", + "6 5.0 \n", + "7 10.0 \n", + "8 17.0 \n", + "9 3.0 \n", + "10 3.0 \n", + "11 18.0 \n", + "12 7.0 \n", + "13 3.0 \n", + "14 3.0 \n", + "15 4.0 \n", + "16 5.0 \n", + "17 4.0 \n", + "18 4.0 \n", + "19 7.0 \n", + "20 5.0 \n", + "21 5.0 \n", + "22 3.0 \n", + "23 4.0 \n", + "24 5.0 \n", + "25 4.0 \n", + "26 9.0 \n", + "27 4.0 \n", + "28 6.0 \n", + "29 9.0 \n", + "30 4.0 \n", + "31 6.0 \n", + "32 3.0 \n", + "33 3.0 \n", + "34 9.0 \n", + "35 2.0 \n", + "36 3.0 \n", + "37 4.0 \n", + "38 3.0 \n", + "39 6.0 \n", + "40 4.0 \n", + "41 7.0 \n", + "42 4.0 \n", + "43 2.0 \n", + "44 6.0 \n", + "45 11.0 \n", + "46 5.0 \n", + "47 4.0 \n", + "48 6.0 \n", + "49 3.0 \n", + "50 11.0 \n", + "51 6.0 " + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
GEOIDNAMEincome_estimaterent_estimateincome_moerent_moe
01Alabama24476.0747.0136.03.0
12Alaska32940.01200.0508.013.0
24Arizona27517.0972.0148.04.0
35Arkansas23789.0709.0165.05.0
46California29454.01358.0109.03.0
58Colorado32401.01125.0109.05.0
69Connecticut35326.01123.0195.05.0
710Delaware31560.01076.0247.010.0
811District of Columbia43198.01424.0681.017.0
912Florida25952.01077.070.03.0
1013Georgia27024.0927.0106.03.0
1115Hawaii32453.01507.0218.018.0
1216Idaho25298.0792.0208.07.0
1317Illinois30684.0952.083.03.0
1418Indiana27247.0782.0117.03.0
1519Iowa30002.0740.0143.04.0
1620Kansas29126.0801.0208.05.0
1721Kentucky24702.0713.0159.04.0
1822Louisiana25086.0825.0155.04.0
1923Maine26841.0808.0187.07.0
2024Maryland37147.01311.0152.05.0
2125Massachusetts34498.01173.0199.05.0
2226Michigan26987.0824.082.03.0
2327Minnesota32734.0906.0189.04.0
2428Mississippi22766.0740.0194.05.0
2529Missouri26999.0784.0113.04.0
2630Montana26249.0751.0206.09.0
2731Nebraska30020.0773.0146.04.0
2832Nevada29019.01017.0213.06.0
2933New Hampshire33172.01052.0387.09.0
3034New Jersey35075.01249.0148.04.0
3135New Mexico24457.0809.0214.06.0
3236New York31057.01194.069.03.0
3337North Carolina26482.0844.0111.03.0
3438North Dakota32336.0775.0245.09.0
3539Ohio27435.0764.094.02.0
3640Oklahoma26207.0766.0101.03.0
3741Oregon27389.0988.0146.04.0
3842Pennsylvania28923.0885.0119.03.0
3944Rhode Island30210.0957.0259.06.0
4045South Carolina25454.0836.0123.04.0
4146South Dakota28821.0696.0276.07.0
4247Tennessee25453.0808.0102.04.0
4348Texas28063.0952.0110.02.0
4449Utah27928.0948.0239.06.0
4550Vermont29351.0945.0361.011.0
4651Virginia32545.01166.0202.05.0
4753Washington32318.01120.0113.04.0
4854West Virginia23707.0681.0203.06.0
4955Wisconsin29868.0813.0135.03.0
5056Wyoming30854.0828.0342.011.0
5172Puerto RicoNaN464.0NaN6.0
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 8 } ], "source": [ @@ -2927,360 +702,8 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
breakswooltension
026AL
130AL
254AL
325AL
470AL
552AL
651AL
726AL
867AL
918AM
1021AM
1129AM
1217AM
1312AM
1418AM
1535AM
1630AM
1736AM
1836AH
1921AH
2024AH
2118AH
2210AH
2343AH
2428AH
2515AH
2626AH
2727BL
2814BL
2929BL
3019BL
3129BL
3231BL
3341BL
3420BL
3544BL
3642BM
3726BM
3819BM
3916BM
4039BM
4128BM
4221BM
4339BM
4429BM
4520BH
4621BH
4724BH
4817BH
4913BH
5015BH
5115BH
5216BH
5328BH
\n", - "
" - ], "text/plain": [ " breaks wool tension\n", "0 26 A L\n", @@ -3337,11 +760,11 @@ "51 15 B H\n", "52 16 B H\n", "53 28 B H" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
breakswooltension
026AL
130AL
254AL
325AL
470AL
552AL
651AL
726AL
867AL
918AM
1021AM
1129AM
1217AM
1312AM
1418AM
1535AM
1630AM
1736AM
1836AH
1921AH
2024AH
2118AH
2210AH
2343AH
2428AH
2515AH
2626AH
2727BL
2814BL
2929BL
3019BL
3129BL
3231BL
3341BL
3420BL
3544BL
3642BM
3726BM
3819BM
3916BM
4039BM
4128BM
4221BM
4339BM
4429BM
4520BH
4621BH
4724BH
4817BH
4913BH
5015BH
5115BH
5216BH
5328BH
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 9 } ], "source": [ @@ -3362,66 +785,18 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AB
tension
H24.55555618.777778
L44.55555628.222222
M24.00000028.777778
\n", - "
" - ], "text/plain": [ - " A B\n", - "tension \n", - "H 24.555556 18.777778\n", - "L 44.555556 28.222222\n", - "M 24.000000 28.777778" - ] + " tension A B\n", + "0 H 24.555556 18.777778\n", + "1 L 44.555556 28.222222\n", + "2 M 24.000000 28.777778" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
tensionAB
0H24.55555618.777778
1L44.55555628.222222
2M24.00000028.777778
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 10 } ], "source": [ @@ -3432,6 +807,13 @@ " values_fn = mean\n", " )" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -3455,4 +837,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/notebooks/replace_na.ipynb b/docs/notebooks/replace_na.ipynb index e1fdf2ce..17e8fe60 100644 --- a/docs/notebooks/replace_na.ipynb +++ b/docs/notebooks/replace_na.ipynb @@ -14,54 +14,28 @@ }, "outputs": [ { + "output_type": "display_data", "data": { - "text/html": [ - "
Try this notebook on binder.
" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/html": "
Try this notebook on binder.
" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "### # replace_na " - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "### # replace_na " }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "##### Replace NA with a value\n", - "\n", - "This function can be also used not as a verb. As a function called as \n", - "an argument in a verb, _data is passed implicitly. Then one could \n", - "pass series_or_replace as the data to replace. \n", - "\n", - "##### Args:\n", - "  `_data`: The data piped in \n", - "  `series_or_replace`: When called as argument of a verb, this is the \n", - "    data to replace. Otherwise this is the replacement. \n", - "\n", - "  `replace`: The value to replace with \n", - "\n", - "##### Returns:\n", - "  Corresponding data with NAs replaced \n" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "##### Replace NA with a value\n\nThis function can be also used not as a verb. As a function called as \nan argument in a verb, data is passed implicitly. Then one could \npass data_or_replace as the data to replace. \n\n##### Args:\n  `data`: The data piped in \n  `data_or_replace`: When called as argument of a verb, this is the \n    data to replace. Otherwise this is the replacement. \n\n  `replace`: The value to replace with \n    Can only be a scalar or dict for data frame. \n    So replace NA with a list is not supported yet. \n\n##### Returns:\n  Corresponding data with NAs replaced \n" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], "source": [ @@ -87,60 +61,18 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xy
01.0a
12.0unknown
20.0b
\n", - "
" - ], "text/plain": [ " x y\n", "0 1.0 a\n", "1 2.0 unknown\n", "2 0.0 b" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
01.0a
12.0unknown
20.0b
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], "source": [ @@ -162,60 +94,18 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xy
01.0a
12.0NaN
20.0b
\n", - "
" - ], "text/plain": [ " x y\n", "0 1.0 a\n", "1 2.0 NaN\n", "2 0.0 b" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
01.0a
12.0NaN
20.0b
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], "source": [ @@ -236,6 +126,7 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "0 1.0\n", @@ -244,9 +135,8 @@ "Name: x, dtype: float64" ] }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], "source": [ @@ -267,6 +157,7 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "0 a\n", @@ -275,9 +166,8 @@ "Name: y, dtype: object" ] }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], "source": [ @@ -296,9 +186,25 @@ "shell.execute_reply": "2021-04-17T00:54:52.861672Z" } }, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " z\n", + "0 [1, 2, 3, 4, 5]\n", + "1 5\n", + "2 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
z
0[1, 2, 3, 4, 5]
15
2[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
\n
" + }, + "metadata": {}, + "execution_count": 6 + } + ], "source": [ - "# nested not supported yet" + "df_list = tibble(z = [seq(1,5), NULL, seq(10,20)])\n", + "df_list >> replace_na({'z': 5}) # replace with a list not supported yet" ] } ], @@ -323,4 +229,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/notebooks/rows.ipynb b/docs/notebooks/rows.ipynb new file mode 100644 index 00000000..a79ca216 --- /dev/null +++ b/docs/notebooks/rows.ipynb @@ -0,0 +1,404 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python378jvsc74a57bd0c4cc73b080e063fcebb9afb794613be7caf4b26129562cba1382945a18cc49cc", + "display_name": "Python 3.7.8 64-bit ('base': conda)" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/html": "
Try this notebook on binder.
" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # rows_insert " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Adds new rows to a data frame\n\nArgument `in_place` not supported, as we always do data frames here. \n\n##### Args:\n  `x`: The seed data frame \n  `y`: The data frame with rows to be inserted into `x`. \n    - Key values in `y` must not occur in `x`\n\n    - `y` must have the same or a subset columns of `x`\n\n  `by`: A string or a list of strings giving the key columns. \n    The key values must uniquely identify each row \n    (i.e. each combination of key values occurs at most once), \n    and the key columns must exist in both x and y. \n    By default, we use the first column in y, since the first column \n    is a reasonable place to put an identifier variable. \n\n  `copy`: If `False`, do not copy data unnecessarily. \n    Original API does not support this. This argument will be \n    passed by to `pandas.concat()` as `copy` argument. \n\n##### Returns:\n  A data frame with `y` inserted into `x` \n" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # rows_update " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Modifies existing rows in a data frame\n\nSee Also: \n  [`rows_insert`](datar.dplyr.rows.rows_insert) \n\n##### Args:\n  `x`: The seed data frame \n  `y`: The data frame with rows to be inserted into `x`. \n    - Key values in `y` must not occur in `x`\n\n    - `y` must have the same or a subset columns of `x`\n\n  `by`: A string or a list of strings giving the key columns. \n    The key values must uniquely identify each row \n    (i.e. each combination of key values occurs at most once), \n    and the key columns must exist in both x and y. \n    By default, we use the first column in y, since the first column \n    is a reasonable place to put an identifier variable. \n\n  `copy`: Whether `x` should be copied and updated or updated directly \n\n##### Returns:\n  `x` with values of keys updated \n" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # rows_patch " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Works like `rows_update()` but only overwrites `NA` values.\n\nSee Also: \n  [`rows_insert`](datar.dplyr.rows.rows_insert) \n\n##### Args:\n  `x`: The seed data frame \n  `y`: The data frame with rows to be inserted into `x`. \n    - Key values in `y` must not occur in `x`\n\n    - `y` must have the same or a subset columns of `x`\n\n  `by`: A string or a list of strings giving the key columns. \n    The key values must uniquely identify each row \n    (i.e. each combination of key values occurs at most once), \n    and the key columns must exist in both x and y. \n    By default, we use the first column in y, since the first column \n    is a reasonable place to put an identifier variable. \n\n  `copy`: Whether `x` should be copied and updated or updated directly \n\n##### Returns:\n  `x` with values of keys updated \n" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # rows_upsert " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Inserts or updates depending on whether or not the\nkey value in `y` already exists in `x`. \n\nSee Also: \n  [`rows_insert`](datar.dplyr.rows.rows_insert) \n\n##### Args:\n  `x`: The seed data frame \n  `y`: The data frame with rows to be inserted into `x`. \n    - Key values in `y` must not occur in `x`\n\n    - `y` must have the same or a subset columns of `x`\n\n  `by`: A string or a list of strings giving the key columns. \n    The key values must uniquely identify each row \n    (i.e. each combination of key values occurs at most once), \n    and the key columns must exist in both x and y. \n    By default, we use the first column in y, since the first column \n    is a reasonable place to put an identifier variable. \n\n  `copy`: If `False`, do not copy data unnecessarily. \n    Original API does not support this. This argument will be \n    passed by to `pandas.concat()` as `copy` argument. \n\n##### Returns:\n  `x` with values of keys updated \n" + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # rows_delete " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Deletes rows; key values in `y` must exist in `x`.\n\nSee Also: \n  [`rows_insert`](datar.dplyr.rows.rows_insert) \n\n##### Args:\n  `x`: The seed data frame \n  `y`: The data frame with rows to be inserted into `x`. \n    - Key values in `y` must not occur in `x`\n\n    - `y` must have the same or a subset columns of `x`\n\n  `by`: A string or a list of strings giving the key columns. \n    The key values must uniquely identify each row \n    (i.e. each combination of key values occurs at most once), \n    and the key columns must exist in both x and y. \n    By default, we use the first column in y, since the first column \n    is a reasonable place to put an identifier variable. \n\n  `copy`: Whether `x` should be copied and deleted or deleted directly \n\n##### Returns:\n  `x` with values of keys deleted \n" + }, + "metadata": {} + } + ], + "source": [ + "from datar.all import *\n", + "\n", + "%run nb_helpers.py\n", + "nb_header(\n", + " rows_insert, \n", + " rows_update, \n", + " rows_patch, \n", + " rows_upsert, \n", + " rows_delete, \n", + " book='rows'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " a b c\n", + "0 1 a 0.5\n", + "1 2 b 1.5\n", + "2 3 NaN 2.5" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
abc
01a0.5
12b1.5
23NaN2.5
\n
" + }, + "metadata": {}, + "execution_count": 2 + } + ], + "source": [ + "data = tibble(a = seq(1, 3), b = c(letters[[0, 1]], NA), c = [.5, 1.5, 2.5])\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[2021-05-27 14:35:38][datar][ INFO] Matching, by='a'\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " a b c\n", + "0 1 a 0.5\n", + "1 2 b 1.5\n", + "2 3 NaN 2.5\n", + "3 4 z NaN" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
abc
01a0.5
12b1.5
23NaN2.5
34zNaN
\n
" + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "rows_insert(data, tibble(a = 4, b = \"z\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[2021-05-27 14:35:38][datar][ INFO] Matching, by='a'\n", + "[ValueError] Attempting to insert duplicate rows.\n" + ] + } + ], + "source": [ + "with try_catch():\n", + " rows_insert(data, tibble(a = 3, b = \"z\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[2021-05-27 14:36:10][datar][ INFO] Matching, by='a'\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " a b c\n", + "0 1 a 0.5\n", + "1 2 z 1.5\n", + "2 3 z 2.5" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
abc
01a0.5
12z1.5
23z2.5
\n
" + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "rows_update(data, tibble(a = [2,3], b = \"z\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " a b c\n", + "0 1 a 0.5\n", + "1 2 z 1.5\n", + "2 3 z 2.5" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
abc
01a0.5
12z1.5
23z2.5
\n
" + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "rows_update(data, tibble(b = \"z\", a = [2,3]), by = \"a\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[2021-05-27 14:36:54][datar][ INFO] Matching, by='a'\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " a b c\n", + "0 1 a 0.5\n", + "1 2 b 1.5\n", + "2 3 z 2.5" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
abc
01a0.5
12b1.5
23z2.5
\n
" + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "rows_patch(data, tibble(a = [2,3], b = \"z\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[2021-05-27 14:37:15][datar][ INFO] Matching, by='a'\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " a b c\n", + "0 1 a 0.5\n", + "1 2 z 1.5\n", + "2 3 z 2.5\n", + "3 4 z NaN" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
abc
01a0.5
12z1.5
23z2.5
34zNaN
\n
" + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "rows_upsert(data, tibble(a = seq(2, 4), b = \"z\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[2021-05-27 14:37:31][datar][ INFO] Matching, by='a'\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " a b c\n", + "0 1 a 0.5" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
abc
01a0.5
\n
" + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "rows_delete(data, tibble(a = [2, 3]))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[2021-05-27 14:37:48][datar][ INFO] Matching, by='a'\n", + "[2021-05-27 14:37:48][datar][ INFO] Ignoring extra columns: ['b']\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " a b c\n", + "0 1 a 0.5" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
abc
01a0.5
\n
" + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "rows_delete(data, tibble(a = [2, 3], b = \"b\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[ValueError] Attempting to delete missing rows.\n" + ] + } + ], + "source": [ + "with try_catch():\n", + " rows_delete(data, tibble(a = [2,3], b = \"b\"), by = c(\"a\", \"b\"))" + ] + } + ] +} \ No newline at end of file diff --git a/docs/notebooks/separate.ipynb b/docs/notebooks/separate.ipynb index b8b7773d..d298a57e 100644 --- a/docs/notebooks/separate.ipynb +++ b/docs/notebooks/separate.ipynb @@ -14,75 +14,44 @@ }, "outputs": [ { + "output_type": "display_data", "data": { - "text/html": [ - "
Try this notebook on binder.
" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/html": "
Try this notebook on binder.
" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "### # separate " - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "### # separate " }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "##### Given either a regular expression or a vector of character positions,\n", - "turns a single character column into multiple columns. \n", - "\n", - "##### Args:\n", - "  `_data`: The dataframe \n", - "  `col`: Column name or position. \n", - "  `into`: Names of new variables to create as character vector. \n", - "    Use None to omit the variable in the output. \n", - "\n", - "  `sep`: Separator between columns. \n", - "    `TODO`: support index split (sep is an integer) \n", - "\n", - "  `remove`: If TRUE, remove input column from output data frame. \n", - "  `convert`: The universal type for the extracted columns or a dict for \n", - "    individual ones \n", - "\n", - "  `extra`: If sep is a character vector, this controls what happens when \n", - "    there are too many pieces. There are three valid options: \n", - "\n", - "    - \"warn\" (the default): emit a warning and drop extra values. \n", - "\n", - "    - \"drop\": drop any extra values without a warning. \n", - "\n", - "    - \"merge\": only splits at most length(into) times \n", - "\n", - "  `fill`: If sep is a character vector, this controls what happens when \n", - "    there are not enough pieces. There are three valid options: \n", - "\n", - "    - \"warn\" (the default): emit a warning and fill from the right \n", - "\n", - "    - \"right\": fill with missing values on the right \n", - "\n", - "    - \"left\": fill with missing values on the left \n", - "\n", - "##### Returns:\n", - "  Dataframe with separated columns. \n" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "##### Given either a regular expression or a vector of character positions,\nturns a single character column into multiple columns. \n\n##### Args:\n  `data`: The dataframe \n  `col`: Column name or position. \n  `into`: Names of new variables to create as character vector. \n    Use `None`/`NA`/`NULL` to omit the variable in the output. \n\n  `sep`: Separator between columns. \n    If str, `sep` is interpreted as a regular expression. \n    The default value is a regular expression that matches \n    any sequence of non-alphanumeric values. \n    If int, `sep` is interpreted as character positions to split at. \n\n  `remove`: If TRUE, remove input column from output data frame. \n  `convert`: The universal type for the extracted columns or a dict for \n    individual ones \n    Note that when given `TRUE`, `DataFrame.convert_dtypes()` is called, \n    but it will not convert `str` to other types \n    (For example, `'1'` to `1`). You have to specify the dtype yourself. \n\n  `extra`: If sep is a character vector, this controls what happens when \n    there are too many pieces. There are three valid options: \n\n    - \"warn\" (the default): emit a warning and drop extra values.\n\n    - \"drop\": drop any extra values without a warning.\n\n    - \"merge\": only splits at most length(into) times\n\n  `fill`: If sep is a character vector, this controls what happens when \n    there are not enough pieces. There are three valid options: \n\n    - \"warn\" (the default): emit a warning and fill from the right\n\n    - \"right\": fill with missing values on the right\n\n    - \"left\": fill with missing values on the left\n\n  `_base0`: Whether `col` is 0-based when given by index and Whether `sep` \n    is 0-based if given by position \n    If not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  Dataframe with separated columns. \n" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "### # separate_rows " + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": "", + "text/markdown": "##### Separates the values and places each one in its own row.\n\n##### Args:\n  `data`: The dataframe \n  `*columns`: The columns to separate on \n  `sep`: Separator between columns. \n  `convert`: The universal type for the extracted columns or a dict for \n    individual ones \n\n  `_base0`: Whether `columns` is 0-based when given by index \n    If not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  Dataframe with rows separated and repeated. \n" + }, + "metadata": {} } ], "source": [ @@ -91,7 +60,7 @@ "from datar.all import *\n", "\n", "%run nb_helpers.py\n", - "nb_header(separate)" + "nb_header(separate, separate_rows)" ] }, { @@ -108,66 +77,19 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AB
0xy
1xz
2yz
3NaNNaN
\n", - "
" - ], "text/plain": [ " A B\n", - "0 x y\n", - "1 x z\n", - "2 y z\n", - "3 NaN NaN" - ] + "0 NaN NaN\n", + "1 x y\n", + "2 x z\n", + "3 y z" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
AB
0NaNNaN
1xy
2xz
3yz
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], "source": [ @@ -189,61 +111,19 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
B
0y
1z
2z
3NaN
\n", - "
" - ], "text/plain": [ " B\n", - "0 y\n", - "1 z\n", + "0 NaN\n", + "1 y\n", "2 z\n", - "3 NaN" - ] + "3 z" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
B
0NaN
1y
2z
3z
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], "source": [ @@ -264,74 +144,27 @@ }, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ - "[2021-04-16 17:55:06][datar][WARNING] Expected 2 pieces. Additional pieces discarded in 1 rows [2].\n", - "[2021-04-16 17:55:06][datar][WARNING] Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [0].\n" + "[2021-06-07 14:46:59][datar][WARNING] Expected 2 pieces. Additional pieces discarded in 1 rows ['x y z'].\n", + "[2021-06-07 14:46:59][datar][WARNING] Expected 2 pieces. Missing pieces filled with `NA` in 1 rows ['x'].\n" ] }, { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
0xNaN
1xy
2xy
3NaNNaN
\n", - "
" - ], "text/plain": [ " a b\n", "0 x NaN\n", "1 x y\n", "2 x y\n", "3 NaN NaN" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ab
0xNaN
1xy
2xy
3NaNNaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], "source": [ @@ -353,66 +186,19 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
0xNaN
1xy
2xy
3NaNNaN
\n", - "
" - ], "text/plain": [ " a b\n", "0 x NaN\n", "1 x y\n", "2 x y\n", "3 NaN NaN" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ab
0xNaN
1xy
2xy
3NaNNaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], "source": [ @@ -433,66 +219,19 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
0NaNx
1xy
2xy z
3NaNNaN
\n", - "
" - ], "text/plain": [ " a b\n", "0 NaN x\n", "1 x y\n", "2 x y z\n", "3 NaN NaN" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ab
0NaNx
1xy
2xy z
3NaNNaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 6 } ], "source": [ @@ -513,78 +252,26 @@ }, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ - "[2021-04-16 17:55:06][datar][WARNING] Expected 3 pieces. Missing pieces filled with `NA` in 2 rows [0, 1].\n" + "[2021-06-07 14:47:11][datar][WARNING] Expected 3 pieces. Missing pieces filled with `NA` in 2 rows ['x', 'x y'].\n" ] }, { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
0xNaNNaN
1xyNaN
2xyz
3NaNNaNNaN
\n", - "
" - ], "text/plain": [ " a b c\n", "0 x NaN NaN\n", "1 x y NaN\n", "2 x y z\n", "3 NaN NaN NaN" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
abc
0xNaNNaN
1xyNaN
2xyz
3NaNNaNNaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 7 } ], "source": [ @@ -605,54 +292,17 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
keyvalue
0x123
1yerror: 7
\n", - "
" - ], "text/plain": [ " key value\n", "0 x 123\n", "1 y error: 7" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
keyvalue
0x123
1yerror: 7
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 8 } ], "source": [ @@ -674,66 +324,19 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AB
0xy
1xz
2yz
3NaNNaN
\n", - "
" - ], "text/plain": [ " A B\n", - "0 x y\n", - "1 x z\n", - "2 y z\n", - "3 NaN NaN" - ] + "0 NaN NaN\n", + "1 x y\n", + "2 x z\n", + "3 y z" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
AB
0NaNNaN
1xy
2xz
3yz
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 9 } ], "source": [ @@ -755,67 +358,15 @@ }, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ - "[2021-04-16 17:55:06][datar][WARNING] Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [3].\n" + "[2021-06-07 14:47:37][datar][WARNING] Expected 2 pieces. Missing pieces filled with `NA` in 1 rows ['z'].\n" ] }, { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
keyvalue
0x1
1x2
2y4
3zNaN
4NaNNaN
\n", - "
" - ], "text/plain": [ " key value\n", "0 x 1\n", @@ -823,11 +374,11 @@ "2 y 4\n", "3 z NaN\n", "4 NaN NaN" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
keyvalue
0x1
1x2
2y4
3zNaN
4NaNNaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 10 } ], "source": [ @@ -849,67 +400,15 @@ }, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ - "[2021-04-16 17:55:06][datar][WARNING] Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [3].\n" + "[2021-06-07 14:47:42][datar][WARNING] Expected 2 pieces. Missing pieces filled with `NA` in 1 rows ['z'].\n" ] }, { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
keyvalue
0x1.0
1x2.0
2y4.0
3zNaN
4NaNNaN
\n", - "
" - ], "text/plain": [ " key value\n", "0 x 1.0\n", @@ -917,48 +416,33 @@ "2 y 4.0\n", "3 z NaN\n", "4 NaN NaN" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
keyvalue
0x1.0
1x2.0
2y4.0
3zNaN
4NaNNaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df >> separate(f.x, c(\"key\",\"value\"), \":\", convert={'value': float}) " - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "analyzed-special", - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-17T00:55:06.839453Z", - "iopub.status.busy": "2021-04-17T00:55:06.838703Z", - "iopub.status.idle": "2021-04-17T00:55:06.842061Z", - "shell.execute_reply": "2021-04-17T00:55:06.841545Z" - } - }, - "outputs": [ + "execution_count": 11 + }, { + "output_type": "execute_result", "data": { "text/plain": [ - "dtype('float64')" + "key object\n", + "value float64\n", + "dtype: object" ] }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 11 } ], "source": [ - "_.value.dtype" + "df >> separate(f.x, c(\"key\",\"value\"), \":\", convert={'value': float}) \n", + "_.dtypes" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "equivalent-there", "metadata": { "execution": { @@ -979,7 +463,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "aggressive-transport", "metadata": { "execution": { @@ -991,118 +475,47 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xyz
01a1
12d2
12e3
12f4
23g5
23h6
\n", - "
" - ], "text/plain": [ " x y z\n", "0 1 a 1\n", "1 2 d 2\n", - "1 2 e 3\n", - "1 2 f 4\n", - "2 3 g 5\n", - "2 3 h 6" - ] + "2 2 e 3\n", + "3 2 f 4\n", + "4 3 g 5\n", + "5 3 h 6" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xyz
01a1
12d2
22e3
32f4
43g5
53h6
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df >> separate_rows(f.y, f.z, convert={'z': int})" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "fluid-humidity", - "metadata": { - "execution": { - "iopub.execute_input": "2021-04-17T00:55:06.907962Z", - "iopub.status.busy": "2021-04-17T00:55:06.907361Z", - "iopub.status.idle": "2021-04-17T00:55:06.910894Z", - "shell.execute_reply": "2021-04-17T00:55:06.911239Z" - } - }, - "outputs": [ + "execution_count": 13 + }, { + "output_type": "execute_result", "data": { "text/plain": [ - "dtype('int64')" + "x int64\n", + "y object\n", + "z int64\n", + "dtype: object" ] }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 13 } ], "source": [ - "_.z.dtype" + "df >> separate_rows(f.y, f.z, convert={'z': int})\n", + "_.dtypes" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1126,4 +539,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/notebooks/uncount.ipynb b/docs/notebooks/uncount.ipynb index 3d3dc08a..61274bc3 100644 --- a/docs/notebooks/uncount.ipynb +++ b/docs/notebooks/uncount.ipynb @@ -14,52 +14,28 @@ }, "outputs": [ { + "output_type": "display_data", "data": { - "text/html": [ - "
Try this notebook on binder.
" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/html": "
Try this notebook on binder.
" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "### # uncount " - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "### # uncount " }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "##### Duplicating rows according to a weighting variable\n", - "\n", - "##### Args:\n", - "  `_data`: A data frame \n", - "  `weights`: A vector of weights. Evaluated in the context of data \n", - "  `_remove`: If TRUE, and weights is the name of a column in data, \n", - "    then this column is removed. \n", - "\n", - "  `_id`: Supply a string to create a new variable which gives a \n", - "    unique identifier for each created row (0-based). \n", - "\n", - "##### Returns:\n", - "  dataframe with rows repeated. \n" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "##### Duplicating rows according to a weighting variable\n\n##### Args:\n  `data`: A data frame \n  `weights`: A vector of weights. Evaluated in the context of data \n  `_remove`: If TRUE, and weights is the name of a column in data, \n    then this column is removed. \n\n  `_id`: Supply a string to create a new variable which gives a \n    unique identifier for each created row (0-based). \n\n  `_base0`: Whether the generated `_id` columns are 0-based. \n    If not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  dataframe with rows repeated. \n" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], "source": [ @@ -85,56 +61,18 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
x
0a
1b
1b
\n", - "
" - ], "text/plain": [ " x\n", "0 a\n", "1 b\n", - "1 b" - ] + "2 b" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
x
0a
1b
2b
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], "source": [ @@ -156,60 +94,18 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xid
0a0
1b0
1b1
\n", - "
" - ], "text/plain": [ " x id\n", - "0 a 0\n", - "1 b 0\n", - "1 b 1" - ] + "0 a 1\n", + "1 b 1\n", + "2 b 2" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xid
0a1
1b1
2b2
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], "source": [ @@ -230,66 +126,19 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xn
0a1
0a1
1b2
1b2
\n", - "
" - ], "text/plain": [ " x n\n", "0 a 1\n", - "0 a 1\n", - "1 b 2\n", - "1 b 2" - ] + "1 a 1\n", + "2 b 2\n", + "3 b 2" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xn
0a1
1a1
2b2
3b2
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], "source": [ @@ -310,60 +159,18 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xn
0a1
0a1
1b2
\n", - "
" - ], "text/plain": [ " x n\n", "0 a 1\n", - "0 a 1\n", - "1 b 2" - ] + "1 a 1\n", + "2 b 2" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xn
0a1
1a1
2b2
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], "source": [ @@ -400,4 +207,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/notebooks/unite.ipynb b/docs/notebooks/unite.ipynb index 65d4bfb2..21fae3c4 100644 --- a/docs/notebooks/unite.ipynb +++ b/docs/notebooks/unite.ipynb @@ -14,52 +14,28 @@ }, "outputs": [ { + "output_type": "display_data", "data": { - "text/html": [ - "
Try this notebook on binder.
" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/html": "
Try this notebook on binder.
" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "### # unite " - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "### # unite " }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { - "text/markdown": [ - "##### Unite multiple columns into one by pasting strings together\n", - "\n", - "##### Args:\n", - "  `data`: A data frame. \n", - "  `col`: The name of the new column, as a string or symbol. \n", - "  `*columns`: Columns to unite \n", - "  `sep`: Separator to use between values. \n", - "  `remove`: If True, remove input columns from output data frame. \n", - "  `na_rm`: If True, missing values will be remove prior to uniting \n", - "    each value. \n", - "\n", - "##### Returns:\n", - "  The dataframe with selected columns united \n" - ], - "text/plain": [ - "" - ] + "text/plain": "", + "text/markdown": "##### Unite multiple columns into one by pasting strings together\n\n##### Args:\n  `data`: A data frame. \n  `col`: The name of the new column, as a string or symbol. \n  `*columns`: Columns to unite \n  `sep`: Separator to use between values. \n  `remove`: If True, remove input columns from output data frame. \n  `na_rm`: If True, missing values will be remove prior to uniting \n    each value. \n\n  `_base0`: Whether `columns` is 0-based when given by index \n    If not provided, will use `datar.base.getOption('index.base.0')` \n\n##### Returns:\n  The dataframe with selected columns united \n" }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], "source": [ @@ -85,66 +61,19 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xy
0ab
1aNaN
2NaNb
3NaNNaN
\n", - "
" - ], "text/plain": [ " x y\n", "0 a b\n", "1 a NaN\n", "2 NaN b\n", "3 NaN NaN" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
0ab
1aNaN
2NaNb
3NaNNaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 2 } ], "source": [ @@ -166,71 +95,19 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xyz
0aba_b
1aNaNa_nan
2NaNbnan_b
3NaNNaNnan_nan
\n", - "
" - ], "text/plain": [ - " x y z\n", - "0 a b a_b\n", - "1 a NaN a_nan\n", - "2 NaN b nan_b\n", - "3 NaN NaN nan_nan" - ] + " z x y\n", + "0 a_b a b\n", + "1 a_nan a NaN\n", + "2 nan_b NaN b\n", + "3 nan_nan NaN NaN" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
zxy
0a_bab
1a_nanaNaN
2nan_bNaNb
3nan_nanNaNNaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 3 } ], "source": [ @@ -251,71 +128,19 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xyz
0aba_b
1aNaNa
2NaNbb
3NaNNaN
\n", - "
" - ], "text/plain": [ - " x y z\n", - "0 a b a_b\n", - "1 a NaN a\n", - "2 NaN b b\n", - "3 NaN NaN " - ] + " z x y\n", + "0 a_b a b\n", + "1 a a NaN\n", + "2 b NaN b\n", + "3 NaN NaN" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
zxy
0a_bab
1aaNaN
2bNaNb
3NaNNaN
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } ], "source": [ @@ -336,66 +161,19 @@ }, "outputs": [ { + "output_type": "execute_result", "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
xy
0ab
1anan
2nanb
3nannan
\n", - "
" - ], "text/plain": [ " x y\n", "0 a b\n", "1 a nan\n", "2 nan b\n", "3 nan nan" - ] + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
xy
0ab
1anan
2nanb
3nannan
\n
" }, - "execution_count": 1, "metadata": {}, - "output_type": "execute_result" + "execution_count": 5 } ], "source": [ @@ -426,4 +204,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docs/porting_rules.md b/docs/porting_rules.md new file mode 100644 index 00000000..294f7e18 --- /dev/null +++ b/docs/porting_rules.md @@ -0,0 +1,19 @@ +## API lifecycle + +## Argument naming + +## Extra arguments + +For example, `how` for `drop_na`. + +## `tibble` vs `DataFrame` + +## Data frame indexes and column names + +## Nested data frames + +## `list` in `R` vs `list` in `python` + +## `ptypes` + +## Grouped/rowwise data frame diff --git a/docs/reference-maps/ALL.md b/docs/reference-maps/ALL.md index 9b5a4469..a5d47822 100644 --- a/docs/reference-maps/ALL.md +++ b/docs/reference-maps/ALL.md @@ -3,6 +3,7 @@ |-|-|-| |#|#|#| |`dplyr`|APIs ported from `tidyverse/dplyr`|[:octicons-cross-reference-16:][2]| +|`tidyr`|APIs ported from `tidyverse/tidyr`|[:octicons-cross-reference-16:][4]| |`tibble`|APIs ported from `tidyverse/tibble`|[:octicons-cross-reference-16:][1]| |#|#|#| |`datasets`|Datasets collected from `tidyverse` or other related packages|[:octicons-cross-reference-16:][3]| @@ -10,3 +11,4 @@ [1]: ../tibble [2]: ../dplyr [3]: ../datasets +[4]: ../tidyr diff --git a/docs/reference-maps/dplyr.md b/docs/reference-maps/dplyr.md index 1bb3401d..6bd8de2f 100644 --- a/docs/reference-maps/dplyr.md +++ b/docs/reference-maps/dplyr.md @@ -94,7 +94,7 @@ See [datasets][125] |[group_trim()][130]|Trim grouping structure|[:material-notebook:][131]| |[group_split()][132]|Split data frame by groups|[:material-notebook:][133]| |[with_groups()][134]|Perform an operation with temporary groups|[:material-notebook:][135]| -|[_rows_insert()_][136] [_rows_update()_][137] [_rows_patch()_][138] [_rows_upsert()_][139] [_rows_delete()_][140]|Manipulate individual rows|[:material-notebook:][141]| +|[rows_insert()][136] [rows_update()][137] [rows_patch()][138] [rows_upsert()][139] [rows_delete()][140]|Manipulate individual rows|[:material-notebook:][141]| ### Questioning @@ -236,9 +236,9 @@ See [datasets][125] [133]: ../../notebooks/group_split [134]: ../../api/datar.dplyr.group_iter/#datar.dplyr.group_iter.with_groups [135]: ../../notebooks/with_groups -[136]: # -[137]: # -[138]: # -[139]: # -[140]: # -[141]: # +[136]: ../../api/datar.dplyr.rows/#datar.dplyr.rows.rows_insert +[137]: ../../api/datar.dplyr.rows/#datar.dplyr.rows.rows_update +[138]: ../../api/datar.dplyr.rows/#datar.dplyr.rows.rows_patch +[139]: ../../api/datar.dplyr.rows/#datar.dplyr.rows.rows_upsert +[140]: ../../api/datar.dplyr.rows/#datar.dplyr.rows.rows_delete +[141]: ../../notebooks/rows diff --git a/docs/reference-maps/tidyr.md b/docs/reference-maps/tidyr.md new file mode 100644 index 00000000..8cab8911 --- /dev/null +++ b/docs/reference-maps/tidyr.md @@ -0,0 +1,119 @@ + + +## Reference of `datar.dplyr` + +Reference map of `r-tidyverse-tidyr` can be found [here][1]. + +**Legend:** + +|Sample|Status| +|---|---| +|[normal]()|API that is regularly ported| +|[strike-through]()|API that is not ported, or not an API originally| +|[**bold**]()|API that is unique in `datar`| +|[_italic_]()|Working in process| + +### Pivoting + +|API|Description|Notebook example| +|---|---|---:| +|[pivot_longer()][26]|Pivot data from wide to long|[:material-notebook:][27]| +|[pivot_wider()][28]|Pivot data from long to wide|[:material-notebook:][29]| + +### Rectangling + +|API|Description|Notebook example| +|---|---|---:| +|_`hoist()`_ _`unnest_longer()`_ _`unnest_wider()`_ _`unnest_auto()`_|Rectangle a nested list into a tidy tibble|| + +### Nesting + +|API|Description|Notebook example| +|---|---|---:| +|[`nest()`][9] [`unnest()`][10]|Nest and unnest|[:material-notebook:][11]| + +### Character vectors + +|API|Description|Notebook example| +|---|---|---:| +|[`extract()`][22]|Extract a character column into multiple columns using regular expression groups|[:material-notebook:][23]| +|[`separate()`][30]|Separate a character column into multiple columns with a regular expression or numeric locations|[:material-notebook:][31]| +|[`separate_rows()`][34]| +Separate a collapsed column into multiple rows|[:material-notebook:][35]| +|[`unite()`][36]| +Unite multiple columns into one by pasting strings together|[:material-notebook:][37]| + +### Missing values + +|API|Description|Notebook example| +|---|---|---:| +|[`complete()`][18]|Complete a data frame with missing combinations of data|[:material-notebook:][19]| +|[`drop_na()`][20]|Drop rows containing missing values|[:material-notebook:][21]| +|[`expand()`][12] [`crossing()`][13] [`nesting()`][14]|Expand data frame to include all possible combinations of values|[:material-notebook:][15]| +|[`expand_grid()`][16]| +|[`fill()`][24]|Fill in missing values with previous or next value|[:material-notebook:][25]| +|[`full_seq()`][40]|Create the full sequence of values in a vector|[:material-notebook:][41]| +|[`replace_na()`][38]|Replace NAs with specified values|[:material-notebook:][39]| + +### Miscellanea + +|API|Description|Notebook example| +|---|---|---:| +|[`chop()`][3] [`unchop()`][4]|Chop and unchop|[:material-notebook:][5]| +|[`pack()`][6] [`unpack()`][7]|Pack and unpack|[:material-notebook:][8]| +|[`uncount()`][32]|"Uncount" a data frame|[:material-notebook:][33]| + +### Data + +See [datasets][2] + +[1]: https://tidyr.tidyverse.org/reference/index.html +[2]: ../datasets +[3]: ../../api/datar.tidyr.chop/#datar.tidyr.chop.chop +[4]: ../../api/datar.tidyr.chop/#datar.tidyr.chop.unchop +[5]: ../../notebooks/chop +[6]: ../../api/datar.tidyr.pack/#datar.tidyr.pack.pack +[7]: ../../api/datar.tidyr.pack/#datar.tidyr.pack.unpack +[8]: ../../notebooks/chop +[9]: ../../api/datar.tidyr.nest/#datar.tidyr.nest.nest +[10]: ../../api/datar.tidyr.nest/#datar.tidyr.nest.unnest +[11]: ../../notebooks/nest +[12]: ../../api/datar.tidyr.expand/#datar.tidyr.expand.expand +[13]: ../../api/datar.tidyr.expand/#datar.tidyr.expand.crossing +[14]: ../../api/datar.tidyr.expand/#datar.tidyr.expand.nesting +[15]: ../../notebooks/expand +[16]: ../../api/datar.tidyr.expand/#datar.tidyr.expand.expand_grid +[17]: ../../notebooks/expand_grid +[18]: ../../api/datar.tidyr.complete/#datar.tidyr.complete.complete +[19]: ../../notebooks/complete +[20]: ../../api/datar.tidyr.drop_na/#datar.tidyr.drop_na.drop_na +[21]: ../../notebooks/drop_na +[22]: ../../api/datar.tidyr.extract/#datar.tidyr.extract.extract +[23]: ../../notebooks/extract +[24]: ../../api/datar.tidyr.fill/#datar.tidyr.fill.fill +[25]: ../../notebooks/fill +[26]: ../../api/datar.tidyr.pivot_long/#datar.tidyr.pivot_long.pivot_longer +[27]: ../../notebooks/pivot_longer +[28]: ../../api/datar.tidyr.pivot_wide/#datar.tidyr.pivot_wide.pivot_wider +[29]: ../../notebooks/pivot_wider +[30]: ../../api/datar.tidyr.separate/#datar.tidyr.separate.separate +[31]: ../../notebooks/separate +[32]: ../../api/datar.tidyr.uncount/#datar.tidyr.uncount.uncount +[33]: ../../notebooks/uncount +[34]: ../../api/datar.tidyr.separate/#datar.tidyr.separate.separate_rows +[35]: ../../notebooks/separate +[36]: ../../api/datar.tidyr.unite/#datar.tidyr.unite.unite +[37]: ../../notebooks/unite +[38]: ../../api/datar.tidyr.replace_na/#datar.tidyr.replace_na.replace_na +[39]: ../../notebooks/replace_na +[40]: ../../api/datar.tidyr.funcs/#datar.tidyr.funcs.full_seq +[41]: ../../notebooks/full_seq diff --git a/mkdocs.yml b/mkdocs.yml index 08e2f31b..b289446d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -34,6 +34,7 @@ nav: - 'Import': 'import.md' - 'The f': 'f.md' - 'Piping vs regular calling': 'piping_vs_regular.md' + - 'Porting rules': 'porting_rules.md' - 'Indexing/Selection': 'indexing.md' - 'Datasets': 'datasets.md' - 'API': 'mkapi/api/datar' @@ -46,7 +47,9 @@ nav: 'between': 'notebooks/between.ipynb' 'bind': 'notebooks/bind.ipynb' 'case_when': 'notebooks/case_when.ipynb' + 'chop': 'notebooks/chop.ipynb' 'coalesce': 'notebooks/coalesce.ipynb' + 'complete': 'notebooks/complete.ipynb' 'context': 'notebooks/context.ipynb' 'count': 'notebooks/count.ipynb' 'cumall': 'notebooks/cumall.ipynb' @@ -72,8 +75,10 @@ nav: 'n_distinct': 'notebooks/n_distinct.ipynb' 'na_if': 'notebooks/na_if.ipynb' 'near': 'notebooks/near.ipynb' + 'nest': 'notebooks/nest.ipynb' 'nest-join': 'notebooks/nest-join.ipynb' 'nth': 'notebooks/nth.ipynb' + 'pack': 'notebooks/pack.ipynb' 'pivot_longer': 'notebooks/pivot_longer.ipynb' 'pivot_wider': 'notebooks/pivot_wider.ipynb' 'pull': 'notebooks/pull.ipynb' diff --git a/pyproject.toml b/pyproject.toml index 251dccc8..4c1808b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "datar" -version = "0.0.6" +version = "0.0.7" description = "Port of dplyr and other related R packages in python, using pipda." authors = ["pwwang "] readme = "README.md" diff --git a/setup.py b/setup.py index 5c2c4988..1671981f 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ setup( long_description=readme, name='datar', - version='0.0.6', + version='0.0.7', description='Port of dplyr and other related R packages in python, using pipda.', python_requires='==3.*,>=3.7.1', project_urls={"homepage": "https://github.com/pwwang/datar", diff --git a/tests/test_dplyr_rows.py b/tests/test_dplyr_rows.py new file mode 100644 index 00000000..a6b18358 --- /dev/null +++ b/tests/test_dplyr_rows.py @@ -0,0 +1,96 @@ +# tests grabbed from: +# https://github1s.com/tidyverse/dplyr/blob/master/tests/testthat/test-rows.R +import pytest +from pandas.testing import assert_frame_equal +from datar.all import * + +@pytest.fixture +def data(): + return tibble(a = seq(1,3), b = c(letters[[0,1]], NA), c = [0.5, 1.5, 2.5]) + +def test_rows_insert(data): + out = rows_insert(data, tibble(a = 4, b = "z"), by = "a") + exp = tibble(a = seq(1,4), b = c("a", "b", NA, "z"), c = c(0.5, 1.5, 2.5, NA)) + assert_frame_equal(out, exp) + + with pytest.raises(ValueError, match="insert duplicate"): + rows_insert(data, tibble(a = 3, b = "z"), by = "a") + +def test_rows_update(data): + out = rows_update(data, tibble(a = [2,3], b = "z"), by = "a") + exp = tibble(a = seq(1,3), b = c("a", "z", "z"), c = data.c) + assert_frame_equal(out, exp) + + with pytest.raises(ValueError, match="update missing"): + rows_update(data, tibble(a = [2,3], b = "z"), by = c("a", "b")) + + out = rows_update(data, tibble(b = "z", a = [2,3]), by = "a") + exp = tibble(a = seq(1,3), b = c("a", "z", "z"), c = data.c) + assert_frame_equal(out, exp) + +def test_rows_patch(data): + out = rows_patch(data, tibble(a = [2,3], b = "z"), by = "a") + exp = tibble(a = seq(1,3), b = c("a", "b", "z"), c = data.c) + assert_frame_equal(out, exp) + + with pytest.raises(ValueError, match="patch missing"): + rows_patch(data, tibble(a = [2, 3], b = "z"), by = c("a", "b")) + + out = rows_patch(data, tibble(b = "z", a = [2,3]), by = "a") + exp = tibble(a = seq(1,3), b = c("a", "b", "z"), c = data.c) + assert_frame_equal(out, exp) + +def test_rows_upsert(data): + out = rows_upsert(data, tibble(a = [2,3,4], b = "z"), by = "a") + exp = tibble(a = seq(1,4), b = c("a", "z", "z", "z"), c = c(data.c.values, NA)) + assert_frame_equal(out, exp) + +def test_rows_delete(data): + out = rows_delete(data, tibble(a=[2,3]), by="a") + assert_frame_equal(out, data.iloc[[0], :]) + + with pytest.raises(ValueError, match="delete missing"): + rows_delete(data, tibble(a=[2,3,4]), by="a") + + out = rows_delete(data, tibble(a = [2,3], b = "b"), by = "a") + assert_frame_equal(out, data.iloc[[0], :]) + + with pytest.raises(ValueError, match="delete missing"): + rows_delete(data, tibble(a = [2,3], b = "b"), by = c("a", "b")) + +def test_rows_errors(data): + # by must be string or strings + with pytest.raises(ValueError, match="must be a string"): + rows_delete(data, tibble(a = [2,3]), by=1) + + # Insert + with pytest.raises(ValueError): + rows_insert(data, tibble(a = 3, b = "z")) + + with pytest.raises(ValueError): + rows_insert(data.iloc[[0,0], ], tibble(a = 3)) + + with pytest.raises(ValueError): + rows_insert(data, tibble(a = 4, b = "z"), by = "e") + + with pytest.raises(ValueError): + rows_insert(data, tibble(d = 4)) + + # Update + with pytest.raises(ValueError): + rows_update(data, tibble(a = [2,3], b = "z"), by = c("a", "b")) + + # Variants: patch + with pytest.raises(ValueError): + rows_patch(data, tibble(a = [2,3], b = "z"), by = c("a", "b")) + + # Delete and truncate + with pytest.raises(ValueError): + rows_delete(data, tibble(a = [2,3,4])) + + with pytest.raises(ValueError): + rows_delete(data, tibble(a = [2,3], b = "b"), by = c("a", "b")) + + # works + # rows_delete(data, tibble(a = [2,3])) + # rows_delete(data, tibble(a = [2,3], b = "b")) diff --git a/tests/test_tibble.py b/tests/test_tibble.py index ee6554bc..daff443a 100644 --- a/tests/test_tibble.py +++ b/tests/test_tibble.py @@ -16,6 +16,13 @@ from datar.datasets import iris, mtcars from .conftest import assert_iterable_equal + +def test_mixed_numbering(): + df = tibble(a=f[1:5], b=seq(5), c=c(1,2,3,[4,5]), d=c(f[1:3], c(4, 5))) + exp = tibble(a=seq(1,5), b=f.a, c=f.a, d=f.a) + assert_frame_equal(df, exp) + + def test_correct_rows(): out = tibble(value=range(1,11)) >> nrow() assert out == 10 diff --git a/tests/test_tidyr_chop.py b/tests/test_tidyr_chop.py new file mode 100644 index 00000000..4a584cf0 --- /dev/null +++ b/tests/test_tidyr_chop.py @@ -0,0 +1,160 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/master/tests/testthat/test-chop.R +import pytest +from pandas.testing import assert_frame_equal +from datar.all import * + +# chop -------------------------------------------------------------------- +def test_chop_multiple_columns(): + df = tibble(x=[1,1,2], a=[1,2,3], b=[1,2,3]) + out = df >> chop([f.a, f.b]) + + assert_frame_equal( + out, + tibble(x=[1,2], a=[[1,2], [3]], b=[[1,2], [3]]) + ) + +def test_chop_no_columns_returns_input(): + df = tibble(a1 = 1, a2 = 2, b1 = 1, b2 = 2) + assert_frame_equal(chop(df), df) + +def test_chop_grouping_preserves(): + df = tibble(g = c(1, 1), x = [1,2]) + out = df >> group_by(f.g) >> chop(f.x) + assert group_vars(out) == ['g'] + +def test_can_chop_empty_frame(): + df = tibble(x=[], y=[]) + df.index = [] + df['x'] = df['x'].astype(object) + df['y'] = df['y'].astype(object) + assert_frame_equal(chop(df, f.y), df) + assert_frame_equal(chop(df, f.x), df[['y', 'x']]) + +def test_chop_with_all_column_vals(): + df = tibble(x=[1,1,2], a=[1,2,3], b=[1,2,3]) + out = chop(df, ['x', 'a', 'b']) + assert_frame_equal(out, tibble( + x=[[1,1,2]], a=[[1,2,3]], b=[[1,2,3]] + )) + +def test_chop_with_all_column_keys(): + df = tibble(x=[1,1,2], a=[1,2,3], b=[1,2,3]) + out = chop(df, []) + assert_frame_equal(out, df) + +# unchop ------------------------------------------------------------------ + +def test_unchop_extends_into_rows(): + df = tibble(x = [1, 2], y = [NULL, seq(1, 4)]) + out = df >> unchop(f.y, dtypes=int) + assert_frame_equal(out, tibble(x=[2,2,2,2], y=[1,2,3,4])) + +def test_can_unchop_multiple_cols(): + df = tibble(x=[1,2], y=[[1], [2,3]], z=[[4], [5,6]]) + out = df >> unchop(c(f.y, f.z), dtypes=int) + assert_frame_equal(out, tibble( + x=[1,2,2], + y=[1,2,3], + z=[4,5,6] + )) + +def test_unchopping_nothing_leaves_input_unchanged(): + df = tibble(x = f[1:3], y = f[4:6]) + assert_frame_equal(unchop(df, []), df) + +def test_unchopping_null_inputs_are_dropped(): + df = tibble( + x = f[1:4], + y = [NULL, [1,2], 4, NULL], + z = [NULL, [1,2], NULL, 5] + ) + out = df >> unchop(c(f.y, f.z), dtypes=float) + assert_frame_equal(out, tibble( + x=[2,2,3,4], + y=[1,2,4,NA], + z=[1,2,NA,5], + _dtypes=float + )) + +def test_unchop_optionally_keep_empty_rows(): + df = tibble( + x = [1,2], + y = [NULL, [1,2]], + # unchopping y meaning x, z will be keys and they have to be hashable + # z = [tibble(x=[]), tibble(x=[1,2])] + ) + out = df >> unchop(f.y, keep_empty=True) + assert_frame_equal(out, tibble(x=[1,2,2], y=[None, 1,2], _dtypes={'y': object})) + +# out <- df %>% unchop(z, keep_empty = TRUE) +# expect_equal(out$x, c(1, 2, 2)) +# expect_equal(out$z, tibble(x = c(NA, 1L, 2L))) +# }) + +def test_unchop_preserves_columns_of_empty_inputs(): + df = tibble(x=[], y=[], z=[], _dtypes={'x': int}) + assert unchop(df, f.y).columns.tolist() == ['x', 'y', 'z'] + assert unchop(df, [f.y, f.z]).columns.tolist() == ['x', 'y', 'z'] + +# test_that("respects list_of types", { +# df <- tibble(x = integer(), y = list_of(.ptype = integer())) +# expect_equal(df %>% unchop(y), tibble(x = integer(), y = integer())) +# }) + +def test_unchop_preserves_grouping(): + df = tibble(g=1, x=[[1,2]]) + out = df >> group_by(f.g) >> unchop(f.x) + assert group_vars(out) == ['g'] + +def test_unchop_empty_list(): + df = tibble(x=[], y=[]) + out = unchop(df, f.y).y.to_list() + assert out == [] + + df = tibble(x=[], y=tibble(z=[])) + # support nested df? + out = unchop(df, f['y$z']) >> pull(f.y) + assert_frame_equal(out, tibble(z=[])) + +def test_unchop_recycles_size_1_inputs(): + df = tibble(x=[[1], [2,3]], y=[[2,3], [1]]) + out = unchop(df, [f.x, f.y], dtypes=int) + exp = tibble(x=[1,2,3], y=[2,3,1]) + # exp = tibble(x=[1,1,2,3], y=[2,3,1,1]) + assert_frame_equal(out, exp) + +def test_unchop_can_specify_dtypes(): + df = tibble(x=1, y=[[1,2]]) + dtypes = {'y': int, 'z': int} + # No extra columns added + exp = tibble(x=[1,1], y=[1,2]) + # exp = tibble(x=[1,1], y=[1,2], z=[NA,NA]) + out = unchop(df, f.y, dtypes=dtypes) + assert_frame_equal(out, exp) + +# test_that("can specify a ptype with extra columns", { +# df <- tibble(x = 1, y = list(1, 2)) +# ptype <- tibble(y = numeric(), z = numeric()) + +# expect <- tibble(x = c(1, 1), y = c(1, 2), z = c(NA_real_, NA_real_)) + +# expect_identical(unchop(df, y, ptype = ptype), expect) +# }) + +def test_unchop_can_specify_dtypes_to_force_output_type(): + df = tibble(x=[[1,2]]) + out = unchop(df, f.x, dtypes=float) + exp = tibble(x=[1.0,2.0]) + assert_frame_equal(out, exp) + +def test_can_unchop_empty_data_frame(): + chopped = tibble(x=[], y=[[]]) + out = unchop(chopped, f.y) + assert out.shape == (0, 2) + +# test_that("unchop retrieves correct types with emptied chopped df", { +# chopped <- chop(tibble(x = 1:3, y = 4:6), y) +# empty <- vec_slice(chopped, 0L) +# expect_identical(unchop(empty, y), tibble(x = integer(), y = integer())) +# }) diff --git a/tests/test_tidyr_complete.py b/tests/test_tidyr_complete.py new file mode 100644 index 00000000..7b4fbfa5 --- /dev/null +++ b/tests/test_tidyr_complete.py @@ -0,0 +1,48 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/HEAD/tests/testthat/test-complete.R +import pytest +from pandas.testing import assert_frame_equal +from datar.datasets import mtcars +from datar.all import * + +from .conftest import assert_iterable_equal + +def test_complete_with_no_vars_return_data_asis(): + assert_frame_equal(complete(mtcars), mtcars) + +def test_basic_invocation_works(): + df = tibble(x=f[1:2], y=f[1:2], z=f[3:4]) + out = complete(df, f.x, f.y) + assert nrow(out) == 4 + assert_iterable_equal(out.z, [3, NA, NA, 4]) + +def test_preserves_grouping(): + df = tibble(x=f[1:2], y=f[1:2], z=f[3:4]) >> group_by(f.x) + out = complete(df, f.x, f.y) + assert group_vars(out) == group_vars(df) + +def test_expands_empty_factors(): + ff = factor(levels=c("a", "b", "c")) + df = tibble(one=ff, two=ff) + assert nrow(complete(df, f.one, f.two)) == 9 + assert ncol(complete(df, f.one, f.two)) == 2 + +def test_empty_expansion_returns_original(): + df = tibble(x=[]) + rs = complete(df, y=NULL) + assert_frame_equal(rs, df) + + df = tibble(x=f[1:4]) + rs = complete(df, y=NULL) + assert_frame_equal(rs, df) + +def test_not_drop_unspecified_levels_in_complete(): + df = tibble( + x=f[1:3], + y=f[1:3], + z=c("a", "b", "c") + ) + df2 = df >> complete(z=c("a", "b")) + + exp = df[['z', 'x', 'y']] + assert_frame_equal(df2, exp) diff --git a/tests/test_tidyr_drop_na.py b/tests/test_tidyr_drop_na.py new file mode 100644 index 00000000..cd10739e --- /dev/null +++ b/tests/test_tidyr_drop_na.py @@ -0,0 +1,62 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/HEAD/tests/testthat/test-drop-na.R +import pytest +from pandas.testing import assert_frame_equal +from datar.all import * +from datar.core.exceptions import ColumnNotExistingError + +def test_empty_call_drops_every_row(): + df = tibble(x=c(1,2,NA), y=c("a", NA, "b")) + # NA is a float + exp = tibble(x=1, y="a", _dtypes={'x': float}) + assert_frame_equal(drop_na(df), exp) + +def test_only_considers_specified_vars(): + df = tibble(x=c(1,2,NA), y=c("a", NA, "b")) + exp = tibble(x=[1,2], y=c("a", NA), _dtypes={'x': float}) + out = drop_na(df, f.x) + assert_frame_equal(out, exp) + + exp = tibble(x=[1], y=c("a"), _dtypes={'x': float}) + out = drop_na(df, f[f.x:f.y]) + assert_frame_equal(out, exp) + +def test_groups_are_preserved(): + df = tibble(g = c("A", "A", "B"), x = c(1, 2, NA), y = c("a", NA, "b")) + exp = tibble(g = c("A", "B"), x = c(1, NA), y = c("a", "b")) + + gdf = group_by(df, f.g) + gexp = group_by(exp, f.g) + + out = drop_na(gdf, f.y) + assert_frame_equal(out, gexp) + assert group_vars(out) == group_vars(gexp) + +def test_empty_call_drops_every_row(): + df = tibble(x=c(1,2,NA), y=c("a", NA, "b")) + out = drop_na(df) + assert_frame_equal(out, tibble(x=1., y="a")) + +def test_errors_are_raised(): + df = tibble(x=c(1,2,NA), y=c("a", NA, "b")) + with pytest.raises(ColumnNotExistingError): + drop_na(df, f.z) + +def test_single_variable_var_doesnot_lose_dimension(): + df = tibble(x=c(1,2,NA)) + out = drop_na(df, f.x) + exp = tibble(x=c(1.,2.)) + assert_frame_equal(out, exp) + +def test_works_with_list_cols(): + df = tibble(x=[[1], NULL, [3]], y=[1, 2, NA]) + rs = drop_na(df) + + assert_frame_equal(rs, tibble(x=[[1]], y=1.)) + +def test_preserves_attributes(): + df = tibble(x = c(1, NA)) + df.attrs['a'] = 10 + out = drop_na(df) + assert_frame_equal(out, tibble(x=1.)) + assert out.attrs['a'] == 10 diff --git a/tests/test_tidyr_expand.py b/tests/test_tidyr_expand.py new file mode 100644 index 00000000..f8099203 --- /dev/null +++ b/tests/test_tidyr_expand.py @@ -0,0 +1,203 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/HEAD/tests/testthat/test-expand.R +import pytest +from datar.all import * +from pandas.testing import assert_frame_equal +from .conftest import assert_iterable_equal + +# expand ---------------------------------------------------------------- +def test_expand_completes_all_values(): + df = tibble(x=f[1:2], y=f[1:2]) + out = expand(df, f.x, f.y) + assert_frame_equal(out, tibble(x=[1,1,2,2,], y=[1,2,1,2])) + +def test_multiple_variables_in_one_arg_doesnot_expand(): + df = tibble(x=f[1:2], y=f[1:2]) + out = expand(df, c(f.x, f.y)) + assert nrow(out) == 2 + +def test_nesting_doesnot_expand_values(): + df = tibble(x=f[1:2], y=f[1:2]) + out = expand(df, nesting(f.x, f.y)) + assert_frame_equal(out, df) + +def test_unnamed_dfs_are_flattened(): + df = tibble(x=f[1:2], y=f[1:2]) + out = expand(df, nesting(f.x, f.y)) + assert_iterable_equal(out.x, df.x) + + out = crossing(df) + assert_iterable_equal(out.x, df.x) + +def test_named_dfs_are_not_flattened(): + df = tibble(x=f[1:2], y=f[1:2]) + out = expand(df, x=nesting(f.x, f.y)) >> pull(f.x) + assert_frame_equal(out, df) + + out = crossing(x=df) >> pull(f.x) + assert_frame_equal(out, df) + +def test_expand_works_with_non_standard_colnames(): + df = tribble( + f[' x '], f['/y'], + 1, 1, + 2, 2 + ) + out = expand(df, f[' x '], f['/y']) + assert nrow(out) == 4 + +def test_expand_accepts_expressions(): + df = expand(tibble(), x=[1,2,3], y=[3,2,1]) + out = crossing(x=[1,2,3], y=[3,2,1]) + assert_frame_equal(df, out) + +def test_expand_respects_groups(): + df = tibble( + a=[1,1,2], + b=[1,2,1], + c=[2,1,1] + ) + out = df >> group_by(f.a) >> expand(f.b, f.c) >> nest(data=c(f.b, f.c)) + assert_frame_equal(out.data.values[0], crossing(b=[1,2], c=[1,2])) + assert_frame_equal(out.data.values[1].reset_index(drop=True), tibble(b=1, c=1)) + +def test_presevers_ordered_factors(): + df = tibble(a=factor("a", ordered=True)) + out = expand(df, f.a) + assert out.a.values.ordered + +def test_preserves_nas(): + x = c("A", "B", NA) + out = crossing(x) + assert_iterable_equal(out.iloc[:, 0], x) + +def test_crossing_preserves_factor_levels(): + # NA can't be levels for pandas.Categorical object + x_na_lev_extra = factor(["a", NA], levels=["a", "b"], exclude=NULL) + out = crossing(x=x_na_lev_extra) + assert_iterable_equal(levels(out.x), ['a', 'b']) + +def test_null_inputs(): + tb = tibble(x=f[1:5]) + out = expand(tb, f.x, y=NULL) + assert_frame_equal(out, tb) + out = nesting(x=tb.x, y=NULL) + assert_frame_equal(out, tb) + out = crossing(NULL, x=tb.x, y=NULL) + assert_frame_equal(out, tb) + +def test_0len_input_gives_0len_output(): + tb = tibble(x=[]) + assert_frame_equal(expand(tb, f.x), tb) + assert_frame_equal(expand(tb, x=f.x), tb) + assert_frame_equal(expand(tb, y=NULL), tibble()) + + assert_frame_equal( + expand_grid(x=[], y=[1,2,3]), + tibble(x=[], y=[]) + ) + +def test_expand_crossing_expand_missing_factor_levels_nesting_doesnot(): + tb = tibble( + x=f[1:3], + f=factor("a", levels=c("a", "b")) + ) + assert nrow(expand(tb, f.x, f.f)) == 6 + assert nrow(crossing(x=tb.x, f=tb.f)) == 6 + assert nrow(nesting(x=tb.x, f=tb.f)) == 3 + +# test_that("expand() reconstructs input dots is empty", { +# expect_s3_class(expand(mtcars), "data.frame") +# expect_s3_class(expand(as_tibble(mtcars)), "tbl_df") +# }) + +# test_that("crossing checks for bad inputs", { +# expect_error( +# crossing(x = 1:10, y = quote(a)), +# class = "vctrs_error_scalar_type" +# ) +# }) + +def test_crossing_handles_list_columns(): + x = [1,2] + y = [[1], [1,2]] + out = crossing(x, y) + + assert nrow(out) == 4 + assert_iterable_equal(out.iloc[:, 0], rep(x, each=2)) + assert out.iloc[:, 1].to_list() == [[1], [1,2]] * 2 + +def test_expand_grid_can_control_name_repair(): + x = [1,2] + + out = expand_grid(**{'x.1': x, 'x.2': x}, _name_repair="universal") + assert out.columns.tolist() == ['x__1', 'x__2'] + +## vars with the same name will get overriden +# test_that("expand_grid can control name_repair", { +# x <- 1:2 + +# if (packageVersion("tibble") > "2.99") { +# expect_error(expand_grid(x, x), class = "rlang_error") +# } else { +# expect_error(expand_grid(x, x), "must not be duplicated") +# } + +# expect_message(out <- expand_grid(x, x, .name_repair = "unique"), "New names:") +# expect_named(out, c("x...1", "x...2")) + +# out <- expand_grid(x, x, .name_repair = "minimal") +# expect_named(out, c("x", "x")) +# }) + +def test_crossing_nesting_expand_respect_name_repair(): + x = [1,2] + out = crossing(**{'x.1': x, 'x.2': x}, _name_repair='universal') + assert out.columns.tolist() == ['x__1', 'x__2'] + + out = nesting(**{'x.1': x, 'x.2': x}, _name_repair='universal') + assert out.columns.tolist() == ['x__1', 'x__2'] + + df = tibble(x) + out = df >> expand(**{'x.1': x, 'x.2': x}, _name_repair='universal') + assert out.columns.tolist() == ['x__1', 'x__2'] + + +# # dots_cols supports lazy evaluation -------------------------------------- + +# test_that("dots_cols evaluates each expression in turn", { +# out <- dots_cols(x = seq(-2, 2), y = x) +# expect_equal(out$x, out$y) +# }) + +# expand_grid ---------------------------------- +def test_expand_grid(): + out = expand_grid(x=seq(1,3), y=[1,2]) + assert_frame_equal(out, tibble(x=[1,1,2,2,3,3], y=[1,2,1,2,1,2])) + + out = expand_grid(l1 = letters, l2 = LETTERS) + assert dim(out) == (676, 2) + + out = expand_grid(df=tibble(x=f[1:2], y=[2,1]), z=[1,2,3]) + assert_frame_equal(out, tibble(df=tibble(x=[1,1,1,2,2,2], + y=[2,2,2,1,1,1]), + z=[1,2,3,1,2,3])) + + out = expand_grid(x1=tibble(a=[1,2], b=[3,4]), x2=tibble(a=[5,6], b=[7,8])) + assert_frame_equal( + out, + tibble(x1=tibble(a=[1,1,2,2], b=[3,3,4,4]), + x2=tibble(a=[5,6,5,6], b=[7,8,7,8])) + ) + +def test_expand_rowwise_df_drops_rowwise(): + df = tibble(x=f[1:2], y=f[1:2]) + rf = rowwise(df) + out1 = df >> expand(f.x, f.y) + out2 = rf >> expand(f.x, f.y) + assert_frame_equal(out1, out2) + +def test_flatten_at_0len_val(): + from datar.tidyr.expand import _flatten_at + out = _flatten_at({'x': [], 'y': [1,2,3]}, {'x': True, 'y': False}) + assert out == {'y': [1,2,3]} diff --git a/tests/test_tidyr_extract.py b/tests/test_tidyr_extract.py new file mode 100644 index 00000000..9ec62d27 --- /dev/null +++ b/tests/test_tidyr_extract.py @@ -0,0 +1,65 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/HEAD/tests/testthat/test-extract.R +import pytest +from pandas.testing import assert_frame_equal +from datar.all import * +from .conftest import assert_iterable_equal + + +def test_default_returns_first_alpha_group(): + df = tibble(x=c("a.b", "a.d", "b.c")) + out = df >> extract(f.x, "A") + assert_iterable_equal(out.A, ["a", "a", "b"]) + +def test_can_match_multiple_groups(): + df = tibble(x=c("a.b", "a.d", "b.c")) + out = df >> extract(f.x, ["A", "B"], r'(\w+)\.(\w+)') + assert_iterable_equal(out.A, ['a', 'a', 'b']) + assert_iterable_equal(out.B, ['b', 'd', 'c']) + +def test_can_drop_group(): + df = tibble(x = c("a.b.e", "a.d.f", "b.c.g")) + out = df >> extract(f.x, ["x", NA, "y"], r'([a-z])\.([a-z])\.([a-z])') + assert_iterable_equal(out.columns, ['x', 'y']) + assert_iterable_equal(out.y, ['e', 'f', 'g']) + +def test_match_failures_give_NAs(): + df = tibble(x=c("a.b", "a")) + out = df >> extract(f.x, "a", "(b)") + assert_iterable_equal(out.a, ["b", NA]) + +def test_extract_keeps_characters_as_character(): + df = tibble(x="X-1") + # cannot do convert=True, but specify the specific dtype + out = extract(df, f.x, c("x", "y"), r'(.)-(.)', convert={'y': int}) + assert_frame_equal(out, tibble(x="X", y=1)) + +def test_can_combine_into_multiple_columns(): + df = tibble(x="abcd") + out = extract(df, f.x, c('a', 'b', 'a', 'b'), r'(.)(.)(.)(.)') + assert_frame_equal(out, tibble(a = "ac", b = "bd")) + +def test_groups_are_preserved(): + df = tibble(g=1, x="X1") >> group_by(f.g) + rs = df >> extract(f.x, ['x', 'y'], '(.)(.)') + assert group_vars(rs) == group_vars(df) + +def test_informative_error_message_if_wrong_number_of_groups(): + df = tibble(x="a") + + with pytest.raises(ValueError, match="should define 1 groups"): + extract(df, f.x, "y", ".") + + with pytest.raises(ValueError, match="should define 2 groups"): + extract(df, f.x, ["y", "z"], ".") + +def test_invalid_into(): + df = tibble(x="a") + + with pytest.raises(ValueError, match="must be a string"): + extract(df, f.x, 1) + +def test_convert_to_single_type(): + df = tibble(x='1.2') + out = extract(df, f.x, ['a', 'b'], r'(\d)\.(\d)', convert=int) + assert_frame_equal(out, tibble(a=1, b=2)) diff --git a/tests/test_tidyr_fill.py b/tests/test_tidyr_fill.py new file mode 100644 index 00000000..495948a3 --- /dev/null +++ b/tests/test_tidyr_fill.py @@ -0,0 +1,75 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/HEAD/tests/testthat/test-fill.R +import pytest +from datar.all import * + +from pandas.testing import assert_frame_equal +from .conftest import assert_iterable_equal + +def test_all_missing_left_unchanged(): + df = tibble( + a = c(NA, NA), + b = c(NULL, NA), + c = c(None, NA), + ) + down = fill(df, f.a, f.b, f.c) + up = fill(df, f.a, f.b, f.c, _direction="up") + + assert_frame_equal(down, df) + assert_frame_equal(up, df) + +def test_missings_are_filled_correctly(): + df = tibble(x=c(NA, 1, NA, 2, NA, NA)) + + out = fill(df, f.x) + assert_iterable_equal(out.x, c(NA, 1,1,2,2,2)) + + out = fill(df, f.x, _direction="up") + assert_iterable_equal(out.x, c(1,1,2,2,NA,NA)) + + out = fill(df, f.x, _direction="downup") + assert_iterable_equal(out.x, c(1,1,1,2,2,2)) + + out = fill(df, f.x, _direction="updown") + assert_iterable_equal(out.x, c(1,1,2,2,2,2)) + +def test_missings_filled_down_for_each_atomic_vector(): + df = tibble( + lgl = c(True, NA), + int = c(1, NA), + dbl = c(1.0, NA), + chr = c("a", NA), + lst = [seq(1,5), NULL] + ) + out = fill(df, everything()) + assert_iterable_equal(out.lgl, [True, True]) + assert_iterable_equal(out.int, [1, 1]) + assert_iterable_equal(out.dbl, [1.0, 1.0]) + assert_iterable_equal(out.chr, ["a", "a"]) + assert [x.tolist() for x in out.lst.tolist()] == [[1,2,3,4,5]] * 2 + +def test_missings_filled_up_for_each_atomic_vector(): + df = tibble( + lgl = c(NA, True), + int = c(NA, 1), + dbl = c(NA, 1.0), + chr = c(NA, "a"), + lst = [NULL, seq(1,5)] + ) + out = fill(df, everything(), _direction="up") + assert_iterable_equal(out.lgl, [True, True]) + assert_iterable_equal(out.int, [1, 1]) + assert_iterable_equal(out.dbl, [1.0, 1.0]) + assert_iterable_equal(out.chr, ["a", "a"]) + assert [x.tolist() for x in out.lst.tolist()] == [[1,2,3,4,5]] * 2 + +def test_fill_preserves_attributes(): + df = tibble(x=c(NA, 1)) + df.attrs['a'] = 10 + out = fill(df, f.x) + assert out.attrs['a'] == 10 + +def test_fill_respects_grouping(): + df = tibble(x = c(1, 1, 2), y = c(1, NA, NA)) + out = df >> group_by(f.x) >> fill(f.y) + assert_iterable_equal(out.y, [1,1,NA]) diff --git a/tests/test_tidyr_nest.py b/tests/test_tidyr_nest.py new file mode 100644 index 00000000..c6738a6f --- /dev/null +++ b/tests/test_tidyr_nest.py @@ -0,0 +1,318 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/master/tests/testthat/test-nest.R +from pandas.core.dtypes.common import is_categorical_dtype +import pytest +from pandas.testing import assert_frame_equal +from datar.all import * +from datar.core.grouped import DataFrameGroupBy, DataFrameRowwise +from .conftest import assert_iterable_equal + +# nest -------------------------------------------------------------------- +def test_nest_turns_grouped_values_into_one_list_df(): + df = tibble(x=[1,1,1], y=f[1:3]) + out = nest(df, data=f.y) + assert len(out.x) == 1 + assert len(out.data) == 1 + assert_frame_equal(out.data.values[0], tibble(y=f[1:3])) + +def test_nest_uses_grouping_vars_if_present(): + df = tibble(x=[1,1,1], y=f[1:3]) + out = nest(group_by(df, f.x)) + assert group_vars(out) == ['x'] + assert_frame_equal(out.data.values[0], tibble(y=f[1:3])) + +def test_nest_provides_grouping_vars_override_grouped_defaults(): + df = tibble(x=1, y=2, z=3) >> group_by(f.x) + out = nest(df, data=f.y) + assert isinstance(out, DataFrameGroupBy) + assert out.columns.tolist() == ['x', 'z', 'data'] + assert out.data.values[0].columns.tolist() == ['y'] + +def test_nest_puts_data_into_correct_row(): + df = tibble(x = f[1:3], y = c("B", "A", "A")) + out = df >> nest(data = f.x) >> filter(f.y == "B") + assert len(out.data) == 1 + assert out.data.values[0].x.tolist() == [1] + +def test_nest_everyting_returns_a_simple_df(): + df = tibble(x=f[1:3], y=['B', 'A', 'A']) + out = nest(df, data=c(f.x, f.y)) + assert len(out.data) == 1 + assert_frame_equal(out.data.values[0], df) + +def test_nest_preserves_order_of_data(): + df = tibble(x=[1,3,2,3,2], y=f[1:5]) + out = nest(df, data=f.y) + assert out.x.tolist() == [1,3,2] + +def test_nest_can_strip_names(): + df = tibble(x = c(1, 1, 1), ya = f[1:3], yb = f[4:6]) + out = df >> nest(y = starts_with("y"), _names_sep = "") + assert out.y.values[0].columns.tolist() == ['a', 'b'] + +def test_nest_names_sep(): + df = tibble(x = c(1, 1, 1), y_a = f[1:3], y_b = f[4:6]) + out = df >> nest(y = starts_with("y"), _names_sep = "_") + assert out.y.values[0].columns.tolist() == ['a', 'b'] + +def test_empty_factor_levels_dont_affect_nest(): + df = tibble( + x = factor(c("z", "a"), levels=letters), + y = f[1:2] + ) + out = nest(df, data=f.y) + assert out.x.eq(df.x).all() + +def test_nest_works_with_empty_df(): + df = tibble(x=[], y=[]) + out = nest(df, data=f.x) + assert out.columns.tolist() == ['y', 'data'] + assert nrow(out) == 0 + + out = nest(df, data=c(f.x, f.y)) + assert out.columns.tolist() == ['data'] + assert nrow(out) == 0 + +# test_that("tibble conversion occurs in the `nest.data.frame()` method", { +# df <- data.frame(x = 1, y = 1:2) +# out <- df %>% nest(data = y) +# expect_s3_class(out, "tbl_df") +# expect_s3_class(out$data[[1L]], "tbl_df") +# }) + +def test_can_nest_multiple_columns(): + df = tibble(x = 1, a1 = 1, a2 = 2, b1 = 1, b2 = 2) + out = df >> nest(a=c(f.a1, f.a2), b=c(f.b1, f.b2)) + + assert out.columns.tolist() == ['x', 'a', 'b'] + assert_frame_equal(out.a.values[0], df[['a1', 'a2']]) + assert_frame_equal(out.b.values[0], df[['b1', 'b2']]) + +def test_nest_no_columns_error(): + # warning for no columns will be changed to error here. + df = tibble(x = 1, a1 = 1, a2 = 2, b1 = 1, b2 = 2) + with pytest.raises(ValueError, match="must not be empty"): + nest(df) + +# test_that("nesting no columns nests all inputs", { +# # included only for backward compatibility +# df <- tibble(a1 = 1, a2 = 2, b1 = 1, b2 = 2) +# expect_warning(out <- nest(df), "must not be empty") +# expect_named(out, "data") +# expect_equal(out$data[[1]], df) +# }) + +# unnest ------------------------------------------------------------------ + +def test_unnest_keep_empty_rows(): + df = tibble(x=f[1:3], y = [NULL, tibble(), tibble(a=1)]) + out1 = df >> unnest(f.y) + assert nrow(out1) == 1 + + out2 = df >> unnest(f.y, keep_empty=True) + assert nrow(out2) == 3 + assert_iterable_equal(out2.a, [NA, NA, 1]) + +## problem with NAs (numpy.nan), which is a float type +# test_that("empty rows still affect output type", { +# df <- tibble( +# x = 1:2, +# data = list( +# tibble(y = character(0)), +# tibble(z = integer(0)) +# ) +# ) +# out <- unnest(df, data) +# expect_equal(out, tibble(x = integer(), y = character(), z = integer())) +# }) + +def test_unnest_bad_inputs_error(): + df = tibble(x=1, y=[mean]) + out = unnest(df, f.y) + ## able to do it + assert nrow(out) == 1 + # with pytest.raises(ValueError): + # unnest(df, f.y) + +def test_unnest_combines_augmented_vectors(): + df = tibble(x=[factor(letters[:3])]) + out = unnest(df, f.x) + assert is_categorical_dtype(out.x) + assert_iterable_equal(out.x, letters[:3]) + +def test_unnest_vector_unnest_preserves_names(): + df = tibble(x=[1, [2,3]], y=["a", ["b", "c"]]) + out = unnest(df, f.x) + assert out.columns.tolist() == ['x', 'y'] + +def test_unnest_rows_and_cols_of_nested_dfs_are_expanded(): + df = tibble(x = f[1:2], y = [tibble(a = 1), tibble(b = f[1:2])]) + out = df >> unnest(f.y) + + assert out.columns.tolist() == ['x', 'a', 'b'] + assert nrow(out) == 3 + +def test_unnest_nested_lists(): + df = tibble(x=f[1:2], y=[[["a"]], [["b"]]]) + rs = unnest(df, f.y) + assert_frame_equal(rs, tibble(x=f[1:2], y=[["a"], ["b"]])) + +def test_can_unnest_mixture_of_named_and_unnamed(): + df = tibble( + x="a", + y=[tibble(y=f[1:2])], + z=[[1,2]] + ) + out = unnest(df, c(f.y, f.z)) + assert_frame_equal(out, tibble(x=["a","a"], y=f[1:2], z=f[1:2])) + +def test_can_unnest_lists(): + df = tibble(x=f[1:2], y=[seq(1,3), seq(4,9)]) + out = unnest(df, f.y) + assert_frame_equal(out, tibble(x=rep([1,2], [3,6]), y=f[1:9])) + +def test_unnest_can_combine_null_with_vectors_or_dfs(): + df1 = tibble(x=f[1:2], y=[NULL, tibble(z=1)]) + out = unnest(df1, f.y) + assert out.columns.tolist() == ['x', 'z'] + assert_iterable_equal(out.z, [1]) + + df2 = tibble(x=f[1:2], y=[NULL, 1]) + out = unnest(df2, f.y) + assert out.columns.tolist() == ['x', 'y'] + assert_iterable_equal(out.y, [1]) + +def test_unnest_vectors_become_columns(): + df = tibble(x=f[1:2], y=[1, [1,2]]) + out = unnest(df, f.y) + assert_iterable_equal(out.y, [1,1,2]) + +def test_unnest_multiple_columns_must_be_same_length(): + df = tibble(x=[[1,2]], y=[[1,2,3]]) + with pytest.raises(ValueError, match="Incompatible lengths: 2, 3"): + unnest(df, c(f.x, f.y)) + + df = tibble(x=[[1,2]], y=[tibble(y=f[1:3])]) + with pytest.raises(ValueError, match="Incompatible lengths: 2, 3"): + unnest(df, c(f.x, f.y)) + +def test_unnest_using_non_syntactic_names(): + out = tibble(foo_bar=[[1,2], 3]) + out.columns = ['foo bar'] + out = out >> unnest(f['foo bar']) + assert out.columns.to_list() == ['foo bar'] + +def test_unnest_no_cols_error(): + with pytest.raises(ValueError): + tibble(x=[]) >> unnest() + +def test_unnest_list_of_empty_dfs(): + df = tibble(x=[1,2], y=[tibble(a=[]), tibble(b=[])]) + out = df >> unnest(f.y) + assert dim(out) == (0, 3) + assert out.columns.tolist() == ['x', 'a', 'b'] + +# other methods ----------------------------------------------------------------- + +def test_unnest_rowwise_df_becomes_grouped_df(): + df = tibble(g=1, x=[[1,2,3]]) >> rowwise(f.g) + rs = df >> unnest(f.x) + assert isinstance(rs, DataFrameGroupBy) + assert not isinstance(rs, DataFrameRowwise) + assert group_vars(rs) == ['g'] + +def test_unnest_grouping_preserved(): + df = tibble(g=1, x=[[1,2,3]]) >> group_by(f.g) + rs = df >> unnest(f.x) + assert isinstance(rs, DataFrameGroupBy) + assert not isinstance(rs, DataFrameRowwise) + assert group_vars(rs) == ['g'] + +# Empty inputs ------------------------------------------------------------ + +def test_unnest_empty_data_frame(): + df = tibble(x=[], y=[], _dtypes={'x': int}) + out = unnest(df, f.y) + assert dim(out) == (0, 2) + + +## unable to do it due to NAs being float +# test_that("unnest() preserves ptype", { +# tbl <- tibble(x = integer(), y = list_of(ptype = tibble(a = integer()))) +# res <- unnest(tbl, y) +# expect_equal(res, tibble(x = integer(), a = integer())) +# }) + +## empty columns ([]) can be unnested +# test_that("errors on bad inputs", { +# df <- tibble(x = integer(), y = list()) +# expect_error(unnest(df, x), "list of vectors") +# }) + +def test_unnest_keeps_list_cols(): + df = tibble(x=f[1:2], y=[[3], [4]], z=[5, [6,7]]) + out = df >> unnest(f.y) + assert out.columns.tolist() == ['x', 'y', 'z'] + +# # Deprecated behaviours --------------------------------------------------- + +# test_that("warn about old style interface", { +# df <- tibble(x = c(1, 1, 1), y = 1:3) +# expect_warning(out <- nest(df, y), "data = c(y)", fixed = TRUE) +# expect_named(out, c("x", "data")) +# }) + +# test_that("can control output column name", { +# df <- tibble(x = c(1, 1, 1), y = 1:3) +# expect_warning(out <- nest(df, y, .key = "y"), "y = c(y)", fixed = TRUE) +# expect_named(out, c("x", "y")) +# }) + +# test_that("can control output column name when nested", { +# df <- dplyr::group_by(tibble(x = c(1, 1, 1), y = 1:3), x) +# expect_warning(out <- nest(df, .key = "y"), "`.key`", fixed = TRUE) +# expect_named(out, c("x", "y")) +# }) + +# test_that(".key gets warning with new interface", { +# df <- tibble(x = c(1, 1, 1), y = 1:3) +# expect_warning(out <- nest(df, y = y, .key = "y"), ".key", fixed = TRUE) +# expect_named(df, c("x", "y")) +# }) + +# test_that("cols must go in cols", { +# df <- tibble(x = list(3, 4), y = list("a", "b")) +# expect_warning(unnest(df, x, y), "c(x, y)", fixed = TRUE) +# }) + +# test_that("need supply column names", { +# df <- tibble(x = 1:2, y = list("a", "b")) +# expect_warning(unnest(df), "c(y)", fixed = TRUE) +# }) + +# test_that("sep combines column names", { +# df <- tibble(x = list(tibble(x = 1)), y = list(tibble(x = 1))) +# out <- expect_warning(df %>% unnest(c(x, y), .sep = "_"), "names_sep") +# expect_named(out, c("x_x", "y_x")) +# }) + +# test_that("unnest has mutate semantics", { +# df <- tibble(x = 1:3, y = list(1, 2:3, 4)) +# out <- expect_warning(df %>% unnest(z = map(y, `+`, 1)), "mutate") +# expect_equal(out$z, 2:5) +# }) + +# test_that(".drop and .preserve are deprecated", { +# df <- tibble(x = list(3, 4), y = list("a", "b")) +# expect_warning(df %>% unnest(x, .preserve = y), ".preserve") + +# df <- tibble(x = list(3, 4), y = list("a", "b")) +# expect_warning(df %>% unnest(x, .drop = FALSE), ".drop") +# }) + +# test_that(".id creates vector of names for vector unnest", { +# df <- tibble(x = 1:2, y = list(a = 1, b = 1:2)) +# out <- expect_warning(unnest(df, y, .id = "name"), "names") + +# expect_equal(out$name, c("a", "b", "b")) +# }) diff --git a/tests/test_tidyr_pack.py b/tests/test_tidyr_pack.py new file mode 100644 index 00000000..edd390e4 --- /dev/null +++ b/tests/test_tidyr_pack.py @@ -0,0 +1,91 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/master/tests/testthat/test-pack.R +import pytest +from pandas.testing import assert_frame_equal +from datar.all import * + +# pack -------------------------------------------------------------------- + +def test_can_pack_multiple_columns(): + df = tibble(a1=1, a2=2, b1=1, b2=2) + out = df >> pack(a=c(f.a1, f.a2), b=c(f.b1, f.b2)) + + assert colnames(out) == ['a', 'b'] + assert_frame_equal(pull(out, f.a), df[['a1', 'a2']]) + assert_frame_equal(pull(out, f.b), df[['b1', 'b2']]) + +def test_pack_no_columns_returns_input(): + df = tibble(a1=1, a2=2, b1=1, b2=2) + assert_frame_equal(pack(df), df) + +def test_can_strip_outer_names_from_inner_names(): + df = tibble(ax=1, ay=2) + out = pack(df, a=c(f.ax, f.ay), _names_sep="") + out = out >> pull(f.a) >> colnames() + assert out == ['x', 'y'] + +def test_grouping_preserved(): + df = tibble(g1=1, g2=2, g3=3) + out = df >> group_by(f.g1, f.g2) >> pack(g=c(f.g2, f.g3)) + assert group_vars(out) == ['g1'] + + +# unpack ------------------------------------------------------------------ + +def test_unpack_preserves_grouping(): + df = tibble(g=1, x=tibble(y=1)) + out = df >> group_by(f.g) >> unpack(f.x) + assert group_vars(out) == ['g'] + assert out.columns.tolist() == ['g', 'y'] + +def test_unpack_error_on_atomic_columns(): + df = tibble(x=f[1:2]) + with pytest.raises(ValueError, match="must be a data frame column"): + df >> unpack(f.x) + +def test_df_cols_are_directly_unpacked(): + df = tibble(x=f[1:3], y=tibble(a=f[1:3], b=f[3:1])) + out = df >> unpack(f.y) + assert out.columns.tolist() == ['x', 'a', 'b'] + exp = df >> pull(f.y) + assert_frame_equal(out[['a', 'b']], exp) + +def test_cannot_unpack_0col_dfs(): + # Since we only have fake packed data frame columns, + # this gives nothing about the column, so it can't be unpacked + df = tibble(x=f[1:3], y=tibble(_rows=3)) + # `y` doesn't even exist + with pytest.raises(ValueError): + df >> unpack(f.y) + +# test_that("can unpack 0-col dataframe", { +# df <- tibble(x = 1:3, y = tibble(.rows = 3)) +# out <- df %>% unpack(y) +# expect_named(out, c("x")) +# }) + +def test_can_unpack_0row_dfs(): + df = tibble(x=[], y=tibble(a=[])) + out = df >> unpack(f.y) + assert out.columns.tolist() == ['x', 'a'] + +def test_unpack_0row_df(): + df = tibble(x=[], y=[]) + out = df >> unpack(f.y) + assert_frame_equal(out, df) + +def test_unpack_can_choose_separator(): + df = tibble(x = 1, y = tibble(a = 2), z = tibble(a = 3)) + out = df >> unpack([f.y, f.z], names_sep='_') + assert out.columns.tolist() == ['x', 'y_a', 'z_a'] + + out = df >> unpack([f.y], names_sep='_') + assert out.columns.tolist() == ['x', 'y_a', 'z$a'] + + out = df >> unpack([2], names_sep='_') + assert out.columns.tolist() == ['x', 'y_a', 'z$a'] + +def test_unpack_cannot_select_multiple_columns_of_packed_df_by_indexes(): + df = tibble(x = 1, y = tibble(a = 2, b=3)) + with pytest.raises(ValueError, match="already been selected"): + df >> unpack([2,3]) diff --git a/tests/test_tidyr_pivot_long.py b/tests/test_tidyr_pivot_long.py new file mode 100644 index 00000000..50a7724d --- /dev/null +++ b/tests/test_tidyr_pivot_long.py @@ -0,0 +1,226 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/HEAD/tests/testthat/test-pivot-long.R +import pytest +from datar.all import * + +from pandas.testing import assert_frame_equal +from .conftest import assert_iterable_equal + +def test_can_pivot_all_cols_to_long(): + df = tibble(x=f[1:2], y=f[3:4]) + pv = pivot_longer(df, f[f.x:f.y]) + + assert pv.columns.tolist() == ['name', 'value'] + # assert_iterable_equal(pv.name, rep(df.columns, 2)) + assert_iterable_equal(pv.name, rep(df.columns, each=2)) + # assert_iterable_equal(pv.value, [1,3,2,4]) + assert_iterable_equal(pv.value, [1,2,3,4]) + + pv2 = pivot_longer(df, f[f.x:f.y], names_transform=str.upper) + assert_iterable_equal(pv2.name, ['X', 'X', 'Y', 'Y']) + pv3 = pivot_longer(df, f[f.x:f.y], names_transform={'name': str.upper}) + assert_iterable_equal(pv3.name, ['X', 'X', 'Y', 'Y']) + +def test_values_interleaved_correctly(): + df = tibble( + x=[1,2], + y=[10,20], + z=[100,200] + ) + pv = pivot_longer(df, f[1:3]) + # assert_iterable_equal(pv.value, [1,10,100,2,20,200]) + assert_iterable_equal(pv.value, [1,2, 10,20, 100,200]) + +# test_that("can add multiple columns from spec", { +# df <- tibble(x = 1:2, y = 3:4) +# sp <- tibble(.name = c("x", "y"), .value = "v", a = 1, b = 2) +# pv <- pivot_longer_spec(df, spec = sp) + +# expect_named(pv, c("a", "b", "v")) +# }) + +def test_preserves_original_keys(): + df = tibble(x=f[1:2], y=2, z=f[1:2]) + pv = pivot_longer(df, f[f.y:f.z]) + + assert pv.columns.tolist() == ['x', 'name', 'value'] + assert_iterable_equal(pv.x, rep(df.x, 2)) + +def test_can_drop_missing_values(): + df = tibble(x=c(1,NA), y=c(NA,2)) + pv = pivot_longer(df, f[f.x:f.y], values_drop_na=True) + + assert_iterable_equal(pv.name, ['x', 'y']) + assert_iterable_equal(pv.value, [1,2]) + +def test_can_handle_missing_combinations(): + df = tribble( + f.id, f.x_1, f.x_2, f.y_2, + "A", 1, 2, "a", + "B", 3, 4, "b", + ) + pv = pivot_longer(df, ~f.id, names_to = c(".value", "n"), names_sep = "_") + + assert_iterable_equal(pv.columns, ['id', 'n', 'x', 'y']) + assert_iterable_equal(pv.x, [1,2,3,4]) + assert_iterable_equal(pv.y, [NA, "a", NA, "b"]) + + df = tribble( + f.id, f.x_1, f.x_2, f.y_2, + "A", 1, 2, "a", + "A", 3, 4, "b", + ) + pv = pivot_longer(df, ~f.id, names_to = c(".value", "n"), names_sep = "_") + + assert_iterable_equal(pv.columns, ['id', 'n', 'x', 'y']) + assert_iterable_equal(pv.x, [1,2,3,4]) + assert_iterable_equal(pv.y, [NA, "a", NA, "b"]) + +def test_mixed_columns_are_automatically_coerced(): + df = tibble(x = factor("a"), y = factor("b")) + pv = pivot_longer(df, f[f.x:f.y]) + assert is_factor(pv.value) + assert_iterable_equal(pv.value, ['a', 'b']) + +def test_can_override_default_output_column_type(): + df = tibble(x="x", y=1) + pv = pivot_longer(df, f[f.x:f.y], values_transform={'value': lambda x: [x]}) + assert pv.value.tolist() == [['x'], [1]] + pv2 = pivot_longer(df, f[f.x:f.y], values_transform=lambda x: [x]) + assert pv2.value.tolist() == [['x'], [1]] + +# test_that("can pivot to multiple measure cols", { +# df <- tibble(x = "x", y = 1) +# sp <- tribble( +# ~.name, ~.value, ~row, +# "x", "X", 1, +# "y", "Y", 1, +# ) +# pv <- pivot_longer_spec(df, sp) + +# expect_named(pv, c("row", "X", "Y")) +# expect_equal(pv$X, "x") +# expect_equal(pv$Y, 1) +# }) + +def test_original_col_order_is_preserved(): + df = tribble( + f.id, f.z_1, f.y_1, f.x_1, f.z_2, f.y_2, f.x_2, + "A", 1, 2, 3, 4, 5, 6, + "B", 7, 8, 9, 10, 11, 12 + ) + pv = pivot_longer(df, ~f.id, names_to = c(".value", "n"), names_sep = "_") + assert pv.columns.tolist() == ['id', 'n', 'z', 'y', 'x'] + +def test_handles_duplicated_column_names(): + df = tibble(tibble(a=1), tibble(b=3), x=1, a=2, b=4, _name_repair="minimal") + pv = pivot_longer(df, ~f.x) + + assert pv.columns.tolist() == ['x', 'name', 'value'] + assert_iterable_equal(pv.name, list('abab')) + assert_iterable_equal(pv.value, [1,3,2,4]) + +def test_can_pivot_duplicated_names_to_dot_value(): + df = tibble(x = 1, a_1 = 1, a_2 = 2, b_1 = 3, b_2 = 4, _name_repair='minimal') + pv1 = pivot_longer(df, ~f.x, names_to = c(".value", NA), names_sep = "_") + pv2 = pivot_longer(df, ~f.x, names_to = c(".value", NA), names_pattern = "(.)_(.)") + # The suffices will be used to group the data, which needs to be captured explictly. + # pv3 = pivot_longer(df, ~f.x, names_to = ".value", names_pattern = "(.)_.") + + assert pv1.columns.tolist() == ['x', 'a', 'b'] + assert_iterable_equal(pv1.a, [1,2]) + assert_frame_equal(pv2, pv1) + # assert_frame_equal(pv3, pv1) + +def test_dot_value_can_be_any_position_in_names_to(): + samp = tibble( + i=f[1:4], + y_t1=rnorm(4), + y_t2=rnorm(4), + z_t1=rep(3,4), + z_t2=rep(-2,4) + ) + value_first = pivot_longer( + samp, ~f.i, + names_to=['.value', 'time'], + names_sep="_" + ) + + samp2 = rename( + samp, + t1_y='y_t1', + t2_y='y_t2', + t1_z='z_t1', + t2_z='z_t2' + ) + value_second = pivot_longer( + samp2, ~f.i, + names_to = c("time", "_value"), + names_sep = "_" + ) + + assert_frame_equal(value_first, value_second) + +def test_type_error_message_use_variable_names(): + df = tibble(abc=1, xyz="b") + # no error, dtype falls back to object + pv = pivot_longer(df, everything()) + assert pv.value.dtype == object + +def test_grouping_is_preserved(): + df = tibble(g=1, x1=1, x2=2) + out = df >> group_by(f.g) >> pivot_longer( + f[f.x1:f.x2], + names_to="x", + values_to="v" + ) + assert group_vars(out) == ['g'] + +def test_values_to_at_end_of_output(): + df = tibble( + country=['US', 'CN'], + new_sp_m014=[159, 22], + new_sp_m24=[1571,21], + new_ep_f88=[34, 24] + ) + pv = pivot_longer(df, ~f.country, + names_to=['diagnosis', 'gender', 'age'], + names_pattern=r"new_?(.*)_(.)(.*)", + values_to = "count") + assert pv.columns.tolist() == [ + 'country', 'diagnosis', 'gender', 'age', 'count' + ] + +def test_errors_names_sep_names_pattern(): + df = tribble( + f.id, f.x_1, f.x_2, f.y_2, + "A", 1, 2, "a", + "B", 3, 4, "b", + ) + with pytest.raises(ValueError): + pivot_longer( + df, ~f.id, + names_to = c(".value", "n"), + names_sep = "_", + names_pattern=r'(.)_(.)' + ) + with pytest.raises(ValueError): + pivot_longer( + df, ~f.id, + names_to = c(".value", "n") + ) + +def test_names_prefix(): + df = tribble( + f.id, f.x_x_1, f.x_x_2, f.x_y_2, + "A", 1, 2, "a", + "B", 3, 4, "b", + ) + pv = pivot_longer(df, ~f.id, + names_to = c(".value", "n"), + names_sep = "_", + names_prefix='x_') + + assert_iterable_equal(pv.columns, ['id', 'n', 'x', 'y']) + assert_iterable_equal(pv.x, [1,2,3,4]) + assert_iterable_equal(pv.y, [NA, "a", NA, "b"]) diff --git a/tests/test_tidyr_pivot_wide.py b/tests/test_tidyr_pivot_wide.py new file mode 100644 index 00000000..41b47f4b --- /dev/null +++ b/tests/test_tidyr_pivot_wide.py @@ -0,0 +1,201 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/HEAD/tests/testthat/test-pivot-wide.R +import pytest +from datar.all import * +from datar.core.exceptions import ColumnNotExistingError +from pandas.testing import assert_frame_equal +from .conftest import assert_iterable_equal + +def test_can_pivot_all_cols_to_wide(): + df = tibble(key=list('xyz'), val=f[1:3]) + pv = pivot_wider(df, names_from=f.key, values_from=f.val) + assert pv.columns.tolist() == list('xyz') + assert nrow(pv) == 1 + +def test_non_pivoted_cols_are_preserved(): + df = tibble(a=1, key=list('xy'), val=f[1:2]) + pv = pivot_wider(df, names_from=f.key, values_from=f.val) + + assert pv.columns.tolist() == list('axy') + assert nrow(pv) == 1 + +def test_implicit_missings_turn_into_explicit_missings(): + df = tibble(a=[1,2], key=['x', 'y'], val=f.a) + pv = pivot_wider(df, names_from = f.key, values_from = f.val) + + assert_iterable_equal(pv.a, [1,2]) + assert_iterable_equal(pv.x, [1,NA]) + assert_iterable_equal(pv.y, [NA,2]) + +def test_error_when_overwriting_existing_column(): + df = tibble( + a=[1,1], + key=['a', 'b'], + val=[1,2] + ) + with pytest.raises(ValueError, match="already exists"): + pivot_wider(df, names_from=f.key, values_from=f.val) + +def test_grouping_is_preserved(): + df = tibble(g=1, k="x", v=2) + out = df >> group_by(f.g) >> pivot_wider(names_from=f.k, values_from=f.v) + assert group_vars(out) == ['g'] + +def test_double_underscore_j_can_be_used_as_names_from(): + df = tibble(__8=list('xyz'), val=f[1:3], _name_repair='minimal') + pv = pivot_wider(df, names_from=f.__8, values_from=f.val) + + assert pv.columns.tolist() == ['x', 'y', 'z'] + assert nrow(pv) == 1 + +def test_nested_df_pivot_correctly(): + df = tibble( + i=[1,2,1,2], + g=list('aabb'), + d=tibble(x=f[1:4], y=f[5:8]) + ) + out = pivot_wider(df, names_from=f.g, values_from=f.d) + assert_iterable_equal(out['a$x'], [1,2]) + assert_iterable_equal(out['b$y'], [7,8]) + + with pytest.raises(ColumnNotExistingError): + pivot_wider(df, names_from=f.g, values_from=f.e) + +def test_works_with_empty_key_vars(): + df = tibble(n="a", v=1) + pw = pivot_wider(df, names_from=f.n, values_from=f.v) + assert_frame_equal(pw, tibble(a=1)) + +# column names ------------------------------------------------------------- + +def test_names_glue_affects_output_names(): + df = tibble(x=['X', 'Y'], y=f[1:2], a=f.y, b=f.y) + out = pivot_wider( + df, + names_from=[f.x, f.y], + values_from=[f.a, f.b], + names_glue='{x}{y}_{_value}' + ) + assert out.columns.tolist() == ['X1_a', 'Y2_a', 'X1_b', 'Y2_b'] + +def test_can_sort_column_names(): + df = tibble( + int=[1,3,2], + fac=factor(list('abc'), + levels=list('acb') + ) + ) + out = pivot_wider(df, names_from=f.fac, values_from=f.int, names_sort=False) + assert out.columns.tolist() == list('acb') + out = pivot_wider(df, names_from=f.fac, values_from=f.int, names_sort=True) + assert out.columns.tolist() == list('abc') + + +# keys --------------------------------------------------------- + +def test_can_override_default_keys(): + df = tribble( + f.row, f.name, f.var, f.value, + 1, "Sam", "age", 10, + 2, "Sam", "height", 1.5, + 3, "Bob", "age", 20, + ) + pv = df >> pivot_wider(id_cols = f.name, names_from = f.var, values_from = f.value) + assert nrow(pv) == 2 + + +# non-unqiue keys --------------------------------------------------------- + +# instead of list-columns +def test_duplicated_keys_aggregated_by_values_fn(): + df = tibble(a = c(1, 1, 2), key = c("x", "x", "x"), val = f[1:3]) + pv = pivot_wider(df, names_from = f.key, values_from = f.val) # mean by default + assert_iterable_equal(pv.x, [1.5, 3.0]) + pv = pivot_wider(df, names_from = f.key, values_from = f.val, values_fn=sum) + assert_iterable_equal(pv.x, [3.0, 3.0]) + +# test_that("duplicated keys produce list column with warning", { +# df <- tibble(a = c(1, 1, 2), key = c("x", "x", "x"), val = 1:3) +# expect_warning( +# pv <- pivot_wider(df, names_from = key, values_from = val), +# "list-col" +# ) + +# expect_equal(pv$a, c(1, 2)) +# expect_equal(as.list(pv$x), list(c(1L, 2L), 3L)) +# }) + +def test_values_fn_can_keep_list(): + df = tibble(a = c(1, 1, 2), key = c("x", "x", "x"), val = f[1:3]) + pv = pivot_wider(df, names_from = f.key, values_from = f.val, values_fn=list) + assert_iterable_equal(pv.a, [1,2]) + assert pv.x.tolist() == [[1,2], [3]] + +def test_values_fn_to_be_a_single_func(): + df = tibble(a = c(1, 1, 2), key = c("x", "x", "x"), val = c(1, 10, 100)) + pv = pivot_wider(df, names_from=f.key, values_from=f.val, values_fn=sum) + assert_iterable_equal(pv.x, [11,100]) + +# test_that("values_summarize applied even when no-duplicates", { +# df <- tibble(a = c(1, 2), key = c("x", "x"), val = 1:2) +# pv <- pivot_wider(df, +# names_from = key, +# values_from = val, +# values_fn = list(val = list) +# ) + +# expect_equal(pv$a, c(1, 2)) +# expect_equal(as.list(pv$x), list(1L, 2L)) +# }) + + +# can fill missing cells -------------------------------------------------- + +def test_can_fill_in_missing_cells(): + df = tibble(g = c(1, 2), var = c("x", "y"), val = c(1, 2)) + widen = lambda **kwargs: df >> pivot_wider(names_from=f.var, values_from=f.val, **kwargs) + + assert_iterable_equal(widen().x, [1, NA]) + assert_iterable_equal(widen(values_fill=0).x, [1,0]) + assert_iterable_equal(widen(values_fill={'val': 0}).x, [1,0]) + +def test_values_fill_only_affects_missing_cells(): + df = tibble(g = c(1, 2), names = c("x", "y"), value = c(1, NA)) + out = pivot_wider(df, names_from=f.names, values_from=f.value, values_fill=0) + assert_iterable_equal(out.y, [0, NA]) + +# multiple values ---------------------------------------------------------- + +def test_can_pivot_from_multiple_measure_cols(): + df = tibble(row = 1, var = c("x", "y"), a = f[1:2], b = f[3:4]) + sp = pivot_wider(df, names_from=f.var, values_from=[f.a, f.b]) + assert sp.columns.tolist() == ['row', 'a_x', 'a_y', 'b_x', 'b_y'] + assert_iterable_equal(sp.a_x, [1]) + assert_iterable_equal(sp.b_y, [4]) + +def test_can_pivot_from_multiple_measure_cols_using_all_keys(): + df = tibble(var = c("x", "y"), a = f[1:2], b = f[3:4]) + sp = pivot_wider(df, names_from=f.var, values_from=[f.a, f.b]) + assert sp.columns.tolist() == ['a_x', 'a_y', 'b_x', 'b_y'] + assert_iterable_equal(sp.a_x, [1]) + assert_iterable_equal(sp.b_y, [4]) + +# test_that("column order in output matches spec", { +# df <- tribble( +# ~hw, ~name, ~mark, ~pr, +# "hw1", "anna", 95, "ok", +# "hw2", "anna", 70, "meh", +# ) + +# # deliberately create weird order +# sp <- tribble( +# ~hw, ~.value, ~.name, +# "hw1", "mark", "hw1_mark", +# "hw1", "pr", "hw1_pr", +# "hw2", "pr", "hw2_pr", +# "hw2", "mark", "hw2_mark", +# ) + +# pv <- pivot_wider_spec(df, sp) +# expect_named(pv, c("name", sp$.name)) +# }) diff --git a/tests/test_tidyr_replace_na.py b/tests/test_tidyr_replace_na.py new file mode 100644 index 00000000..bf9dcb2d --- /dev/null +++ b/tests/test_tidyr_replace_na.py @@ -0,0 +1,47 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/HEAD/tests/testthat/test-replace_na.R +import pytest +import numpy +from datar.all import * +from .conftest import assert_iterable_equal +from pandas.testing import assert_frame_equal +# vector ------------------------------------------------------------------ + +def test_empty_call_does_nothing(): + x = c(1, NA) + assert_iterable_equal(replace_na(x), x) + + x = numpy.array(x) + assert_iterable_equal(replace_na(x), x) + +def test_missing_values_are_replaced(): + x = c(1, NA) + assert_iterable_equal(replace_na(x, 0), c(1,0)) + assert_iterable_equal(replace_na([], x, 0), c(1,0)) + + x = numpy.array(x) + assert_iterable_equal(replace_na(x, 0), c(1,0)) + assert_iterable_equal(replace_na([], x, 0), c(1,0)) + +# data frame ------------------------------------------------------------- + +def test_df_empty_call_does_nothing(): + df = tibble(x=c(1, NA)) + out = replace_na(df) + assert_frame_equal(out, df) + +def test_df_missing_values_are_replaced(): + df = tibble(x=c(1, NA)) + out = replace_na(df, {'x': 0}) + assert_iterable_equal(out.x, c(1,0)) + +def test_df_no_complain_about_non_existing_vars(): + df = tibble(a=c(1, NA)) + out = replace_na(df, {'a': 100, 'b':0}) + assert_frame_equal(out, tibble(a=c(1,100), _dtypes=float)) + +def test_df_can_replace_NULLs_in_list_column(): + df = tibble(x=[[1], NULL]) + # replace with list not supported yet + rs = replace_na(df, {'x': 2}) + assert_frame_equal(rs, tibble(x=[[1], 2])) diff --git a/tests/test_tidyr_separate.py b/tests/test_tidyr_separate.py new file mode 100644 index 00000000..f644ed8c --- /dev/null +++ b/tests/test_tidyr_separate.py @@ -0,0 +1,217 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/HEAD/tests/testthat/test-separate.R +from datar.core.grouped import DataFrameGroupBy +import pytest +from datar.all import * +from pandas.testing import assert_frame_equal +from .conftest import assert_iterable_equal + +def test_missing_values_in_input_are_missing_in_output(): + df = tibble(x=c(NA, "a b")) + out = separate(df, f.x, c("x", "y")) + + assert_iterable_equal(out.x, [NA, "a"]) + assert_iterable_equal(out.y, [NA, "b"]) + +def test_positive_integer_values_specific_position_between_strings(): + df = tibble(x = c(NA, "ab", "cd")) + out = separate(df, f.x, c("x", "y"), 1) + assert_iterable_equal(out.x, c(NA, "a", "c")) + assert_iterable_equal(out.y, c(NA, "b", "d")) + +def test_negative_integer_values_specific_position_between_strings(): + df = tibble(x = c(NA, "ab", "cd")) + out = separate(df, f.x, c("x", "y"), -1) + assert_iterable_equal(out.x, c(NA, "a", "c")) + assert_iterable_equal(out.y, c(NA, "b", "d")) + +def test_extreme_integer_values_handled_sensibly(): + df = tibble(x = c(NA, "a", "bc", "def")) + + out = separate(df, f.x, c("x", "y"), 3) + assert_iterable_equal(out.x, c(NA, "a", "bc", "def")) + assert_iterable_equal(out.y, c(NA, "", "", "")) + + out = separate(df, f.x, c("x", "y"), -3) + assert_iterable_equal(out.x, c(NA, "", "", "")) + assert_iterable_equal(out.y, c(NA, "a", "bc", "def")) + +def test_convert_produces_integers_etc(): + df = tibble(x = "1-1.5-") + out = separate(df, f.x, c("x", "y", "z"), "-", convert = { + 'x': int, + 'y': float, + 'z': bool, + }) + assert_iterable_equal(out.x, [1]) + assert_iterable_equal(out.y, [1.5]) + assert_iterable_equal(out.z, [FALSE]) + +def test_convert_keeps_characters_as_character(): + df = tibble(x = "X-1") + out = separate(df, f.x, c("x", "y"), "-", convert = { + 'x': str, 'y': int + }) + assert_iterable_equal(out.x, ["X"]) + assert_iterable_equal(out.y, [1]) + +def test_too_many_pieces_dealt_with_as_requested(caplog): + df = tibble(x = c("a b", "a b c")) + separate(df, f.x, c("x", "y")) + assert "Additional pieces discarded" in caplog.text + caplog.clear() + + merge = separate(df, f.x, c("x", "y"), extra = "merge") + assert_iterable_equal(merge.iloc[:, 0], c("a", "a")) + assert_iterable_equal(merge.iloc[:, 1], c("b", "b c")) + + drop = separate(df, f.x, c("x", "y"), extra = "drop") + assert_iterable_equal(drop.iloc[:, 0], c("a", "a")) + assert_iterable_equal(drop.iloc[:, 1], c("b", "b")) + +# suppressWarnings( +# expect_warning(separate(df, x, c("x", "y"), extra = "error"), "deprecated") +# ) + + +def test_too_few_pieces_dealt_with_as_requested(caplog): + df = tibble(x = c("a b", "a b c")) + + separate(df, f.x, c("x", "y", "z")) + assert "Missing pieces filled" in caplog.text + caplog.clear() + + left = separate(df, f.x, c("x", "y", "z"), fill = "left") + assert_iterable_equal(left.x, c(NA, "a")) + assert_iterable_equal(left.y, c("a", "b")) + assert_iterable_equal(left.z, c("b", "c")) + + right = separate(df, f.x, c("x", "y", "z"), fill = "right") + assert_iterable_equal(right.z, c(NA, "c")) + + +def test_preserves_grouping(): + df = tibble(g = 1, x = "a:b") >> group_by(f.g) + rs = df >> separate(f.x, c("a", "b")) + assert group_vars(df) == group_vars(rs) + + +def test_drops_grouping_when_needed(): + df = tibble(x = "a:b") >> group_by(f.x) + rs = df >> separate(f.x, c("a", "b")) + assert_iterable_equal(rs.a, ["a"]) + assert group_vars(rs) == [] + + +def test_overwrites_existing_columns(): + df = tibble(x = "a:b") + rs = df >> separate(f.x, c("x", "y")) + + assert_iterable_equal(rs.columns, c("x", "y")) + assert_iterable_equal(rs.x, ["a"]) + + +def test_drops_NA_columns(): + df = tibble(x = c(NA, "ab", "cd")) + out = separate(df, f.x, c(NA, "y"), 1) + assert_iterable_equal(names(out), "y") + assert_iterable_equal(out.y, c(NA, "b", "d")) + + +def test_checks_type_of_into_and_sep(): + df = tibble(x = "a:b") + with pytest.raises(ValueError, match="Index 0 given for 1-based indexing"): + # False for sep interpreted as 0 + separate(df, f.x, "x", FALSE) + + with pytest.raises(ValueError, match="must be a string"): + separate(df, f.x, FALSE) + +def test_remove_false(): + df = tibble(x=c("a b")) + out = separate(df, f.x, c("x", "y"), remove=False) + assert out.columns.tolist() == ['x', 'y'] + out = separate(df, f.x, c("a", "b"), remove=False) + assert out.columns.tolist() == ['x', 'a', 'b'] + +def test_separate_on_group_vars(): + df = tibble(x=c("a b")) >> group_by(f.x) + out = separate(df, f.x, c("x", "y"), remove=False) + assert group_vars(out) == ['x'] + + df = tibble(x=c("a b"), y=1) >> group_by(f.x, f.y) + out = separate(df, f.x, c("x", "y"), remove=False) + assert group_vars(out) == ['x', 'y'] + +# separate_rows -------------------------------- + +def test_can_handle_collapsed_rows(): + df = tibble(x=f[1:3], y=c("a", "d,e,f", "g,h")) + out = separate_rows(df, f.y) + assert_iterable_equal(out.y, list("adefgh")) + +def test_can_handle_empty_dfs(): + df = tibble(a=[], b=[], _dtypes=str) + rs = separate_rows(df, f.b) + assert_frame_equal(rs, df) + +# test_that("default pattern does not split decimals in nested strings", { +# df <- dplyr::tibble(x = 1:3, y = c("1", "1.0,1.1", "2.1")) +# expect_equal(separate_rows(df, y)$y, unlist(strsplit(df$y, ","))) +# }) + +def test_preserves_grouping(): + df = tibble(g=1, x="a:b") >> group_by(f.g) + rs = df >> separate_rows(f.x) + assert group_vars(df) == group_vars(rs) + +def test_drops_grouping_when_needed(): + df = tibble(x=1, y="a:b") >> group_by(f.x, f.y) + out = df >> separate_rows(f.y) + assert_iterable_equal(out.y, c("a", "b")) + assert group_vars(out) == ['x'] + + out = df >> group_by(f.y) >> separate_rows(f.y) + assert not isinstance(out, DataFrameGroupBy) + +def test_drops_grouping_on_zero_row_dfs_when_needed(): + df = tibble(x = [], y = []) >> group_by(f.y) + out = df >> separate_rows(f.y) + assert not isinstance(out, DataFrameGroupBy) + + + + + +# test_that("drops grouping on zero row data frames when needed (#886)", { +# df <- tibble(x = numeric(), y = character()) %>% dplyr::group_by(y) +# out <- df %>% separate_rows(y) +# expect_equal(dplyr::group_vars(out), character()) +# }) + +# test_that("convert produces integers etc", { +# df <- tibble(x = "1,2,3", y = "T,F,T", z = "a,b,c") + +# out <- separate_rows(df, x, y, z, convert = TRUE) +# expect_equal(class(out$x), "integer") +# expect_equal(class(out$y), "logical") +# expect_equal(class(out$z), "character") +# }) + +# test_that("leaves list columns intact (#300)", { +# df <- tibble(x = "1,2,3", y = list(1)) + +# out <- separate_rows(df, x) +# # Can't compare tibbles with list columns directly +# expect_equal(names(out), c("x", "y")) +# expect_equal(out$x, as.character(1:3)) +# expect_equal(out$y, rep(list(1), 3)) +# }) + +# test_that("does not silently drop blank values (#1014)", { +# df <- tibble(x = 1:3, y = c("a", "d,e,f", "")) + +# out <- separate_rows(df, y) +# expect_equal(out, tibble(x = c(1, 2, 2, 2, 3), +# y = c("a", "d", "e", "f", ""))) +# }) diff --git a/tests/test_tidyr_uncount.py b/tests/test_tidyr_uncount.py new file mode 100644 index 00000000..70d60820 --- /dev/null +++ b/tests/test_tidyr_uncount.py @@ -0,0 +1,53 @@ +# tests grabbed from: +# https://github.com/tidyverse/tidyr/blob/HEAD/tests/testthat/test-uncount.R +import pytest +from datar.all import * +from pandas.testing import assert_frame_equal + +def test_symbols_weights_are_dropped_in_output(): + df = tibble(x=1, w=1) + assert_frame_equal(uncount(df, f.w), tibble(x=1)) + +def test_can_request_to_preserve_symbols(): + df = tibble(x=1, w=1) + assert_frame_equal(uncount(df, f.w, _remove=False), df) + +def test_unique_identifiers_created_on_request(): + df = tibble(w=f[1:3]) + assert_frame_equal( + uncount(df, f.w, _id="id"), + tibble(id=c(1, f[1:2], f[1:3])) + ) + +def test_expands_constants_and_expressions(): + df = tibble(x=1, w=2) + out = uncount(df, 2) + assert_frame_equal(out, df.iloc[[0,0], :].reset_index(drop=True)) + + out = uncount(df, 1+1) + assert_frame_equal(out, df.iloc[[0,0], :].reset_index(drop=True)) + +def test_works_with_groups(): + df = tibble(g=1, x=1, w=1) >> group_by(f.g) + out = uncount(df, f.w) + exp = df >> select(~f.w) + assert_frame_equal(out, exp) + +def test_must_evaluate_to_integer(): + df = tibble(x=1, w=.5) + + out = uncount(df, f.w) + assert nrow(out) == 0 + + df = tibble(x=1) + with pytest.raises(ValueError, match="`weights` must evaluate to numerics"): + uncount(df, "W") + +def test_works_with_0_weights(): + df = tibble(x=f[1:2], w=[0,1]) + assert_frame_equal(uncount(df, f.w), tibble(x=2)) + +def test_errors_on_negative_weights(): + df = tibble(x=1, w=-1) + with pytest.raises(ValueError, match="must be >= 0"): + uncount(df, f.w) diff --git a/tests/test_tidyr_unite.py b/tests/test_tidyr_unite.py new file mode 100644 index 00000000..85040697 --- /dev/null +++ b/tests/test_tidyr_unite.py @@ -0,0 +1,62 @@ +# tests grabbed from: +# url +import pytest +from datar.all import * +import pandas +from pandas.testing import assert_frame_equal +from .conftest import assert_iterable_equal + + +def test_unite_pastes_columns_togeter_and_removes_old_col(): + df = tibble(x="a", y="b") + out = unite(df, 'z', f[f.x:f.y]) + assert_frame_equal(out, tibble(z="a_b")) + + +def test_unite_does_not_remove_new_col_in_case_of_name_clash(): + df = tibble(x = "a", y = "b") + out = unite(df, 'x', f[f.x:f.y]) + assert_iterable_equal(names(out), ["x"]) + assert_iterable_equal(out.x, ["a_b"]) + + +def test_unite_preserves_grouping(): + df = tibble(g = 1, x = "a") >> group_by(f.g) + rs = df >> unite('x', f.x) + assert_frame_equal(df, rs) + assert group_vars(df) == group_vars(rs) + + +def test_drops_grouping_when_needed(): + df = tibble(g = 1, x = "a") >> group_by(f.g) + rs = df >> unite('gx', f.g, f.x) + assert_iterable_equal(rs.gx, ["1_a"]) + assert group_vars(rs) == [] + +def test_empty_var_spec_uses_all_vars(): + df = tibble(x = "a", y = "b") + assert_iterable_equal(unite(df, "z"), tibble(z = "a_b")) + +def test_can_remove_missing_vars_on_request(): + df = expand_grid(x = c("a", NA), y = c("b", NA)) + out = unite(df, "z", f[f.x:f.y], na_rm = TRUE) + + assert_iterable_equal(out.z, c("a_b", "a", "b", "")) + + +# test_that("regardless of the type of the NA", { +# vec_unite <- function(df, vars) { +# unite(df, "out", any_of(vars), na.rm = TRUE)$out +# } + +# df <- tibble( +# x = c("x", "y", "z"), +# lgl = NA, +# dbl = NA_real_, +# chr = NA_character_ +# ) + +# expect_equal(vec_unite(df, c("x", "lgl")), c("x", "y", "z")) +# expect_equal(vec_unite(df, c("x", "dbl")), c("x", "y", "z")) +# expect_equal(vec_unite(df, c("x", "chr")), c("x", "y", "z")) +# })