0.0.7 (#10)

* Add dplyr rows verbs * Add tidyr.chop/unchop * Add tidyr.pack/unpack * Allow mixed numbering for tibble construction * Add tidyr.nest/unnest * Add tidyr.expand/expand_grid * Add tidyr.complete * Move tidyr.drop_na * Move tidyr.extract * Move tidyr.fill * Move and fix tidyr.pivot_longer * Move and fix tidyr.pivot_wider * Move and fix tidyr.separate/separate_rows/uncount * Move and fix tidyr.unite * Move and fix tidyr.replace_na * Update notebook for tidyr.full_seq * 0.0.7 * Update CHANGELOG
pwwang · Jun 8, 2021 · 69c2add · 69c2add
1 parent 15c00d2
commit 69c2add
Show file tree

Hide file tree

Showing 75 changed files with 7,976 additions and 9,576 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 Port of [dplyr][2] and other related R packages in python, using [pipda][3].
 
-Unlike other similar packages in python that just mimic the piping sign, `datar` follows the API designs from the original packages as possible. So that nearly no extra effort is needed for those who are familar with those R packages to transition to python.
+Unlike other similar packages in python that just mimic the piping sign, `datar` follows the API designs from the original packages as much as possible. So that minimal effort is needed for those who are familar with those R packages to transition to python.
 
 <!-- badges -->
 [![Pypi][6]][7] [![Github][8]][9] ![Building][10] [![Docs and API][11]][5] [![Codacy][12]][13] [![Codacy coverage][14]][13]

diff --git a/datar/__init__.py b/datar/__init__.py
@@ -3,4 +3,4 @@
 from .core import operator as _datar_operator
 from .core.defaults import f
 
-__version__ = '0.0.6'
+__version__ = '0.0.7'
diff --git a/datar/base/__init__.py b/datar/base/__init__.py
@@ -10,7 +10,7 @@
 from .options import options, getOption, options_context
 from .verbs import (
  colnames, rownames, dim, nrow, ncol, diag, t, names,
- intersect, union, setdiff, setequal
+ intersect, union, setdiff, setequal, duplicated
 )
 from .funcs import (
  as_date, as_character, as_double, as_factor, as_categorical,

diff --git a/datar/base/constants.py b/datar/base/constants.py
@@ -18,6 +18,6 @@
 LETTERS = numpy.array(list(ascii_letters[26:]))
 
 NA_character_ = f"<NA_{uuid.uuid4()}_>"
-NA_integer_ = numpy.random.randint(numpy.iinfo(numpy.int64).max)
+NA_integer_ = numpy.random.randint(numpy.iinfo(numpy.int32).max)
 NA_real_ = NA
 NA_compex_ = complex(NA, NA)
diff --git a/datar/base/funcs.py b/datar/base/funcs.py
@@ -21,7 +21,7 @@
 from pipda import Context, register_func
 
 from .constants import NA
-from ..core.utils import categorize, get_option, logger
+from ..core.utils import categorized, get_option, logger
 from ..core.middlewares import WithDataEnv
 from ..core.collections import Collection
 from ..core.types import (
@@ -281,7 +281,7 @@ def as_int(x: Any) -> Union[int, Iterable[int]]:
  Otherwise, convert x to int.
  """
  if is_categorical_dtype(x):
- return categorize(x).codes
+ return categorized(x).codes
  return _as_type(x, int)
 
 @register_func(None, context=Context.EVAL)
@@ -297,7 +297,7 @@ def as_integer(x: Any) -> Union[numpy.int64, Iterable[numpy.int64]]:
  Otherwise, convert x to numpy.int64.
  """
  if is_categorical_dtype(x):
- return categorize(x).codes
+ return categorized(x).codes
  return _as_type(x, numpy.int64)
 
 as_int64 = as_integer
@@ -812,7 +812,7 @@ def droplevels(x: Categorical) -> Categorical:
  Returns:
  The categorical data with unused categories dropped.
  """
- return categorize(x).remove_unused_categories()
+ return categorized(x).remove_unused_categories()
 
 @register_func(None, context=Context.EVAL)
 def levels(x: CategoricalLikeType) -> Optional[List[Any]]:
@@ -904,7 +904,7 @@ def lengths(x: Any) -> List[int]:
 # ---------------------------------
 
 def factor(
- x: Iterable[Any],
+ x: Optional[Iterable[Any]] = None,
  # pylint: disable=redefined-outer-name
  levels: Optional[Iterable[Any]] = None,
  exclude: Any = NA,
@@ -925,6 +925,9 @@ def factor(
  ordered: logical flag to determine if the levels should be regarded
  as ordered (in the order given).
  """
+ if x is None:
+ x = []
+
  if is_categorical_dtype(x):
  x = x.to_numpy()
  ret = Categorical(

diff --git a/datar/base/verbs.py b/datar/base/verbs.py
@@ -1,9 +1,11 @@
 """Function from R-base that can be used as verbs"""
 # TODO: add tests
-from typing import Any, Iterable, List, Optional, Tuple, Union
+from typing import (
+ Any, Iterable, List, Mapping, Optional, Tuple, Union
+)
 
 import numpy
-from pandas import DataFrame
+from pandas import DataFrame, Series, Categorical
 from pipda import register_verb
 
 from ..core.types import IntType, is_scalar
@@ -14,7 +16,8 @@
 @register_verb(DataFrame, context=Context.EVAL)
 def colnames(
  df: DataFrame,
- names: Optional[Iterable[str]] = None
+ names: Optional[Iterable[str]] = None,
+ stack: bool = True
 ) -> Union[List[Any], DataFrame]:
  """Get or set the column names of a dataframe
 
@@ -28,10 +31,42 @@ def colnames(
  if the input dataframe is grouped, the structure is kept.
  """
  from ..stats.verbs import setNames
+ if not stack:
+ if names is not None:
+ return setNames(df, names)
+ return df.columns.tolist()
+
  if names is not None:
- return setNames(df, names)
+ namei = 0
+ newnames = []
+ for colname in df.columns:
+ parts = colname.split('$', 1)
+ if not newnames:
+ if len(parts) < 2:
+ newnames.append(names[namei])
+ namei += 1
+ else:
+ newnames.append(f"{names[namei]}${parts[1]}")
+ elif len(parts) < 2:
+ newnames.append(names[namei])
+ namei += 1
+ elif newnames[-1].startswith(f"{parts[0]}$"):
+ newnames.append(f"{names[namei]}${parts[1]}")
+ else:
+ namei += 1
+ newnames.append(f"{names[namei]}${parts[1]}")
+ return setNames(df, newnames)
+
+ cols = [
+ col.split('$', 1)[0] if isinstance(col, str) else col
+ for col in df.columns
+ ]
+ out = []
+ for col in cols:
+ if col not in out:
+ out.append(col)
+ return out
 
- return df.columns.tolist()
 
 @register_verb(DataFrame, context=Context.EVAL)
 def rownames(
@@ -176,6 +211,14 @@ def names(x: DataFrame) -> List[str]:
  """Get the column names of a dataframe"""
  return x.columns.tolist()
 
+@names.register(dict)
+def _(x: Mapping[str, Any]) -> List[str]:
+ """Get the keys of a dict
+
+ dict is like a list in R, mimic `names(<list>)` in R.
+ """
+ return list(x)
+
 @register_verb(context=Context.EVAL)
 def setdiff(x: Any, y: Any) -> List[Any]:
  """Diff of two iterables"""
@@ -216,3 +259,52 @@ def setequal(x: Any, y: Any) -> List[Any]:
  x = sorted(x)
  y = sorted(y)
  return x == y
+
+@register_verb((list, tuple, numpy.ndarray, Series, Categorical))
+def duplicated( # pylint: disable=invalid-name
+ x: Iterable[Any],
+ incomparables: Optional[Iterable[Any]] = None,
+ fromLast: bool = False
+) -> numpy.ndarray:
+ """Determine Duplicate Elements
+
+ Args:
+ x: The iterable to detect duplicates
+ Currently, elements in `x` must be hashable.
+ fromLast: Whether start to detect from the last element
+
+ Returns:
+ A bool array with the same length as `x`
+ """
+ dups = set()
+ out = []
+ out_append = out.append
+ if incomparables is None:
+ incomparables = []
+
+ if fromLast:
+ x = reversed(x)
+ for elem in x:
+ if elem in incomparables:
+ out_append(False)
+ if elem in dups:
+ out_append(True)
+ else:
+ dups.add(elem)
+ out_append(False)
+ if fromLast:
+ out = list(reversed(out))
+ return numpy.array(out, dtype=bool)
+
+@duplicated.register(DataFrame)
+def _( # pylint: disable=invalid-name,unused-argument
+ x: DataFrame,
+ incomparables: Optional[Iterable[Any]] = None,
+ fromLast: bool = False
+) -> numpy.ndarray:
+ """Check if rows in a data frame are duplicated
+
+ `incomparables` not working here
+ """
+ keep = 'first' if not fromLast else 'last'
+ return x.duplicated(keep=keep).values
diff --git a/datar/core/contexts.py b/datar/core/contexts.py
@@ -20,6 +20,10 @@ def __init__(self):
 
  def getitem(self, parent, ref):
  """Interpret f[ref]"""
+ if isinstance(ref, slice):
+ from .collections import Collection
+ return Collection(ref)
+
  self.used_refs[ref] += 1
  if isinstance(parent, DataFrame) and ref not in parent:
  cols = [col for col in parent.columns if col.startswith(f'{ref}$')]

diff --git a/datar/core/middlewares.py b/datar/core/middlewares.py
@@ -2,8 +2,6 @@
 from typing import Any, Mapping, Tuple
 from pipda.utils import DataEnv
 
-from .utils import logger
-
 class CurColumn:
  """Current column in across"""
  @classmethod
@@ -34,33 +32,3 @@ def __enter__(self) -> Any:
 
  def __exit__(self, *exc_info) -> None:
  self.data.delete()
-
-class Nesting:
- """Nesting objects for calls from tidyr.nesting"""
- def __init__(self, *columns: Any, **kwargs: Any) -> None:
- self.columns = []
- self.names = []
-
- id_prefix = hex(id(self))[2:6]
- for i, column in enumerate(columns):
- self.columns.append(column)
- if isinstance(column, str):
- self.names.append(column)
- continue
- try:
- # series
- name = column.name
- except AttributeError:
- name = f'_tmp{id_prefix}_{i}'
- logger.warning(
- 'Temporary name used for a nesting column, use '
- 'keyword argument instead to specify the key as name.'
- )
- self.names.append(name)
-
- for key, val in kwargs.items():
- self.columns.append(val)
- self.names.append(key)
-
- def __len__(self):
- return len(self.columns)
diff --git a/datar/core/types.py b/datar/core/types.py
@@ -7,7 +7,6 @@
 from pandas.core.frame import DataFrame
 from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy
 from pandas.core.series import Series
-from pipda.function import Function
 
 # used for type annotations
 NumericType = Union[int, float, complex, numpy.number]
@@ -23,6 +22,7 @@
 BoolOrIter = Union[bool, Iterable[bool]]
 FloatOrIter = Union[FloatType, Iterable[FloatType]]
 NumericOrIter = Union[NumericType, Iterable[NumericType]]
+DTypeType = Union[str, type, numpy.dtype]
 
 NoneType = type(None)
 # used for type checks

diff --git a/datar/core/utils.py b/datar/core/utils.py
@@ -14,7 +14,7 @@
 from varname import argname
 
 from .exceptions import ColumnNotExistingError, NameNonUniqueError
-from .types import is_scalar
+from .types import is_scalar, DTypeType
 from .defaults import DEFAULT_COLUMN_PREFIX
 
 # logger
@@ -33,6 +33,7 @@ def vars_select(
  raise_nonexists: bool = True,
  base0: Optional[bool] = None
 ) -> List[int]:
+ # TODO: support selecting data-frame columns
  """Select columns
 
  Args:
@@ -156,7 +157,7 @@ def df_assign_item(
  else:
  df.insert(df.shape[1], item, value, allow_duplicates=True)
 
-def categorize(data: Any) -> Any:
+def categorized(data: Any) -> Any:
  """Get the Categorical object"""
  if not is_categorical_dtype(data):
  return data
@@ -381,3 +382,78 @@ def get_option(key: str, value: Any = None) -> Any:
  return value
  from ..base import getOption
  return getOption(key)
+
+def apply_dtypes(
+ df: DataFrame,
+ dtypes: Optional[Union[bool, DTypeType, Mapping[str, DTypeType]]]
+) -> None:
+ """Apply dtypes to data frame"""
+ if dtypes is None or dtypes is False:
+ return
+
+ if dtypes is True:
+ inferred = df.convert_dtypes()
+ for col in df:
+ df[col] = inferred[col]
+ return
+
+ if not isinstance(dtypes, dict):
+ dtypes = dict(zip(df.columns, [dtypes]*df.shape[1]))
+
+ for column, dtype in dtypes.items():
+ if column in df:
+ df[column] = df[column].astype(dtype)
+ else:
+ for col in df:
+ if col.startswith(f"{column}$"):
+ df[col] = df[col].astype(dtype)
+
+def keep_column_order(df: DataFrame, order: Iterable[str]):
+ """Keep the order of columns as given `order`
+
+ We cannot do `df[order]` directly, since `df` may have nested df columns.
+ """
+ out_columns = []
+ for col in order:
+ if col in df:
+ out_columns.append(col)
+ else:
+ out_columns.extend(
+ (dfcol for dfcol in df.columns if dfcol.startswith(f"{col}$"))
+ )
+ if set(out_columns) != set(df.columns):
+ raise ValueError("Given `order` does not select all columns.")
+
+ return df[out_columns]
+
+def reconstruct_tibble(
+ input: DataFrame, # pylint: disable=redefined-builtin
+ output: DataFrame,
+ ungrouped_vars: Optional[List[str]] = None,
+ keep_rowwise: bool = False
+) -> DataFrame:
+ """Reconstruct the output dataframe based on input"""
+ from ..base import setdiff, intersect
+ from ..dplyr import group_vars, group_by_drop_default
+ from .grouped import DataFrameGroupBy, DataFrameRowwise
+
+ if ungrouped_vars is None:
+ ungrouped_vars = []
+ old_groups = group_vars(input)
+ new_groups = intersect(setdiff(old_groups, ungrouped_vars), output.columns)
+
+ if isinstance(input, DataFrameRowwise):
+ return DataFrameRowwise(
+ output,
+ _group_vars=new_groups,
+ _drop=group_by_drop_default(input)
+ ) if keep_rowwise else output
+
+ if isinstance(input, DataFrameGroupBy):
+ return DataFrameGroupBy(
+ output,
+ _group_vars=new_groups,
+ _drop=group_by_drop_default(input)
+ )
+
+ return output