Skip to content

Commit

Permalink
0.0.7 (#10)
Browse files Browse the repository at this point in the history
* Add dplyr rows verbs

* Add tidyr.chop/unchop

* Add tidyr.pack/unpack

* Allow mixed numbering for tibble construction

* Add tidyr.nest/unnest

* Add tidyr.expand/expand_grid

* Add tidyr.complete

* Move tidyr.drop_na

* Move tidyr.extract

* Move tidyr.fill

* Move and fix tidyr.pivot_longer

* Move and fix tidyr.pivot_wider

* Move and fix tidyr.separate/separate_rows/uncount

* Move and fix tidyr.unite

* Move and fix tidyr.replace_na

* Update notebook for tidyr.full_seq

* 0.0.7

* Update CHANGELOG
  • Loading branch information
pwwang authored Jun 8, 2021
1 parent 15c00d2 commit 69c2add
Show file tree
Hide file tree
Showing 75 changed files with 7,976 additions and 9,576 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

Port of [dplyr][2] and other related R packages in python, using [pipda][3].

Unlike other similar packages in python that just mimic the piping sign, `datar` follows the API designs from the original packages as possible. So that nearly no extra effort is needed for those who are familar with those R packages to transition to python.
Unlike other similar packages in python that just mimic the piping sign, `datar` follows the API designs from the original packages as much as possible. So that minimal effort is needed for those who are familar with those R packages to transition to python.

<!-- badges -->
[![Pypi][6]][7] [![Github][8]][9] ![Building][10] [![Docs and API][11]][5] [![Codacy][12]][13] [![Codacy coverage][14]][13]
Expand Down
2 changes: 1 addition & 1 deletion datar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
from .core import operator as _datar_operator
from .core.defaults import f

__version__ = '0.0.6'
__version__ = '0.0.7'
2 changes: 1 addition & 1 deletion datar/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from .options import options, getOption, options_context
from .verbs import (
colnames, rownames, dim, nrow, ncol, diag, t, names,
intersect, union, setdiff, setequal
intersect, union, setdiff, setequal, duplicated
)
from .funcs import (
as_date, as_character, as_double, as_factor, as_categorical,
Expand Down
2 changes: 1 addition & 1 deletion datar/base/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@
LETTERS = numpy.array(list(ascii_letters[26:]))

NA_character_ = f"<NA_{uuid.uuid4()}_>"
NA_integer_ = numpy.random.randint(numpy.iinfo(numpy.int64).max)
NA_integer_ = numpy.random.randint(numpy.iinfo(numpy.int32).max)
NA_real_ = NA
NA_compex_ = complex(NA, NA)
13 changes: 8 additions & 5 deletions datar/base/funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from pipda import Context, register_func

from .constants import NA
from ..core.utils import categorize, get_option, logger
from ..core.utils import categorized, get_option, logger
from ..core.middlewares import WithDataEnv
from ..core.collections import Collection
from ..core.types import (
Expand Down Expand Up @@ -281,7 +281,7 @@ def as_int(x: Any) -> Union[int, Iterable[int]]:
Otherwise, convert x to int.
"""
if is_categorical_dtype(x):
return categorize(x).codes
return categorized(x).codes
return _as_type(x, int)

@register_func(None, context=Context.EVAL)
Expand All @@ -297,7 +297,7 @@ def as_integer(x: Any) -> Union[numpy.int64, Iterable[numpy.int64]]:
Otherwise, convert x to numpy.int64.
"""
if is_categorical_dtype(x):
return categorize(x).codes
return categorized(x).codes
return _as_type(x, numpy.int64)

as_int64 = as_integer
Expand Down Expand Up @@ -812,7 +812,7 @@ def droplevels(x: Categorical) -> Categorical:
Returns:
The categorical data with unused categories dropped.
"""
return categorize(x).remove_unused_categories()
return categorized(x).remove_unused_categories()

@register_func(None, context=Context.EVAL)
def levels(x: CategoricalLikeType) -> Optional[List[Any]]:
Expand Down Expand Up @@ -904,7 +904,7 @@ def lengths(x: Any) -> List[int]:
# ---------------------------------

def factor(
x: Iterable[Any],
x: Optional[Iterable[Any]] = None,
# pylint: disable=redefined-outer-name
levels: Optional[Iterable[Any]] = None,
exclude: Any = NA,
Expand All @@ -925,6 +925,9 @@ def factor(
ordered: logical flag to determine if the levels should be regarded
as ordered (in the order given).
"""
if x is None:
x = []

if is_categorical_dtype(x):
x = x.to_numpy()
ret = Categorical(
Expand Down
102 changes: 97 additions & 5 deletions datar/base/verbs.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Function from R-base that can be used as verbs"""
# TODO: add tests
from typing import Any, Iterable, List, Optional, Tuple, Union
from typing import (
Any, Iterable, List, Mapping, Optional, Tuple, Union
)

import numpy
from pandas import DataFrame
from pandas import DataFrame, Series, Categorical
from pipda import register_verb

from ..core.types import IntType, is_scalar
Expand All @@ -14,7 +16,8 @@
@register_verb(DataFrame, context=Context.EVAL)
def colnames(
df: DataFrame,
names: Optional[Iterable[str]] = None
names: Optional[Iterable[str]] = None,
stack: bool = True
) -> Union[List[Any], DataFrame]:
"""Get or set the column names of a dataframe
Expand All @@ -28,10 +31,42 @@ def colnames(
if the input dataframe is grouped, the structure is kept.
"""
from ..stats.verbs import setNames
if not stack:
if names is not None:
return setNames(df, names)
return df.columns.tolist()

if names is not None:
return setNames(df, names)
namei = 0
newnames = []
for colname in df.columns:
parts = colname.split('$', 1)
if not newnames:
if len(parts) < 2:
newnames.append(names[namei])
namei += 1
else:
newnames.append(f"{names[namei]}${parts[1]}")
elif len(parts) < 2:
newnames.append(names[namei])
namei += 1
elif newnames[-1].startswith(f"{parts[0]}$"):
newnames.append(f"{names[namei]}${parts[1]}")
else:
namei += 1
newnames.append(f"{names[namei]}${parts[1]}")
return setNames(df, newnames)

cols = [
col.split('$', 1)[0] if isinstance(col, str) else col
for col in df.columns
]
out = []
for col in cols:
if col not in out:
out.append(col)
return out

return df.columns.tolist()

@register_verb(DataFrame, context=Context.EVAL)
def rownames(
Expand Down Expand Up @@ -176,6 +211,14 @@ def names(x: DataFrame) -> List[str]:
"""Get the column names of a dataframe"""
return x.columns.tolist()

@names.register(dict)
def _(x: Mapping[str, Any]) -> List[str]:
"""Get the keys of a dict
dict is like a list in R, mimic `names(<list>)` in R.
"""
return list(x)

@register_verb(context=Context.EVAL)
def setdiff(x: Any, y: Any) -> List[Any]:
"""Diff of two iterables"""
Expand Down Expand Up @@ -216,3 +259,52 @@ def setequal(x: Any, y: Any) -> List[Any]:
x = sorted(x)
y = sorted(y)
return x == y

@register_verb((list, tuple, numpy.ndarray, Series, Categorical))
def duplicated( # pylint: disable=invalid-name
x: Iterable[Any],
incomparables: Optional[Iterable[Any]] = None,
fromLast: bool = False
) -> numpy.ndarray:
"""Determine Duplicate Elements
Args:
x: The iterable to detect duplicates
Currently, elements in `x` must be hashable.
fromLast: Whether start to detect from the last element
Returns:
A bool array with the same length as `x`
"""
dups = set()
out = []
out_append = out.append
if incomparables is None:
incomparables = []

if fromLast:
x = reversed(x)
for elem in x:
if elem in incomparables:
out_append(False)
if elem in dups:
out_append(True)
else:
dups.add(elem)
out_append(False)
if fromLast:
out = list(reversed(out))
return numpy.array(out, dtype=bool)

@duplicated.register(DataFrame)
def _( # pylint: disable=invalid-name,unused-argument
x: DataFrame,
incomparables: Optional[Iterable[Any]] = None,
fromLast: bool = False
) -> numpy.ndarray:
"""Check if rows in a data frame are duplicated
`incomparables` not working here
"""
keep = 'first' if not fromLast else 'last'
return x.duplicated(keep=keep).values
4 changes: 4 additions & 0 deletions datar/core/contexts.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ def __init__(self):

def getitem(self, parent, ref):
"""Interpret f[ref]"""
if isinstance(ref, slice):
from .collections import Collection
return Collection(ref)

self.used_refs[ref] += 1
if isinstance(parent, DataFrame) and ref not in parent:
cols = [col for col in parent.columns if col.startswith(f'{ref}$')]
Expand Down
32 changes: 0 additions & 32 deletions datar/core/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from typing import Any, Mapping, Tuple
from pipda.utils import DataEnv

from .utils import logger

class CurColumn:
"""Current column in across"""
@classmethod
Expand Down Expand Up @@ -34,33 +32,3 @@ def __enter__(self) -> Any:

def __exit__(self, *exc_info) -> None:
self.data.delete()

class Nesting:
"""Nesting objects for calls from tidyr.nesting"""
def __init__(self, *columns: Any, **kwargs: Any) -> None:
self.columns = []
self.names = []

id_prefix = hex(id(self))[2:6]
for i, column in enumerate(columns):
self.columns.append(column)
if isinstance(column, str):
self.names.append(column)
continue
try:
# series
name = column.name
except AttributeError:
name = f'_tmp{id_prefix}_{i}'
logger.warning(
'Temporary name used for a nesting column, use '
'keyword argument instead to specify the key as name.'
)
self.names.append(name)

for key, val in kwargs.items():
self.columns.append(val)
self.names.append(key)

def __len__(self):
return len(self.columns)
2 changes: 1 addition & 1 deletion datar/core/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from pandas.core.frame import DataFrame
from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy
from pandas.core.series import Series
from pipda.function import Function

# used for type annotations
NumericType = Union[int, float, complex, numpy.number]
Expand All @@ -23,6 +22,7 @@
BoolOrIter = Union[bool, Iterable[bool]]
FloatOrIter = Union[FloatType, Iterable[FloatType]]
NumericOrIter = Union[NumericType, Iterable[NumericType]]
DTypeType = Union[str, type, numpy.dtype]

NoneType = type(None)
# used for type checks
Expand Down
80 changes: 78 additions & 2 deletions datar/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from varname import argname

from .exceptions import ColumnNotExistingError, NameNonUniqueError
from .types import is_scalar
from .types import is_scalar, DTypeType
from .defaults import DEFAULT_COLUMN_PREFIX

# logger
Expand All @@ -33,6 +33,7 @@ def vars_select(
raise_nonexists: bool = True,
base0: Optional[bool] = None
) -> List[int]:
# TODO: support selecting data-frame columns
"""Select columns
Args:
Expand Down Expand Up @@ -156,7 +157,7 @@ def df_assign_item(
else:
df.insert(df.shape[1], item, value, allow_duplicates=True)

def categorize(data: Any) -> Any:
def categorized(data: Any) -> Any:
"""Get the Categorical object"""
if not is_categorical_dtype(data):
return data
Expand Down Expand Up @@ -381,3 +382,78 @@ def get_option(key: str, value: Any = None) -> Any:
return value
from ..base import getOption
return getOption(key)

def apply_dtypes(
df: DataFrame,
dtypes: Optional[Union[bool, DTypeType, Mapping[str, DTypeType]]]
) -> None:
"""Apply dtypes to data frame"""
if dtypes is None or dtypes is False:
return

if dtypes is True:
inferred = df.convert_dtypes()
for col in df:
df[col] = inferred[col]
return

if not isinstance(dtypes, dict):
dtypes = dict(zip(df.columns, [dtypes]*df.shape[1]))

for column, dtype in dtypes.items():
if column in df:
df[column] = df[column].astype(dtype)
else:
for col in df:
if col.startswith(f"{column}$"):
df[col] = df[col].astype(dtype)

def keep_column_order(df: DataFrame, order: Iterable[str]):
"""Keep the order of columns as given `order`
We cannot do `df[order]` directly, since `df` may have nested df columns.
"""
out_columns = []
for col in order:
if col in df:
out_columns.append(col)
else:
out_columns.extend(
(dfcol for dfcol in df.columns if dfcol.startswith(f"{col}$"))
)
if set(out_columns) != set(df.columns):
raise ValueError("Given `order` does not select all columns.")

return df[out_columns]

def reconstruct_tibble(
input: DataFrame, # pylint: disable=redefined-builtin
output: DataFrame,
ungrouped_vars: Optional[List[str]] = None,
keep_rowwise: bool = False
) -> DataFrame:
"""Reconstruct the output dataframe based on input"""
from ..base import setdiff, intersect
from ..dplyr import group_vars, group_by_drop_default
from .grouped import DataFrameGroupBy, DataFrameRowwise

if ungrouped_vars is None:
ungrouped_vars = []
old_groups = group_vars(input)
new_groups = intersect(setdiff(old_groups, ungrouped_vars), output.columns)

if isinstance(input, DataFrameRowwise):
return DataFrameRowwise(
output,
_group_vars=new_groups,
_drop=group_by_drop_default(input)
) if keep_rowwise else output

if isinstance(input, DataFrameGroupBy):
return DataFrameGroupBy(
output,
_group_vars=new_groups,
_drop=group_by_drop_default(input)
)

return output
Loading

0 comments on commit 69c2add

Please sign in to comment.