0.2.3 (#21)

* Update logo * Add favicon for docs * Fix compatibility with pandas v1.2.0~4 * Fix ci configuration * Fix ci configuration * Fix linting * Fix linting * Remove pandas 1.2.1~4 from CI * Fix base.table when inputs are factors and exclude is NA; Add base.scale/col_sums/row_sums/col_means/row_means/col_sds/row_sds/col_medians/row_medians * Fix linting
pwwang · Jun 29, 2021 · 72ab957 · 72ab957
1 parent 50daefb
commit 72ab957
Show file tree

Hide file tree

Showing 28 changed files with 449 additions and 66 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -7,8 +7,17 @@ jobs:
   build:
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         python-version: [3.7, 3.8, 3.9]
+        pandas: [
+          pandas==1.2.0,
+          # pandas==1.2.1,
+          # pandas==1.2.2,
+          # pandas==1.2.3,
+          # pandas==1.2.4,
+          pandas # lastest 1.2.5
+        ]
 
     steps:
       - uses: actions/checkout@v2
@@ -19,15 +28,20 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install pylint
+          python -m pip install pylint=='2.8.*'
           python -m pip install poetry
+          poetry config virtualenvs.create false
           poetry install -v
-          poetry run pip install scipy
-          poetry run pip install wcwidth
-      # - name: Run pylint
-      #   run: pylint datar
+          pip install wcwidth
+          pip install scipy
+          # reinstall pandas to specific version
+          pip install $PANDAS
+        env:
+          PANDAS: ${{ matrix.pandas }}
+      - name: Run pylint
+        run: pylint datar
       - name: Test with pytest
-        run: poetry run pytest tests/ --junitxml=junit/test-results-${{ matrix.python-version }}.xml
+        run: pytest tests/ --junitxml=junit/test-results-${{ matrix.python-version }}.xml
       - name: Upload pytest test results
         uses: actions/upload-artifact@v2
         with:
@@ -45,7 +59,7 @@ jobs:
   deploy:
     needs: build
     runs-on: ubuntu-latest
-    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+    if: github.event_name == 'release'
     strategy:
       matrix:
         python-version: [3.8]

diff --git a/.pylintrc b/.pylintrc
@@ -158,7 +158,8 @@ disable=print-statement,
         not-callable,
         unsubscriptable-object,
         unused-arguments,
-        fixme
+        fixme,
+        consider-using-dict-items
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option

diff --git a/datar/__init__.py b/datar/__init__.py
@@ -4,4 +4,4 @@
 from .core import frame_format_patch as _
 from .core.defaults import f
 
-__version__ = '0.2.2'
+__version__ = '0.2.3'
diff --git a/datar/base/__init__.py b/datar/base/__init__.py
@@ -11,6 +11,15 @@
     pmin,
     sqrt,
     var,
+    scale,
+    col_sums,
+    row_sums,
+    col_means,
+    row_means,
+    col_sds,
+    row_sds,
+    col_medians,
+    row_medians,
     min as min_,
     max as max_,
     round as round_,

diff --git a/datar/base/arithmetic.py b/datar/base/arithmetic.py
@@ -1,19 +1,17 @@
 """Arithmetic or math functions"""
 
-from multiprocessing.dummy import Array
-from typing import Any, Callable, Iterable, Optional
+from typing import Any, Callable, Iterable, Optional, Union
 
 import numpy
 from pandas import DataFrame, Series
 from pipda import register_func, register_verb
 
 from ..core.contexts import Context
-from ..core.types import NumericOrIter, NumericType, is_not_null
+from ..core.types import NumericOrIter, NumericType, is_not_null, is_scalar
 from ..core.utils import Array, register_numpy_func_x, recycle_value, length_of
 from ..core.collections import Collection
 
-# TODO: docstring
-# weighted_mean, sd, cor?, range, quantile, summary, iqr
+# cor?, range, summary, iqr
 
 def _register_arithmetic_agg(
         name: str,
@@ -22,11 +20,15 @@ def _register_arithmetic_agg(
 ) -> Callable:
     """Register an arithmetic function"""
     @register_func(None, context=Context.EVAL)
-    def _arithmetric(x: Iterable, na_rm: bool = False):
+    def _arithmetric(x: Iterable, na_rm: bool = False) -> Iterable:
         """Arithmetric function"""
-        if na_rm:
-            x = Array(x)[is_not_null(x)]
-        return getattr(numpy, np_name)(x)
+        # na_rm not working for numpy functions
+        # with x is a Series object
+        if isinstance(x, Series):
+            return getattr(x, np_name)(skipna=na_rm)
+
+        fun_name = f"nan{np_name}" if na_rm else np_name
+        return getattr(numpy, fun_name)(x)
 
     _arithmetric.__name__ = name
     _arithmetric.__doc__ = doc
@@ -232,3 +234,260 @@ def _(x: Iterable, y: Iterable, ddof: int = 1) -> DataFrame:
     """Compute covariance for two iterables"""
     # ddof: numpy v1.5+
     return numpy.cov(x, y, ddof=ddof)[0][1]
+
+@register_verb(DataFrame, context=Context.EVAL)
+def scale(
+        x: DataFrame,
+        center: Union[bool, Iterable[NumericType]] = True,
+        # pylint: disable=redefined-outer-name
+        scale: Union[bool, Iterable[NumericType]] = True
+) -> DataFrame:
+    """Scaling and Centering of a numeric data frame
+
+    See Details in `?scale` in `R`
+
+    Args:
+        x: The numeric data frame to scale
+        center: either a logical value or numeric-alike vector of length
+            equal to the number of columns of `x`
+        scale: either a logical value or a numeric-alike vector of length
+            equal to the number of columns of `x`.
+
+    Returns:
+        The centered, scaled data frame
+    """
+    # center
+    ncols = x.shape[1]
+    center_is_true = center is True
+    out_attrs = {}
+
+    if center is True:
+        center = col_means(x)
+
+    elif center is not False:
+        if is_scalar(center):
+            center = [center]
+        if len(center) != ncols:
+            raise ValueError(
+                f"length of `center` ({len(center)}) must equal "
+                f"the number of columns of `x` ({ncols})"
+            )
+
+    if center is not False:
+        x = x - center
+        out_attrs['scaled:center'] = Array(center)
+
+    # scale
+    if scale is True:
+        def _rms(col: Series) -> Series:
+            nonnas = col[is_not_null(col)] ** 2
+            return sqrt(nonnas.sum() / (len(nonnas) - 1))
+
+        scale = col_sds(x) if center_is_true else x.agg(_rms)
+
+    elif scale is not False:
+        if is_scalar(scale):
+            scale = [scale]
+        if len(scale) != ncols:
+            raise ValueError(
+                f"length of `scale` ({len(center)}) must equal "
+                f"the number of columns of `x` ({ncols})"
+            )
+
+    if scale is not False:
+        x = x / scale
+        out_attrs['scaled:scale'] = Array(scale)
+
+    if center is False and scale is False:
+        x = x.copy()
+
+    x.attrs.update(out_attrs)
+    return x
+
+# being able to refer it inside the function
+# as scale also used as an argument
+_scale = scale
+
+@scale.register(Series)
+def _(
+        x: Series,
+        center: Union[bool, Iterable[NumericType]] = True,
+        # pylint: disable=redefined-outer-name
+        scale: Union[bool, Iterable[NumericType]] = True
+) -> DataFrame:
+    """Scaling on series"""
+    return _scale(x.to_frame(), center, scale)
+
+@scale.register((numpy.ndarray, list, tuple))
+def _(
+        x: Iterable,
+        center: Union[bool, Iterable[NumericType]] = True,
+        # pylint: disable=redefined-outer-name
+        scale: Union[bool, Iterable[NumericType]] = True
+) -> DataFrame:
+    """Scaling on iterables"""
+    return _scale(Series(x, name='scaled'), center, scale)
+
+@register_verb(DataFrame)
+def col_sums(
+        x: DataFrame,
+        na_rm: bool = False,
+        # dims: int = 1,
+        # weights = None,
+        # freq = None,
+        # n = None
+) -> Iterable[NumericType]:
+    """Calculate sum of a data frame by column
+
+    Args:
+        x: The data frame
+        na_rm: Specifies how to handle missing values in `x`.
+
+    Returns:
+        The sums by column.
+    """
+    return x.agg(sum, na_rm=na_rm)
+
+@register_verb(DataFrame)
+def row_sums(
+        x: DataFrame,
+        na_rm: bool = False,
+        # dims: int = 1,
+        # weights = None,
+        # freq = None,
+        # n = None
+) -> Iterable[NumericType]:
+    """Calculate sum of a data frame by row
+
+    Args:
+        x: The data frame
+        na_rm: Specifies how to handle missing values in `x`.
+
+    Returns:
+        The sums by row.
+    """
+    return x.agg(sum, axis=1, na_rm=na_rm)
+
+
+@register_verb(DataFrame)
+def col_means(
+        x: DataFrame,
+        na_rm: bool = False,
+        # dims: int = 1,
+        # weights = None,
+        # freq = None,
+        # n = None
+) -> Iterable[NumericType]:
+    """Calculate mean of a data frame by column
+
+    Args:
+        x: The data frame
+        na_rm: Specifies how to handle missing values in `x`.
+
+    Returns:
+        The means by column.
+    """
+    return x.agg(mean, na_rm=na_rm)
+
+@register_verb(DataFrame)
+def row_means(
+        x: DataFrame,
+        na_rm: bool = False,
+        # dims: int = 1,
+        # weights = None,
+        # freq = None,
+        # n = None
+) -> Iterable[NumericType]:
+    """Calculate mean of a data frame by row
+
+    Args:
+        x: The data frame
+        na_rm: Specifies how to handle missing values in `x`.
+
+    Returns:
+        The means by row.
+    """
+    return x.agg(mean, axis=1, na_rm=na_rm)
+
+@register_verb(DataFrame)
+def col_sds(
+        x: DataFrame,
+        na_rm: bool = False,
+        # dims: int = 1,
+        # weights = None,
+        # freq = None,
+        # n = None
+) -> Iterable[NumericType]:
+    """Calculate stdev of a data frame by column
+
+    Args:
+        x: The data frame
+        na_rm: Specifies how to handle missing values in `x`.
+
+    Returns:
+        The stdevs by column.
+    """
+    from ..stats import sd
+    return x.agg(sd, na_rm=na_rm)
+
+@register_verb(DataFrame)
+def row_sds(
+        x: DataFrame,
+        na_rm: bool = False,
+        # dims: int = 1,
+        # weights = None,
+        # freq = None,
+        # n = None
+) -> Iterable[NumericType]:
+    """Calculate stdev of a data frame by row
+
+    Args:
+        x: The data frame
+        na_rm: Specifies how to handle missing values in `x`.
+
+    Returns:
+        The stdevs by row.
+    """
+    from ..stats import sd
+    return x.agg(sd, axis=1, na_rm=na_rm)
+
+
+@register_verb(DataFrame)
+def col_medians(
+        x: DataFrame,
+        na_rm: bool = False,
+        # dims: int = 1,
+        # weights = None,
+        # freq = None,
+        # n = None
+) -> Iterable[NumericType]:
+    """Calculate median of a data frame by column
+
+    Args:
+        x: The data frame
+        na_rm: Specifies how to handle missing values in `x`.
+
+    Returns:
+        The medians by column.
+    """
+    return x.agg(median, na_rm=na_rm)
+
+@register_verb(DataFrame)
+def row_medians(
+        x: DataFrame,
+        na_rm: bool = False,
+        # dims: int = 1,
+        # weights = None,
+        # freq = None,
+        # n = None
+) -> Iterable[NumericType]:
+    """Calculate median of a data frame by row
+
+    Args:
+        x: The data frame
+        na_rm: Specifies how to handle missing values in `x`.
+
+    Returns:
+        The medians by row.
+    """
+    return x.agg(median, axis=1, na_rm=na_rm)