Use np.int64 type for day to nanosecond conversion (NEP50) (#922)

* Use np.int64 type for day to nanosecond conversion (NEP50) * Ditch dask CI, fix pandas --------- Co-authored-by: Martin Durant <martin.durant@alumni.utoronto.ca>
dask · May 8, 2024 · bb00f37 · bb00f37
1 parent ec26733
commit bb00f37
Show file tree

Hide file tree

Showing 7 changed files with 9 additions and 47 deletions.
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -70,36 +70,6 @@ jobs:
         run: |
           echo "FASTPARQUET_DATAPAGE_V2=$FASTPARQUET_DATAPAGE_V2"
           pytest --verbose --cov=fastparquet
-  dask:
-    name: dask
-    runs-on: ubuntu-latest
-    steps:
-      - name: APT
-        run: sudo apt-get install liblzo2-dev
-
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Setup conda
-        uses: mamba-org/provision-with-micromamba@main
-        with:
-          environment-file: ci/environment-py39.yml
-
-      - name: pip-install
-        shell: bash -l {0}
-        run: |
-          git clone https://github.com/dask/dask
-          pip install pyarrow
-          pip install -e dask/
-          pip install -e . --no-deps
-
-      - name: Run Tests
-        shell: bash -l {0}
-        run: |
-          pytest --verbose dask/dask/dataframe/io/tests/test_parquet.py
-  
   pandas:
     name: pandas
     runs-on: ubuntu-latest

diff --git a/fastparquet/converted_types.py b/fastparquet/converted_types.py
@@ -31,8 +31,9 @@ def unbson(x):
         def tobson(x):
             raise ImportError("BSON not found")
 
-DAYS_TO_MILLIS = 86400000000000
-"""Number of millis in a day. Used to convert a Date to a date"""
+# Explicitly use numpy type in order to avoid promotion errors due to NEP 50 in numpy >= 2
+DAYS_TO_NANOS = np.int64(86400000000000)
+"""Number of nanoseconds in a day. Used to convert a Date to a date"""
 nat = np.datetime64('NaT').view('int64')
 
 simple = {
@@ -158,7 +159,7 @@ def convert(data, se, timestamp96=True, dtype=None):
     if se.type == parquet_thrift.Type.INT96 and timestamp96:
         data2 = data.view([('ns', 'i8'), ('day', 'i4')])
         # TODO: this should be ms unit, now that we can?
-        return ((data2['day'] - 2440588) * 86400000000000 +
+        return ((data2['day'] - np.int64(2440588)) * DAYS_TO_NANOS +
                 data2['ns']).view('M8[ns]')
     if se.logicalType is not None and se.logicalType.TIMESTAMP is not None:
         dt = _logical_to_time_dtype(se.logicalType.TIMESTAMP)
@@ -188,7 +189,7 @@ def convert(data, se, timestamp96=True, dtype=None):
                 for i in range(len(data))
             ])
     elif ctype == parquet_thrift.ConvertedType.DATE:
-        data = data * DAYS_TO_MILLIS
+        data = data * DAYS_TO_NANOS
         return data.view('datetime64[ns]')
     elif ctype == parquet_thrift.ConvertedType.TIME_MILLIS:
         # this was not covered by new pandas time units

diff --git a/fastparquet/dataframe.py b/fastparquet/dataframe.py
@@ -153,7 +153,7 @@ def cat(col):
                     tz_to_dt_tz(timezones[str(col)]))
             else:
                 index = Index(d)
-            views[col] = index.values
+            views[col] = d
     else:
         index = MultiIndex([[]], [[]])
         # index = MultiIndex.from_arrays(indexes)

diff --git a/fastparquet/test/test_output.py b/fastparquet/test/test_output.py
@@ -1022,9 +1022,7 @@ def test_no_string(tmpdir):
     df["A"] = df["A"].astype(pd.StringDtype())
 
     # set *all* values to NA
-    df["A"].iloc[0] = pd.NA
-    df["A"].iloc[1] = pd.NA
-    df["A"].iloc[2] = pd.NA
+    df.loc[:, "A"] = pd.NA
     df.to_parquet(fn, engine="fastparquet")
     df2 = pd.read_parquet(fn)
     assert pd.isna(df2.A).all()

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,2 +1,2 @@
 [build-system]
-requires = ["setuptools", "wheel", "Cython >= 0.29.23", "oldest-supported-numpy", "pytest-runner"]
+requires = ["setuptools", "setuptools_scm", "Cython >= 0.29.23", "numpy>=2.0.0rc1"]
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 pandas>=1.5.0
-numpy>=1.20.3
+numpy
 cramjam>=2.3
 fsspec
 packaging
diff --git a/setup.py b/setup.py
@@ -53,13 +53,6 @@ def fix_exts(sources):
         'local_scheme': 'no-local-version',
         'write_to': 'fastparquet/_version.py'
     },
-    setup_requires=[
-        'setuptools>18.0',
-        'setuptools-scm>1.5.4',
-        'Cython',
-        'pytest-runner',
-        'oldest-supported-numpy'
-    ],
     description='Python support for Parquet file format',
     author='Martin Durant',
     author_email='mdurant@anaconda.com',