From b4b2c1a1e4a21099c236e679d89d97d3fca5443c Mon Sep 17 00:00:00 2001
From: Ben Greiner <code@bnavigator.de>
Date: Fri, 3 May 2024 10:31:55 +0200
Subject: [PATCH 1/2] Use np.int64 type for day to nanosecond conversion
 (NEP50)

---
 fastparquet/converted_types.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fastparquet/converted_types.py b/fastparquet/converted_types.py
index d21a5e37..f7876621 100644
--- a/fastparquet/converted_types.py
+++ b/fastparquet/converted_types.py
@@ -31,8 +31,9 @@ def unbson(x):
         def tobson(x):
             raise ImportError("BSON not found")
 
-DAYS_TO_MILLIS = 86400000000000
-"""Number of millis in a day. Used to convert a Date to a date"""
+# Explicitly use numpy type in order to avoid promotion errors due to NEP 50 in numpy >= 2
+DAYS_TO_NANOS = np.int64(86400000000000)
+"""Number of nanoseconds in a day. Used to convert a Date to a date"""
 nat = np.datetime64('NaT').view('int64')
 
 simple = {
@@ -158,7 +159,7 @@ def convert(data, se, timestamp96=True, dtype=None):
     if se.type == parquet_thrift.Type.INT96 and timestamp96:
         data2 = data.view([('ns', 'i8'), ('day', 'i4')])
         # TODO: this should be ms unit, now that we can?
-        return ((data2['day'] - 2440588) * 86400000000000 +
+        return ((data2['day'] - np.int64(2440588)) * DAYS_TO_NANOS +
                 data2['ns']).view('M8[ns]')
     if se.logicalType is not None and se.logicalType.TIMESTAMP is not None:
         dt = _logical_to_time_dtype(se.logicalType.TIMESTAMP)
@@ -188,7 +189,7 @@ def convert(data, se, timestamp96=True, dtype=None):
                 for i in range(len(data))
             ])
     elif ctype == parquet_thrift.ConvertedType.DATE:
-        data = data * DAYS_TO_MILLIS
+        data = data * DAYS_TO_NANOS
         return data.view('datetime64[ns]')
     elif ctype == parquet_thrift.ConvertedType.TIME_MILLIS:
         # this was not covered by new pandas time units

From f6989bd5b0e29197546fa9bf6699c36af53d1e82 Mon Sep 17 00:00:00 2001
From: Martin Durant <martin.durant@alumni.utoronto.ca>
Date: Tue, 7 May 2024 21:21:41 -0400
Subject: [PATCH 2/2] Ditch dask CI, fix pandas

---
 .github/workflows/main.yaml     | 30 ------------------------------
 fastparquet/dataframe.py        |  2 +-
 fastparquet/test/test_output.py |  4 +---
 pyproject.toml                  |  2 +-
 requirements.txt                |  2 +-
 setup.py                        |  7 -------
 6 files changed, 4 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 87eb224a..dcf72914 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -70,36 +70,6 @@ jobs:
         run: |
           echo "FASTPARQUET_DATAPAGE_V2=$FASTPARQUET_DATAPAGE_V2"
           pytest --verbose --cov=fastparquet
-  dask:
-    name: dask
-    runs-on: ubuntu-latest
-    steps:
-      - name: APT
-        run: sudo apt-get install liblzo2-dev
-
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Setup conda
-        uses: mamba-org/provision-with-micromamba@main
-        with:
-          environment-file: ci/environment-py39.yml
-
-      - name: pip-install
-        shell: bash -l {0}
-        run: |
-          git clone https://github.com/dask/dask
-          pip install pyarrow
-          pip install -e dask/
-          pip install -e . --no-deps
-
-      - name: Run Tests
-        shell: bash -l {0}
-        run: |
-          pytest --verbose dask/dask/dataframe/io/tests/test_parquet.py
-  
   pandas:
     name: pandas
     runs-on: ubuntu-latest
diff --git a/fastparquet/dataframe.py b/fastparquet/dataframe.py
index afa7d01b..1e2aa583 100644
--- a/fastparquet/dataframe.py
+++ b/fastparquet/dataframe.py
@@ -153,7 +153,7 @@ def cat(col):
                     tz_to_dt_tz(timezones[str(col)]))
             else:
                 index = Index(d)
-            views[col] = index.values
+            views[col] = d
     else:
         index = MultiIndex([[]], [[]])
         # index = MultiIndex.from_arrays(indexes)
diff --git a/fastparquet/test/test_output.py b/fastparquet/test/test_output.py
index 827603ed..01c265a9 100644
--- a/fastparquet/test/test_output.py
+++ b/fastparquet/test/test_output.py
@@ -1022,9 +1022,7 @@ def test_no_string(tmpdir):
     df["A"] = df["A"].astype(pd.StringDtype())
 
     # set *all* values to NA
-    df["A"].iloc[0] = pd.NA
-    df["A"].iloc[1] = pd.NA
-    df["A"].iloc[2] = pd.NA
+    df.loc[:, "A"] = pd.NA
     df.to_parquet(fn, engine="fastparquet")
     df2 = pd.read_parquet(fn)
     assert pd.isna(df2.A).all()
diff --git a/pyproject.toml b/pyproject.toml
index fd80deb3..61c1f634 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,2 +1,2 @@
 [build-system]
-requires = ["setuptools", "wheel", "Cython >= 0.29.23", "oldest-supported-numpy", "pytest-runner"]
+requires = ["setuptools", "setuptools_scm", "Cython >= 0.29.23", "numpy>=2.0.0rc1"]
diff --git a/requirements.txt b/requirements.txt
index 384b66be..25f444ff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 pandas>=1.5.0
-numpy>=1.20.3
+numpy
 cramjam>=2.3
 fsspec
 packaging
diff --git a/setup.py b/setup.py
index d3053c6c..b07c16f1 100644
--- a/setup.py
+++ b/setup.py
@@ -53,13 +53,6 @@ def fix_exts(sources):
         'local_scheme': 'no-local-version',
         'write_to': 'fastparquet/_version.py'
     },
-    setup_requires=[
-        'setuptools>18.0',
-        'setuptools-scm>1.5.4',
-        'Cython',
-        'pytest-runner',
-        'oldest-supported-numpy'
-    ],
     description='Python support for Parquet file format',
     author='Martin Durant',
     author_email='mdurant@anaconda.com',