From b4b2c1a1e4a21099c236e679d89d97d3fca5443c Mon Sep 17 00:00:00 2001 From: Ben Greiner Date: Fri, 3 May 2024 10:31:55 +0200 Subject: [PATCH 1/2] Use np.int64 type for day to nanosecond conversion (NEP50) --- fastparquet/converted_types.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fastparquet/converted_types.py b/fastparquet/converted_types.py index d21a5e37..f7876621 100644 --- a/fastparquet/converted_types.py +++ b/fastparquet/converted_types.py @@ -31,8 +31,9 @@ def unbson(x): def tobson(x): raise ImportError("BSON not found") -DAYS_TO_MILLIS = 86400000000000 -"""Number of millis in a day. Used to convert a Date to a date""" +# Explicitly use numpy type in order to avoid promotion errors due to NEP 50 in numpy >= 2 +DAYS_TO_NANOS = np.int64(86400000000000) +"""Number of nanoseconds in a day. Used to convert a Date to a date""" nat = np.datetime64('NaT').view('int64') simple = { @@ -158,7 +159,7 @@ def convert(data, se, timestamp96=True, dtype=None): if se.type == parquet_thrift.Type.INT96 and timestamp96: data2 = data.view([('ns', 'i8'), ('day', 'i4')]) # TODO: this should be ms unit, now that we can? - return ((data2['day'] - 2440588) * 86400000000000 + + return ((data2['day'] - np.int64(2440588)) * DAYS_TO_NANOS + data2['ns']).view('M8[ns]') if se.logicalType is not None and se.logicalType.TIMESTAMP is not None: dt = _logical_to_time_dtype(se.logicalType.TIMESTAMP) @@ -188,7 +189,7 @@ def convert(data, se, timestamp96=True, dtype=None): for i in range(len(data)) ]) elif ctype == parquet_thrift.ConvertedType.DATE: - data = data * DAYS_TO_MILLIS + data = data * DAYS_TO_NANOS return data.view('datetime64[ns]') elif ctype == parquet_thrift.ConvertedType.TIME_MILLIS: # this was not covered by new pandas time units From f6989bd5b0e29197546fa9bf6699c36af53d1e82 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 7 May 2024 21:21:41 -0400 Subject: [PATCH 2/2] Ditch dask CI, fix pandas --- .github/workflows/main.yaml | 30 ------------------------------ fastparquet/dataframe.py | 2 +- fastparquet/test/test_output.py | 4 +--- pyproject.toml | 2 +- requirements.txt | 2 +- setup.py | 7 ------- 6 files changed, 4 insertions(+), 43 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 87eb224a..dcf72914 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -70,36 +70,6 @@ jobs: run: | echo "FASTPARQUET_DATAPAGE_V2=$FASTPARQUET_DATAPAGE_V2" pytest --verbose --cov=fastparquet - dask: - name: dask - runs-on: ubuntu-latest - steps: - - name: APT - run: sudo apt-get install liblzo2-dev - - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Setup conda - uses: mamba-org/provision-with-micromamba@main - with: - environment-file: ci/environment-py39.yml - - - name: pip-install - shell: bash -l {0} - run: | - git clone https://github.com/dask/dask - pip install pyarrow - pip install -e dask/ - pip install -e . --no-deps - - - name: Run Tests - shell: bash -l {0} - run: | - pytest --verbose dask/dask/dataframe/io/tests/test_parquet.py - pandas: name: pandas runs-on: ubuntu-latest diff --git a/fastparquet/dataframe.py b/fastparquet/dataframe.py index afa7d01b..1e2aa583 100644 --- a/fastparquet/dataframe.py +++ b/fastparquet/dataframe.py @@ -153,7 +153,7 @@ def cat(col): tz_to_dt_tz(timezones[str(col)])) else: index = Index(d) - views[col] = index.values + views[col] = d else: index = MultiIndex([[]], [[]]) # index = MultiIndex.from_arrays(indexes) diff --git a/fastparquet/test/test_output.py b/fastparquet/test/test_output.py index 827603ed..01c265a9 100644 --- a/fastparquet/test/test_output.py +++ b/fastparquet/test/test_output.py @@ -1022,9 +1022,7 @@ def test_no_string(tmpdir): df["A"] = df["A"].astype(pd.StringDtype()) # set *all* values to NA - df["A"].iloc[0] = pd.NA - df["A"].iloc[1] = pd.NA - df["A"].iloc[2] = pd.NA + df.loc[:, "A"] = pd.NA df.to_parquet(fn, engine="fastparquet") df2 = pd.read_parquet(fn) assert pd.isna(df2.A).all() diff --git a/pyproject.toml b/pyproject.toml index fd80deb3..61c1f634 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,2 @@ [build-system] -requires = ["setuptools", "wheel", "Cython >= 0.29.23", "oldest-supported-numpy", "pytest-runner"] +requires = ["setuptools", "setuptools_scm", "Cython >= 0.29.23", "numpy>=2.0.0rc1"] diff --git a/requirements.txt b/requirements.txt index 384b66be..25f444ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ pandas>=1.5.0 -numpy>=1.20.3 +numpy cramjam>=2.3 fsspec packaging diff --git a/setup.py b/setup.py index d3053c6c..b07c16f1 100644 --- a/setup.py +++ b/setup.py @@ -53,13 +53,6 @@ def fix_exts(sources): 'local_scheme': 'no-local-version', 'write_to': 'fastparquet/_version.py' }, - setup_requires=[ - 'setuptools>18.0', - 'setuptools-scm>1.5.4', - 'Cython', - 'pytest-runner', - 'oldest-supported-numpy' - ], description='Python support for Parquet file format', author='Martin Durant', author_email='mdurant@anaconda.com',