diff -Nru pandas-2.1.4+dfsg/.circleci/config.yml pandas-2.2.2+dfsg/.circleci/config.yml --- pandas-2.1.4+dfsg/.circleci/config.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.circleci/config.yml 2024-04-10 17:42:52.000000000 +0000 @@ -3,7 +3,7 @@ jobs: test-arm: machine: - image: ubuntu-2004:2022.04.1 + image: default resource_class: arm.large environment: ENV_FILE: ci/deps/circle-310-arm64.yaml @@ -18,12 +18,35 @@ PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD ci/run_tests.sh + linux-musl: + docker: + - image: quay.io/pypa/musllinux_1_1_aarch64 + resource_class: arm.large + steps: + # Install pkgs first to have git in the image + # (needed for checkout) + - run: | + apk update + apk add git + apk add musl-locales + - checkout + - run: | + /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 + python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" + python -m pip list --no-cache-dir + - run: | + . ~/virtualenvs/pandas-dev/bin/activate + export PANDAS_CI=1 + python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml build-aarch64: parameters: cibw-build: type: string machine: - image: ubuntu-2004:2022.04.1 + image: default resource_class: arm.large environment: TRIGGER_SOURCE: << pipeline.trigger_source >> @@ -50,6 +73,7 @@ command: | pip3 install cibuildwheel==2.15.0 cibuildwheel --prerelease-pythons --output-dir wheelhouse + environment: CIBW_BUILD: << parameters.cibw-build >> @@ -84,6 +108,13 @@ equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] jobs: - test-arm + test-musl: + # Don't run trigger this one when scheduled pipeline runs + when: + not: + equal: [ scheduled_pipeline, << pipeline.trigger_source >> ] + jobs: + - linux-musl build-wheels: jobs: - build-aarch64: @@ -92,4 +123,11 @@ only: /^v.*/ matrix: parameters: - cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64"] + cibw-build: ["cp39-manylinux_aarch64", + "cp310-manylinux_aarch64", + "cp311-manylinux_aarch64", + "cp312-manylinux_aarch64", + "cp39-musllinux_aarch64", + "cp310-musllinux_aarch64", + "cp311-musllinux_aarch64", + "cp312-musllinux_aarch64",] diff -Nru pandas-2.1.4+dfsg/.circleci/setup_env.sh pandas-2.2.2+dfsg/.circleci/setup_env.sh --- pandas-2.1.4+dfsg/.circleci/setup_env.sh 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.circleci/setup_env.sh 2024-04-10 17:42:52.000000000 +0000 @@ -55,6 +55,6 @@ fi echo "Install pandas" -python -m pip install --no-build-isolation -ve . +python -m pip install --no-build-isolation -ve . --config-settings=setup-args="--werror" echo "done" diff -Nru pandas-2.1.4+dfsg/.github/actions/build_pandas/action.yml pandas-2.2.2+dfsg/.github/actions/build_pandas/action.yml --- pandas-2.1.4+dfsg/.github/actions/build_pandas/action.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.github/actions/build_pandas/action.yml 2024-04-10 17:42:52.000000000 +0000 @@ -4,6 +4,12 @@ editable: description: Whether to build pandas in editable mode (default true) default: true + meson_args: + description: Extra flags to pass to meson + required: false + cflags_adds: + description: Items to append to the CFLAGS variable + required: false runs: using: composite steps: @@ -24,9 +30,12 @@ - name: Build Pandas run: | + export CFLAGS="$CFLAGS ${{ inputs.cflags_adds }}" if [[ ${{ inputs.editable }} == "true" ]]; then - pip install -e . --no-build-isolation -v --no-deps + pip install -e . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \ + --config-settings=setup-args="--werror" else - pip install . --no-build-isolation -v --no-deps + pip install . --no-build-isolation -v --no-deps ${{ inputs.meson_args }} \ + --config-settings=setup-args="--werror" fi shell: bash -el {0} diff -Nru pandas-2.1.4+dfsg/.github/actions/setup-conda/action.yml pandas-2.2.2+dfsg/.github/actions/setup-conda/action.yml --- pandas-2.1.4+dfsg/.github/actions/setup-conda/action.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.github/actions/setup-conda/action.yml 2024-04-10 17:42:52.000000000 +0000 @@ -11,6 +11,6 @@ with: environment-file: ${{ inputs.environment-file }} environment-name: test - condarc-file: ci/condarc.yml + condarc-file: ci/.condarc cache-environment: true cache-downloads: true diff -Nru pandas-2.1.4+dfsg/.github/workflows/broken-linkcheck.yml pandas-2.2.2+dfsg/.github/workflows/broken-linkcheck.yml --- pandas-2.1.4+dfsg/.github/workflows/broken-linkcheck.yml 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/.github/workflows/broken-linkcheck.yml 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,39 @@ +name: Linkcheck +on: + schedule: + # Run monthly on the 1st day of the month + - cron: '0 0 1 * *' + pull_request: + paths: + - ".github/workflows/broken-linkcheck.yml" + - "doc/make.py" +jobs: + linkcheck: + if: false + runs-on: ubuntu-latest + defaults: + run: + shell: bash -el {0} + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + + - name: Build Pandas + uses: ./.github/actions/build_pandas + + - name: Run linkcheck script + working-directory: ./doc + run: | + set -o pipefail + python make.py linkcheck | tee linkcheck.txt + + - name: Display broken links + if: failure() + working-directory: ./doc + run: grep broken linkcheck.txt diff -Nru pandas-2.1.4+dfsg/.github/workflows/code-checks.yml pandas-2.2.2+dfsg/.github/workflows/code-checks.yml --- pandas-2.1.4+dfsg/.github/workflows/code-checks.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.github/workflows/code-checks.yml 2024-04-10 17:42:52.000000000 +0000 @@ -4,11 +4,11 @@ push: branches: - main - - 2.1.x + - 2.2.x pull_request: branches: - main - - 2.1.x + - 2.2.x env: ENV_FILE: environment.yml @@ -33,7 +33,7 @@ steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -109,7 +109,7 @@ steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -124,7 +124,7 @@ run: | cd asv_bench asv machine --yes - asv run --quick --dry-run --durations=30 --python=same + asv run --quick --dry-run --durations=30 --python=same --show-stderr build_docker_dev_environment: name: Build Docker Dev Environment @@ -143,7 +143,7 @@ run: docker image prune -f - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -164,13 +164,13 @@ steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup Python id: setup_python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.10' cache: 'pip' diff -Nru pandas-2.1.4+dfsg/.github/workflows/codeql.yml pandas-2.2.2+dfsg/.github/workflows/codeql.yml --- pandas-2.1.4+dfsg/.github/workflows/codeql.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.github/workflows/codeql.yml 2024-04-10 17:42:52.000000000 +0000 @@ -27,9 +27,9 @@ - python steps: - - uses: actions/checkout@v3 - - uses: github/codeql-action/init@v2 + - uses: actions/checkout@v4 + - uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} - - uses: github/codeql-action/autobuild@v2 - - uses: github/codeql-action/analyze@v2 + - uses: github/codeql-action/autobuild@v3 + - uses: github/codeql-action/analyze@v3 diff -Nru pandas-2.1.4+dfsg/.github/workflows/comment-commands.yml pandas-2.2.2+dfsg/.github/workflows/comment-commands.yml --- pandas-2.1.4+dfsg/.github/workflows/comment-commands.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.github/workflows/comment-commands.yml 2024-04-10 17:42:52.000000000 +0000 @@ -51,7 +51,7 @@ steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -77,7 +77,7 @@ echo 'EOF' >> $GITHUB_ENV echo "REGEX=$REGEX" >> $GITHUB_ENV - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 env: BENCH_OUTPUT: ${{env.BENCH_OUTPUT}} REGEX: ${{env.REGEX}} diff -Nru pandas-2.1.4+dfsg/.github/workflows/deprecation-tracking-bot.yml pandas-2.2.2+dfsg/.github/workflows/deprecation-tracking-bot.yml --- pandas-2.1.4+dfsg/.github/workflows/deprecation-tracking-bot.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.github/workflows/deprecation-tracking-bot.yml 2024-04-10 17:42:52.000000000 +0000 @@ -21,7 +21,7 @@ env: DEPRECATION_TRACKER_ISSUE: 50578 steps: - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 id: update-deprecation-issue with: script: | diff -Nru pandas-2.1.4+dfsg/.github/workflows/docbuild-and-upload.yml pandas-2.2.2+dfsg/.github/workflows/docbuild-and-upload.yml --- pandas-2.1.4+dfsg/.github/workflows/docbuild-and-upload.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.github/workflows/docbuild-and-upload.yml 2024-04-10 17:42:52.000000000 +0000 @@ -4,13 +4,13 @@ push: branches: - main - - 2.1.x + - 2.2.x tags: - '*' pull_request: branches: - main - - 2.1.x + - 2.2.x env: ENV_FILE: environment.yml @@ -36,7 +36,7 @@ steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -46,6 +46,9 @@ - name: Build Pandas uses: ./.github/actions/build_pandas + - name: Test website + run: python -m pytest web/ + - name: Build website run: python web/pandas_web.py web/pandas --target-path=web/build @@ -82,7 +85,7 @@ run: mv doc/build/html web/build/docs - name: Save website as an artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: website path: web/build diff -Nru pandas-2.1.4+dfsg/.github/workflows/package-checks.yml pandas-2.2.2+dfsg/.github/workflows/package-checks.yml --- pandas-2.1.4+dfsg/.github/workflows/package-checks.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.github/workflows/package-checks.yml 2024-04-10 17:42:52.000000000 +0000 @@ -4,11 +4,11 @@ push: branches: - main - - 2.1.x + - 2.2.x pull_request: branches: - main - - 2.1.x + - 2.2.x types: [ labeled, opened, synchronize, reopened ] permissions: @@ -24,7 +24,7 @@ runs-on: ubuntu-22.04 strategy: matrix: - extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] + extra: ["test", "pyarrow", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] fail-fast: false name: Install Extras - ${{ matrix.extra }} concurrency: @@ -34,13 +34,13 @@ steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup Python id: setup_python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.10' @@ -62,7 +62,7 @@ cancel-in-progress: true steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 diff -Nru pandas-2.1.4+dfsg/.github/workflows/stale-pr.yml pandas-2.2.2+dfsg/.github/workflows/stale-pr.yml --- pandas-2.1.4+dfsg/.github/workflows/stale-pr.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.github/workflows/stale-pr.yml 2024-04-10 17:42:52.000000000 +0000 @@ -14,7 +14,7 @@ if: github.repository_owner == 'pandas-dev' runs-on: ubuntu-22.04 steps: - - uses: actions/stale@v8 + - uses: actions/stale@v9 with: repo-token: ${{ secrets.GITHUB_TOKEN }} stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity. Please [update](https://pandas.pydata.org/pandas-docs/stable/development/contributing.html#updating-your-pull-request) and respond to this comment if you're still interested in working on this." diff -Nru pandas-2.1.4+dfsg/.github/workflows/unit-tests.yml pandas-2.2.2+dfsg/.github/workflows/unit-tests.yml --- pandas-2.1.4+dfsg/.github/workflows/unit-tests.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.github/workflows/unit-tests.yml 2024-04-10 17:42:52.000000000 +0000 @@ -4,11 +4,11 @@ push: branches: - main - - 2.1.x + - 2.2.x pull_request: branches: - main - - 2.1.x + - 2.2.x paths-ignore: - "doc/**" - "web/**" @@ -23,7 +23,7 @@ jobs: ubuntu: runs-on: ubuntu-22.04 - timeout-minutes: 180 + timeout-minutes: 90 strategy: matrix: env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] @@ -100,17 +100,19 @@ name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} env: PATTERN: ${{ matrix.pattern }} - EXTRA_APT: ${{ matrix.extra_apt || '' }} LANG: ${{ matrix.lang || 'C.UTF-8' }} LC_ALL: ${{ matrix.lc_all || '' }} PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} PANDAS_CI: ${{ matrix.pandas_ci || '1' }} TEST_ARGS: ${{ matrix.test_args || '' }} - PYTEST_WORKERS: 'auto' + PYTEST_WORKERS: ${{ matrix.pytest_workers || 'auto' }} PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} + NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }} + # Clipboard tests + QT_QPA_PLATFORM: offscreen concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_copy_on_write || '' }} cancel-in-progress: true services: @@ -152,13 +154,13 @@ steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Extra installs - # xsel for clipboard tests - run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} + run: sudo apt-get update && sudo apt-get install -y ${{ matrix.extra_apt }} + if: ${{ matrix.extra_apt }} - name: Generate extra locales # These extra locales will be available for locale.setlocale() calls in tests @@ -173,6 +175,9 @@ - name: Build Pandas id: build uses: ./.github/actions/build_pandas + with: + meson_args: ${{ matrix.meson_args }} + cflags_adds: ${{ matrix.cflags_adds }} - name: Test (not single_cpu) uses: ./.github/actions/run-tests @@ -189,10 +194,11 @@ if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}} macos-windows: - timeout-minutes: 180 + timeout-minutes: 90 strategy: matrix: - os: [macos-latest, windows-latest] + # Note: Don't use macOS latest since macos 14 appears to be arm64 only + os: [macos-13, macos-14, windows-latest] env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] fail-fast: false runs-on: ${{ matrix.os }} @@ -205,12 +211,11 @@ PANDAS_CI: 1 PYTEST_TARGET: pandas PATTERN: "not slow and not db and not network and not single_cpu" - # GH 47443: PYTEST_WORKERS > 0 crashes Windows builds with memory related errors - PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '0' }} + PYTEST_WORKERS: 'auto' steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -253,7 +258,7 @@ python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 - python -m pip install --no-cache-dir --no-build-isolation -e . + python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir export PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml @@ -291,7 +296,7 @@ . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 - python -m pip install --no-cache-dir --no-build-isolation -e . + python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror" python -m pip list --no-cache-dir - name: Run Tests @@ -332,9 +337,10 @@ strategy: fail-fast: false matrix: - os: [ubuntu-22.04, macOS-latest, windows-latest] + # Separate out macOS 13 and 14, since macOS 14 is arm64 only + os: [ubuntu-22.04, macOS-13, macOS-14, windows-latest] - timeout-minutes: 180 + timeout-minutes: 90 concurrency: #https://github.community/t/concurrecy-not-work-for-push/183068/7 @@ -348,12 +354,12 @@ PYTEST_TARGET: pandas steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Python Dev Version - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.12-dev' @@ -364,7 +370,7 @@ python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov - python -m pip install -ve . --no-build-isolation --no-index --no-deps + python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror" python -m pip list - name: Run Tests diff -Nru pandas-2.1.4+dfsg/.github/workflows/wheels.yml pandas-2.2.2+dfsg/.github/workflows/wheels.yml --- pandas-2.1.4+dfsg/.github/workflows/wheels.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.github/workflows/wheels.yml 2024-04-10 17:42:52.000000000 +0000 @@ -48,12 +48,12 @@ sdist_file: ${{ steps.save-path.outputs.sdist_name }} steps: - name: Checkout pandas - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.11' @@ -62,7 +62,7 @@ python -m pip install build python -m build --sdist - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: sdist path: ./dist/* @@ -94,7 +94,9 @@ buildplat: - [ubuntu-22.04, manylinux_x86_64] - [ubuntu-22.04, musllinux_x86_64] - - [macos-12, macosx_*] + - [macos-12, macosx_x86_64] + # Note: M1 images on Github Actions start from macOS 14 + - [macos-14, macosx_arm64] - [windows-2022, win_amd64] # TODO: support PyPy? python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] @@ -103,7 +105,7 @@ IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} steps: - name: Checkout pandas - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -115,7 +117,7 @@ # removes unnecessary files from the release - name: Download sdist (not macOS) #if: ${{ matrix.buildplat[1] != 'macosx_*' }} - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: sdist path: ./dist @@ -128,7 +130,7 @@ # Python version used to build sdist doesn't matter # wheel will be built from sdist with the correct version - name: Unzip sdist (macOS) - if: ${{ matrix.buildplat[1] == 'macosx_*' }} + if: ${{ startsWith(matrix.buildplat[1], 'macosx') }} run: | tar -xzf ./dist/${{ env.sdist_name }} -C ./dist @@ -138,9 +140,9 @@ run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.16.2 + uses: pypa/cibuildwheel@v2.17.0 with: - package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} + package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} @@ -170,15 +172,15 @@ $TST_CMD = @" python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); - python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; + python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`'; "@ # add rc to the end of the image name if the Python version is unreleased docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} powershell -Command $TST_CMD - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: - name: ${{ matrix.python[0] }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }} + name: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} path: ./wheelhouse/*.whl - name: Upload wheels & sdist diff -Nru pandas-2.1.4+dfsg/.pre-commit-config.yaml pandas-2.2.2+dfsg/.pre-commit-config.yaml --- pandas-2.1.4+dfsg/.pre-commit-config.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/.pre-commit-config.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -20,11 +20,11 @@ repos: - repo: https://github.com/hauntsaninja/black-pre-commit-mirror # black compiled with mypyc - rev: 23.7.0 + rev: 23.11.0 hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.0.282 + rev: v0.1.6 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -32,27 +32,29 @@ # TODO: remove autofixe-only rules when they are checked by ruff name: ruff-selected-autofixes alias: ruff-selected-autofixes - args: [--select, "ANN001,ANN204", --fix-only, --exit-non-zero-on-fix] + files: ^pandas + exclude: ^pandas/tests + args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix] - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.7' + rev: 'v2.10' hooks: - id: vulture entry: python scripts/run_vulture.py pass_filenames: true require_serial: false - repo: https://github.com/codespell-project/codespell - rev: v2.2.5 + rev: v2.2.6 hooks: - id: codespell types_or: [python, rst, markdown, cython, c] additional_dependencies: [tomli] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.15.0 + rev: v0.16.0 hooks: - id: cython-lint - id: double-quote-cython-strings - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-ast - id: check-case-conflict @@ -70,21 +72,8 @@ - id: fix-encoding-pragma args: [--remove] - id: trailing-whitespace -- repo: https://github.com/cpplint/cpplint - rev: 1.6.1 - hooks: - - id: cpplint - exclude: ^pandas/_libs/include/pandas/vendored/klib - args: [ - --quiet, - '--extensions=c,h', - '--headers=h', - --recursive, - --linelength=88, - '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' - ] - repo: https://github.com/pylint-dev/pylint - rev: v3.0.0a6 + rev: v3.0.1 hooks: - id: pylint stages: [manual] @@ -107,7 +96,7 @@ hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.10.1 + rev: v3.15.0 hooks: - id: pyupgrade args: [--py39-plus] @@ -124,9 +113,16 @@ types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint - rev: v0.6.7 + rev: v0.9.1 hooks: - id: sphinx-lint +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v17.0.6 + hooks: + - id: clang-format + files: ^pandas/_libs/src|^pandas/_libs/include + args: [-i] + types_or: [c, c++] - repo: local hooks: - id: pyright @@ -138,11 +134,11 @@ types: [python] stages: [manual] additional_dependencies: &pyright_dependencies - - pyright@1.1.318 + - pyright@1.1.339 - id: pyright # note: assumes python env is setup and activated name: pyright reportGeneralTypeIssues - entry: pyright --skipunannotated -p pyright_reportGeneralTypeIssues.json --level warning + entry: pyright -p pyright_reportGeneralTypeIssues.json --level warning language: node pass_filenames: false types: [python] @@ -246,8 +242,9 @@ # pytest raises without context |\s\ pytest.raises + # TODO # pytest.warns (use tm.assert_produces_warning instead) - |pytest\.warns + # |pytest\.warns # os.remove |os\.remove @@ -361,18 +358,6 @@ files: ^pandas/ exclude: ^(pandas/_libs/|pandas/tests/|pandas/errors/__init__.py$|pandas/_version.py) types: [python] - - id: future-annotations - name: import annotations from __future__ - entry: 'from __future__ import annotations' - language: pygrep - args: [--negate] - files: ^pandas/ - types: [python] - exclude: | - (?x) - /(__init__\.py)|(api\.py)|(_version\.py)|(testing\.py)|(conftest\.py)$ - |/tests/ - |/_testing/ - id: check-test-naming name: check that test names start with 'test' entry: python -m scripts.check_test_naming diff -Nru pandas-2.1.4+dfsg/LICENSES/BOTTLENECK_LICENCE pandas-2.2.2+dfsg/LICENSES/BOTTLENECK_LICENCE --- pandas-2.1.4+dfsg/LICENSES/BOTTLENECK_LICENCE 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/BOTTLENECK_LICENCE 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,25 @@ +Copyright (c) 2010-2019 Keith Goodman +Copyright (c) 2019 Bottleneck Developers +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff -Nru pandas-2.1.4+dfsg/LICENSES/DATEUTIL_LICENSE pandas-2.2.2+dfsg/LICENSES/DATEUTIL_LICENSE --- pandas-2.1.4+dfsg/LICENSES/DATEUTIL_LICENSE 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/DATEUTIL_LICENSE 2024-04-10 17:42:52.000000000 +0000 @@ -51,4 +51,4 @@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -The above BSD License Applies to all code, even that also covered by Apache 2.0. +The above BSD License Applies to all code, even that also covered by Apache 2.0. \ No newline at end of file diff -Nru pandas-2.1.4+dfsg/LICENSES/KLIB_LICENSE pandas-2.2.2+dfsg/LICENSES/KLIB_LICENSE --- pandas-2.1.4+dfsg/LICENSES/KLIB_LICENSE 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/KLIB_LICENSE 2024-04-10 17:42:52.000000000 +0000 @@ -20,4 +20,4 @@ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +SOFTWARE. \ No newline at end of file diff -Nru pandas-2.1.4+dfsg/LICENSES/MUSL_LICENSE pandas-2.2.2+dfsg/LICENSES/MUSL_LICENSE --- pandas-2.1.4+dfsg/LICENSES/MUSL_LICENSE 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/MUSL_LICENSE 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,7 @@ musl as a whole is licensed under the following standard MIT license: ---------------------------------------------------------------------- -Copyright © 2005-2014 Rich Felker, et al. +Copyright © 2005-2020 Rich Felker, et al. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -25,37 +25,88 @@ Authors/contributors include: +A. Wilcox +Ada Worcester +Alex Dowad +Alex Suykov +Alexander Monakov +Andre McCurdy +Andrew Kelley Anthony G. Basile +Aric Belsito Arvid Picciani +Bartosz Brachaczek +Benjamin Peterson Bobby Bingham Boris Brezillon Brent Cook Chris Spiegel Clément Vasseur +Daniel Micay +Daniel Sabogal +Daurnimator +David Carlier +David Edelsohn +Denys Vlasenko +Dmitry Ivanov +Dmitry V. Levin +Drew DeVault Emil Renner Berthing +Fangrui Song +Felix Fietkau +Felix Janda +Gianluca Anzolin +Hauke Mehrtens +He X Hiltjo Posthuma Isaac Dunham +Jaydeep Patil Jens Gustedt Jeremy Huntwork +Jo-Philipp Wich +Joakim Sindholt John Spencer +Julien Ramseier Justin Cormack +Kaarle Ritvanen +Khem Raj +Kylie McClain +Leah Neukirchen Luca Barbato Luka Perkov M Farkas-Dyck (Strake) +Mahesh Bodapati +Markus Wichmann +Masanori Ogino +Michael Clark Michael Forney +Mikhail Kremnyov +Natanael Copa Nicholas J. Kain orc Pascal Cuoq +Patrick Oppenlander +Petr Hosek +Petr Skocik Pierre Carrier +Reini Urban Rich Felker Richard Pennington +Ryan Fairfax +Samuel Holland +Segev Finer +Shiz sin Solar Designer Stefan Kristiansson +Stefan O'Rear Szabolcs Nagy Timo Teräs +Trutz Behn Valentin Ochs +Will Dietz William Haddon +William Pitcock Portions of this software are derived from third-party works licensed under terms compatible with the above MIT license: @@ -71,18 +122,22 @@ Copyright © 2003-2011 David Schultz or Copyright © 2003-2009 Steven G. Kargl or Copyright © 2003-2009 Bruce D. Evans or -Copyright © 2008 Stephen L. Moshier +Copyright © 2008 Stephen L. Moshier or +Copyright © 2017-2018 Arm Limited and labelled as such in comments in the individual source files. All have been licensed under extremely permissive terms. -The ARM memcpy code (src/string/armel/memcpy.s) is Copyright © 2008 +The ARM memcpy code (src/string/arm/memcpy.S) is Copyright © 2008 The Android Open Source Project and is licensed under a two-clause BSD license. It was taken from Bionic libc, used on Android. -The implementation of DES for crypt (src/misc/crypt_des.c) is +The AArch64 memcpy and memset code (src/string/aarch64/*) are +Copyright © 1999-2019, Arm Limited. + +The implementation of DES for crypt (src/crypt/crypt_des.c) is Copyright © 1994 David Burren. It is licensed under a BSD license. -The implementation of blowfish crypt (src/misc/crypt_blowfish.c) was +The implementation of blowfish crypt (src/crypt/crypt_blowfish.c) was originally written by Solar Designer and placed into the public domain. The code also comes with a fallback permissive license for use in jurisdictions that may not recognize the public domain. @@ -90,22 +145,17 @@ The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011 Valentin Ochs and is licensed under an MIT-style license. -The BSD PRNG implementation (src/prng/random.c) and XSI search API -(src/search/*.c) functions are Copyright © 2011 Szabolcs Nagy and -licensed under following terms: "Permission to use, copy, modify, -and/or distribute this code for any purpose with or without fee is -hereby granted. There is no warranty." - -The x86_64 port was written by Nicholas J. Kain. Several files (crt) -were released into the public domain; others are licensed under the -standard MIT license terms at the top of this file. See individual -files for their copyright status. +The x86_64 port was written by Nicholas J. Kain and is licensed under +the standard MIT terms. The mips and microblaze ports were originally written by Richard Pennington for use in the ellcc project. The original code was adapted by Rich Felker for build system and code conventions during upstream integration. It is licensed under the standard MIT terms. +The mips64 port was contributed by Imagination Technologies and is +licensed under the standard MIT terms. + The powerpc port was also originally written by Richard Pennington, and later supplemented and integrated by John Spencer. It is licensed under the standard MIT terms. @@ -118,15 +168,26 @@ omission of copyright and license comments in each file is in the interest of source tree size. -All public header files (include/* and arch/*/bits/*) should be -treated as Public Domain as they intentionally contain no content -which can be covered by copyright. Some source modules may fall in -this category as well. If you believe that a file is so trivial that -it should be in the Public Domain, please contact the authors and -request an explicit statement releasing it from copyright. +In addition, permission is hereby granted for all public header files +(include/* and arch/*/bits/*) and crt files intended to be linked into +applications (crt/*, ldso/dlstart.c, and arch/*/crt_arch.h) to omit +the copyright notice and permission notice otherwise required by the +license, and to use these files without any requirement of +attribution. These files include substantial contributions from: + +Bobby Bingham +John Spencer +Nicholas J. Kain +Rich Felker +Richard Pennington +Stefan Kristiansson +Szabolcs Nagy -The following files are trivial, believed not to be copyrightable in -the first place, and hereby explicitly released to the Public Domain: +all of whom have explicitly granted such permission. -All public headers: include/*, arch/*/bits/* -Startup files: crt/* +This file previously contained text expressing a belief that most of +the files covered by the above exception were sufficiently trivial not +to be subject to copyright, resulting in confusion over whether it +negated the permissions granted in the license. In the spirit of +permissive licensing, and of not having licensing issues being an +obstacle to adoption, that text has been removed. \ No newline at end of file diff -Nru pandas-2.1.4+dfsg/LICENSES/NUMPY_LICENSE pandas-2.2.2+dfsg/LICENSES/NUMPY_LICENSE --- pandas-2.1.4+dfsg/LICENSES/NUMPY_LICENSE 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/NUMPY_LICENSE 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,4 @@ -Copyright (c) 2005-2011, NumPy Developers. +Copyright (c) 2005-2023, NumPy Developers. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -27,4 +27,4 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff -Nru pandas-2.1.4+dfsg/LICENSES/OTHER pandas-2.2.2+dfsg/LICENSES/OTHER --- pandas-2.1.4+dfsg/LICENSES/OTHER 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/OTHER 1970-01-01 00:00:00.000000000 +0000 @@ -1,57 +0,0 @@ -Bottleneck license ------------------- - -Copyright (c) 2010-2012 Archipel Asset Management AB. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -Pyperclip v1.3 license ----------------------- - -Copyright (c) 2010, Albert Sweigart -All rights reserved. - -BSD-style license: - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the pyperclip nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY Albert Sweigart "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL Albert Sweigart BE LIABLE FOR ANY -DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff -Nru pandas-2.1.4+dfsg/LICENSES/PACKAGING_LICENSE pandas-2.2.2+dfsg/LICENSES/PACKAGING_LICENSE --- pandas-2.1.4+dfsg/LICENSES/PACKAGING_LICENSE 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/PACKAGING_LICENSE 2024-04-10 17:42:52.000000000 +0000 @@ -199,4 +199,4 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff -Nru pandas-2.1.4+dfsg/LICENSES/PSF_LICENSE pandas-2.2.2+dfsg/LICENSES/PSF_LICENSE --- pandas-2.1.4+dfsg/LICENSES/PSF_LICENSE 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/PSF_LICENSE 2024-04-10 17:42:52.000000000 +0000 @@ -2,25 +2,24 @@ ========================== Python was created in the early 1990s by Guido van Rossum at Stichting -Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands +Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands as a successor of a language called ABC. Guido remains Python's principal author, although it includes many contributions from others. In 1995, Guido continued his work on Python at the Corporation for -National Research Initiatives (CNRI, see http://www.cnri.reston.va.us) +National Research Initiatives (CNRI, see https://www.cnri.reston.va.us) in Reston, Virginia where he released several versions of the software. In May 2000, Guido and the Python core development team moved to BeOpen.com to form the BeOpen PythonLabs team. In October of the same -year, the PythonLabs team moved to Digital Creations (now Zope -Corporation, see http://www.zope.com). In 2001, the Python Software -Foundation (PSF, see http://www.python.org/psf/) was formed, a -non-profit organization created specifically to own Python-related -Intellectual Property. Zope Corporation is a sponsoring member of -the PSF. +year, the PythonLabs team moved to Digital Creations, which became +Zope Corporation. In 2001, the Python Software Foundation (PSF, see +https://www.python.org/psf/) was formed, a non-profit organization +created specifically to own Python-related Intellectual Property. +Zope Corporation was a sponsoring member of the PSF. -All Python releases are Open Source (see http://www.opensource.org for +All Python releases are Open Source (see https://opensource.org for the Open Source Definition). Historically, most, but not all, Python releases have also been GPL-compatible; the table below summarizes the various releases. @@ -36,34 +35,9 @@ 2.1 2.0+1.6.1 2001 PSF no 2.0.1 2.0+1.6.1 2001 PSF yes 2.1.1 2.1+2.0.1 2001 PSF yes - 2.2 2.1.1 2001 PSF yes 2.1.2 2.1.1 2002 PSF yes 2.1.3 2.1.2 2002 PSF yes - 2.2.1 2.2 2002 PSF yes - 2.2.2 2.2.1 2002 PSF yes - 2.2.3 2.2.2 2003 PSF yes - 2.3 2.2.2 2002-2003 PSF yes - 2.3.1 2.3 2002-2003 PSF yes - 2.3.2 2.3.1 2002-2003 PSF yes - 2.3.3 2.3.2 2002-2003 PSF yes - 2.3.4 2.3.3 2004 PSF yes - 2.3.5 2.3.4 2005 PSF yes - 2.4 2.3 2004 PSF yes - 2.4.1 2.4 2005 PSF yes - 2.4.2 2.4.1 2005 PSF yes - 2.4.3 2.4.2 2006 PSF yes - 2.4.4 2.4.3 2006 PSF yes - 2.5 2.4 2006 PSF yes - 2.5.1 2.5 2007 PSF yes - 2.5.2 2.5.1 2008 PSF yes - 2.5.3 2.5.2 2008 PSF yes - 2.6 2.5 2008 PSF yes - 2.6.1 2.6 2008 PSF yes - 2.6.2 2.6.1 2009 PSF yes - 2.6.3 2.6.2 2009 PSF yes - 2.6.4 2.6.3 2009 PSF yes - 2.6.5 2.6.4 2010 PSF yes - 2.7 2.6 2010 PSF yes + 2.2 and above 2.1.1 2001-now PSF yes Footnotes: @@ -85,6 +59,17 @@ B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON =============================================================== +Python software and documentation are licensed under the +Python Software Foundation License Version 2. + +Starting with Python 3.8.6, examples, recipes, and other code in +the documentation are dual licensed under the PSF License Version 2 +and the Zero-Clause BSD license. + +Some software incorporated into Python is under different licenses. +The licenses are listed with code falling under that license. + + PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 -------------------------------------------- @@ -98,9 +83,10 @@ analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use Python alone or in any derivative version, provided, however, that PSF's License Agreement and PSF's notice of copyright, -i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 -Python Software Foundation; All Rights Reserved" are retained in Python alone or -in any derivative version prepared by Licensee. +i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation; +All Rights Reserved" are retained in Python alone or in any derivative version +prepared by Licensee. 3. In the event Licensee prepares a derivative work that is based on or incorporates Python or any part thereof, and wants to make @@ -205,9 +191,9 @@ Agreement, Licensee may substitute the following text (omitting the quotes): "Python 1.6.1 is made available subject to the terms and conditions in CNRI's License Agreement. This Agreement together with -Python 1.6.1 may be located on the Internet using the following +Python 1.6.1 may be located on the internet using the following unique, persistent identifier (known as a handle): 1895.22/1013. This -Agreement may also be obtained from a proxy server on the Internet +Agreement may also be obtained from a proxy server on the internet using the following URL: http://hdl.handle.net/1895.22/1013". 3. In the event Licensee prepares a derivative work that is based on @@ -277,3 +263,17 @@ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +ZERO-CLAUSE BSD LICENSE FOR CODE IN THE PYTHON DOCUMENTATION +---------------------------------------------------------------------- + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. diff -Nru pandas-2.1.4+dfsg/LICENSES/PYPERCLIP_LICENSE pandas-2.2.2+dfsg/LICENSES/PYPERCLIP_LICENSE --- pandas-2.1.4+dfsg/LICENSES/PYPERCLIP_LICENSE 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/PYPERCLIP_LICENSE 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,27 @@ +Copyright (c) 2014, Al Sweigart +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the {organization} nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff -Nru pandas-2.1.4+dfsg/LICENSES/PYUPGRADE_LICENSE pandas-2.2.2+dfsg/LICENSES/PYUPGRADE_LICENSE --- pandas-2.1.4+dfsg/LICENSES/PYUPGRADE_LICENSE 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/PYUPGRADE_LICENSE 2024-04-10 17:42:52.000000000 +0000 @@ -16,4 +16,4 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. +THE SOFTWARE. \ No newline at end of file diff -Nru pandas-2.1.4+dfsg/LICENSES/SAS7BDAT_LICENSE pandas-2.2.2+dfsg/LICENSES/SAS7BDAT_LICENSE --- pandas-2.1.4+dfsg/LICENSES/SAS7BDAT_LICENSE 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/SAS7BDAT_LICENSE 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,4 @@ -Copyright (c) 2015 Jared Hobbs +Copyright (c) 2015-2019 Jared Hobbs Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -16,4 +16,4 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +SOFTWARE. \ No newline at end of file diff -Nru pandas-2.1.4+dfsg/LICENSES/SCIPY_LICENSE pandas-2.2.2+dfsg/LICENSES/SCIPY_LICENSE --- pandas-2.1.4+dfsg/LICENSES/SCIPY_LICENSE 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/SCIPY_LICENSE 1970-01-01 00:00:00.000000000 +0000 @@ -1,31 +0,0 @@ -Copyright (c) 2001, 2002 Enthought, Inc. -All rights reserved. - -Copyright (c) 2003-2012 SciPy Developers. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - a. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - b. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - c. Neither the name of Enthought nor the names of the SciPy Developers - may be used to endorse or promote products derived from this software - without specific prior written permission. - - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -DAMAGE. - diff -Nru pandas-2.1.4+dfsg/LICENSES/ULTRAJSON_LICENSE pandas-2.2.2+dfsg/LICENSES/ULTRAJSON_LICENSE --- pandas-2.1.4+dfsg/LICENSES/ULTRAJSON_LICENSE 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/ULTRAJSON_LICENSE 2024-04-10 17:42:52.000000000 +0000 @@ -1,21 +1,22 @@ -Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +Developed by ESN, an Electronic Arts Inc. studio. +Copyright (c) 2014, Electronic Arts Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the ESN Social Software AB nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of ESN, Electronic Arts Inc. nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +DISCLAIMED. IN NO EVENT SHALL ELECTRONIC ARTS INC. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND @@ -23,12 +24,91 @@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +---- Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. -Numeric decoder derived from TCL library -http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms + Copyright 2005, 2006, 2007 + Nick Galbreath -- nickg [at] modp [dot] com + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + Neither the name of the modp.com nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + This is the standard "new" BSD license: + http://www.opensource.org/licenses/bsd-license.php + +https://github.com/client9/stringencoders/blob/cfd5c1507325ae497ea9bacdacba12c0ffd79d30/COPYING + +---- + +Numeric decoder derived from from TCL library +https://opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. + + This software is copyrighted by the Regents of the University of + California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState + Corporation and other parties. The following terms apply to all files + associated with the software unless explicitly disclaimed in + individual files. + + The authors hereby grant permission to use, copy, modify, distribute, + and license this software and its documentation for any purpose, provided + that existing copyright notices are retained in all copies and that this + notice is included verbatim in any distributions. No written agreement, + license, or royalty fee is required for any of the authorized uses. + Modifications to this software may be copyrighted by their authors + and need not follow the licensing terms described here, provided that + the new terms are clearly indicated on the first page of each file where + they apply. + + IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY + FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY + DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + + THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE + IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE + NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR + MODIFICATIONS. + + GOVERNMENT USE: If you are acquiring this software on behalf of the + U.S. government, the Government shall have only "Restricted Rights" + in the software and related documentation as defined in the Federal + Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you + are acquiring the software on behalf of the Department of Defense, the + software shall be classified as "Commercial Computer Software" and the + Government shall have only "Restricted Rights" as defined in Clause + 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the + authors grant the U.S. Government and others acting in its behalf + permission to use and distribute the software in accordance with the + terms specified in this license. \ No newline at end of file diff -Nru pandas-2.1.4+dfsg/LICENSES/XARRAY_LICENSE pandas-2.2.2+dfsg/LICENSES/XARRAY_LICENSE --- pandas-2.1.4+dfsg/LICENSES/XARRAY_LICENSE 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/LICENSES/XARRAY_LICENSE 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,3 @@ -Copyright 2014-2019, xarray Developers - --------------------------------------------------------------------------------- - Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -192,4 +188,4 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. \ No newline at end of file diff -Nru pandas-2.1.4+dfsg/asv_bench/asv.conf.json pandas-2.2.2+dfsg/asv_bench/asv.conf.json --- pandas-2.1.4+dfsg/asv_bench/asv.conf.json 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/asv.conf.json 2024-04-10 17:42:52.000000000 +0000 @@ -41,7 +41,7 @@ // pip (with all the conda available packages installed first, // followed by the pip installed packages). "matrix": { - "Cython": ["0.29.33"], + "Cython": ["3.0.5"], "matplotlib": [], "sqlalchemy": [], "scipy": [], diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/algorithms.py pandas-2.2.2+dfsg/asv_bench/benchmarks/algorithms.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/algorithms.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/algorithms.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,8 +4,6 @@ import pandas as pd -from .pandas_vb_common import tm - for imp in ["pandas.util", "pandas.tools.hashing"]: try: hashing = import_module(imp) @@ -19,9 +17,9 @@ [True, False], [True, False], [ - "int", - "uint", - "float", + "int64", + "uint64", + "float64", "object", "object_str", "datetime64[ns]", @@ -35,28 +33,27 @@ def setup(self, unique, sort, dtype): N = 10**5 - string_index = tm.makeStringIndex(N) - string_arrow = None - if dtype == "string[pyarrow]": - try: - string_arrow = pd.array(string_index, dtype="string[pyarrow]") - except ImportError: - raise NotImplementedError - - data = { - "int": pd.Index(np.arange(N), dtype="int64"), - "uint": pd.Index(np.arange(N), dtype="uint64"), - "float": pd.Index(np.random.randn(N), dtype="float64"), - "object_str": string_index, - "object": pd.Index(np.arange(N), dtype="object"), - "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), - "datetime64[ns, tz]": pd.date_range( - "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" - ), - "Int64": pd.array(np.arange(N), dtype="Int64"), - "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"), - "string[pyarrow]": string_arrow, - }[dtype] + + if dtype in ["int64", "uint64", "Int64", "object"]: + data = pd.Index(np.arange(N), dtype=dtype) + elif dtype == "float64": + data = pd.Index(np.random.randn(N), dtype=dtype) + elif dtype == "boolean": + data = pd.array(np.random.randint(0, 2, N), dtype=dtype) + elif dtype == "datetime64[ns]": + data = pd.date_range("2011-01-01", freq="h", periods=N) + elif dtype == "datetime64[ns, tz]": + data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo") + elif dtype == "object_str": + data = pd.Index([f"i-{i}" for i in range(N)], dtype=object) + elif dtype == "string[pyarrow]": + data = pd.array( + pd.Index([f"i-{i}" for i in range(N)], dtype=object), + dtype="string[pyarrow]", + ) + else: + raise NotImplementedError + if not unique: data = data.repeat(5) self.data = data @@ -72,22 +69,35 @@ params = [ [True, False], ["first", "last", False], - ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"], + [ + "int64", + "uint64", + "float64", + "string", + "datetime64[ns]", + "datetime64[ns, tz]", + "timestamp[ms][pyarrow]", + "duration[s][pyarrow]", + ], ] param_names = ["unique", "keep", "dtype"] def setup(self, unique, keep, dtype): N = 10**5 - data = { - "int": pd.Index(np.arange(N), dtype="int64"), - "uint": pd.Index(np.arange(N), dtype="uint64"), - "float": pd.Index(np.random.randn(N), dtype="float64"), - "string": tm.makeStringIndex(N), - "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), - "datetime64[ns, tz]": pd.date_range( - "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" - ), - }[dtype] + if dtype in ["int64", "uint64"]: + data = pd.Index(np.arange(N), dtype=dtype) + elif dtype == "float64": + data = pd.Index(np.random.randn(N), dtype="float64") + elif dtype == "string": + data = pd.Index([f"i-{i}" for i in range(N)], dtype=object) + elif dtype == "datetime64[ns]": + data = pd.date_range("2011-01-01", freq="h", periods=N) + elif dtype == "datetime64[ns, tz]": + data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo") + elif dtype in ["timestamp[ms][pyarrow]", "duration[s][pyarrow]"]: + data = pd.Index(np.arange(N), dtype=dtype) + else: + raise NotImplementedError if not unique: data = data.repeat(5) self.idx = data @@ -127,7 +137,9 @@ df = pd.DataFrame( { "strings": pd.Series( - tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N)) + pd.Index([f"i-{i}" for i in range(10000)], dtype=object).take( + np.random.randint(0, 10000, size=N) + ) ), "floats": np.random.randn(N), "ints": np.arange(N), @@ -165,21 +177,22 @@ params = [ [0, 0.5, 1], ["linear", "nearest", "lower", "higher", "midpoint"], - ["float", "int", "uint"], + ["float64", "int64", "uint64"], ] param_names = ["quantile", "interpolation", "dtype"] def setup(self, quantile, interpolation, dtype): N = 10**5 - data = { - "int": np.arange(N), - "uint": np.arange(N).astype(np.uint64), - "float": np.random.randn(N), - } - self.idx = pd.Series(data[dtype].repeat(5)) + if dtype in ["int64", "uint64"]: + data = np.arange(N, dtype=dtype) + elif dtype == "float64": + data = np.random.randn(N) + else: + raise NotImplementedError + self.ser = pd.Series(data.repeat(5)) def time_quantile(self, quantile, interpolation, dtype): - self.idx.quantile(quantile, interpolation=interpolation) + self.ser.quantile(quantile, interpolation=interpolation) class SortIntegerArray: diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/algos/isin.py pandas-2.2.2+dfsg/asv_bench/benchmarks/algos/isin.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/algos/isin.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/algos/isin.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,8 +8,6 @@ date_range, ) -from ..pandas_vb_common import tm - class IsIn: params = [ @@ -60,7 +58,9 @@ elif dtype in ["str", "string[python]", "string[pyarrow]"]: try: - self.series = Series(tm.makeStringIndex(N), dtype=dtype) + self.series = Series( + Index([f"i-{i}" for i in range(N)], dtype=object), dtype=dtype + ) except ImportError: raise NotImplementedError self.values = list(self.series[:2]) diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/arithmetic.py pandas-2.2.2+dfsg/asv_bench/benchmarks/arithmetic.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/arithmetic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/arithmetic.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,13 +6,12 @@ import pandas as pd from pandas import ( DataFrame, + Index, Series, Timestamp, date_range, to_timedelta, ) -import pandas._testing as tm -from pandas.core.algorithms import checked_add_with_arr from .pandas_vb_common import numeric_dtypes @@ -262,7 +261,7 @@ def setup(self, tz): N = 10**6 halfway = (N // 2) - 1 - self.s = Series(date_range("20010101", periods=N, freq="T", tz=tz)) + self.s = Series(date_range("20010101", periods=N, freq="min", tz=tz)) self.ts = self.s[halfway] self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz)) @@ -323,8 +322,10 @@ def setup(self, dtype): N = 10**6 - indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"} - self.index = getattr(tm, indexes[dtype])(N) + if dtype == "float": + self.index = Index(np.arange(N), dtype=np.float64) + elif dtype == "int": + self.index = Index(np.arange(N), dtype=np.int64) def time_add(self, dtype): self.index + 2 @@ -387,42 +388,6 @@ df["timedelta"] + df["timedelta"] -class AddOverflowScalar: - params = [1, -1, 0] - param_names = ["scalar"] - - def setup(self, scalar): - N = 10**6 - self.arr = np.arange(N) - - def time_add_overflow_scalar(self, scalar): - checked_add_with_arr(self.arr, scalar) - - -class AddOverflowArray: - def setup(self): - N = 10**6 - self.arr = np.arange(N) - self.arr_rev = np.arange(-N, 0) - self.arr_mixed = np.array([1, -1]).repeat(N / 2) - self.arr_nan_1 = np.random.choice([True, False], size=N) - self.arr_nan_2 = np.random.choice([True, False], size=N) - - def time_add_overflow_arr_rev(self): - checked_add_with_arr(self.arr, self.arr_rev) - - def time_add_overflow_arr_mask_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) - - def time_add_overflow_b_mask_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, b_mask=self.arr_nan_1) - - def time_add_overflow_both_arg_nan(self): - checked_add_with_arr( - self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, b_mask=self.arr_nan_2 - ) - - hcal = pd.tseries.holiday.USFederalHolidayCalendar() # These offsets currently raise a NotImplementedError with .apply_index() non_apply = [ @@ -460,7 +425,7 @@ def setup(self, offset): N = 10000 - rng = date_range(start="1/1/2000", periods=N, freq="T") + rng = date_range(start="1/1/2000", periods=N, freq="min") self.rng = rng self.ser = Series(rng) @@ -479,7 +444,7 @@ def setup(self, offset): N = 10000 - rng = date_range(start="1/1/2000", periods=N, freq="T") + rng = date_range(start="1/1/2000", periods=N, freq="min") self.rng = rng def time_apply_index(self, offset): @@ -491,7 +456,7 @@ param_names = ["func"] def setup(self, func): - array = date_range("20200101 00:00", "20200102 0:00", freq="S") + array = date_range("20200101 00:00", "20200102 0:00", freq="s") level_0_names = [str(i) for i in range(30)] index = pd.MultiIndex.from_product([level_0_names, array]) diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/array.py pandas-2.2.2+dfsg/asv_bench/benchmarks/array.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/array.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/array.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,9 +31,9 @@ class IntegerArray: def setup(self): N = 250_000 - self.values_integer = np.array([1, 0, 1, 0] * N) - self.data = np.array([1, 2, 3, 4] * N, dtype="int64") - self.mask = np.array([False, False, True, False] * N) + self.values_integer = np.tile(np.array([1, 0, 1, 0]), N) + self.data = np.tile(np.array([1, 2, 3, 4], dtype="int64"), N) + self.mask = np.tile(np.array([False, False, True, False]), N) def time_constructor(self): pd.arrays.IntegerArray(self.data, self.mask) @@ -90,7 +90,7 @@ self.array[i] = "foo" def time_setitem_list(self, multiple_chunks): - indexer = list(range(0, 50)) + list(range(-1000, 0, 50)) + indexer = list(range(50)) + list(range(-1000, 0, 50)) self.array[indexer] = ["foo"] * len(indexer) def time_setitem_slice(self, multiple_chunks): diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/categoricals.py pandas-2.2.2+dfsg/asv_bench/benchmarks/categoricals.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/categoricals.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/categoricals.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,8 +6,6 @@ import pandas as pd -from .pandas_vb_common import tm - try: from pandas.api.types import union_categoricals except ImportError: @@ -189,7 +187,7 @@ N = 10**5 ncats = 15 - self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) + self.s_str = pd.Series(np.random.randint(0, ncats, size=N).astype(str)) self.s_str_cat = pd.Series(self.s_str, dtype="category") with warnings.catch_warnings(record=True): str_cat_type = pd.CategoricalDtype(set(self.s_str), ordered=True) @@ -242,7 +240,7 @@ class Contains: def setup(self): N = 10**5 - self.ci = tm.makeCategoricalIndex(N) + self.ci = pd.CategoricalIndex(np.arange(N)) self.c = self.ci.values self.key = self.ci.categories[0] @@ -260,18 +258,16 @@ def setup(self, index): N = 10**6 categories = ["a", "b", "c"] - values = [0] * N + [1] * N + [2] * N if index == "monotonic_incr": - self.data = pd.Categorical.from_codes(values, categories=categories) + codes = np.repeat([0, 1, 2], N) elif index == "monotonic_decr": - self.data = pd.Categorical.from_codes( - list(reversed(values)), categories=categories - ) + codes = np.repeat([2, 1, 0], N) elif index == "non_monotonic": - self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories) + codes = np.tile([0, 1, 2], N) else: raise ValueError(f"Invalid index param: {index}") + self.data = pd.Categorical.from_codes(codes, categories=categories) self.scalar = 10000 self.list = list(range(10000)) self.cat_scalar = "b" @@ -327,7 +323,7 @@ class SearchSorted: def setup(self): N = 10**5 - self.ci = tm.makeCategoricalIndex(N).sort_values() + self.ci = pd.CategoricalIndex(np.arange(N)).sort_values() self.c = self.ci.values self.key = self.ci.categories[1] diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/ctors.py pandas-2.2.2+dfsg/asv_bench/benchmarks/ctors.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/ctors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/ctors.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,8 +9,6 @@ date_range, ) -from .pandas_vb_common import tm - def no_change(arr): return arr @@ -115,7 +113,7 @@ class MultiIndexConstructor: def setup(self): N = 10**4 - self.iterables = [tm.makeStringIndex(N), range(20)] + self.iterables = [Index([f"i-{i}" for i in range(N)], dtype=object), range(20)] def time_multiindex_from_iterables(self): MultiIndex.from_product(self.iterables) diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/dtypes.py pandas-2.2.2+dfsg/asv_bench/benchmarks/dtypes.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/dtypes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/dtypes.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,7 +3,10 @@ import numpy as np import pandas as pd -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm from pandas.api.types import ( is_extension_array_dtype, @@ -73,8 +76,8 @@ def setup(self, dtype): N, K = 5000, 50 - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) + self.index = Index([f"i-{i}" for i in range(N)], dtype=object) + self.columns = Index([f"i-{i}" for i in range(K)], dtype=object) def create_df(data): return DataFrame(data, index=self.index, columns=self.columns) diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/eval.py pandas-2.2.2+dfsg/asv_bench/benchmarks/eval.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/eval.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/eval.py 2024-04-10 17:42:52.000000000 +0000 @@ -44,7 +44,7 @@ def setup(self): N = 10**6 halfway = (N // 2) - 1 - index = pd.date_range("20010101", periods=N, freq="T") + index = pd.date_range("20010101", periods=N, freq="min") s = pd.Series(index) self.ts = s.iloc[halfway] self.df = pd.DataFrame({"a": np.random.randn(N), "dates": index}, index=index) diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/frame_ctor.py pandas-2.2.2+dfsg/asv_bench/benchmarks/frame_ctor.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/frame_ctor.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/frame_ctor.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,8 +12,6 @@ date_range, ) -from .pandas_vb_common import tm - try: from pandas.tseries.offsets import ( Hour, @@ -30,8 +28,8 @@ class FromDicts: def setup(self): N, K = 5000, 50 - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) + self.index = pd.Index([f"i-{i}" for i in range(N)], dtype=object) + self.columns = pd.Index([f"i-{i}" for i in range(K)], dtype=object) frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) self.data = frame.to_dict() self.dict_list = frame.to_dict(orient="records") diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/frame_methods.py pandas-2.2.2+dfsg/asv_bench/benchmarks/frame_methods.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/frame_methods.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/frame_methods.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,7 @@ from pandas import ( DataFrame, + Index, MultiIndex, NaT, Series, @@ -14,8 +15,6 @@ timedelta_range, ) -from .pandas_vb_common import tm - class AsType: params = [ @@ -439,9 +438,9 @@ N, M = 10000, 100 if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"): data = { - "datetime64[ns]": date_range("2011-01-01", freq="H", periods=N), + "datetime64[ns]": date_range("2011-01-01", freq="h", periods=N), "datetime64[ns, tz]": date_range( - "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" + "2011-01-01", freq="h", periods=N, tz="Asia/Tokyo" ), "timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"), } @@ -640,7 +639,8 @@ class SeriesNuniqueWithNan: def setup(self): - self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float) + values = 100 * [np.nan] + list(range(100)) + self.ser = Series(np.tile(values, 10000), dtype=float) def time_series_nunique_nan(self): self.ser.nunique() @@ -649,7 +649,7 @@ class Duplicated: def setup(self): n = 1 << 20 - t = date_range("2015-01-01", freq="S", periods=(n // 64)) + t = date_range("2015-01-01", freq="s", periods=(n // 64)) xs = np.random.randn(n // 64).round(2) self.df = DataFrame( { @@ -693,20 +693,34 @@ self.df.sort_values(by="A", ascending=ascending) -class SortIndexByColumns: - def setup(self): +class SortMultiKey: + params = [True, False] + param_names = ["monotonic"] + + def setup(self, monotonic): N = 10000 K = 10 - self.df = DataFrame( + df = DataFrame( { - "key1": tm.makeStringIndex(N).values.repeat(K), - "key2": tm.makeStringIndex(N).values.repeat(K), + "key1": Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat( + K + ), + "key2": Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat( + K + ), "value": np.random.randn(N * K), } ) + if monotonic: + df = df.sort_values(["key1", "key2"]) + self.df_by_columns = df + self.df_by_index = df.set_index(["key1", "key2"]) + + def time_sort_values(self, monotonic): + self.df_by_columns.sort_values(by=["key1", "key2"]) - def time_frame_sort_values_by_columns(self): - self.df.sort_values(by=["key1", "key2"]) + def time_sort_index(self, monotonic): + self.df_by_index.sort_index() class Quantile: diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/gil.py pandas-2.2.2+dfsg/asv_bench/benchmarks/gil.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/gil.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/gil.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,7 @@ from pandas import ( DataFrame, + Index, Series, date_range, factorize, @@ -12,8 +13,6 @@ ) from pandas.core.algorithms import take_nd -from .pandas_vb_common import tm - try: from pandas import ( rolling_kurt, @@ -34,7 +33,6 @@ except ImportError: from pandas import algos - from .pandas_vb_common import BaseIO # isort:skip @@ -178,7 +176,7 @@ class ParallelDatetimeFields: def setup(self): N = 10**6 - self.dti = date_range("1900-01-01", periods=N, freq="T") + self.dti = date_range("1900-01-01", periods=N, freq="min") self.period = self.dti.to_period("D") def time_datetime_field_year(self): @@ -212,7 +210,7 @@ def time_datetime_to_period(self): @test_parallel(num_threads=2) def run(dti): - dti.to_period("S") + dti.to_period("s") run(self.dti) @@ -272,18 +270,20 @@ def setup(self, dtype): rows = 10000 cols = 50 - data = { - "float": DataFrame(np.random.randn(rows, cols)), - "datetime": DataFrame( + if dtype == "float": + df = DataFrame(np.random.randn(rows, cols)) + elif dtype == "datetime": + df = DataFrame( np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows) - ), - "object": DataFrame( + ) + elif dtype == "object": + df = DataFrame( "foo", index=range(rows), columns=["object%03d" for _ in range(5)] - ), - } + ) + else: + raise NotImplementedError self.fname = f"__test_{dtype}__.csv" - df = data[dtype] df.to_csv(self.fname) @test_parallel(num_threads=2) @@ -303,7 +303,7 @@ param_names = ["threads"] def setup(self, threads): - strings = tm.makeStringIndex(100000) + strings = Index([f"i-{i}" for i in range(100000)], dtype=object) @test_parallel(num_threads=threads) def parallel(): diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/groupby.py pandas-2.2.2+dfsg/asv_bench/benchmarks/groupby.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/groupby.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/groupby.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,8 +17,6 @@ to_timedelta, ) -from .pandas_vb_common import tm - method_blocklist = { "object": { "diff", @@ -73,6 +71,8 @@ "ffill", "first", "head", + "idxmax", + "idxmin", "last", "median", "nunique", @@ -165,10 +165,14 @@ "int64_small": Series(np.random.randint(0, 100, size=size)), "int64_large": Series(np.random.randint(0, 10000, size=size)), "object_small": Series( - tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size)) + Index([f"i-{i}" for i in range(100)], dtype=object).take( + np.random.randint(0, 100, size=size) + ) ), "object_large": Series( - tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size)) + Index([f"i-{i}" for i in range(10000)], dtype=object).take( + np.random.randint(0, 10000, size=size) + ) ), } return data @@ -236,7 +240,7 @@ class DateAttributes: def setup(self): - rng = date_range("1/1/2000", "12/31/2005", freq="H") + rng = date_range("1/1/2000", "12/31/2005", freq="h") self.year, self.month, self.day = rng.year, rng.month, rng.day self.ts = Series(np.random.randn(len(rng)), index=rng) @@ -588,6 +592,8 @@ "prod", "min", "max", + "idxmin", + "idxmax", "mean", "median", "var", @@ -709,7 +715,7 @@ if dtype == "datetime64": data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype) else: - data = np.array([1] * N, dtype=dtype) + data = np.ones(N, dtype=dtype) self.df = DataFrame({"values": data, "key": ["foo"] * N}) def time_rank_ties(self, dtype, tie_method): @@ -798,6 +804,51 @@ self.df_extra_cat.groupby("a", observed=observed, sort=False)["b"].count() +class MultipleCategories: + def setup(self): + N = 10**3 + arr = np.random.random(N) + data = { + "a1": Categorical(np.random.randint(10000, size=N)), + "a2": Categorical(np.random.randint(10000, size=N)), + "b": arr, + } + self.df = DataFrame(data) + data = { + "a1": Categorical(np.random.randint(10000, size=N), ordered=True), + "a2": Categorical(np.random.randint(10000, size=N), ordered=True), + "b": arr, + } + self.df_ordered = DataFrame(data) + data = { + "a1": Categorical(np.random.randint(100, size=N), categories=np.arange(N)), + "a2": Categorical(np.random.randint(100, size=N), categories=np.arange(N)), + "b": arr, + } + self.df_extra_cat = DataFrame(data) + + def time_groupby_sort(self): + self.df.groupby(["a1", "a2"], observed=False)["b"].count() + + def time_groupby_nosort(self): + self.df.groupby(["a1", "a2"], observed=False, sort=False)["b"].count() + + def time_groupby_ordered_sort(self): + self.df_ordered.groupby(["a1", "a2"], observed=False)["b"].count() + + def time_groupby_ordered_nosort(self): + self.df_ordered.groupby(["a1", "a2"], observed=False, sort=False)["b"].count() + + def time_groupby_extra_cat_sort(self): + self.df_extra_cat.groupby(["a1", "a2"], observed=False)["b"].count() + + def time_groupby_extra_cat_nosort(self): + self.df_extra_cat.groupby(["a1", "a2"], observed=False, sort=False)["b"].count() + + def time_groupby_transform(self): + self.df_extra_cat.groupby(["a1", "a2"], observed=False)["b"].cumsum() + + class Datelike: # GH 14338 params = ["period_range", "date_range", "date_range_tz"] @@ -841,12 +892,29 @@ self.df.groupby(level=[0, 1]).sum() +class SumTimeDelta: + # GH 20660 + def setup(self): + N = 10**4 + self.df = DataFrame( + np.random.randint(1000, 100000, (N, 100)), + index=np.random.randint(200, size=(N,)), + ).astype("timedelta64[ns]") + self.df_int = self.df.copy().astype("int64") + + def time_groupby_sum_timedelta(self): + self.df.groupby(lambda x: x).sum() + + def time_groupby_sum_int(self): + self.df_int.groupby(lambda x: x).sum() + + class Transform: def setup(self): n1 = 400 n2 = 250 index = MultiIndex( - levels=[np.arange(n1), tm.makeStringIndex(n2)], + levels=[np.arange(n1), Index([f"i-{i}" for i in range(n2)], dtype=object)], codes=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1], names=["lev1", "lev2"], ) diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/index_cached_properties.py pandas-2.2.2+dfsg/asv_bench/benchmarks/index_cached_properties.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/index_cached_properties.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/index_cached_properties.py 2024-04-10 17:42:52.000000000 +0000 @@ -25,14 +25,14 @@ N = 10**5 if index_type == "MultiIndex": self.idx = pd.MultiIndex.from_product( - [pd.date_range("1/1/2000", freq="T", periods=N // 2), ["a", "b"]] + [pd.date_range("1/1/2000", freq="min", periods=N // 2), ["a", "b"]] ) elif index_type == "DatetimeIndex": - self.idx = pd.date_range("1/1/2000", freq="T", periods=N) + self.idx = pd.date_range("1/1/2000", freq="min", periods=N) elif index_type == "Int64Index": self.idx = pd.Index(range(N), dtype="int64") elif index_type == "PeriodIndex": - self.idx = pd.period_range("1/1/2000", freq="T", periods=N) + self.idx = pd.period_range("1/1/2000", freq="min", periods=N) elif index_type == "RangeIndex": self.idx = pd.RangeIndex(start=0, stop=N) elif index_type == "IntervalIndex": diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/index_object.py pandas-2.2.2+dfsg/asv_bench/benchmarks/index_object.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/index_object.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/index_object.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,8 +12,6 @@ date_range, ) -from .pandas_vb_common import tm - class SetOperations: params = ( @@ -25,12 +23,12 @@ def setup(self, index_structure, dtype, method): N = 10**5 - dates_left = date_range("1/1/2000", periods=N, freq="T") + dates_left = date_range("1/1/2000", periods=N, freq="min") fmt = "%Y-%m-%d %H:%M:%S" date_str_left = Index(dates_left.strftime(fmt)) int_left = Index(np.arange(N)) ea_int_left = Index(np.arange(N), dtype="Int64") - str_left = tm.makeStringIndex(N) + str_left = Index([f"i-{i}" for i in range(N)], dtype=object) data = { "datetime": dates_left, @@ -155,15 +153,18 @@ def setup(self, dtype): N = 10**6 - self.idx = getattr(tm, f"make{dtype}Index")(N) + if dtype == "String": + self.idx = Index([f"i-{i}" for i in range(N)], dtype=object) + elif dtype == "Float": + self.idx = Index(np.arange(N), dtype=np.float64) + elif dtype == "Int": + self.idx = Index(np.arange(N), dtype=np.int64) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) self.sorted = self.idx.sort_values() half = N // 2 self.non_unique = self.idx[:half].append(self.idx[:half]) - self.non_unique_sorted = ( - self.sorted[:half].append(self.sorted[:half]).sort_values() - ) + self.non_unique_sorted = self.sorted[:half].repeat(2) self.key = self.sorted[N // 4] def time_boolean_array(self, dtype): diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/indexing.py pandas-2.2.2+dfsg/asv_bench/benchmarks/indexing.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -22,8 +22,6 @@ period_range, ) -from .pandas_vb_common import tm - class NumericSeriesIndexing: params = [ @@ -124,7 +122,7 @@ def setup(self, index, index_structure): N = 10**6 if index == "string": - index = tm.makeStringIndex(N) + index = Index([f"i-{i}" for i in range(N)], dtype=object) elif index == "datetime": index = date_range("1900", periods=N, freq="s") elif index == "period": @@ -156,8 +154,8 @@ class DataFrameStringIndexing: def setup(self): - index = tm.makeStringIndex(1000) - columns = tm.makeStringIndex(30) + index = Index([f"i-{i}" for i in range(1000)], dtype=object) + columns = Index([f"i-{i}" for i in range(30)], dtype=object) with warnings.catch_warnings(record=True): self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) self.idx_scalar = index[100] @@ -232,7 +230,7 @@ N = 100000 indexes = { "int": Index(np.arange(N), dtype=np.int64), - "datetime": date_range("2011-01-01", freq="S", periods=N), + "datetime": date_range("2011-01-01", freq="s", periods=N), } index = indexes[index] self.s = Series(np.random.rand(N), index=index) @@ -306,6 +304,10 @@ target = (self.tgt_null_slice, self.tgt_slice) self.df.loc[target, :] + def time_loc_multiindex(self, unique_levels): + target = self.df.index[::10] + self.df.loc[target] + def time_xs_level_0(self, unique_levels): target = self.tgt_scalar self.df.xs(target, level=0) @@ -465,7 +467,7 @@ class AssignTimeseriesIndex: def setup(self): N = 100000 - idx = date_range("1/1/2000", periods=N, freq="H") + idx = date_range("1/1/2000", periods=N, freq="h") self.df = DataFrame(np.random.randn(N, 1), columns=["A"], index=idx) def time_frame_assign_timeseries_index(self): @@ -515,6 +517,18 @@ self.df[[100, 200, 300]] = 100 +class SetitemObjectDtype: + # GH#19299 + + def setup(self): + N = 1000 + cols = 500 + self.df = DataFrame(index=range(N), columns=range(cols), dtype=object) + + def time_setitem_object_dtype(self): + self.df.loc[0, 1] = 1.0 + + class ChainIndexing: params = [None, "warn"] param_names = ["mode"] diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/indexing_engines.py pandas-2.2.2+dfsg/asv_bench/benchmarks/indexing_engines.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/indexing_engines.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/indexing_engines.py 2024-04-10 17:42:52.000000000 +0000 @@ -71,14 +71,12 @@ if unique: arr = np.arange(N * 3, dtype=dtype) else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) elif index_type == "monotonic_decr": if unique: arr = np.arange(N * 3, dtype=dtype)[::-1] else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype)[::-1] + arr = np.array([3, 2, 1], dtype=dtype).repeat(N) else: assert index_type == "non_monotonic" if unique: @@ -86,7 +84,7 @@ arr[:N] = np.arange(N * 2, N * 3, dtype=dtype) arr[N:] = np.arange(N * 2, dtype=dtype) else: - arr = np.array([1, 2, 3] * N, dtype=dtype) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) self.data = engine(arr) # code belows avoids populating the mapping etc. while timing. @@ -115,30 +113,29 @@ def setup(self, engine_and_dtype, index_type, unique, N): engine, dtype = engine_and_dtype + dtype = dtype.lower() if index_type == "monotonic_incr": if unique: - arr = np.arange(N * 3, dtype=dtype.lower()) + arr = np.arange(N * 3, dtype=dtype) else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype.lower()) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) mask = np.zeros(N * 3, dtype=np.bool_) elif index_type == "monotonic_decr": if unique: - arr = np.arange(N * 3, dtype=dtype.lower())[::-1] + arr = np.arange(N * 3, dtype=dtype)[::-1] else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype.lower())[::-1] + arr = np.array([3, 2, 1], dtype=dtype).repeat(N) mask = np.zeros(N * 3, dtype=np.bool_) else: assert index_type == "non_monotonic" if unique: - arr = np.zeros(N * 3, dtype=dtype.lower()) - arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower()) - arr[N:] = np.arange(N * 2, dtype=dtype.lower()) + arr = np.zeros(N * 3, dtype=dtype) + arr[:N] = np.arange(N * 2, N * 3, dtype=dtype) + arr[N:] = np.arange(N * 2, dtype=dtype) else: - arr = np.array([1, 2, 3] * N, dtype=dtype.lower()) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) mask = np.zeros(N * 3, dtype=np.bool_) mask[-1] = True diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/inference.py pandas-2.2.2+dfsg/asv_bench/benchmarks/inference.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/inference.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/inference.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,7 @@ import numpy as np from pandas import ( + Index, NaT, Series, date_range, @@ -17,10 +18,7 @@ to_timedelta, ) -from .pandas_vb_common import ( - lib, - tm, -) +from .pandas_vb_common import lib class ToNumeric: @@ -31,7 +29,7 @@ N = 10000 self.float = Series(np.random.randn(N)) self.numstr = self.float.astype("str") - self.str = Series(tm.makeStringIndex(N)) + self.str = Series(Index([f"i-{i}" for i in range(N)], dtype=object)) def time_from_float(self, errors): to_numeric(self.float, errors=errors) @@ -164,7 +162,7 @@ class ToDatetimeISO8601: def setup(self): - rng = date_range(start="1/1/2000", periods=20000, freq="H") + rng = date_range(start="1/1/2000", periods=20000, freq="h") self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist() self.strings_tz_space = [ @@ -276,7 +274,7 @@ # GH 43901 class ToDatetimeInferDatetimeFormat: def setup(self): - rng = date_range(start="1/1/2000", periods=100000, freq="H") + rng = date_range(start="1/1/2000", periods=100000, freq="h") self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() def time_infer_datetime_format(self): diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/io/csv.py pandas-2.2.2+dfsg/asv_bench/benchmarks/io/csv.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/io/csv.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/io/csv.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,6 +10,7 @@ from pandas import ( Categorical, DataFrame, + Index, concat, date_range, period_range, @@ -17,10 +18,7 @@ to_datetime, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class ToCSV(BaseIO): @@ -89,7 +87,7 @@ fname = "__test__.csv" def setup(self): - rng = date_range("2000", periods=100_000, freq="S") + rng = date_range("2000", periods=100_000, freq="s") self.data = DataFrame({"a": 1}, index=rng) def time_frame_date_formatting_index(self): @@ -102,7 +100,7 @@ class ToCSVPeriod(BaseIO): fname = "__test__.csv" - params = ([1000, 10000], ["D", "H"]) + params = ([1000, 10000], ["D", "h"]) param_names = ["nobs", "freq"] def setup(self, nobs, freq): @@ -110,7 +108,7 @@ self.data = DataFrame(rng) if freq == "D": self.default_fmt = "%Y-%m-%d" - elif freq == "H": + elif freq == "h": self.default_fmt = "%Y-%m-%d %H:00" def time_frame_period_formatting_default(self, nobs, freq): @@ -130,7 +128,7 @@ class ToCSVPeriodIndex(BaseIO): fname = "__test__.csv" - params = ([1000, 10000], ["D", "H"]) + params = ([1000, 10000], ["D", "h"]) param_names = ["nobs", "freq"] def setup(self, nobs, freq): @@ -138,7 +136,7 @@ self.data = DataFrame({"a": 1}, index=rng) if freq == "D": self.default_fmt = "%Y-%m-%d" - elif freq == "H": + elif freq == "h": self.default_fmt = "%Y-%m-%d %H:00" def time_frame_period_formatting_index(self, nobs, freq): @@ -253,7 +251,7 @@ iso8601 = "%Y-%m-%d %H:%M:%S" def setup(self): - rng = date_range("1/1/2000", periods=50000, freq="S") + rng = date_range("1/1/2000", periods=50000, freq="s") self.StringIO_input = StringIO("\n".join(rng.strftime(self.iso8601).tolist())) def time_read_csv(self): @@ -288,7 +286,7 @@ def setup(self, skiprows, engine): N = 20000 - index = tm.makeStringIndex(N) + index = Index([f"i-{i}" for i in range(N)], dtype=object) df = DataFrame( { "float1": np.random.randn(N), @@ -410,6 +408,9 @@ def time_read_bytescsv(self, engine): read_csv(self.data(self.BytesIO_input), engine=engine) + def peakmem_read_csv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) + class ReadCSVCategorical(BaseIO): fname = "__test__.csv" @@ -621,4 +622,15 @@ ) +class ReadCSVCParserLowMemory: + # GH 16798 + def setup(self): + self.csv = StringIO( + "strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)]) + ) + + def peakmem_over_2gb_input(self): + read_csv(self.csv, engine="c", low_memory=False) + + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/io/excel.py pandas-2.2.2+dfsg/asv_bench/benchmarks/io/excel.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/io/excel.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/io/excel.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,12 +12,11 @@ from pandas import ( DataFrame, ExcelWriter, + Index, date_range, read_excel, ) -from ..pandas_vb_common import tm - def _generate_dataframe(): N = 2000 @@ -25,9 +24,9 @@ df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), + index=date_range("20000101", periods=N, freq="h"), ) - df["object"] = tm.makeStringIndex(N) + df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object) return df diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/io/hdf.py pandas-2.2.2+dfsg/asv_bench/benchmarks/io/hdf.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/io/hdf.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/io/hdf.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,20 +3,18 @@ from pandas import ( DataFrame, HDFStore, + Index, date_range, read_hdf, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class HDFStoreDataFrame(BaseIO): def setup(self): N = 25000 - index = tm.makeStringIndex(N) + index = Index([f"i-{i}" for i in range(N)], dtype=object) self.df = DataFrame( {"float1": np.random.randn(N), "float2": np.random.randn(N)}, index=index ) @@ -122,9 +120,9 @@ self.df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), + index=date_range("20000101", periods=N, freq="h"), ) - self.df["object"] = tm.makeStringIndex(N) + self.df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object) self.df.to_hdf(self.fname, "df", format=format) # Numeric df diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/io/json.py pandas-2.2.2+dfsg/asv_bench/benchmarks/io/json.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/io/json.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/io/json.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,6 +4,7 @@ from pandas import ( DataFrame, + Index, concat, date_range, json_normalize, @@ -11,10 +12,7 @@ timedelta_range, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class ReadJSON(BaseIO): @@ -26,7 +24,7 @@ N = 100000 indexes = { "int": np.arange(N), - "datetime": date_range("20000101", periods=N, freq="H"), + "datetime": date_range("20000101", periods=N, freq="h"), } df = DataFrame( np.random.randn(N, 5), @@ -48,7 +46,7 @@ N = 100000 indexes = { "int": np.arange(N), - "datetime": date_range("20000101", periods=N, freq="H"), + "datetime": date_range("20000101", periods=N, freq="h"), } df = DataFrame( np.random.randn(N, 5), @@ -108,13 +106,13 @@ def setup(self, orient, frame): N = 10**5 ncols = 5 - index = date_range("20000101", periods=N, freq="H") + index = date_range("20000101", periods=N, freq="h") timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) longints = sys.maxsize * np.random.randint(100000000, size=N) floats = np.random.randn(N) - strings = tm.makeStringIndex(N) + strings = Index([f"i-{i}" for i in range(N)], dtype=object) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) self.df_td_int_ts = DataFrame( @@ -191,7 +189,7 @@ def setup(self, orient): N = 10**5 - index = date_range("20000101", periods=N, freq="H") + index = date_range("20000101", periods=N, freq="h") timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") self.df = DataFrame( @@ -214,13 +212,13 @@ def setup(self): N = 10**5 ncols = 5 - index = date_range("20000101", periods=N, freq="H") + index = date_range("20000101", periods=N, freq="h") timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) longints = sys.maxsize * np.random.randint(100000000, size=N) floats = np.random.randn(N) - strings = tm.makeStringIndex(N) + strings = Index([f"i-{i}" for i in range(N)], dtype=object) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) self.df_td_int_ts = DataFrame( @@ -290,7 +288,7 @@ class ToJSONMem: def setup_cache(self): df = DataFrame([[1]]) - df2 = DataFrame(range(8), date_range("1/1/2000", periods=8, freq="T")) + df2 = DataFrame(range(8), date_range("1/1/2000", periods=8, freq="min")) frames = {"int": df, "float": df.astype(float), "datetime": df2} return frames diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/io/pickle.py pandas-2.2.2+dfsg/asv_bench/benchmarks/io/pickle.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/io/pickle.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/io/pickle.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,14 +2,12 @@ from pandas import ( DataFrame, + Index, date_range, read_pickle, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class Pickle(BaseIO): @@ -20,9 +18,9 @@ self.df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), + index=date_range("20000101", periods=N, freq="h"), ) - self.df["object"] = tm.makeStringIndex(N) + self.df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object) self.df.to_pickle(self.fname) def time_read_pickle(self): diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/io/sql.py pandas-2.2.2+dfsg/asv_bench/benchmarks/io/sql.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/io/sql.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/io/sql.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,13 +5,12 @@ from pandas import ( DataFrame, + Index, date_range, read_sql_query, read_sql_table, ) -from ..pandas_vb_common import tm - class SQL: params = ["sqlalchemy", "sqlite"] @@ -35,7 +34,7 @@ "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, - index=tm.makeStringIndex(N), + index=Index([f"i-{i}" for i in range(N)], dtype=object), ) self.df.iloc[1000:3000, 1] = np.nan self.df["date"] = self.df["datetime"].dt.date @@ -84,7 +83,7 @@ "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, - index=tm.makeStringIndex(N), + index=Index([f"i-{i}" for i in range(N)], dtype=object), ) self.df.iloc[1000:3000, 1] = np.nan self.df["date"] = self.df["datetime"].dt.date @@ -113,7 +112,7 @@ "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, - index=tm.makeStringIndex(N), + index=Index([f"i-{i}" for i in range(N)], dtype=object), ) self.df.iloc[1000:3000, 1] = np.nan self.df["date"] = self.df["datetime"].dt.date @@ -159,7 +158,7 @@ "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, - index=tm.makeStringIndex(N), + index=Index([f"i-{i}" for i in range(N)], dtype=object), ) self.df.iloc[1000:3000, 1] = np.nan self.df["date"] = self.df["datetime"].dt.date diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/io/stata.py pandas-2.2.2+dfsg/asv_bench/benchmarks/io/stata.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/io/stata.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/io/stata.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,14 +2,12 @@ from pandas import ( DataFrame, + Index, date_range, read_stata, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class Stata(BaseIO): @@ -23,9 +21,9 @@ self.df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), + index=date_range("20000101", periods=N, freq="h"), ) - self.df["object"] = tm.makeStringIndex(self.N) + self.df["object"] = Index([f"i-{i}" for i in range(self.N)], dtype=object) self.df["int8_"] = np.random.randint( np.iinfo(np.int8).min, np.iinfo(np.int8).max - 27, N ) diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/join_merge.py pandas-2.2.2+dfsg/asv_bench/benchmarks/join_merge.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/join_merge.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/join_merge.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,8 +14,6 @@ merge_asof, ) -from .pandas_vb_common import tm - try: from pandas import merge_ordered except ImportError: @@ -28,7 +26,7 @@ def setup(self, axis): N = 1000 - s = Series(N, index=tm.makeStringIndex(N)) + s = Series(N, index=Index([f"i-{i}" for i in range(N)], dtype=object)) self.series = [s[i:-i] for i in range(1, 10)] * 50 self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000 df = DataFrame( @@ -94,7 +92,7 @@ elif dtype in ("int64", "Int64", "int64[pyarrow]"): vals = np.arange(N, dtype=np.int64) elif dtype in ("string[python]", "string[pyarrow]"): - vals = tm.makeStringIndex(N) + vals = Index([f"i-{i}" for i in range(N)], dtype=object) else: raise NotImplementedError @@ -122,8 +120,8 @@ param_names = ["sort"] def setup(self, sort): - level1 = tm.makeStringIndex(10).values - level2 = tm.makeStringIndex(1000).values + level1 = Index([f"i-{i}" for i in range(10)], dtype=object).values + level2 = Index([f"i-{i}" for i in range(1000)], dtype=object).values codes1 = np.arange(10).repeat(1000) codes2 = np.tile(np.arange(1000), 10) index2 = MultiIndex(levels=[level1, level2], codes=[codes1, codes2]) @@ -212,8 +210,8 @@ # outer join of non-unique # GH 6329 def setup(self): - date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="T") - daily_dates = date_index.to_period("D").to_timestamp("S", "S") + date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="min") + daily_dates = date_index.to_period("D").to_timestamp("s", "s") self.fracofday = date_index.values - daily_dates.values self.fracofday = self.fracofday.astype("timedelta64[ns]") self.fracofday = self.fracofday.astype(np.float64) / 86_400_000_000_000 @@ -231,8 +229,8 @@ def setup(self, sort): N = 10000 - indices = tm.makeStringIndex(N).values - indices2 = tm.makeStringIndex(N).values + indices = Index([f"i-{i}" for i in range(N)], dtype=object).values + indices2 = Index([f"i-{i}" for i in range(N)], dtype=object).values key = np.tile(indices[:8000], 10) key2 = np.tile(indices2[:8000], 10) self.left = DataFrame( @@ -277,18 +275,21 @@ class MergeEA: params = [ - "Int64", - "Int32", - "Int16", - "UInt64", - "UInt32", - "UInt16", - "Float64", - "Float32", + [ + "Int64", + "Int32", + "Int16", + "UInt64", + "UInt32", + "UInt16", + "Float64", + "Float32", + ], + [True, False], ] - param_names = ["dtype"] + param_names = ["dtype", "monotonic"] - def setup(self, dtype): + def setup(self, dtype, monotonic): N = 10_000 indices = np.arange(1, N) key = np.tile(indices[:8000], 10) @@ -301,8 +302,11 @@ "value2": np.random.randn(7999), } ) + if monotonic: + self.left = self.left.sort_values("key") + self.right = self.right.sort_values("key") - def time_merge(self, dtype): + def time_merge(self, dtype, monotonic): merge(self.left, self.right) @@ -332,13 +336,14 @@ ("ns", "ms"), ], [None, "Europe/Brussels"], + [True, False], ] - param_names = ["units", "tz"] + param_names = ["units", "tz", "monotonic"] - def setup(self, units, tz): + def setup(self, units, tz, monotonic): unit_left, unit_right = units N = 10_000 - keys = Series(date_range("2012-01-01", freq="T", periods=N, tz=tz)) + keys = Series(date_range("2012-01-01", freq="min", periods=N, tz=tz)) self.left = DataFrame( { "key": keys.sample(N * 10, replace=True).dt.as_unit(unit_left), @@ -351,8 +356,11 @@ "value2": np.random.randn(8000), } ) + if monotonic: + self.left = self.left.sort_values("key") + self.right = self.right.sort_values("key") - def time_merge(self, units, tz): + def time_merge(self, units, tz, monotonic): merge(self.left, self.right) @@ -360,14 +368,14 @@ def setup(self): self.left_object = DataFrame( { - "X": np.random.choice(range(0, 10), size=(10000,)), + "X": np.random.choice(range(10), size=(10000,)), "Y": np.random.choice(["one", "two", "three"], size=(10000,)), } ) self.right_object = DataFrame( { - "X": np.random.choice(range(0, 10), size=(10000,)), + "X": np.random.choice(range(10), size=(10000,)), "Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)), } ) @@ -400,7 +408,7 @@ class MergeOrdered: def setup(self): - groups = tm.makeStringIndex(10).values + groups = Index([f"i-{i}" for i in range(10)], dtype=object).values self.left = DataFrame( { "group": groups.repeat(5000), diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/libs.py pandas-2.2.2+dfsg/asv_bench/benchmarks/libs.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/libs.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/libs.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,13 +15,11 @@ from pandas import ( NA, + Index, NaT, ) -from .pandas_vb_common import ( - lib, - tm, -) +from .pandas_vb_common import lib try: from pandas.util import cache_readonly @@ -61,8 +59,8 @@ def setup(self): N = 10000 K = 10 - key1 = tm.makeStringIndex(N).values.repeat(K) - key2 = tm.makeStringIndex(N).values.repeat(K) + key1 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) + key2 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) col_array = np.vstack([key1, key2, np.random.randn(N * K)]) col_array2 = col_array.copy() col_array2[:, :10000] = np.nan diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/multiindex_object.py pandas-2.2.2+dfsg/asv_bench/benchmarks/multiindex_object.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/multiindex_object.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/multiindex_object.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,7 @@ from pandas import ( NA, DataFrame, + Index, MultiIndex, RangeIndex, Series, @@ -12,8 +13,6 @@ date_range, ) -from .pandas_vb_common import tm - class GetLoc: def setup(self): @@ -144,7 +143,11 @@ class Duplicated: def setup(self): n, k = 200, 5000 - levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] + levels = [ + np.arange(n), + Index([f"i-{i}" for i in range(n)], dtype=object).values, + 1000 + np.arange(n), + ] codes = [np.random.choice(n, (k * n)) for lev in levels] self.mi = MultiIndex(levels=levels, codes=codes) @@ -249,7 +252,7 @@ level2 = range(N // 1000) int_left = MultiIndex.from_product([level1, level2]) - level2 = tm.makeStringIndex(N // 1000).values + level2 = Index([f"i-{i}" for i in range(N // 1000)], dtype=object).values str_left = MultiIndex.from_product([level1, level2]) level2 = range(N // 1000) @@ -293,7 +296,7 @@ level2[0] = NA ea_int_left = MultiIndex.from_product([level1, level2]) - level2 = tm.makeStringIndex(N // 1000).values + level2 = Index([f"i-{i}" for i in range(N // 1000)], dtype=object).values str_left = MultiIndex.from_product([level1, level2]) data = { @@ -354,7 +357,7 @@ level2 = range(N // 1000) int_midx = MultiIndex.from_product([level1, level2]) - level2 = tm.makeStringIndex(N // 1000).values + level2 = Index([f"i-{i}" for i in range(N // 1000)], dtype=object).values str_midx = MultiIndex.from_product([level1, level2]) data = { @@ -411,7 +414,7 @@ elif dtype == "int64": level2 = range(N2) elif dtype == "string": - level2 = tm.makeStringIndex(N2) + level2 = Index([f"i-{i}" for i in range(N2)], dtype=object) else: raise NotImplementedError diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/period.py pandas-2.2.2+dfsg/asv_bench/benchmarks/period.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/period.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/period.py 2024-04-10 17:42:52.000000000 +0000 @@ -45,7 +45,7 @@ class DataFramePeriodColumn: def setup(self): - self.rng = period_range(start="1/1/1990", freq="S", periods=20000) + self.rng = period_range(start="1/1/1990", freq="s", periods=20000) self.df = DataFrame(index=range(len(self.rng))) def time_setitem_period_column(self): diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/reindex.py pandas-2.2.2+dfsg/asv_bench/benchmarks/reindex.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/reindex.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/reindex.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,8 +9,6 @@ period_range, ) -from .pandas_vb_common import tm - class Reindex: def setup(self): @@ -23,8 +21,8 @@ ) N = 5000 K = 200 - level1 = tm.makeStringIndex(N).values.repeat(K) - level2 = np.tile(tm.makeStringIndex(K).values, N) + level1 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) + level2 = np.tile(Index([f"i-{i}" for i in range(K)], dtype=object).values, N) index = MultiIndex.from_arrays([level1, level2]) self.s = Series(np.random.randn(N * K), index=index) self.s_subset = self.s[::2] @@ -93,8 +91,8 @@ def setup(self, inplace): N = 10000 K = 10 - key1 = tm.makeStringIndex(N).values.repeat(K) - key2 = tm.makeStringIndex(N).values.repeat(K) + key1 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) + key2 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) self.df = DataFrame( {"key1": key1, "key2": key2, "value": np.random.randn(N * K)} ) @@ -102,7 +100,9 @@ self.df_nan.iloc[:10000, :] = np.nan self.s = Series(np.random.randint(0, 1000, size=10000)) - self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10)) + self.s_str = Series( + np.tile(Index([f"i-{i}" for i in range(1000)], dtype=object).values, 10) + ) N = 1000000 K = 10000 @@ -133,7 +133,7 @@ # blog "pandas escaped the zoo" def setup(self): n = 50000 - indices = tm.makeStringIndex(n) + indices = Index([f"i-{i}" for i in range(n)], dtype=object) subsample_size = 40000 self.x = Series(np.random.randn(n), indices) self.y = Series( diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/series_methods.py pandas-2.2.2+dfsg/asv_bench/benchmarks/series_methods.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/series_methods.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/series_methods.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,8 +10,6 @@ date_range, ) -from .pandas_vb_common import tm - class SeriesConstructor: def setup(self): @@ -28,9 +26,6 @@ def time_constructor_no_data(self): Series(data=None, index=self.idx) - def time_constructor_fastpath(self): - Series(self.array, index=self.idx2, name="name", fastpath=True) - class ToFrame: params = [["int64", "datetime64[ns]", "category", "Int64"], [None, "foo"]] @@ -67,7 +62,7 @@ N = 10**6 data = { "int": np.random.randint(1, 10, N), - "datetime": date_range("2000-01-01", freq="S", periods=N), + "datetime": date_range("2000-01-01", freq="s", periods=N), } self.s = Series(data[dtype]) if dtype == "datetime": @@ -95,7 +90,7 @@ def setup(self, dtype): N = 10**6 if dtype == "datetime64[ns]": - data = date_range("2000-01-01", freq="S", periods=N) + data = date_range("2000-01-01", freq="s", periods=N) na_value = NaT elif dtype in ("float64", "Float64"): data = np.random.randn(N) @@ -256,7 +251,7 @@ class Dir: def setup(self): - self.s = Series(index=tm.makeStringIndex(10000)) + self.s = Series(index=Index([f"i-{i}" for i in range(10000)], dtype=object)) def time_dir_strings(self): dir(self.s) @@ -320,7 +315,7 @@ if func == "argmax" and dtype in {"Int64", "boolean"}: # Skip argmax for nullable int since this doesn't work yet (GH-24382) raise NotImplementedError - self.s = Series([1] * N, dtype=dtype) + self.s = Series(np.ones(N), dtype=dtype) self.func = getattr(self.s, func) def time_func(self, func, N, dtype): diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/sparse.py pandas-2.2.2+dfsg/asv_bench/benchmarks/sparse.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/sparse.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/sparse.py 2024-04-10 17:42:52.000000000 +0000 @@ -22,7 +22,7 @@ def setup(self): K = 50 N = 50001 - rng = date_range("1/1/2000", periods=N, freq="T") + rng = date_range("1/1/2000", periods=N, freq="min") self.series = {} for i in range(1, K): data = np.random.randn(N)[:-i] diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/stat_ops.py pandas-2.2.2+dfsg/asv_bench/benchmarks/stat_ops.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/stat_ops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/stat_ops.py 2024-04-10 17:42:52.000000000 +0000 @@ -20,6 +20,39 @@ self.df_func(axis=axis) +class FrameMixedDtypesOps: + params = [ops, [0, 1, None]] + param_names = ["op", "axis"] + + def setup(self, op, axis): + if op in ("sum", "skew", "kurt", "prod", "sem", "var") or ( + (op, axis) + in ( + ("mean", 1), + ("mean", None), + ("median", 1), + ("median", None), + ("std", 1), + ) + ): + # Skipping cases where datetime aggregations are not implemented + raise NotImplementedError + + N = 1_000_000 + df = pd.DataFrame( + { + "f": np.random.normal(0.0, 1.0, N), + "i": np.random.randint(0, N, N), + "ts": pd.date_range(start="1/1/2000", periods=N, freq="h"), + } + ) + + self.df_func = getattr(df, op) + + def time_op(self, op, axis): + self.df_func(axis=axis) + + class FrameMultiIndexOps: params = [ops] param_names = ["op"] diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/strftime.py pandas-2.2.2+dfsg/asv_bench/benchmarks/strftime.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/strftime.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/strftime.py 2024-04-10 17:42:52.000000000 +0000 @@ -53,7 +53,7 @@ class PeriodStrftime: timeout = 1500 - params = ([1000, 10000], ["D", "H"]) + params = ([1000, 10000], ["D", "h"]) param_names = ["nobs", "freq"] def setup(self, nobs, freq): @@ -67,7 +67,7 @@ self.data.set_index("i", inplace=True) if freq == "D": self.default_fmt = "%Y-%m-%d" - elif freq == "H": + elif freq == "h": self.default_fmt = "%Y-%m-%d %H:00" def time_frame_period_to_str(self, nobs, freq): diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/strings.py pandas-2.2.2+dfsg/asv_bench/benchmarks/strings.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/strings.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/strings.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,12 +6,11 @@ NA, Categorical, DataFrame, + Index, Series, ) from pandas.arrays import StringArray -from .pandas_vb_common import tm - class Dtypes: params = ["str", "string[python]", "string[pyarrow]"] @@ -19,7 +18,9 @@ def setup(self, dtype): try: - self.s = Series(tm.makeStringIndex(10**5), dtype=dtype) + self.s = Series( + Index([f"i-{i}" for i in range(10000)], dtype=object), dtype=dtype + ) except ImportError: raise NotImplementedError @@ -172,7 +173,7 @@ def setup(self, repeats): N = 10**5 - self.s = Series(tm.makeStringIndex(N)) + self.s = Series(Index([f"i-{i}" for i in range(N)], dtype=object)) repeat = {"int": 1, "array": np.random.randint(1, 3, N)} self.values = repeat[repeats] @@ -187,13 +188,20 @@ def setup(self, other_cols, sep, na_rep, na_frac): N = 10**5 mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac]) - self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) + self.s = Series(Index([f"i-{i}" for i in range(N)], dtype=object)).where( + mask_gen() + ) if other_cols == 0: # str.cat self-concatenates only for others=None self.others = None else: self.others = DataFrame( - {i: tm.makeStringIndex(N).where(mask_gen()) for i in range(other_cols)} + { + i: Index([f"i-{i}" for i in range(N)], dtype=object).where( + mask_gen() + ) + for i in range(other_cols) + } ) def time_cat(self, other_cols, sep, na_rep, na_frac): @@ -245,7 +253,8 @@ class Dummies(Dtypes): def setup(self, dtype): super().setup(dtype) - self.s = self.s.str.join("|") + N = len(self.s) // 5 + self.s = self.s[:N].str.join("|") def time_get_dummies(self, dtype): self.s.str.get_dummies("|") @@ -253,7 +262,7 @@ class Encode: def setup(self): - self.ser = Series(tm.makeStringIndex()) + self.ser = Series(Index([f"i-{i}" for i in range(10_000)], dtype=object)) def time_encode_decode(self): self.ser.str.encode("utf-8").str.decode("utf-8") diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/timeseries.py pandas-2.2.2+dfsg/asv_bench/benchmarks/timeseries.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/timeseries.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/timeseries.py 2024-04-10 17:42:52.000000000 +0000 @@ -27,7 +27,7 @@ N = 100000 dtidxes = { "dst": date_range( - start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S" + start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="s" ), "repeated": date_range(start="2000", periods=N / 10, freq="s").repeat(10), "tz_aware": date_range(start="2000", periods=N, freq="s", tz="US/Eastern"), @@ -72,13 +72,13 @@ def setup(self, tz): dst_rng = date_range( - start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S" + start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="s" ) - self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="S") + self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="s") self.index = self.index.append(dst_rng) self.index = self.index.append(dst_rng) self.index = self.index.append( - date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="S") + date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="s") ) def time_infer_dst(self, tz): @@ -90,7 +90,7 @@ param_names = "tz" def setup(self, tz): - idx = date_range(start="1/1/2000", periods=1000, freq="H", tz=tz) + idx = date_range(start="1/1/2000", periods=1000, freq="h", tz=tz) self.df = DataFrame(np.random.randn(1000, 2), index=idx) def time_reset_datetimeindex(self, tz): @@ -116,7 +116,7 @@ class TimeDatetimeConverter: def setup(self): N = 100000 - self.rng = date_range(start="1/1/2000", periods=N, freq="T") + self.rng = date_range(start="1/1/2000", periods=N, freq="min") def time_convert(self): DatetimeConverter.convert(self.rng, None, None) @@ -129,9 +129,9 @@ def setup(self, time_index): N = 10**6 if time_index is timedelta_range: - self.idx = time_index(start=0, freq="T", periods=N) + self.idx = time_index(start=0, freq="min", periods=N) else: - self.idx = time_index(start="20140101", freq="T", periods=N) + self.idx = time_index(start="20140101", freq="min", periods=N) self.exit = 10000 def time_iter(self, time_index): @@ -149,7 +149,7 @@ param_names = ["method"] def setup(self, method): - rng = date_range(start="20130101", periods=100000, freq="50L") + rng = date_range(start="20130101", periods=100000, freq="50ms") df = DataFrame(np.random.randn(100000, 2), index=rng) self.resample = getattr(df.resample("1s"), method) @@ -163,8 +163,8 @@ def setup(self, index, freq, method): indexes = { - "period": period_range(start="1/1/2000", end="1/1/2001", freq="T"), - "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="T"), + "period": period_range(start="1/1/2000", end="1/1/2001", freq="min"), + "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="min"), } idx = indexes[index] ts = Series(np.random.randn(len(idx)), index=idx) @@ -178,7 +178,7 @@ # GH 7754 def setup(self): rng3 = date_range( - start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000U" + start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000us" ) self.dt_ts = Series(5, rng3, dtype="datetime64[ns]") @@ -255,7 +255,7 @@ class Lookup: def setup(self): N = 1500000 - rng = date_range(start="1/1/2000", periods=N, freq="S") + rng = date_range(start="1/1/2000", periods=N, freq="s") self.ts = Series(1, index=rng) self.lookup_val = rng[N // 2] @@ -270,7 +270,7 @@ def setup(self, tz): N = 100000 - self.series = Series(date_range(start="1/1/2000", periods=N, freq="T", tz=tz)) + self.series = Series(date_range(start="1/1/2000", periods=N, freq="min", tz=tz)) def time_dt_accessor(self, tz): self.series.dt diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/tslibs/period.py pandas-2.2.2+dfsg/asv_bench/benchmarks/tslibs/period.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/tslibs/period.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/tslibs/period.py 2024-04-10 17:42:52.000000000 +0000 @@ -72,7 +72,7 @@ self.per.now(freq) def time_asfreq(self, freq): - self.per.asfreq("A") + self.per.asfreq("Y") def time_str(self, freq): str(self.per) @@ -151,7 +151,11 @@ # tzlocal is cumbersomely slow, so skip to keep runtime in check raise NotImplementedError - arr = np.arange(10, dtype="i8").repeat(size // 10) + # we pick 2**55 because smaller values end up returning + # -1 from npy_datetimestruct_to_datetime with NPY_FR_Y frequency + # this artificially slows down functions since -1 is also the + # error sentinel + arr = np.arange(2**55, 2**55 + 10, dtype="i8").repeat(size // 10) self.i8values = arr def time_dt64arr_to_periodarr(self, size, freq, tz): diff -Nru pandas-2.1.4+dfsg/asv_bench/benchmarks/tslibs/timestamp.py pandas-2.2.2+dfsg/asv_bench/benchmarks/tslibs/timestamp.py --- pandas-2.1.4+dfsg/asv_bench/benchmarks/tslibs/timestamp.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/asv_bench/benchmarks/tslibs/timestamp.py 2024-04-10 17:42:52.000000000 +0000 @@ -136,10 +136,10 @@ self.ts.to_julian_date() def time_floor(self, tz): - self.ts.floor("5T") + self.ts.floor("5min") def time_ceil(self, tz): - self.ts.ceil("5T") + self.ts.ceil("5min") class TimestampAcrossDst: diff -Nru pandas-2.1.4+dfsg/ci/.condarc pandas-2.2.2+dfsg/ci/.condarc --- pandas-2.1.4+dfsg/ci/.condarc 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/.condarc 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,32 @@ +# https://docs.conda.io/projects/conda/en/latest/configuration.html + +# always_yes (NoneType, bool) +# aliases: yes +# Automatically choose the 'yes' option whenever asked to proceed with a +# conda operation, such as when running `conda install`. +# +always_yes: true + +# remote_connect_timeout_secs (float) +# The number seconds conda will wait for your client to establish a +# connection to a remote url resource. +# +remote_connect_timeout_secs: 30 + +# remote_max_retries (int) +# The maximum number of retries each HTTP connection should attempt. +# +remote_max_retries: 10 + +# remote_backoff_factor (int) +# The factor determines the time HTTP connection should wait for +# attempt. +# +remote_backoff_factor: 3 + +# remote_read_timeout_secs (float) +# Once conda has connected to a remote resource and sent an HTTP +# request, the read timeout is the number of seconds conda will wait for +# the server to send a response. +# +remote_read_timeout_secs: 60.0 diff -Nru pandas-2.1.4+dfsg/ci/code_checks.sh pandas-2.2.2+dfsg/ci/code_checks.sh --- pandas-2.1.4+dfsg/ci/code_checks.sh 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/code_checks.sh 2024-04-10 17:42:52.000000000 +0000 @@ -14,6 +14,8 @@ # $ ./ci/code_checks.sh single-docs # check single-page docs build warning-free # $ ./ci/code_checks.sh notebooks # check execution of documentation notebooks +set -uo pipefail + [[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" || "$1" == "notebooks" ]] || \ { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 9999; } @@ -63,16 +65,6 @@ MSG='Partially validate docstrings (EX03)' ; echo $MSG $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \ - pandas.Series.loc \ - pandas.Series.iloc \ - pandas.Series.pop \ - pandas.Series.describe \ - pandas.Series.skew \ - pandas.Series.var \ - pandas.Series.last \ - pandas.Series.tz_convert \ - pandas.Series.tz_localize \ - pandas.Series.dt.month_name \ pandas.Series.dt.day_name \ pandas.Series.str.len \ pandas.Series.cat.set_categories \ @@ -186,9 +178,8 @@ ### SINGLE-PAGE DOCS ### if [[ -z "$CHECK" || "$CHECK" == "single-docs" ]]; then - python doc/make.py --warnings-are-errors --single pandas.Series.value_counts - python doc/make.py --warnings-are-errors --single pandas.Series.str.split - python doc/make.py clean + python doc/make.py --warnings-are-errors --no-browser --single pandas.Series.value_counts + python doc/make.py --warnings-are-errors --no-browser --single pandas.Series.str.split fi exit $RET diff -Nru pandas-2.1.4+dfsg/ci/condarc.yml pandas-2.2.2+dfsg/ci/condarc.yml --- pandas-2.1.4+dfsg/ci/condarc.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/condarc.yml 1970-01-01 00:00:00.000000000 +0000 @@ -1,32 +0,0 @@ -# https://docs.conda.io/projects/conda/en/latest/configuration.html - -# always_yes (NoneType, bool) -# aliases: yes -# Automatically choose the 'yes' option whenever asked to proceed with a -# conda operation, such as when running `conda install`. -# -always_yes: true - -# remote_connect_timeout_secs (float) -# The number seconds conda will wait for your client to establish a -# connection to a remote url resource. -# -remote_connect_timeout_secs: 30 - -# remote_max_retries (int) -# The maximum number of retries each HTTP connection should attempt. -# -remote_max_retries: 10 - -# remote_backoff_factor (int) -# The factor determines the time HTTP connection should wait for -# attempt. -# -remote_backoff_factor: 3 - -# remote_read_timeout_secs (float) -# Once conda has connected to a remote resource and sent an HTTP -# request, the read timeout is the number of seconds conda will wait for -# the server to send a response. -# -remote_read_timeout_secs: 60.0 diff -Nru pandas-2.1.4+dfsg/ci/deps/actions-310.yaml pandas-2.2.2+dfsg/ci/deps/actions-310.yaml --- pandas-2.1.4+dfsg/ci/deps/actions-310.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/deps/actions-310.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -14,47 +14,52 @@ - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies - - beautifulsoup4>=4.11.1 - - blosc>=1.21.0 - - bottleneck>=1.3.4 - - fastparquet>=0.8.1 - - fsspec>=2022.05.0 + - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 + - blosc>=1.21.3 + - bottleneck>=1.3.6 + - fastparquet>=2022.12.0 + - fsspec>=2022.11.0 - html5lib>=1.1 - hypothesis>=6.46.1 - - gcsfs>=2022.05.0 + - gcsfs>=2022.11.0 - jinja2>=3.1.2 - - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.8 - - numba>=0.55.2 - - numexpr>=2.8.0 + - lxml>=4.9.2 + - matplotlib>=3.6.3 + - numba>=0.56.4 + - numexpr>=2.8.4 - odfpy>=1.4.1 - - qtpy>=2.2.0 - - openpyxl>=3.0.10 - - pandas-gbq>=0.17.5 - - psycopg2>=2.9.3 - - pyarrow>=7.0.0 + - qtpy>=2.3.0 + - openpyxl>=3.1.0 + - psycopg2>=2.9.6 + - pyarrow>=10.0.1 - pymysql>=1.0.2 - - pyreadstat>=1.1.5 - - pytables>=3.7.0 - - pyxlsb>=1.0.9 - - s3fs>=2022.05.0 - - scipy>=1.8.1 - - sqlalchemy>=1.4.36 - - tabulate>=0.8.10 - - xarray>=2022.03.0 + - pyqt>=5.15.9 + - pyreadstat>=1.2.0 + - pytables>=3.8.0 + - python-calamine>=0.1.7 + - pyxlsb>=1.0.10 + - s3fs>=2022.11.0 + - scipy>=1.10.0 + - sqlalchemy>=2.0.0 + - tabulate>=0.9.0 + - xarray>=2022.12.0 - xlrd>=2.0.1 - - xlsxwriter>=3.0.3 - - zstandard>=0.17.0 + - xlsxwriter>=3.0.5 + - zstandard>=0.19.0 - pip: - - pyqt5>=5.15.6 - - tzdata>=2022.1 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 + - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff -Nru pandas-2.1.4+dfsg/ci/deps/actions-311-downstream_compat.yaml pandas-2.2.2+dfsg/ci/deps/actions-311-downstream_compat.yaml --- pandas-2.1.4+dfsg/ci/deps/actions-311-downstream_compat.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/deps/actions-311-downstream_compat.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -16,45 +16,49 @@ - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies - - beautifulsoup4>=4.11.1 - - blosc>=1.21.0 - - bottleneck>=1.3.4 - - fastparquet>=0.8.1 - - fsspec>=2022.05.0 + - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 + - blosc>=1.21.3 + - bottleneck>=1.3.6 + - fastparquet>=2022.12.0 + - fsspec>=2022.11.0 - html5lib>=1.1 - hypothesis>=6.46.1 - - gcsfs>=2022.05.0 + - gcsfs>=2022.11.0 - jinja2>=3.1.2 - - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.8 - - numba>=0.55.2 - - numexpr>=2.8.0 + - lxml>=4.9.2 + - matplotlib>=3.6.3 + - numba>=0.56.4 + - numexpr>=2.8.4 - odfpy>=1.4.1 - - qtpy>=2.2.0 - - openpyxl>=3.0.10 - - pandas-gbq>=0.17.5 - - psycopg2>=2.9.3 - - pyarrow>=7.0.0 + - qtpy>=2.3.0 + - openpyxl>=3.1.0 + - psycopg2>=2.9.6 + - pyarrow>=10.0.1 - pymysql>=1.0.2 - - pyreadstat>=1.1.5 - - pytables>=3.7.0 - - pyxlsb>=1.0.9 - - s3fs>=2022.05.0 - - scipy>=1.8.1 - - sqlalchemy>=1.4.36 - - tabulate>=0.8.10 - - xarray>=2022.03.0 + - pyqt>=5.15.9 + - pyreadstat>=1.2.0 + - pytables>=3.8.0 + - python-calamine>=0.1.7 + - pyxlsb>=1.0.10 + - s3fs>=2022.11.0 + - scipy>=1.10.0 + - sqlalchemy>=2.0.0 + - tabulate>=0.9.0 + - xarray>=2022.12.0 - xlrd>=2.0.1 - - xlsxwriter>=3.0.3 - - zstandard>=0.17.0 + - xlsxwriter>=3.0.5 + - zstandard>=0.19.0 # downstream packages - botocore @@ -70,6 +74,7 @@ - pyyaml - py - pip: + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - dataframe-api-compat>=0.1.7 - - pyqt5>=5.15.6 - - tzdata>=2022.1 + - tzdata>=2022.7 diff -Nru pandas-2.1.4+dfsg/ci/deps/actions-311-numpydev.yaml pandas-2.2.2+dfsg/ci/deps/actions-311-numpydev.yaml --- pandas-2.1.4+dfsg/ci/deps/actions-311-numpydev.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/deps/actions-311-numpydev.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -28,4 +28,4 @@ - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--pre" - "numpy" - - "tzdata>=2022.1" + - "tzdata>=2022.7" diff -Nru pandas-2.1.4+dfsg/ci/deps/actions-311-pyarrownightly.yaml pandas-2.2.2+dfsg/ci/deps/actions-311-pyarrownightly.yaml --- pandas-2.1.4+dfsg/ci/deps/actions-311-pyarrownightly.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/deps/actions-311-pyarrownightly.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -18,12 +18,12 @@ # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz - pip - pip: - - "tzdata>=2022.1" + - "tzdata>=2022.7" - "--extra-index-url https://pypi.fury.io/arrow-nightlies/" - "--prefer-binary" - "--pre" diff -Nru pandas-2.1.4+dfsg/ci/deps/actions-311.yaml pandas-2.2.2+dfsg/ci/deps/actions-311.yaml --- pandas-2.1.4+dfsg/ci/deps/actions-311.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/deps/actions-311.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -14,47 +14,51 @@ - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies - - beautifulsoup4>=4.11.1 - - blosc>=1.21.0 - - bottleneck>=1.3.4 - - fastparquet>=0.8.1 - - fsspec>=2022.05.0 + - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 + - blosc>=1.21.3 + - bottleneck>=1.3.6 + - fastparquet>=2022.12.0 + - fsspec>=2022.11.0 - html5lib>=1.1 - hypothesis>=6.46.1 - - gcsfs>=2022.05.0 + - gcsfs>=2022.11.0 - jinja2>=3.1.2 - - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.8 - - numba>=0.55.2 - - numexpr>=2.8.0 + - lxml>=4.9.2 + - matplotlib>=3.6.3 + - numba>=0.56.4 + - numexpr>=2.8.4 - odfpy>=1.4.1 - - qtpy>=2.2.0 - - openpyxl>=3.0.10 - - pandas-gbq>=0.17.5 - - psycopg2>=2.9.3 - - pyarrow>=7.0.0 + - qtpy>=2.3.0 + - pyqt>=5.15.9 + - openpyxl>=3.1.0 + - psycopg2>=2.9.6 + - pyarrow>=10.0.1 - pymysql>=1.0.2 - - pyreadstat>=1.1.5 - # - pytables>=3.7.0, 3.8.0 is first version that supports 3.11 - - pyxlsb>=1.0.9 - - s3fs>=2022.05.0 - - scipy>=1.8.1 - - sqlalchemy>=1.4.36 - - tabulate>=0.8.10 - - xarray>=2022.03.0 + - pyreadstat>=1.2.0 + - pytables>=3.8.0 + - python-calamine>=0.1.7 + - pyxlsb>=1.0.10 + - s3fs>=2022.11.0 + - scipy>=1.10.0 + - sqlalchemy>=2.0.0 + - tabulate>=0.9.0 + - xarray>=2022.12.0 - xlrd>=2.0.1 - - xlsxwriter>=3.0.3 - - zstandard>=0.17.0 + - xlsxwriter>=3.0.5 + - zstandard>=0.19.0 - pip: - - pyqt5>=5.15.6 - - tzdata>=2022.1 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 + - pytest-localserver>=0.7.1 diff -Nru pandas-2.1.4+dfsg/ci/deps/actions-312.yaml pandas-2.2.2+dfsg/ci/deps/actions-312.yaml --- pandas-2.1.4+dfsg/ci/deps/actions-312.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/deps/actions-312.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -14,50 +14,52 @@ - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies - - beautifulsoup4>=4.11.1 + - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 - blosc>=1.21.3 - - bottleneck>=1.3.4 - - fastparquet>=0.8.1 - - fsspec>=2022.05.0 + - bottleneck>=1.3.6 + - fastparquet>=2022.12.0 + - fsspec>=2022.11.0 - html5lib>=1.1 - hypothesis>=6.46.1 - - gcsfs>=2022.05.0 + - gcsfs>=2022.11.0 - jinja2>=3.1.2 - - lxml>=4.8.0 - - matplotlib>=3.6.1 + - lxml>=4.9.2 + - matplotlib>=3.6.3 # - numba>=0.56.4 - - numexpr>=2.8.0 + - numexpr>=2.8.4 - odfpy>=1.4.1 - - qtpy>=2.2.0 + - qtpy>=2.3.0 - pyqt>=5.15.9 - - openpyxl>=3.0.10 - - psycopg2>=2.9.3 - - pyarrow>=7.0.0 + - openpyxl>=3.1.0 + - psycopg2>=2.9.6 + - pyarrow>=10.0.1 - pymysql>=1.0.2 - - pyreadstat>=1.1.5 + - pyreadstat>=1.2.0 # - pytables>=3.8.0 - # - python-calamine>=0.1.6 - - pyxlsb>=1.0.9 - - s3fs>=2022.05.0 - - scipy>=1.8.1 - - sqlalchemy>=1.4.36 - - tabulate>=0.8.10 - - xarray>=2022.03.0 + - python-calamine>=0.1.7 + - pyxlsb>=1.0.10 + - s3fs>=2022.11.0 + - scipy>=1.10.0 + - sqlalchemy>=2.0.0 + - tabulate>=0.9.0 + - xarray>=2022.12.0 - xlrd>=2.0.1 - - xlsxwriter>=3.0.3 - - zstandard>=0.17.0 + - xlsxwriter>=3.0.5 + - zstandard>=0.19.0 - pip: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff -Nru pandas-2.1.4+dfsg/ci/deps/actions-39-minimum_versions.yaml pandas-2.2.2+dfsg/ci/deps/actions-39-minimum_versions.yaml --- pandas-2.1.4+dfsg/ci/deps/actions-39-minimum_versions.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/deps/actions-39-minimum_versions.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -17,47 +17,52 @@ - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies - python-dateutil=2.8.2 - - numpy=1.22.4, <2 + - numpy=1.22.4 - pytz=2020.1 # optional dependencies - - beautifulsoup4=4.11.1 - - blosc=1.21.0 - - bottleneck=1.3.4 - - fastparquet=0.8.1 - - fsspec=2022.05.0 + - beautifulsoup4=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 + - blosc=1.21.3 + - bottleneck=1.3.6 + - fastparquet=2022.12.0 + - fsspec=2022.11.0 - html5lib=1.1 - hypothesis=6.46.1 - - gcsfs=2022.05.0 + - gcsfs=2022.11.0 - jinja2=3.1.2 - - lxml=4.8.0 - - matplotlib=3.6.1 - - numba=0.55.2 - - numexpr=2.8.0 + - lxml=4.9.2 + - matplotlib=3.6.3 + - numba=0.56.4 + - numexpr=2.8.4 - odfpy=1.4.1 - - qtpy=2.2.0 - - openpyxl=3.0.10 - - pandas-gbq=0.17.5 - - psycopg2=2.9.3 - - pyarrow=7.0.0 + - qtpy=2.3.0 + - openpyxl=3.1.0 + - psycopg2=2.9.6 + - pyarrow=10.0.1 - pymysql=1.0.2 - - pyreadstat=1.1.5 - - pytables=3.7.0 - - pyxlsb=1.0.9 - - s3fs=2022.05.0 - - scipy=1.8.1 - - sqlalchemy=1.4.36 - - tabulate=0.8.10 - - xarray=2022.03.0 + - pyqt=5.15.9 + - pyreadstat=1.2.0 + - pytables=3.8.0 + - python-calamine=0.1.7 + - pyxlsb=1.0.10 + - s3fs=2022.11.0 + - scipy=1.10.0 + - sqlalchemy=2.0.0 + - tabulate=0.9.0 + - xarray=2022.12.0 - xlrd=2.0.1 - - xlsxwriter=3.0.3 - - zstandard=0.17.0 + - xlsxwriter=3.0.5 + - zstandard=0.19.0 - pip: + - adbc-driver-postgresql==0.8.0 + - adbc-driver-sqlite==0.8.0 - dataframe-api-compat==0.1.7 - - pyqt5==5.15.6 - - tzdata==2022.1 + - tzdata==2022.7 diff -Nru pandas-2.1.4+dfsg/ci/deps/actions-39.yaml pandas-2.2.2+dfsg/ci/deps/actions-39.yaml --- pandas-2.1.4+dfsg/ci/deps/actions-39.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/deps/actions-39.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -14,47 +14,52 @@ - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies - - beautifulsoup4>=4.11.1 - - blosc>=1.21.0 - - bottleneck>=1.3.4 - - fastparquet>=0.8.1 - - fsspec>=2022.05.0 + - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 + - blosc>=1.21.3 + - bottleneck>=1.3.6 + - fastparquet>=2022.12.0 + - fsspec>=2022.11.0 - html5lib>=1.1 - hypothesis>=6.46.1 - - gcsfs>=2022.05.0 + - gcsfs>=2022.11.0 - jinja2>=3.1.2 - - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.8 - - numba>=0.55.2 - - numexpr>=2.8.0 + - lxml>=4.9.2 + - matplotlib>=3.6.3 + - numba>=0.56.4 + - numexpr>=2.8.4 - odfpy>=1.4.1 - - qtpy>=2.2.0 - - openpyxl>=3.0.10 - - pandas-gbq>=0.17.5 - - psycopg2>=2.9.3 - - pyarrow>=7.0.0 + - qtpy>=2.3.0 + - openpyxl>=3.1.0 + - psycopg2>=2.9.6 + - pyarrow>=10.0.1 - pymysql>=1.0.2 - - pyreadstat>=1.1.5 - - pytables>=3.7.0 - - pyxlsb>=1.0.9 - - s3fs>=2022.05.0 - - scipy>=1.8.1 - - sqlalchemy>=1.4.36 - - tabulate>=0.8.10 - - xarray>=2022.03.0 + - pyqt>=5.15.9 + - pyreadstat>=1.2.0 + - pytables>=3.8.0 + - python-calamine>=0.1.7 + - pyxlsb>=1.0.10 + - s3fs>=2022.11.0 + - scipy>=1.10.0 + - sqlalchemy>=2.0.0 + - tabulate>=0.9.0 + - xarray>=2022.12.0 - xlrd>=2.0.1 - - xlsxwriter>=3.0.3 - - zstandard>=0.17.0 + - xlsxwriter>=3.0.5 + - zstandard>=0.19.0 - pip: - - pyqt5>=5.15.6 - - tzdata>=2022.1 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 + - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff -Nru pandas-2.1.4+dfsg/ci/deps/actions-pypy-39.yaml pandas-2.2.2+dfsg/ci/deps/actions-pypy-39.yaml --- pandas-2.1.4+dfsg/ci/deps/actions-pypy-39.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/deps/actions-pypy-39.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -20,8 +20,8 @@ - hypothesis>=6.46.1 # required - - numpy<2 + - numpy - python-dateutil - pytz - pip: - - tzdata>=2022.1 + - tzdata>=2022.7 diff -Nru pandas-2.1.4+dfsg/ci/deps/circle-310-arm64.yaml pandas-2.2.2+dfsg/ci/deps/circle-310-arm64.yaml --- pandas-2.1.4+dfsg/ci/deps/circle-310-arm64.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/deps/circle-310-arm64.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -15,43 +15,49 @@ - pytest-cov - pytest-xdist>=2.2.0 - pytest-localserver>=0.7.1 + - pytest-qt>=4.2.0 - boto3 # required dependencies - python-dateutil - - numpy<2 + - numpy - pytz # optional dependencies - - beautifulsoup4>=4.11.1 - - blosc>=1.21.0 - - bottleneck>=1.3.4 - - fastparquet>=0.8.1 - - fsspec>=2022.05.0 + - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 + - blosc>=1.21.3 + - bottleneck>=1.3.6 + - fastparquet>=2022.12.0 + - fsspec>=2022.11.0 - html5lib>=1.1 - hypothesis>=6.46.1 - - gcsfs>=2022.05.0 + - gcsfs>=2022.11.0 - jinja2>=3.1.2 - - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.8 - # test_numba_vs_cython segfaults with numba 0.57 - - numba>=0.55.2, <0.57.0 - - numexpr>=2.8.0 + - lxml>=4.9.2 + - matplotlib>=3.6.3 + - numba>=0.56.4 + - numexpr>=2.8.4 - odfpy>=1.4.1 - - qtpy>=2.2.0 - - openpyxl>=3.0.10 - - pandas-gbq>=0.17.5 - - psycopg2>=2.9.3 - - pyarrow>=7.0.0 + - qtpy>=2.3.0 + - openpyxl>=3.1.0 + - psycopg2>=2.9.6 + - pyarrow>=10.0.1 - pymysql>=1.0.2 - # - pyreadstat>=1.1.5 not available on ARM - - pytables>=3.7.0 - - pyxlsb>=1.0.9 - - s3fs>=2022.05.0 - - scipy>=1.8.1 - - sqlalchemy>=1.4.36 - - tabulate>=0.8.10 - - xarray>=2022.03.0 + - pyqt>=5.15.9 + - pyreadstat>=1.2.0 + - pytables>=3.8.0 + - python-calamine>=0.1.7 + - pyxlsb>=1.0.10 + - s3fs>=2022.11.0 + - scipy>=1.10.0 + - sqlalchemy>=2.0.0 + - tabulate>=0.9.0 + - xarray>=2022.12.0 - xlrd>=2.0.1 - - xlsxwriter>=3.0.3 - - zstandard>=0.17.0 + - xlsxwriter>=3.0.5 + - zstandard>=0.19.0 + - pip: + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 diff -Nru pandas-2.1.4+dfsg/ci/meta.yaml pandas-2.2.2+dfsg/ci/meta.yaml --- pandas-2.1.4+dfsg/ci/meta.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/meta.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -38,7 +38,7 @@ - numpy >=1.23.2 # [py>=311] - python-dateutil >=2.8.2 - pytz >=2020.1 - - python-tzdata >=2022.1 + - python-tzdata >=2022.7 test: imports: diff -Nru pandas-2.1.4+dfsg/ci/run_tests.sh pandas-2.2.2+dfsg/ci/run_tests.sh --- pandas-2.1.4+dfsg/ci/run_tests.sh 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/ci/run_tests.sh 2024-04-10 17:42:52.000000000 +0000 @@ -10,8 +10,7 @@ COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -# TODO: Support NEP 50 and remove NPY_PROMOTION_STATE -PYTEST_CMD="NPY_PROMOTION_STATE=legacy MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" diff -Nru pandas-2.1.4+dfsg/debian/changelog pandas-2.2.2+dfsg/debian/changelog --- pandas-2.1.4+dfsg/debian/changelog 2024-04-21 12:50:13.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/changelog 2024-07-07 18:36:37.000000000 +0000 @@ -1,3 +1,36 @@ +pandas (2.2.2+dfsg-4) unstable; urgency=medium + + * Tests: re-enable bottleneck and tabulate (see #1070359, #1070360), + make blosc xfail nonstrict, use pyproject.toml in autopkgtest, + run autopkgtest in CI, be less verbose to fit in the CI log. + * Add transition Breaks. + * Upload to unstable. (Closes: #1069792) + + -- Rebecca N. Palmer Sun, 07 Jul 2024 19:36:37 +0100 + +pandas (2.2.2+dfsg-3) experimental; urgency=medium + + * Tests: add forgotten import. + + -- Rebecca N. Palmer Mon, 06 May 2024 14:47:52 +0100 + +pandas (2.2.2+dfsg-2) experimental; urgency=medium + + * Allow importing pandas._testing without pytest. + * Tests: don't require 32-bit to imply time32. + + -- Rebecca N. Palmer Mon, 06 May 2024 11:29:54 +0100 + +pandas (2.2.2+dfsg-1) experimental; urgency=medium + + * New upstream release. Update copyright, patches, depends. + * Upload to experimental, due to potential breakage (see #1069792). + * Tests: use our test data path, skip too-old dependencies, + mark some tests as requiring optional dependencies, + remove no longer needed patches, clean up afterwards. + + -- Rebecca N. Palmer Sun, 05 May 2024 14:40:45 +0100 + pandas (2.1.4+dfsg-8) unstable; urgency=medium * Re-enable the documentation. diff -Nru pandas-2.1.4+dfsg/debian/contributors_list.txt pandas-2.2.2+dfsg/debian/contributors_list.txt --- pandas-2.1.4+dfsg/debian/contributors_list.txt 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/contributors_list.txt 2024-04-23 06:22:13.000000000 +0000 @@ -1,42 +1,43 @@ -Current to version 2.1.4 (generated by git shortlog -ns) +Current to version 2.2.2 (generated by git shortlog -ns) There may be multiple entries for the same person if they have used more than one (form of their) name https://github.com/pandas-dev/pandas/graphs/contributors - 4562 jbrockmendel + 4765 jbrockmendel 3130 Wes McKinney 3043 jreback 1690 Jeff Reback - 1315 Joris Van den Bossche - 1078 Matthew Roeschke + 1348 Joris Van den Bossche + 1170 Matthew Roeschke 943 y-p + 867 Patrick Hoefler 758 Tom Augspurger - 698 Patrick Hoefler 630 Chang She 622 Simon Hawkins 607 Phillip Cloud 464 gfyoung 344 sinhrks 327 Adam Klein - 290 Richard Shadrach + 327 Richard Shadrach + 297 William Ayd 278 Terji Petersen - 264 William Ayd - 245 Marc Garcia + 246 Marc Garcia + 217 Thomas Li + 205 Luke Manley + 202 Torsten Wörtwein + 197 Marco Edward Gorelli 196 MomIsBestFriend - 188 Torsten Wörtwein - 183 Thomas Li - 180 Marco Edward Gorelli 178 patrick 168 Daniel Saxton 160 Andy Hayden - 160 Lumberbot (aka Jack) - 152 Luke Manley 148 topper-123 - 144 Fangchen Li + 146 Fangchen Li 128 Jeff Tratner 122 Jeffrey Tratner + 122 Lumberbot (aka Jack) 119 attack68 115 Marco Gorelli - 110 Dea María Léon + 113 Dea María Léon + 113 Natalia Mokeeva 109 Vytautas Jancauskas 102 Matthew Zeitlin 93 Jeremy Schendel @@ -48,7 +49,6 @@ 84 JHM Darbyshire 84 Stephan Hoyer 80 Kevin Sheppard - 77 Natalia Mokeeva 76 Wouter Overmeire 74 Andrew Wieteska 70 jschendel @@ -80,8 +80,10 @@ 28 David Stephens 27 Avinash Pancham 27 Khor Chean Wei + 27 Yao Xiao 26 Gábor Lipták 26 Justin Zheng + 26 Rajat Subhra Mukherjee 26 github-actions[bot] 26 realead 25 Stephen Lin @@ -89,7 +91,6 @@ 25 rockg 24 Erfan Nariman 23 Fabian Haase - 23 Yao Xiao 23 Yuanhao Geng 22 Licht Takeuchi 22 Ming Li @@ -108,12 +109,13 @@ 18 Kerby Shedden 18 Noa Tamir 18 Robert de Vries + 18 Tim Swast + 18 dependabot[bot] 18 pilkibun 18 unutbu 17 Aidan Feldman 17 Anthonios Partheniou 17 Nicolas Bonnotte - 17 Tim Swast 17 Uwe L. Korn 16 Jiaxiang 16 Matt Roeschke @@ -130,6 +132,7 @@ 14 Dr. Irv 14 Gregory Rome 14 Maximilian Roos + 14 Pandas Development Team 14 Roy Hyunjin Han 14 Spencer Lyon 14 dataxerik @@ -146,11 +149,12 @@ 12 Bharat Raghunathan 12 Luca Beltrame 12 PKEuS - 12 Pandas Development Team + 12 Quang Nguyễn 12 Richard T. Guy 12 Thomas Smith 12 Tommy 12 Wenhuan + 12 Xiao Yuan 11 Andrew Rosenfeld 11 Daniel Himmelstein 11 Joe Jevnik @@ -158,13 +162,13 @@ 11 Keith Hughitt 11 anmyachev 11 proost + 11 rmhowe425 11 terrytangyuan 10 Aaron Critchley 10 Alexander Ponomaroff 10 Andrew Hawyrluk 10 Anjali2019 10 Dale Jung - 10 Daniel Isaac 10 DriesS 10 Eric Leung 10 Felix Dulys @@ -178,13 +182,14 @@ 10 Shahar Naveh 10 cbertinato 10 luke396 - 10 rmhowe425 9 3vts 9 Damien Garaud + 9 Daniel Isaac 9 Deepyaman Datta 9 Guillaume Gay 9 Jack Goldsmith 9 Jacob Schaer + 9 Mateusz Sokół 9 Michael Marino 9 Shao Yang Hong 9 Tobias Pitters @@ -193,6 +198,7 @@ 9 Viktor Kerkez 9 cleconte987 9 poloso + 9 rebecca-palmer 8 Adam J. Stewart 8 Aly Sivji 8 Brandon Bradley @@ -205,16 +211,19 @@ 8 Mike Kelly 8 Oleh Kozynets 8 OlivierLuG + 8 Parthi 8 RaisaDZ 8 Shawn Heide 8 Sylvain Marié + 8 Thomas A Caswell 8 Thomas Grainger 8 Varun 8 Vyom Pathak 8 himanshu awasthi 8 iasoon 8 jnmclarty - 8 rebecca-palmer + 8 smij720 + 8 taytzehao 8 tpaxman 7 Benedikt Sauer 7 Chuanzhu Xu @@ -229,26 +238,25 @@ 7 Johannes Mueller 7 Kernc 7 Kostya Farber + 7 Matthias Bussonnier 7 Michael Mueller 7 Nirav - 7 Quang Nguyễn - 7 Rajat Subhra Mukherjee 7 Robin to Roxel 7 Rushabh Vasani 7 Samesh Lakhotia + 7 Stefanie Molin 7 Stijn Van Hoey 7 Takafumi Arakaki - 7 Thomas A Caswell + 7 William Andrea 7 Winterflower - 7 Xiao Yuan 7 Younggun Kim 7 Yury Mikhaylov 7 abonte - 7 dependabot[bot] 7 dsm054 7 ftrihardjo 7 nipunreddevil 7 nrebena + 7 rohanjain101 7 scls19fr 7 themien 7 uzzell @@ -265,7 +273,9 @@ 6 FactorizeD 6 Garrett-R 6 Graham Inggs + 6 Guillaume Lemaitre 6 HH-MWB + 6 Isaac Virshup 6 Jeff Carey 6 Kashif Khan 6 Kelsey Jordahl @@ -277,7 +287,6 @@ 6 Neil Parley 6 Nicholaus E. Halecky 6 Nick Crews - 6 Parthi 6 Piotr Jucha 6 Roman Yurchak 6 Rouz Azari @@ -291,7 +300,6 @@ 6 Vincent Arel-Bundock 6 WANG Aiyong 6 Wes Turner - 6 William Andrea 6 ZhuBaohe 6 agijsberts 6 agraboso @@ -307,9 +315,9 @@ 6 saehuihwang 6 seljaks 6 smartvinnetou - 6 taytzehao 6 timmie 6 tmnhat2001 + 5 AG 5 Adam Obeng 5 Addison Lynch 5 Ajay Saxena @@ -328,9 +336,8 @@ 5 Galuh Sahid 5 Gjelt 5 Guilherme Beltramini - 5 Guillaume Lemaitre - 5 Isaac Virshup 5 Janelle Zoutkamp + 5 Jay 5 Joeperdefloep 5 Jon Mease 5 Joris Vankerschaver @@ -339,11 +346,11 @@ 5 Ksenia 5 Lars Buitinck 5 LeakedMemory + 5 Linus Sommer 5 Loïc Estève 5 Luis Pinto 5 Marc Abramowitz 5 Mark Wiebe - 5 Matthias Bussonnier 5 Max Chen 5 Michael Hsieh 5 Nicholas Musolino @@ -353,15 +360,16 @@ 5 Philip 5 Prabakaran Kumaresshan 5 Prakhar Pandey + 5 Ralf Gommers 5 Sangwoong Yoon 5 Scott E Lasley 5 Shantanu - 5 Stefanie Molin 5 Sumanau Sareen 5 Tao He 5 Tarbo Fukazawa 5 Taylor Packard 5 Ted Petrou + 5 Tim Hoffmann 5 Tomaz Berisa 5 TrigonaMinima 5 Troels Nielsen @@ -376,6 +384,7 @@ 5 donK23 5 fathomer 5 kylekeppler + 5 pre-commit-ci[bot] 5 pv8493013j 5 tshauck 5 yui-knk @@ -409,6 +418,7 @@ 4 Danil Iashchenko 4 Dave Hughes 4 David Adrián Cañones Castellano + 4 Dmitriy 4 Dražen Lučanin 4 Ekaterina 4 Erdi @@ -422,8 +432,8 @@ 4 JMBurley 4 Jacques Kvam 4 James Myatt - 4 Jay 4 JennaVergeynst + 4 Jessica Greene 4 Jev Kuznetsov 4 Jiang Yue 4 Jim Crist @@ -431,6 +441,7 @@ 4 John Zwinck 4 Jonathan Shreckengost 4 Junya Hayashi + 4 Kevin 4 Kevin Stone 4 Krishna 4 Kyle Meyer @@ -445,6 +456,7 @@ 4 Matt Wittmann 4 Matthew Gilbert 4 Max van Deursen + 4 Michael Tiemann 4 Nathan Abel 4 Nathan Goldbaum 4 Nicholas Ver Halen @@ -453,8 +465,10 @@ 4 Olivier Grisel 4 Oluokun Adedayo 4 Paul Ivanov + 4 Philippe THOMY 4 Qbiwan 4 Ram Rachum + 4 Rob 4 Robert Gieseke 4 Roger Thomas 4 Samuel Sinayoko @@ -465,7 +479,6 @@ 4 Srinivas Reddy Thatiparthy (శ్రీనివాస్ రెడ్డి తాటిపర్తి) 4 Stephen Rauch 4 Stéphan Taljaard - 4 Tim Hoffmann 4 Trent Hauck 4 Tyler Reddy 4 Varun Shrivastava @@ -496,8 +509,8 @@ 4 vangorade 4 waitingkuo 4 wcwagner - 3 AG 3 Aaditya Panikath + 3 Abdullah Ihsan Secer 3 Abhijeet Krishnan 3 Adam Hooper 3 Aleksey Bilogur @@ -506,6 +519,7 @@ 3 Alexander Buchkovsky 3 Alfonso MHC 3 Alp Arıbal + 3 Amanda Bizzinotto 3 Andreas Winkler 3 Andrew Wood 3 Angelos Evripiotis @@ -534,6 +548,7 @@ 3 Dave Hirschfeld 3 Dave Willmer 3 David Krych + 3 David Poznik 3 Devin Petersohn 3 Dillon Niederhut 3 DimiGrammatikakis @@ -591,6 +606,7 @@ 3 Josh Klein 3 Josiah Baker 3 José Duarte + 3 José Lucas Mayer 3 Julia Signell 3 Justin Essert 3 Kamil Trocewicz @@ -603,6 +619,7 @@ 3 Kim, KwonHyun 3 Krishna Chivukula 3 Kyle Barron + 3 Lawrence Mitchell 3 Leonardus Chen 3 Liam3851 3 Louis Huynh @@ -613,6 +630,7 @@ 3 Martin Durant 3 Martina Oefelein 3 Mateusz + 3 Matheus Felipe 3 Mathis Felardos 3 Matt Braymer-Hayes 3 Matteo Santamaria @@ -620,7 +638,6 @@ 3 Matus Valo 3 Max Bolingbroke 3 Maximiliano Greco - 3 Michael Tiemann 3 Mike Kutzma 3 Miroslav Šedivý 3 Mitar @@ -644,7 +661,6 @@ 3 Prithvijit 3 Pulkit Maloo 3 Rahul Sathanapalli - 3 Ralf Gommers 3 Randy Carnevale 3 Ray Bell 3 Riccardo Magliocchetti @@ -656,6 +672,7 @@ 3 Ryan 3 Safia Abdalla 3 Sahil Dua + 3 Sai-Suraj-27 3 Salahuddin 3 Sanjith Chockan 3 Sarthak Vineet Kumar @@ -700,6 +717,7 @@ 3 arredond 3 bang128 3 beanan + 3 caneff 3 davidshinn 3 dengemann 3 deponovo @@ -719,13 +737,15 @@ 3 nathalier 3 neelmraman 3 ogiaquino + 3 omar-elbaz + 3 paulreece 3 prossahl + 3 raj-thapa 3 rekcahpassyla 3 ri938 3 shawnbrown 3 shteken 3 shubham11941140 - 3 smij720 3 stonebig 3 tehunter 3 thatneat @@ -756,7 +776,6 @@ 2 Aleš Erjavec 2 Allen Downey 2 Allison Kwan - 2 Amanda Bizzinotto 2 Andrew Burrows 2 Andrew Chen 2 Andy @@ -781,6 +800,7 @@ 2 Bhuvana KA 2 Bill Letson 2 Bobae Kim + 2 Boyd Kane 2 Bryan Cutler 2 Camilo Cota 2 Carol Willing @@ -802,6 +822,7 @@ 2 CloseChoice 2 Cody 2 Compro Prasad + 2 Damian Kula 2 Dan Ringwalt 2 Daniel Grady 2 Daniel Hrisca @@ -823,6 +844,7 @@ 2 Digres45 2 Dom 2 Doran Deluz + 2 Doug Davis 2 Douglas Hanley 2 Douglas Rudd 2 Dražen Lučanin @@ -901,7 +923,6 @@ 2 Jeff Knupp 2 Jeff Mellen 2 Jeffrey Gerard - 2 Jessica Greene 2 Jessica M 2 Jethro Cao 2 Jiawei Zhang @@ -910,6 +931,7 @@ 2 Joao Victor Martinelli 2 Joaq Almirante 2 Joe Bradish + 2 JohannaTrost 2 John David Reaver 2 John G Evans 2 John Liekezer @@ -939,7 +961,6 @@ 2 Katrin Leinweber 2 Kee Chong Tan 2 Kenny Huynh - 2 Kevin 2 Kevin Anderson 2 Kevin Bowey 2 Kevin Jan Anker @@ -950,10 +971,10 @@ 2 Kyle Prestel 2 LJ 2 Lars Lien Ankile - 2 Lawrence Mitchell 2 Leif Johnson 2 Leif Walsh 2 Leo Razoumov + 2 Linus 2 Liwei Cai 2 Lorenzo Vainigli 2 Luca Scarabello @@ -972,7 +993,6 @@ 2 Martin Fleischmann 2 Martina G. Vilas 2 Mason Gallo - 2 Mateusz Sokół 2 Mats Maiwald 2 Matt Maybeno 2 Matt Richards @@ -985,6 +1005,7 @@ 2 Michael Schatzow 2 Michael W Schatzow 2 Michael Wang + 2 Michał Górny 2 Miguel 2 Mike Phung 2 Min RK @@ -1015,6 +1036,7 @@ 2 Paddy Mullen 2 Pankaj Pandey 2 Paolo Lammens + 2 Paras Gupta 2 Patrick Cando 2 Patrick O'Keeffe 2 Paul Lee @@ -1032,6 +1054,7 @@ 2 Prerana Chakraborty 2 Punitvara 2 Rafal Skolasinski + 2 Raghav 2 Rahul Chauhan 2 Rajib Mitra 2 RaphSku @@ -1064,6 +1087,7 @@ 2 Scott Sanderson 2 Scott Talbert 2 Seb + 2 Sebastian Berg 2 Sebastián Vanrell 2 Shadi Akiki 2 Shahul Hameed @@ -1134,6 +1158,7 @@ 2 Yuecheng Wu 2 Yutaro Ikeda 2 Yvan Gatete + 2 Yves Delley 2 Zach Angell 2 Zak Kohler 2 adatasetaday @@ -1144,6 +1169,7 @@ 2 bjonen 2 bolkedebruin 2 broessli + 2 ccccjone 2 cgangwar11 2 charalampos papaloizou 2 charlogazzo @@ -1179,6 +1205,7 @@ 2 jaimefrio 2 jakirkham 2 jeschwar + 2 jfadia 2 jlamborn324 2 jmholzer 2 jmorris0x0 @@ -1211,7 +1238,6 @@ 2 nsuresh 2 nullptr 2 ohad83 - 2 omar-elbaz 2 ottiP 2 paradox-lab 2 pedrooa @@ -1219,7 +1245,6 @@ 2 priyankjain 2 ptype 2 qudade - 2 raj-thapa 2 reidy-p 2 rjfs 2 rlukevie @@ -1267,12 +1292,14 @@ 1 Aadharsh-Acharya 1 Aadhi Manivannan 1 Aaron Barber + 1 Aaron Rahman 1 Aaron Schumacher 1 Aaron Toth 1 Aashish KC 1 Abbie Popa 1 AbdealiJK 1 AbdulMAbdi + 1 Abhijit Deo 1 Abhiraj Hinge 1 Abhishek R 1 Abo7atm @@ -1293,10 +1320,12 @@ 1 Aditya Agarwal 1 Aditya Anulekh 1 Adrian + 1 Adrian D'Alessandro 1 Adrian Liaw 1 Adrien Emery 1 Adrien RUAULT 1 Agustín Herranz + 1 Ahmad Mustafa Anis 1 Aidos Kanapyanov 1 Aivengoe 1 Ajitesh Singh @@ -1353,6 +1382,7 @@ 1 Amanda Dsouza 1 Amay Patel 1 Amim Knabben + 1 Amith KK 1 Amol 1 Amol Agrawal 1 Amol K @@ -1388,6 +1418,7 @@ 1 Angela Seo 1 AnglinaBhambra 1 Anh Le + 1 Aniket Patil 1 Aniket uttam 1 Anil Kumar Pallekonda 1 Anirudh Hegde @@ -1400,6 +1431,7 @@ 1 Antoine Viscardi 1 Anton Lodder 1 Antonio Andraues Jr + 1 Antonio Fonseca 1 Antonio Gutierrez 1 Antonio Linde 1 Antonio Molina @@ -1417,6 +1449,7 @@ 1 Arkadeep Adhikari 1 ArnaudChanoine 1 Arno Veenstra + 1 Artur Barseghyan 1 Arun12121 1 Arunim Samudra 1 Arushi Sharma @@ -1450,6 +1483,7 @@ 1 Ben 1 Ben Auffarth 1 Ben Forbes + 1 Ben Greiner 1 Ben James 1 Ben Mangold 1 Ben Nelson @@ -1472,6 +1506,7 @@ 1 Bibek Jha 1 BielStela 1 Bijay Regmi + 1 Bill Blum 1 Bill Chambers 1 Bishwas 1 Bjorn Arneson @@ -1481,7 +1516,6 @@ 1 Bogdan Pilyavets 1 Boris Lau 1 BorisVerk - 1 Boyd Kane 1 Bradley Dice 1 Brandon Rhodes 1 Brayan Alexander Muñoz B @@ -1579,12 +1613,12 @@ 1 Da Wang 1 DaCoEx 1 Damian Barabonkov - 1 Damian Kula 1 Damini Satya 1 Damodara Puddu 1 Dan Davison 1 Dan Dixey 1 Dan Hendry + 1 Dan King 1 Dan Moore 1 DanBasson 1 Daniel Chen @@ -1598,6 +1632,8 @@ 1 Daniel Sakuma 1 Daniel Shapiro 1 Daniel Siladji + 1 Daniel Weindl + 1 Daniele Nicolodi 1 Danilo Horta 1 Darcy Meyer 1 Darin Plutchok @@ -1617,7 +1653,6 @@ 1 David Liu 1 David Lutz 1 David Polo - 1 David Poznik 1 David Rasch 1 David Read 1 David Rouquet @@ -1627,6 +1662,7 @@ 1 David Sanders 1 David Schlachter 1 David Seifert + 1 David Toneian 1 David Wales 1 David Wolever 1 David Zaslavsky @@ -1635,6 +1671,7 @@ 1 DavidRosen 1 Dean 1 Dean Langsam + 1 Deepak George 1 Deepak Pandey 1 Deepak Sirohiwal 1 Deepan Das @@ -1663,6 +1700,7 @@ 1 Dominik Berger 1 Dominik Kutra 1 Dominik Stanczak + 1 Dominique Garmier 1 Donald Curtis 1 DorAmram 1 Doris Lee @@ -1677,6 +1715,7 @@ 1 Drew Heenan 1 Drew Levitt 1 Drewrey Lupton + 1 Dukastlik 1 Dustin K 1 Dylan Dmitri Gray 1 Dylan Percy @@ -1689,6 +1728,7 @@ 1 Ege Özgüroğlu 1 Ehsan Azarnasab 1 Ekaterina Borovikova + 1 Elahe Sharifi 1 Eli Dourado 1 Eli Schwartz 1 Eli Treuherz @@ -1704,6 +1744,7 @@ 1 Eric Brassell 1 Eric Goddard 1 Eric Groszman + 1 Eric Han 1 Eric Kisslinger 1 Eric O. LEBIGOT (EOL) 1 Eric Stein @@ -1759,6 +1800,7 @@ 1 Francesco Romandini 1 Francesco Truzzi 1 Francis + 1 Francisco Alfaro 1 Francois Dion 1 Frank Cleary 1 Frank Hoang @@ -1777,6 +1819,7 @@ 1 Gabriel Moreira 1 Gabriel Reid 1 Gabriel de Maeztu + 1 Gadea Autric 1 Gaibo Zhang 1 Gaurav Chauhan 1 Gaétan Ramet @@ -1819,6 +1862,7 @@ 1 Gustavo Vargas 1 Gyeongjae Choi 1 HHest + 1 Hadi Abdi Khojasteh 1 HagaiHargil 1 Haleemur Ali 1 Hamidreza Sanaee @@ -1832,6 +1876,7 @@ 1 Hassan Shamim 1 Hatem Nassrat 1 Hatim Zahid + 1 Hedeer El Showk 1 Heidi 1 Hendrik Makait 1 Henry @@ -1866,7 +1911,9 @@ 1 Inevitable-Marzipan 1 Isaac Schwabacher 1 Isaac Slavitt + 1 Issam 1 IsvenC + 1 Itayazolay 1 Iulius Curt 1 Iva Koevska 1 Iva Laginja @@ -1876,7 +1923,9 @@ 1 Iván Vallés Pérez 1 JElfner 1 JJ + 1 Jaca 1 Jack Greisman + 1 Jack McIvor 1 Jackie Leng 1 Jacob Austin 1 Jacob Buckheit @@ -1900,6 +1949,7 @@ 1 James McBride 1 James Moro 1 James Santucci + 1 James Spencer 1 James Winegar 1 Jan F-F 1 Jan Müller @@ -1946,6 +1996,7 @@ 1 Jimmy Woo 1 Jinli Xiao 1 Jinyang Zhou + 1 Jirka Borovec 1 Joachim Wagner 1 Joan Martin Miralles 1 Joanna Ge @@ -1961,6 +2012,7 @@ 1 John 1 John Bencina 1 John Bodley + 1 John C 1 John Cant 1 John Evans 1 John Fremlin @@ -2015,6 +2067,7 @@ 1 Jovixe 1 Joy Bhalla 1 Jozef Brandys + 1 João Andrade 1 João Meirelles 1 João Veiga 1 Julia Aoun @@ -2046,6 +2099,7 @@ 1 Kabiir Krishna 1 Kacawi 1 Kadatatlu Kishore + 1 Kai Mühlbauer 1 Kai Priester 1 Kamal Kamalaldin 1 Kamil Sindi @@ -2057,12 +2111,14 @@ 1 Karrie Kehoe 1 Karthik Velayutham 1 Kassandra Keeton + 1 Katharina Tielking, MD 1 Katherine Surta 1 Katherine Younglove 1 Katie Atkinson 1 Kaushal Rohit 1 Kavya9986 1 Kazuki Igeta + 1 Kazuto Haruguchi 1 Keiron Pizzey 1 Keith Kraus 1 Keith Webber @@ -2126,6 +2182,7 @@ 1 Lorenzo Bolla 1 Lorenzo Cestaro 1 Lorenzo Stella + 1 Louis-Émile Robitaille 1 Loïc Séguin-C 1 Luca Donini 1 Lucas Damo @@ -2150,8 +2207,10 @@ 1 Maciej Kos 1 Madhuri Palanivelu 1 Madhuri Patil + 1 Maggie Liu 1 Magnus Jöud 1 Mahdi Ben Jelloul + 1 MainHanzo 1 Makarov Andrey 1 Malcolm 1 Malgorzata Turzanska @@ -2187,6 +2246,7 @@ 1 Martin Grigorov 1 Martin Jones 1 Martin Journois + 1 Martin Šícho 1 Marty Rudolf 1 Marvin 1 Marvin John Walter @@ -2229,6 +2289,7 @@ 1 Max Mikhaylov 1 MaxU 1 Maxim Kupfer + 1 Maxwell Bileschi 1 Mayank Asthana 1 Mayank Bisht 1 Mayank Chaudhary @@ -2259,7 +2320,6 @@ 1 Michael Terry 1 Michael Waskom 1 Michael-J-Ward - 1 Michał Górny 1 Michel de Ruiter 1 Michelangelo D'Agostino 1 Michiel Stock @@ -2293,6 +2353,7 @@ 1 Morisa Manzella 1 Moritz Münst 1 Moritz Schreiber + 1 Moritz Schubert 1 Morten Canth Hels 1 Morton Fox 1 Moussa Taifi @@ -2301,6 +2362,7 @@ 1 MusTheDataGuy 1 Mykola Golubyev 1 NJOKU OKECHUKWU VALENTINE + 1 NNLNR 1 Nagesh Kumar C 1 Nanda H Krishna 1 Nate Armstrong @@ -2331,6 +2393,7 @@ 1 Nikolay Boev 1 Nikoleta Glynatsi 1 Nikos Karagiannakis + 1 Nils Müller-Wendt 1 Nima Sarang 1 Nipun Sadvilkar 1 Nis Martensen @@ -2373,8 +2436,10 @@ 1 Paul Masurel 1 Paul McCarthy 1 Paul Mestemaker + 1 Paul Pellissier 1 Paul Reiners 1 Paul Siegel + 1 Paul Uhlenbruck 1 Paul van Mulbregt 1 Paula 1 Paulo Roberto de Oliveira Castro @@ -2432,7 +2497,6 @@ 1 Radoslaw Lemiec 1 Rafael Jaimes III 1 Rafif - 1 Raghav 1 Rahul Gaikwad 1 Rahul Siloniya 1 RahulHP @@ -2442,6 +2506,7 @@ 1 Rajiv Bharadwaj 1 Rakshit Naidu 1 Ralph Bean + 1 Randolf Scholz 1 Rebecca Chen 1 Red 1 Redonnet Louis @@ -2488,6 +2553,7 @@ 1 Ruizhe Deng 1 Rupert Thompson 1 Russell Smith + 1 Ryan Gibson 1 Ryan Grout 1 Ryan Hendrickson 1 Ryan Joyce @@ -2525,6 +2591,7 @@ 1 Sangmin Park 1 Sanjiv Lobo 1 Santosh Kumar + 1 Sara Bonati 1 Sarah Bird 1 Sarah Masud 1 SarahJessica @@ -2541,7 +2608,6 @@ 1 Sean Chan 1 Sean M. Law 1 Sebastiaan Vermeulen - 1 Sebastian Berg 1 Sebastian Gsänger 1 Sebastian Pölsterl 1 Sebastian Roll @@ -2552,6 +2618,7 @@ 1 Sergei Ivko 1 Sergey 1 Sergey Kopylov + 1 Sergey Zakharov 1 Sergio Pascual 1 Shaghayegh 1 Shannon Wang @@ -2571,6 +2638,7 @@ 1 Shubhankar Lohani 1 Shudong Yang 1 Shyam Saladi + 1 Shyamala Venkatakrishnan 1 SiYoungOh 1 Siddhesh Poyarekar 1 Sidharthan Nair @@ -2598,6 +2666,7 @@ 1 Spencer Carrucciu 1 Spencer Clark 1 SplashDance + 1 StEmGeo 1 Stan West 1 Stefan Mejlgaard 1 Stefan van der Walt @@ -2649,13 +2718,16 @@ 1 Terry Santegoeds 1 TheDerivator 1 Thiago Cordeiro da Fonseca + 1 Thiago Gariani 1 Thiago Serafim 1 ThibTrip 1 Thomas + 1 Thomas Guillet 1 Thomas H 1 Thomas Heavey 1 Thomas Kastl 1 Thomas Kluiters + 1 Thomas Lazarus 1 Thomas Lentali 1 Thomas Vranken 1 Thomas Wiecki @@ -2689,6 +2761,7 @@ 1 Tomáš Chvátal 1 Tong Shen 1 Tony Hirst + 1 Toro 1 Toroi 1 TraverseTowner 1 Travis @@ -2722,10 +2795,12 @@ 1 Vikram Shirgur 1 Vikramaditya Gaonkar 1 Vikramjeet Das + 1 Ville Aikas 1 Vince W 1 Vincent Davis 1 Vinicius Akira 1 Vinicius Akira Imaizumi + 1 Vinita Parasrampuria 1 Vinícius Figueiredo 1 Vipin Kumar 1 Vishwak Srinivasan @@ -2738,6 +2813,7 @@ 1 Vladimir Podolskiy 1 Vladislav 1 VomV + 1 Vyas Ramasubramani 1 Vyom Jain 1 Víctor Moron Tejero 1 W.R @@ -2793,7 +2869,6 @@ 1 Yuval Langer 1 Yuya Takashina 1 Yvan Cywan - 1 Yves Delley 1 Zach Breger 1 Zach Brookler 1 Zach Dwiel @@ -2804,9 +2879,11 @@ 1 Zbyszek Królikowski 1 Zeb Nicholls 1 Zeke + 1 Zemux1613 1 Zhengfei Wang 1 ZhihuiChen0903 1 Zhiyi Wu + 1 Ziad Kermadi 1 Zihao Zhao 1 aaron315 1 abaldenko @@ -2844,6 +2921,7 @@ 1 anonmouse1 1 anton-d 1 aptalca + 1 aram-cinnamon 1 araraonline 1 arnaudlegout 1 asharma13524 @@ -2894,13 +2972,16 @@ 1 chernrick 1 chinggg 1 chinskiy + 1 chris-caballero 1 chromy 1 claudiobertoldi 1 claws 1 cmazzullo 1 cmmck 1 cnguyen-03 + 1 cobalt 1 code-review-doctor + 1 color455nm 1 conmai 1 cr3 1 cruzzoe @@ -2920,6 +3001,7 @@ 1 davidmvalente 1 davidovitch 1 daydreamt + 1 denisrei 1 dequadras 1 derestle-htwg 1 dgram0 @@ -3047,6 +3129,7 @@ 1 kaustuv deolal 1 kdiether 1 kevx82 + 1 kgmuzungu 1 kiwirob 1 kjford 1 klonuo @@ -3101,8 +3184,10 @@ 1 mck619 1 mcocdawc 1 mdeboc + 1 mecopur 1 mgilbert 1 mglasder + 1 mhb143 1 miguelmorin 1 mikebailey 1 milosz-martynow @@ -3112,10 +3197,12 @@ 1 moaraccounts 1 monicaBee 1 monosans + 1 morotti 1 morrme 1 mpuels 1 mrastgoo 1 mschmohl + 1 mvirts 1 mwaskom 1 na2 1 naveenkaushik2504 @@ -3138,7 +3225,6 @@ 1 parchd-1 1 parkdj1 1 paul-mannino - 1 paulreece 1 pbreach 1 peadarcoyle 1 penelopeysm @@ -3151,7 +3237,6 @@ 1 potap75 1 pqzx 1 pratyushsharan - 1 pre-commit-ci[bot] 1 pvanhauw 1 quantumalaviya 1 raanasn @@ -3198,6 +3283,7 @@ 1 segatrade 1 sfoo 1 shaido987 + 1 shiersansi 1 shourya5 1 sideeye 1 silentquasar @@ -3248,6 +3334,7 @@ 1 tom-alcorn 1 tomascassidy 1 tomrod + 1 torext 1 tpanza 1 trevorkask 1 tsinggggg @@ -3275,6 +3362,7 @@ 1 winlu 1 xgdgsc 1 xinrong-databricks + 1 xzmeng 1 yehia67 1 yelite 1 yhaque1213 @@ -3302,7 +3390,7 @@ 1 김동현 (Daniel Donghyun Kim) Debian packaging - 545 Rebecca N. Palmer + 608 Rebecca N. Palmer 323 Yaroslav Halchenko 49 Mo Zhou 38 Andreas Tille diff -Nru pandas-2.1.4+dfsg/debian/control pandas-2.2.2+dfsg/debian/control --- pandas-2.1.4+dfsg/debian/control 2024-04-21 12:25:29.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/control 2024-05-29 21:10:37.000000000 +0000 @@ -10,48 +10,49 @@ faketime, locales-all, python3-all-dev, - cython3 (>= 0.29.33~), -# we don't have this version python3-blosc (>= 1.21.0~) , - python3-bottleneck (>= 1.3.4~) , - python3-bs4 (>= 4.11.1~) , + cython3 (>= 3.0.5~), +# we don't have this version python3-blosc (>= 1.21.3~) , + python3-bottleneck (>= 1.3.6~) , + python3-bs4 (>= 4.11.2~) , python3-dask (>= 2023.2.0~) , python3-dateutil, - python3-fsspec (>= 2022.05.0~) , + python3-fsspec (>= 2022.11.0~) , python3-html5lib (>= 1.1~) , python3-hypothesis (>= 6.46.1~) , python3-jinja2 (>= 3.1.2~) , - python3-lxml (>= 4.8.0~) , - python3-matplotlib (>= 3.6.1~) [!ia64 !loong64 !powerpc !sparc64 !x32] , + python3-lxml (>= 4.9.2~) , + python3-matplotlib (>= 3.6.3~) [!ia64 !loong64 !powerpc !sparc64 !x32] , # numba has a history of bugs on non-x86, e.g. #1033907 # architectures here are the ones on which to treat numba-related failures as RC - see also debian/tests/control -# temporarily disabled as numba is not in testing #1033907 python3-numba (>= 0.55.2~) [amd64 i386 ppc64el] , - python3-numexpr (>= 2.8.0~) , +# temporarily disabled as numba is not in testing #1033907 python3-numba (>= 0.56.4~) [amd64 i386 ppc64el] , + python3-numexpr (>= 2.8.4~) , python3-numpy (>= 1:1.23.2~), python3-odf (>= 1.4.1~) , - python3-openpyxl (>= 3.0.10~) , -# doesn't seem to work in this test environment python3-psycopg2 (>= 2.9.3~) , + python3-openpyxl (>= 3.1.0~) , +# doesn't seem to work in this test environment python3-psycopg2 (>= 2.9.6~) , python3-py , # doesn't seem to work in this test environment python3-pymysql (>= 1.0.2~) , - python3-pyreadstat (>= 1.1.5~) [!ia64 !loong64 !sparc64 !x32] , + python3-pyqt5 (>= 5.15.9~) , + python3-pyreadstat (>= 1.2.0~) [!ia64 !loong64 !sparc64 !x32] , python3-pytest (>= 7.3.2~) , python3-pytest-asyncio (>= 0.17~) , python3-pytest-forked , python3-pytest-localserver , python3-pytest-xdist (>= 2.2.0~) , + python3-pytestqt (>= 4.2.0~) , # we don't have python3-pyxlsb - python3-scipy (>= 1.8.1~), + python3-scipy (>= 1.10.0~), python3-setuptools (>= 51~), -# we don't have this version python3-snappy (>= 0.6.0~) , - python3-sqlalchemy (>= 1.4.36~) [!ia64 !hppa !sh4 !x32] , +# we don't have this version python3-sqlalchemy (>= 2.0.0~) [!ia64 !hppa !sh4 !x32] , # python3-tables is now little-endian only, and also unavailable on some ports - python3-tables (>= 3.7.0~) [!s390x !hppa !powerpc !ppc64 !sparc64 !hurd-any !alpha] , -# we don't have this version python3-tabulate (>= 0.8.10~) , + python3-tables (>= 3.8.0~) [!s390x !hppa !powerpc !ppc64 !sparc64 !hurd-any !alpha] , + python3-tabulate (>= 0.9.0~) , python3-tk , - python3-tz (>= 2022.1~) , -# we don't have this version python3-xlrd (>= 2.0.1~) , -# we don't have this version python3-xlsxwriter (>= 3.0.3~) , + python3-tz (>= 2022.7~) , + python3-xlrd (>= 2.0.1~) , + python3-xlsxwriter (>= 3.0.5~) , python3-versioneer, - python3-zstandard (>= 0.17.0~) , + python3-zstandard (>= 0.19.0~) , sphinx-common, # for tests/examples that use old-style timezone names tzdata-legacy , @@ -63,8 +64,10 @@ python3-sphinx-design , python3-pydata-sphinx-theme , python3-ipykernel , - python3-notebook (>= 6.0.3~) , - python3-nbconvert (>= 6.4.5~) , +# we don't have this version python3-notebook (>= 7.0.6~) , +# we don't have this version python3-nbconvert (>= 7.11.0~) , + python3-notebook , + python3-nbconvert , python3-nbsphinx , python3-numpydoc , python3-pygments , @@ -86,7 +89,7 @@ python3-seaborn , python3-sqlalchemy , python3-statsmodels , - python3-xarray (>= 2022.03.0~) , + python3-xarray (>= 2022.12.0~) , Standards-Version: 4.7.0 Vcs-Browser: https://salsa.debian.org/science-team/pandas Vcs-Git: https://salsa.debian.org/science-team/pandas.git @@ -127,10 +130,7 @@ # 1.1 -> 1.3 API breaks, see #999415 python3-cfgrib (<= 0.9.9-1), python3-joypy (<= 0.2.2-2), -# 1.4 -> 1.5 API breaks, see #1022571 - python3-statsmodels (<< 0.13.5~), # 1.5 -> 2.1 API breaks, #1043240 - augur (<= 24.0.0-1), cnvkit (<< 0.9.10~), python3-altair (<< 5.0.1~), python3-anndata (<= 0.8.0-4), @@ -143,7 +143,6 @@ python3-esda (<= 2.5.1-1), python3-feather-format (<< 0.3.1+dfsg1-8~), python3-hypothesis (<< 6.83.1~), -# broken tests but probably not broken actual package python3-influxdb (<= 5.3.1-5), python3-jsonpickle (<< 3.0.2+dfsg-1~), python3-mirtop (<< 0.4.25-5~), python3-nanoget (<< 1.19.3~), @@ -165,7 +164,11 @@ q2-quality-control (<= 2022.11.1-2), q2-taxa (<= 2023.9.0+dfsg-1), q2-types (<= 2023.9.0-1), - q2templates (<= 2023.9.0+ds-1) + q2templates (<= 2023.9.0+ds-1), +# 2.1 -> 2.2 API breaks, #1069792 + augur (<< 24.4.0-1~), + python3-influxdb (<< 5.3.2-1~), + python3-statsmodels (<< 0.14.2~), Description: data structures for "relational" or "labeled" data pandas is a Python package providing fast, flexible, and expressive data structures designed to make working with "relational" or diff -Nru pandas-2.1.4+dfsg/debian/copyright pandas-2.2.2+dfsg/debian/copyright --- pandas-2.1.4+dfsg/debian/copyright 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/copyright 2024-04-23 17:35:52.000000000 +0000 @@ -5,29 +5,29 @@ Files-Excluded: doc/source/user_guide/cookbook.rst .gitignore .gitattributes -Comment: cookbook has around 100 Stack Overflow snippets (CC-BY-SA with possibly inadequate attribution), the other omissions are for non-copyright potential breakage +Comment: cookbook has around 100 Stack Overflow snippets (CC-BY-SA with possibly inadequate attribution), the other omissions are for non-copyright potential breakage; the generic pandas upstream copyright in * is repeated for other files modified by them (without bothering to keep track of exact years) because the format defines the initial * as applying only to files not listed elsewhere Files: * -Copyright: 2008-2023 AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors +Copyright: 2008-2024 AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: BSD-3 Comment: Lumped together as notices disagree on which ones are which years. See contributors_list.txt for a full list of commit authors. The original source contains some CC-BY-SA snippets by Stack Overflow users (https://stackoverflow.com/users/1114/jeremy-banks, https://stackoverflow.com/users/387251/oliver, https://stackoverflow.com/users/3297752/noah-motion, https://stackoverflow.com/users/925671/bill, https://stackoverflow.com/users/1082349/foobar, https://stackoverflow.com/users/3089209/crantila, https://stackoverflow.com/users/2375855/ojdo, https://stackoverflow.com/users/487339/dsm, https://stackoverflow.com/users/2677943/swenzel), but these may be too small to be copyrightable, and the less trivial ones are patched out in this package Files: doc/sphinxext/announce.py Copyright: 2001-2017 Enthought, Inc. and SciPy Developers. - 2017-2021 Lambda Foundry, Inc. and PyData Development Team + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: BSD-3 Comment: it is possible that other code was also taken from Scipy Files: pandas/_libs/include/pandas/portable.h -Copyright: 2005-2014 Rich Felker and contributors - 2008-2019, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team +Copyright: 2005-2020 Rich Felker and contributors (see LICENSES/MUSL_LICENSE for list) + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: Expat and BSD-3 -Origin: musl (partly) -Comment: this might or might not be from a public domain part of musl +Origin: musl src/ctype (partly) Files: pandas/_libs/include/pandas/skiplist.h Copyright: 2009, Raymond Hettinger - 2011-2020 Wes McKinney and PyData Development Team + 2016(?) Wes McKinney + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: Expat and BSD-3 Origin: http://code.activestate.com/recipes/576930/ Comment: it is a Cython code "inspired" by the original Python code by Raymond @@ -37,7 +37,7 @@ Copyright: 2002 Michael Ringgaard 2011-2012 Warren Weckesser 2001-2012 Python Software Foundation and Python contributors - 2012-2022 Lambda Foundry, Inc. and PyData Development Team + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: Python and BSD-3 Origin: csv (Python standard library), github.com/WarrenWeckesser/textreader @@ -49,11 +49,12 @@ pandas/_libs/src/vendored/ujson/* Copyright: 1988-1993 The Regents of the University of California 1994 Sun Microsystems, Inc. - 2007 Nick Galbreath + 2005-2007 Nick Galbreath + 2014 Electronic Arts Inc. 2011-2013 ESN Social Software AB and Jonas Tarnstrom - 2012-2022 Lambda Foundry, Inc. and PyData Development Team + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: BSD-3 and Expat -Origin: ultrajson +Origin: ultrajson, modp_ascii and TCL Files: pandas/_libs/tslibs/parsing.pyx Copyright: 2003-2011 - Gustavo Niemeyer @@ -61,46 +62,48 @@ 2014-2016 - Yaron de Leeuw 2015-2017 - Paul Ganssle 2015-2017 - dateutil contributors (see AUTHORS file) - 2008-2022, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: BSD-3 Origin: dateutil (partly) Files: pandas/_libs/include/pandas/vendored/numpy/* pandas/_libs/src/vendored/numpy/* -Copyright: 2005-2013, NumPy Developers +Copyright: 2005-?, NumPy Developers License: BSD-3 Origin: numpy -Comment: Listed as derived from Numpy 1.7 +Comment: the included license says 2005-2023, but this was updated without changing the actual code (https://github.com/pandas-dev/pandas/pull/54743) and the files say derived from Numpy 1.7 (2013) Files: pandas/_libs/window/aggregations.pyx pandas/tests/window/test_rolling.py -Copyright: 2010-2012 Archipel Asset Management AB - 2011-2022 Lambda Foundry, Inc. and PyData Development Team +Copyright: 2010-2019 Keith Goodman + 2019 Bottleneck Developers + 2010-2012 Archipel Asset Management AB + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: BSD-3 Origin: bottleneck (partly) Comment: Original was BSD-2, but BSD-2 and BSD-3 = BSD-3 Files: pandas/compat/* Copyright: 2010-2013 Benjamin Peterson - 2012-2022 Lambda Foundry, Inc. and PyData Development Team + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: Expat and BSD-3 Origin: six Files: pandas/core/accessor.py Copyright: 2014-2018 xarray developers - 2018-2021 Lambda Foundry, Inc. and PyData Development Team + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: Apache-2.0 and BSD-3 Origin: xarray (partly) Files: pandas/io/clipboard/* Copyright: 2010-2017 Albert Sweigart and Pyperclip contributors - 2016-2022 Lambda Foundry, Inc. and PyData Development Team + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: BSD-3 Origin: Pyperclip Files: pandas/io/sas/sas7bdat.py -Copyright: 2015 Jared Hobbs - 2016-2022 Lambda Foundry, Inc. and PyData Development Team +Copyright: 2015-2019 Jared Hobbs + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors Origin: https://bitbucket.org/jaredhobbs/sas7bdat License: Expat @@ -126,19 +129,19 @@ Files: scripts/no_bool_in_generic.py Copyright: 2017 Anthony Sottile - 2021 Lambda Foundry, Inc. and PyData Development Team + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: Expat and BSD-3 Origin: pyupgrade (partly) Files: setup.py Copyright: 2009-2012, Brian Granger, Min Ragan-Kelley (from pyzmq) 2004 Infrae (from lxml) - 2008-2022, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team + 20xx-20xx AQR Capital Management, LLC, Lambda Foundry, Inc., PyData Development Team, Open source contributors License: BSD-3 Files: debian/* Copyright: 2011-2018, Yaroslav Halchenko - 2019-2023, Debian Science Team + 2019-2024, Debian Science Team License: BSD-3 License: BSD-2 diff -Nru pandas-2.1.4+dfsg/debian/patches/1068104_time64.patch pandas-2.2.2+dfsg/debian/patches/1068104_time64.patch --- pandas-2.1.4+dfsg/debian/patches/1068104_time64.patch 2024-04-05 06:33:08.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/1068104_time64.patch 2024-05-06 13:47:40.000000000 +0000 @@ -2,29 +2,88 @@ Debian armhf/armel (but not i386) are now time64 -Author: Graham Inggs +Author: Graham Inggs, Rebecca N. Palmer Bug-Debian: https://bugs.debian.org/1068104 Forwarded: no ---- a/pandas/tests/indexes/datetimes/test_ops.py -+++ b/pandas/tests/indexes/datetimes/test_ops.py -@@ -35,7 +35,7 @@ class TestDatetimeIndexOps: - tz = tz_naive_fixture - if freq == "A" and not IS64 and isinstance(tz, tzlocal): - request.node.add_marker( -- pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") -+ pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038", strict=False) - ) +--- a/pandas/tests/indexes/datetimes/methods/test_resolution.py ++++ b/pandas/tests/indexes/datetimes/methods/test_resolution.py +@@ -24,7 +24,7 @@ def test_dti_resolution(request, tz_naiv + tz = tz_naive_fixture + if freq == "YE" and not IS64 and isinstance(tz, tzlocal): + request.applymarker( +- pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") ++ pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038", strict=False) + ) - idx = date_range(start="2013-04-01", periods=30, freq=freq, tz=tz) + idx = date_range(start="2013-04-01", periods=30, freq=freq, tz=tz) --- a/pandas/tests/tseries/offsets/test_common.py +++ b/pandas/tests/tseries/offsets/test_common.py @@ -143,7 +143,7 @@ def test_apply_out_of_range(request, tz_ # If we hit OutOfBoundsDatetime on non-64 bit machines # we'll drop out of the try clause before the next test - request.node.add_marker( + request.applymarker( - pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") + pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038", strict=False) ) elif ( isinstance(tz, tzlocal) +--- a/pandas/tests/tools/test_to_timedelta.py ++++ b/pandas/tests/tools/test_to_timedelta.py +@@ -244,7 +244,7 @@ class TestTimedeltas: + actual = to_timedelta([val]) + assert actual[0]._value == np.timedelta64("NaT").astype("int64") + +- @pytest.mark.xfail(not IS64, reason="Floating point error") ++ @pytest.mark.xfail(not IS64, reason="Floating point error", strict=False) + def test_to_timedelta_float(self): + # https://github.com/pandas-dev/pandas/issues/25077 + arr = np.arange(0, 1, 1e-6)[-10:] +--- a/pandas/tests/io/sas/test_sas7bdat.py ++++ b/pandas/tests/io/sas/test_sas7bdat.py +@@ -15,6 +15,9 @@ import pandas as pd + import pandas._testing as tm + + from pandas.io.sas.sas7bdat import SAS7BDATReader ++import platform ++import re ++is_platform_x86_32 = bool(re.match("i.?86|x86", platform.uname()[4])) and not IS64 + + + @pytest.fixture +@@ -202,7 +205,7 @@ def test_date_time(datapath): + res = df0["DateTimeHi"].astype("M8[us]").dt.round("ms") + df0["DateTimeHi"] = res.astype("M8[ms]") + +- if not IS64: ++ if is_platform_x86_32: + # No good reason for this, just what we get on the CI + df0.loc[0, "DateTimeHi"] += np.timedelta64(1, "ms") + df0.loc[[2, 3], "DateTimeHi"] -= np.timedelta64(1, "ms") +@@ -297,7 +300,7 @@ def test_max_sas_date(datapath): + columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"], + ) + +- if not IS64: ++ if is_platform_x86_32: + # No good reason for this, just what we get on the CI + expected.loc[:, "dt_as_dt"] -= np.timedelta64(1, "ms") + +@@ -340,7 +343,7 @@ def test_max_sas_date_iterator(datapath) + columns=col_order, + ), + ] +- if not IS64: ++ if is_platform_x86_32: + # No good reason for this, just what we get on the CI + expected[0].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms") + expected[1].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms") +@@ -371,7 +374,7 @@ def test_null_date(datapath): + ), + }, + ) +- if not IS64: ++ if is_platform_x86_32: + # No good reason for this, just what we get on the CI + expected.loc[0, "datetimecol"] -= np.timedelta64(1, "ms") + tm.assert_frame_equal(df, expected) diff -Nru pandas-2.1.4+dfsg/debian/patches/1068422_ignore_dask_tests.patch pandas-2.2.2+dfsg/debian/patches/1068422_ignore_dask_tests.patch --- pandas-2.1.4+dfsg/debian/patches/1068422_ignore_dask_tests.patch 2024-04-05 06:33:08.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/1068422_ignore_dask_tests.patch 2024-04-23 18:15:48.000000000 +0000 @@ -9,7 +9,7 @@ --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py -@@ -33,6 +33,7 @@ def df(): +@@ -31,6 +31,7 @@ def df(): return DataFrame({"A": [1, 2, 3]}) @@ -17,7 +17,7 @@ def test_dask(df): try: from multiprocessing.pool import ThreadPool -@@ -55,6 +56,7 @@ def test_dask(df): +@@ -53,6 +54,7 @@ def test_dask(df): pd.set_option("compute.use_numexpr", olduse) @@ -25,7 +25,7 @@ def test_dask_ufunc(): # dask sets "compute.use_numexpr" to False, so catch the current value # and ensure to reset it afterwards to avoid impacting other tests -@@ -74,6 +76,7 @@ def test_dask_ufunc(): +@@ -72,6 +74,7 @@ def test_dask_ufunc(): pd.set_option("compute.use_numexpr", olduse) diff -Nru pandas-2.1.4+dfsg/debian/patches/2p1_openpyxl_errors.patch pandas-2.2.2+dfsg/debian/patches/2p1_openpyxl_errors.patch --- pandas-2.1.4+dfsg/debian/patches/2p1_openpyxl_errors.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/2p1_openpyxl_errors.patch 2024-05-04 20:19:28.000000000 +0000 @@ -5,25 +5,25 @@ --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py -@@ -115,13 +115,13 @@ def test_engine_kwargs_append_invalid(ex +@@ -124,13 +124,14 @@ def test_engine_kwargs_append_invalid(ex DataFrame(["good"]).to_excel(writer, sheet_name="Sheet2") --@pytest.mark.parametrize("data_only, expected", [(True, 0), (False, "=1+1")]) -+@pytest.mark.parametrize("data_only, expected", [(True, None), (False, "=1+1")]) ++@td.skip_if_no("xlsxwriter") + @pytest.mark.parametrize("data_only, expected", [(True, 0), (False, "=1+1")]) def test_engine_kwargs_append_data_only(ext, data_only, expected): # GH 43445 # tests whether the data_only engine_kwarg actually works well for # openpyxl's load_workbook with tm.ensure_clean(ext) as f: - DataFrame(["=1+1"]).to_excel(f) -+ DataFrame(["=1+1"]).to_excel(f, engine="openpyxl") ++ DataFrame(["=1+1"]).to_excel(f, engine="xlsxwriter") # with openpyxl here, data_only=True gives None/np.nan not 0 with ExcelWriter( f, engine="openpyxl", mode="a", engine_kwargs={"data_only": data_only} ) as writer: --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py -@@ -1308,6 +1308,8 @@ class TestExcelWriterEngineTests: +@@ -1420,6 +1420,8 @@ class TestExcelWriterEngineTests: assert isinstance(writer, _XlsxWriter) else: assert isinstance(writer, klass) diff -Nru pandas-2.1.4+dfsg/debian/patches/accept_system_tzdata.patch pandas-2.2.2+dfsg/debian/patches/accept_system_tzdata.patch --- pandas-2.1.4+dfsg/debian/patches/accept_system_tzdata.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/accept_system_tzdata.patch 2024-04-23 18:07:52.000000000 +0000 @@ -12,18 +12,18 @@ return False - # Warn if tzdata is too old, even if there is a system tzdata to alert - # users about the mismatch between local/system tzdata -- import_optional_dependency("tzdata", errors="warn", min_version="2022.1") +- import_optional_dependency("tzdata", errors="warn", min_version="2022.7") return tz is utc_zoneinfo --- a/pyproject.toml +++ b/pyproject.toml -@@ -33,8 +33,7 @@ dependencies = [ - "numpy>=1.23.2,<2; python_version=='3.11'", - "numpy>=1.26.0,<2; python_version>='3.12'", +@@ -31,8 +31,7 @@ dependencies = [ + "numpy>=1.23.2; python_version=='3.11'", + "numpy>=1.26.0; python_version>='3.12'", "python-dateutil>=2.8.2", - "pytz>=2020.1", -- "tzdata>=2022.1" +- "tzdata>=2022.7" + "pytz>=2020.1" ] classifiers = [ diff -Nru pandas-2.1.4+dfsg/debian/patches/add_missing_importorskip.patch pandas-2.2.2+dfsg/debian/patches/add_missing_importorskip.patch --- pandas-2.1.4+dfsg/debian/patches/add_missing_importorskip.patch 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/add_missing_importorskip.patch 2024-05-06 10:04:42.000000000 +0000 @@ -0,0 +1,55 @@ +Description: Skip tests when dependencies are missing + +Author: Rebecca N. Palmer +Forwarded: no + +--- a/pandas/tests/io/parser/conftest.py ++++ b/pandas/tests/io/parser/conftest.py +@@ -12,6 +12,7 @@ from pandas import ( + read_table, + ) + import pandas._testing as tm ++import pandas.util._test_decorators as td + + + class BaseParser: +@@ -118,7 +119,7 @@ _pyarrowParser = PyArrowParser + + _py_parsers_only = [_pythonParser] + _c_parsers_only = [_cParserHighMemory, _cParserLowMemory] +-_pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=pytest.mark.single_cpu)] ++_pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=[pytest.mark.single_cpu, td.skip_if_no("pyarrow")])] + + _all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] + +@@ -182,8 +183,8 @@ def _get_all_parser_float_precision_comb + parser = parser.values[0] + for precision in parser.float_precision_choices: + # Re-wrap in pytest.param for pyarrow +- mark = pytest.mark.single_cpu if parser.engine == "pyarrow" else () +- param = pytest.param((parser(), precision), marks=mark) ++ marks = [pytest.mark.single_cpu, td.skip_if_no("pyarrow")] if parser.engine == "pyarrow" else () ++ param = pytest.param((parser(), precision), marks=marks) + params.append(param) + ids.append(f"{parser_id}-{precision}") + +--- a/pandas/tests/io/formats/style/test_bar.py ++++ b/pandas/tests/io/formats/style/test_bar.py +@@ -347,6 +347,7 @@ def test_styler_bar_with_NA_values(): + assert expected_substring in html_output2 + + ++@td.skip_if_no("pyarrow") + def test_style_bar_with_pyarrow_NA_values(): + data = """name,age,test1,test2,teacher + Adam,15,95.0,80,Ashby +--- a/pandas/tests/series/test_api.py ++++ b/pandas/tests/series/test_api.py +@@ -171,6 +171,7 @@ class TestSeriesMisc: + def test_inspect_getmembers(self): + # GH38782 + td.versioned_importorskip("jinja2") ++ td.versioned_importorskip("pyarrow") + ser = Series(dtype=object) + msg = "Series._data is deprecated" + with tm.assert_produces_warning( diff -Nru pandas-2.1.4+dfsg/debian/patches/allow_no_openpyxl.patch pandas-2.2.2+dfsg/debian/patches/allow_no_openpyxl.patch --- pandas-2.1.4+dfsg/debian/patches/allow_no_openpyxl.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/allow_no_openpyxl.patch 2024-04-23 18:09:06.000000000 +0000 @@ -8,7 +8,7 @@ --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py -@@ -1645,12 +1645,14 @@ class TestExcelFileRead: +@@ -1671,12 +1671,14 @@ class TestExcelFileRead: expected = pd.read_excel("test1" + read_ext, engine=engine) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/debian/patches/armel_ignore_nonwarning.patch pandas-2.2.2+dfsg/debian/patches/armel_ignore_nonwarning.patch --- pandas-2.1.4+dfsg/debian/patches/armel_ignore_nonwarning.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/armel_ignore_nonwarning.patch 2024-04-23 18:02:23.000000000 +0000 @@ -27,33 +27,9 @@ # float_frame fixture is defined in conftest.py, so we don't check the # stacklevel as otherwise the test would fail. result = getattr(float_frame, how)(op) ---- a/pandas/tests/apply/test_invalid_arg.py -+++ b/pandas/tests/apply/test_invalid_arg.py -@@ -323,6 +323,12 @@ def test_transform_wont_agg_frame(axis, - with pytest.raises(ValueError, match=msg): - float_frame.transform(func, axis=axis) - -+# armel numpy currently doesn't have the invalid log/sqrt warning (see 1.4.3-1 build log, -+# possibly the same underlying issue as statsmodels https://bugs.debian.org/956882) -+# using nullcontext() instead of warn=None to not start failing if this ever gets fixed -+import subprocess -+import contextlib -+debian_arch = subprocess.run(["dpkg","--print-architecture"],capture_output=True).stdout - - @pytest.mark.parametrize("func", [["min", "max"], ["sqrt", "max"]]) - def test_transform_wont_agg_series(string_series, func): -@@ -333,7 +339,7 @@ def test_transform_wont_agg_series(strin - warn = RuntimeWarning if func[0] == "sqrt" else None - warn_msg = "invalid value encountered in sqrt" - with pytest.raises(ValueError, match=msg): -- with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False): -+ with (contextlib.nullcontext() if (debian_arch==b'armel\n') else tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False)): - string_series.transform(func) - - --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py -@@ -56,6 +56,12 @@ def test_delim_whitespace_custom_termina +@@ -63,6 +63,12 @@ def test_delim_whitespace_custom_termina expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) tm.assert_frame_equal(df, expected) @@ -66,7 +42,7 @@ def test_dtype_and_names_error(c_parser_only): # see gh-8833: passing both dtype and names -@@ -91,7 +97,7 @@ nan 2 +@@ -98,7 +104,7 @@ nan 2 # fallback casting, but not castable warning = RuntimeWarning if np_version_gte1p24 else None with pytest.raises(ValueError, match="cannot safely convert"): diff -Nru pandas-2.1.4+dfsg/debian/patches/blosc_nonstrict_xfail.patch pandas-2.2.2+dfsg/debian/patches/blosc_nonstrict_xfail.patch --- pandas-2.1.4+dfsg/debian/patches/blosc_nonstrict_xfail.patch 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/blosc_nonstrict_xfail.patch 2024-05-18 10:53:22.000000000 +0000 @@ -0,0 +1,19 @@ +Description: Use nonstrict xfail + +Upstream strict-xfailed this instead of changing the expected message, +which doesn't work here because it only fails in build, not autopkgtest + +Author: Rebecca N. Palmer +Forwarded: no + +--- a/pandas/tests/io/pytables/test_file_handling.py ++++ b/pandas/tests/io/pytables/test_file_handling.py +@@ -270,7 +270,7 @@ def test_complibs(tmp_path, lvl, lib, re + # GH14478 + if PY311 and is_platform_linux() and lib == "blosc2" and lvl != 0: + request.applymarker( +- pytest.mark.xfail(reason=f"Fails for {lib} on Linux and PY > 3.11") ++ pytest.mark.xfail(reason=f"Fails for {lib} on Linux and PY > 3.11", strict=False) + ) + df = DataFrame( + np.ones((30, 4)), columns=list("ABCD"), index=np.arange(30).astype(np.str_) diff -Nru pandas-2.1.4+dfsg/debian/patches/deb_disable_analytics.patch pandas-2.2.2+dfsg/debian/patches/deb_disable_analytics.patch --- pandas-2.1.4+dfsg/debian/patches/deb_disable_analytics.patch 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/deb_disable_analytics.patch 2024-04-23 17:44:44.000000000 +0000 @@ -0,0 +1,88 @@ +Description: Avoid privacy breach by analytics + +Author: Yaroslav Halchenko , + Andreas Tille , + Rebecca N. Palmer +Forwarded: not-needed + +--- a/pandas/tests/io/data/html/spam.html ++++ b/pandas/tests/io/data/html/spam.html +@@ -27,45 +27,9 @@ + + + +- +- ++ + + + +@@ -794,4 +758,4 @@ handler: function() {this.cancel();}, +   Software v.1.2.2 + + +- +\ No newline at end of file ++ +--- a/web/pandas/_templates/layout.html ++++ b/web/pandas/_templates/layout.html +@@ -1,7 +1,6 @@ + + + +- + pandas - Python Data Analysis Library + + +--- a/doc/source/conf.py ++++ b/doc/source/conf.py +@@ -243,10 +243,6 @@ html_theme_options = { + "footer_start": ["pandas_footer", "sphinx-version"], + "github_url": "https://github.com/pandas-dev/pandas", + "twitter_url": "https://twitter.com/pandas_dev", +- "analytics": { +- "plausible_analytics_domain": "pandas.pydata.org", +- "plausible_analytics_url": "https://views.scientific-python.org/js/script.js", +- }, + "logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"}, + "navbar_align": "left", + "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"], diff -Nru pandas-2.1.4+dfsg/debian/patches/deb_disable_googleanalytics.patch pandas-2.2.2+dfsg/debian/patches/deb_disable_googleanalytics.patch --- pandas-2.1.4+dfsg/debian/patches/deb_disable_googleanalytics.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/deb_disable_googleanalytics.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,91 +0,0 @@ -Description: Avoid privacy breach by Google Analytics - -Author: Yaroslav Halchenko , - Andreas Tille , - Rebecca N. Palmer -Forwarded: not-needed - ---- a/pandas/tests/io/data/html/spam.html -+++ b/pandas/tests/io/data/html/spam.html -@@ -27,45 +27,9 @@ - - - -- -- -+ - - - -@@ -794,4 +758,4 @@ handler: function() {this.cancel();}, -   Software v.1.2.2 - - -- -\ No newline at end of file -+ ---- a/web/pandas/_templates/layout.html -+++ b/web/pandas/_templates/layout.html -@@ -1,13 +1,6 @@ - - - -- -- - pandas - Python Data Analysis Library - - ---- a/doc/source/conf.py -+++ b/doc/source/conf.py -@@ -240,7 +240,6 @@ html_theme_options = { - "footer_start": ["pandas_footer", "sphinx-version"], - "github_url": "https://github.com/pandas-dev/pandas", - "twitter_url": "https://twitter.com/pandas_dev", -- "analytics": {"google_analytics_id": "G-5RE31C1RNW"}, - "logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"}, - "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"], - "switcher": { diff -Nru pandas-2.1.4+dfsg/debian/patches/deb_doc_donotoverride_PYTHONPATH.patch pandas-2.2.2+dfsg/debian/patches/deb_doc_donotoverride_PYTHONPATH.patch --- pandas-2.1.4+dfsg/debian/patches/deb_doc_donotoverride_PYTHONPATH.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/deb_doc_donotoverride_PYTHONPATH.patch 2024-04-23 17:39:05.000000000 +0000 @@ -7,7 +7,7 @@ --- a/doc/make.py +++ b/doc/make.py -@@ -353,8 +353,9 @@ def main(): +@@ -368,8 +368,9 @@ def main(): # external libraries (namely Sphinx) to compile this module and resolve # the import of `python_path` correctly. The latter is used to resolve # the import within the module, injecting it into the global namespace diff -Nru pandas-2.1.4+dfsg/debian/patches/deb_nonversioneer_version.patch pandas-2.2.2+dfsg/debian/patches/deb_nonversioneer_version.patch --- pandas-2.1.4+dfsg/debian/patches/deb_nonversioneer_version.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/deb_nonversioneer_version.patch 2024-04-23 17:39:05.000000000 +0000 @@ -7,7 +7,7 @@ --- a/pandas/__init__.py +++ b/pandas/__init__.py -@@ -183,12 +183,7 @@ try: +@@ -186,12 +186,7 @@ try: _built_with_meson = True except ImportError: @@ -19,8 +19,8 @@ - del get_versions, v + from .__version import version as __version__ - - # module level doc-string + # GH#55043 - deprecation of the data_manager option + if "PANDAS_DATA_MANAGER" in os.environ: --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -193,8 +193,6 @@ class TestPDApi(Base): diff -Nru pandas-2.1.4+dfsg/debian/patches/find_test_data.patch pandas-2.2.2+dfsg/debian/patches/find_test_data.patch --- pandas-2.1.4+dfsg/debian/patches/find_test_data.patch 2024-02-01 07:52:29.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/find_test_data.patch 2024-06-01 17:59:40.000000000 +0000 @@ -3,15 +3,12 @@ We don't ship these in the package, but do want to run the tests that use them -tests_path() is removed completely because it is unclear whether it -should point to the tests code or the directory above the test data - Author: Rebecca N. Palmer Forwarded: https://github.com/pandas-dev/pandas/issues/54907 --- a/pandas/conftest.py +++ b/pandas/conftest.py -@@ -35,6 +35,7 @@ from typing import ( +@@ -34,6 +34,7 @@ from typing import ( TYPE_CHECKING, Callable, ) @@ -19,7 +16,7 @@ from dateutil.tz import ( tzlocal, -@@ -107,6 +108,7 @@ def pytest_addoption(parser) -> None: +@@ -114,6 +115,7 @@ def pytest_addoption(parser) -> None: action="store_false", help="Don't fail if a test is skipped for missing data file.", ) @@ -27,31 +24,16 @@ def ignore_doctest_warning(item: pytest.Item, path: str, message: str) -> None: -@@ -1169,17 +1171,15 @@ def strict_data_files(pytestconfig): - - - @pytest.fixture --def tests_path() -> Path: -- return Path(__file__).parent / "tests" -+def tests_io_data_path(pytestconfig) -> Path: -+ BASE_PATH = pytestconfig.getoption("--deb-data-root-dir", default=None) -+ if BASE_PATH is None: -+ BASE_PATH = os.path.join(os.path.dirname(__file__), "tests") -+ return Path(BASE_PATH) / "io" / "data" +@@ -1098,7 +1100,7 @@ def strict_data_files(pytestconfig): @pytest.fixture --def tests_io_data_path(tests_path) -> Path: -- return tests_path / "io" / "data" -- -- --@pytest.fixture -def datapath(strict_data_files: str) -> Callable[..., str]: +def datapath(strict_data_files: str, pytestconfig) -> Callable[..., str]: """ Get the path to a data file. -@@ -1197,7 +1197,9 @@ def datapath(strict_data_files: str) -> +@@ -1116,7 +1118,9 @@ def datapath(strict_data_files: str) -> ValueError If the path doesn't exist and the --no-strict-data-files option is not set. """ @@ -64,7 +46,7 @@ path = os.path.join(BASE_PATH, *args) --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py -@@ -32,6 +32,7 @@ def test_datapath_missing(datapath): +@@ -35,6 +35,7 @@ def test_datapath_missing(datapath): datapath("not_a_file") @@ -74,7 +56,7 @@ --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py -@@ -117,7 +117,7 @@ def test_pickles(datapath): +@@ -115,7 +115,7 @@ def test_pickles(datapath): pytest.skip("known failure on non-little endian") # For loop for compat with --strict-data-files @@ -83,7 +65,7 @@ legacy_pickle = datapath(legacy_pickle) data = pd.read_pickle(legacy_pickle) -@@ -574,7 +574,7 @@ def test_pickle_big_dataframe_compressio +@@ -627,7 +627,7 @@ def test_pickle_big_dataframe_compressio def test_pickle_frame_v124_unpickle_130(datapath): # GH#42345 DataFrame created in 1.2.x, unpickle in 1.3.x path = datapath( @@ -107,14 +89,26 @@ result = file.read() assert "{% include html_style_tpl %}" in result assert "{% include html_table_tpl %}" in result +--- a/pandas/tests/io/xml/conftest.py ++++ b/pandas/tests/io/xml/conftest.py +@@ -4,8 +4,8 @@ import pytest + + + @pytest.fixture +-def xml_data_path(): +- return Path(__file__).parent.parent / "data" / "xml" ++def xml_data_path(datapath): ++ return Path(datapath("io", "data", "xml")) + + + @pytest.fixture --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py -@@ -486,13 +486,14 @@ def test_empty_string_etree(val): +@@ -487,13 +487,13 @@ def test_empty_string_etree(val): read_xml(BytesIO(val), parser="etree") -def test_wrong_file_path(parser): -+@pytest.mark.xfail(reason="broken by etree changes", strict=False) +def test_wrong_file_path(parser, datapath): msg = ( "Passing literal xml to 'read_xml' is deprecated and " @@ -126,7 +120,7 @@ with pytest.raises( FutureWarning, -@@ -1357,17 +1358,16 @@ def test_stylesheet_with_etree(kml_cta_r +@@ -1358,17 +1358,16 @@ def test_stylesheet_with_etree(kml_cta_r @pytest.mark.parametrize("val", ["", b""]) diff -Nru pandas-2.1.4+dfsg/debian/patches/fix_overly_arch_specific_xfails.patch pandas-2.2.2+dfsg/debian/patches/fix_overly_arch_specific_xfails.patch --- pandas-2.1.4+dfsg/debian/patches/fix_overly_arch_specific_xfails.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/fix_overly_arch_specific_xfails.patch 2024-04-23 17:56:27.000000000 +0000 @@ -11,16 +11,15 @@ --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py -@@ -17,7 +17,7 @@ import tarfile +@@ -17,6 +17,7 @@ import tarfile import numpy as np import pytest --from pandas.compat import is_ci_environment -+from pandas.compat import is_ci_environment, IS64 ++from pandas.compat import IS64 from pandas.compat.numpy import np_version_gte1p24 - from pandas.errors import ParserError - import pandas.util._test_decorators as td -@@ -27,6 +27,9 @@ from pandas import ( + from pandas.errors import ( + ParserError, +@@ -29,6 +30,9 @@ from pandas import ( concat, ) import pandas._testing as tm @@ -30,7 +29,7 @@ @pytest.mark.parametrize( -@@ -654,11 +657,13 @@ def test_float_precision_options(c_parse +@@ -633,11 +637,13 @@ def test_float_precision_options(c_parse tm.assert_frame_equal(df, df2) @@ -62,7 +61,7 @@ from pandas import ( DataFrame, -@@ -1101,7 +1105,8 @@ def test_rolling_sem(frame_or_series): +@@ -1176,7 +1180,8 @@ def test_rolling_sem(frame_or_series): @pytest.mark.xfail( diff -Nru pandas-2.1.4+dfsg/debian/patches/fix_random_seeds.patch pandas-2.2.2+dfsg/debian/patches/fix_random_seeds.patch --- pandas-2.1.4+dfsg/debian/patches/fix_random_seeds.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/fix_random_seeds.patch 2024-04-23 17:48:19.000000000 +0000 @@ -35,7 +35,7 @@ --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py -@@ -600,6 +600,7 @@ def boxplot_frame_groupby( +@@ -604,6 +604,7 @@ def boxplot_frame_groupby( .. plot:: :context: close-figs @@ -43,7 +43,7 @@ >>> import itertools >>> tuples = [t for t in itertools.product(range(1000), range(4))] >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) -@@ -1329,6 +1330,7 @@ class PlotAccessor(PandasObject): +@@ -1328,6 +1329,7 @@ class PlotAccessor(PandasObject): .. plot:: :context: close-figs @@ -51,15 +51,15 @@ >>> data = np.random.randn(25, 4) >>> df = pd.DataFrame(data, columns=list('ABCD')) >>> ax = df.plot.box() -@@ -1393,6 +1395,7 @@ class PlotAccessor(PandasObject): +@@ -1392,6 +1394,7 @@ class PlotAccessor(PandasObject): .. plot:: :context: close-figs + >>> np.random.seed(1234) - >>> df = pd.DataFrame( - ... np.random.randint(1, 7, 6000), - ... columns = ['one']) -@@ -1814,6 +1817,7 @@ class PlotAccessor(PandasObject): + >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) + >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) + >>> ax = df.plot.hist(bins=12, alpha=0.5) +@@ -1811,6 +1814,7 @@ class PlotAccessor(PandasObject): .. plot:: :context: close-figs @@ -76,7 +76,7 @@ + >>> np.random.seed(1234) + >>> random.seed(1234) # for reproducibility >>> s = pd.Series(np.random.uniform(size=100)) - >>> pd.plotting.bootstrap_plot(s) + >>> pd.plotting.bootstrap_plot(s) # doctest: +SKIP
@@ -597,6 +599,7 @@ def autocorrelation_plot(series: Series, .. plot:: diff -Nru pandas-2.1.4+dfsg/debian/patches/hurd_compat.patch pandas-2.2.2+dfsg/debian/patches/hurd_compat.patch --- pandas-2.1.4+dfsg/debian/patches/hurd_compat.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/hurd_compat.patch 2024-04-23 17:55:00.000000000 +0000 @@ -9,7 +9,7 @@ --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py -@@ -83,7 +83,7 @@ def test_nonexistent_path(all_parsers): +@@ -100,7 +100,7 @@ def test_nonexistent_path(all_parsers): parser = all_parsers path = f"{uuid.uuid4()}.csv" @@ -18,7 +18,7 @@ with pytest.raises(FileNotFoundError, match=msg) as e: parser.read_csv(path) assert path == e.value.filename -@@ -94,7 +94,7 @@ def test_no_permission(all_parsers): +@@ -111,7 +111,7 @@ def test_no_permission(all_parsers): # GH 23784 parser = all_parsers @@ -29,14 +29,14 @@ --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py -@@ -53,7 +53,7 @@ def test_too_many_exponent_digits(all_pa +@@ -67,7 +67,7 @@ def test_too_many_exponent_digits(all_pa data = f"data\n10E{exp}" result = parser.read_csv(StringIO(data), float_precision=precision) if precision == "round_trip": - if exp == 999999999999999999 and is_platform_linux(): + if exp == 999999999999999999: mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") - request.node.add_marker(mark) + request.applymarker(mark) --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -57,7 +57,7 @@ import pytest --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py -@@ -33,6 +33,12 @@ def df(): +@@ -31,6 +31,12 @@ def df(): def test_dask(df): @@ -72,7 +72,7 @@ olduse = pd.get_option("compute.use_numexpr") --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py -@@ -195,16 +195,16 @@ Look,a snake,🐍""" +@@ -200,16 +200,16 @@ Look,a snake,🐍""" path = os.path.join(HERE, "data", "does_not_exist." + fn_ext) msg1 = rf"File (b')?.+does_not_exist\.{fn_ext}'? does not exist" @@ -93,7 +93,7 @@ ) msg8 = rf"Failed to open local file.+does_not_exist\.{fn_ext}" -@@ -265,16 +265,16 @@ Look,a snake,🐍""" +@@ -270,16 +270,16 @@ Look,a snake,🐍""" monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x)) msg1 = rf"File (b')?.+does_not_exist\.{fn_ext}'? does not exist" @@ -114,7 +114,7 @@ ) msg8 = rf"Failed to open local file.+does_not_exist\.{fn_ext}" -@@ -578,7 +578,7 @@ def test_bad_encdoing_errors(): +@@ -607,7 +607,7 @@ def test_bad_encdoing_errors(): def test_errno_attribute(): # GH 13872 diff -Nru pandas-2.1.4+dfsg/debian/patches/ignore_ipython_exceptions.patch pandas-2.2.2+dfsg/debian/patches/ignore_ipython_exceptions.patch --- pandas-2.1.4+dfsg/debian/patches/ignore_ipython_exceptions.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/ignore_ipython_exceptions.patch 2024-04-23 18:07:53.000000000 +0000 @@ -8,7 +8,7 @@ --- a/doc/source/conf.py +++ b/doc/source/conf.py -@@ -460,6 +460,7 @@ extlinks = { +@@ -463,6 +463,7 @@ extlinks = { } diff -Nru pandas-2.1.4+dfsg/debian/patches/ignore_python3p12_deprecations.patch pandas-2.2.2+dfsg/debian/patches/ignore_python3p12_deprecations.patch --- pandas-2.1.4+dfsg/debian/patches/ignore_python3p12_deprecations.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/ignore_python3p12_deprecations.patch 2024-04-23 18:09:07.000000000 +0000 @@ -5,7 +5,7 @@ --- a/pyproject.toml +++ b/pyproject.toml -@@ -506,6 +506,8 @@ filterwarnings = [ +@@ -515,6 +515,8 @@ filterwarnings = [ "ignore:distutils Version classes are deprecated:DeprecationWarning:fsspec", # Can be removed once https://github.com/numpy/numpy/pull/24794 is merged "ignore:.*In the future `np.long` will be defined as.*:FutureWarning", @@ -16,7 +16,7 @@ markers = [ --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py -@@ -554,11 +554,11 @@ class TestEval: +@@ -569,11 +569,11 @@ class TestEval: assert pd.eval("-1", parser=parser, engine=engine) == -1 assert pd.eval("+1", parser=parser, engine=engine) == +1 with tm.assert_produces_warning( diff -Nru pandas-2.1.4+dfsg/debian/patches/no_sphinx_toggleprompt.patch pandas-2.2.2+dfsg/debian/patches/no_sphinx_toggleprompt.patch --- pandas-2.1.4+dfsg/debian/patches/no_sphinx_toggleprompt.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/no_sphinx_toggleprompt.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -Description: Don't try to use a Sphinx extension we don't have - -Author: Rebecca N. Palmer -Forwarded: not-needed - ---- a/doc/source/conf.py -+++ b/doc/source/conf.py -@@ -57,7 +57,6 @@ extensions = [ - "numpydoc", - "sphinx_copybutton", - "sphinx_design", -- "sphinx_toggleprompt", - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - "sphinx.ext.coverage", diff -Nru pandas-2.1.4+dfsg/debian/patches/numba_fail_32bit.patch pandas-2.2.2+dfsg/debian/patches/numba_fail_32bit.patch --- pandas-2.1.4+dfsg/debian/patches/numba_fail_32bit.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/numba_fail_32bit.patch 2024-04-23 17:55:00.000000000 +0000 @@ -8,7 +8,7 @@ --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py -@@ -7,6 +7,11 @@ from pandas.core.groupby.base import ( +@@ -11,6 +11,11 @@ from pandas.core.groupby.base import ( reduction_kernels, transformation_kernels, ) @@ -20,7 +20,7 @@ @pytest.fixture(params=[True, False]) -@@ -169,7 +174,22 @@ def groupby_func(request): +@@ -153,7 +158,22 @@ def groupby_func(request): return request.param @@ -85,12 +85,11 @@ return request.param --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py -@@ -5,7 +5,13 @@ from pandas.compat import ( - is_ci_environment, - is_platform_mac, - is_platform_windows, -+ IS64, - ) +@@ -1,6 +1,12 @@ + import numpy as np + import pytest + ++from pandas.compat import IS64 +try: + from numba.core.errors import UnsupportedParforsError, TypingError +except ImportError: # numba not installed @@ -99,7 +98,7 @@ from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td -@@ -199,6 +205,12 @@ class TestEngine: +@@ -186,6 +192,12 @@ class TestEngine: expected = DataFrame({"value": [2.0, 2.0, 2.0]}) tm.assert_frame_equal(result, expected) @@ -112,7 +111,7 @@ def test_dont_cache_engine_kwargs(self): # If the user passes a different set of engine_kwargs don't return the same # jitted function -@@ -339,6 +351,12 @@ class TestTableMethod: +@@ -326,6 +338,12 @@ class TestTableMethod: f, engine="numba", raw=True ) @@ -125,7 +124,7 @@ def test_table_method_rolling_methods( self, axis, -@@ -421,6 +439,12 @@ class TestTableMethod: +@@ -408,6 +426,12 @@ class TestTableMethod: ) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/debian/patches/numba_warn_nonx86.patch pandas-2.2.2+dfsg/debian/patches/numba_warn_nonx86.patch --- pandas-2.1.4+dfsg/debian/patches/numba_warn_nonx86.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/numba_warn_nonx86.patch 2024-04-23 18:07:52.000000000 +0000 @@ -19,9 +19,9 @@ from pandas.util._exceptions import find_stack_level -@@ -120,6 +123,8 @@ def import_optional_dependency( +@@ -123,6 +126,8 @@ def import_optional_dependency( + is ``'warn'`` or ``'ignore'``. """ - assert errors in {"warn", "raise", "ignore"} + if name=='numba' and warn_numba_platform: + warnings.warn(warn_numba_platform) diff -Nru pandas-2.1.4+dfsg/debian/patches/pytables_python3p12.patch pandas-2.2.2+dfsg/debian/patches/pytables_python3p12.patch --- pandas-2.1.4+dfsg/debian/patches/pytables_python3p12.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/pytables_python3p12.patch 2024-04-23 18:09:06.000000000 +0000 @@ -11,7 +11,7 @@ --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py -@@ -279,7 +279,8 @@ def test_append_all_nans(setup_path): +@@ -286,7 +286,8 @@ def test_append_all_nans(setup_path): tm.assert_frame_equal(store["df2"], df, check_index_type=True) @@ -23,7 +23,7 @@ # column oriented --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py -@@ -164,6 +164,8 @@ def test_select(setup_path): +@@ -168,6 +168,8 @@ def test_select(setup_path): tm.assert_frame_equal(expected, result) @@ -32,17 +32,17 @@ def test_select_dtypes(setup_path): with ensure_clean_store(setup_path) as store: # with a Timestamp data column (GH #2637) -@@ -563,6 +565,7 @@ def test_select_iterator_many_empty_fram +@@ -607,6 +609,7 @@ def test_select_iterator_many_empty_fram assert len(results) == 0 +@pytest.mark.xfail(condition=PY312, reason="python3.12 https://bugs.debian.org/1055801",raises=TypeError,strict=False) def test_frame_select(setup_path): - df = tm.makeTimeDataFrame() - + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py -@@ -763,7 +763,8 @@ def test_start_stop_fixed(setup_path): +@@ -884,7 +884,8 @@ def test_start_stop_fixed(setup_path): df.iloc[8:10, -2] = np.nan diff -Nru pandas-2.1.4+dfsg/debian/patches/remove_ccbysa_snippets.patch pandas-2.2.2+dfsg/debian/patches/remove_ccbysa_snippets.patch --- pandas-2.1.4+dfsg/debian/patches/remove_ccbysa_snippets.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/remove_ccbysa_snippets.patch 2024-04-23 20:50:04.000000000 +0000 @@ -43,7 +43,7 @@ - cookbook --- a/pandas/io/sql.py +++ b/pandas/io/sql.py -@@ -2053,14 +2053,14 @@ def _get_valid_sqlite_name(name: object) +@@ -2465,14 +2465,14 @@ def _get_valid_sqlite_name(name: object) # Replace all " with "". # Wrap the entire thing in double quotes. @@ -65,7 +65,7 @@ class SQLiteTable(SQLTable): --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py -@@ -980,28 +980,6 @@ def test_groupby_empty_with_category(): +@@ -988,28 +988,6 @@ def test_groupby_empty_with_category(): tm.assert_series_equal(result, expected) @@ -94,9 +94,47 @@ @pytest.mark.parametrize("ordered", [True, False]) def test_sort2(sort, ordered): # dataframe groupby sort was being ignored # GH 8868 +--- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py ++++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +@@ -13,35 +13,6 @@ from pandas import ( + import pandas._testing as tm + + +-def test_detect_chained_assignment(using_copy_on_write, warn_copy_on_write): +- # Inplace ops, originally from: +- # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug +- a = [12, 23] +- b = [123, None] +- c = [1234, 2345] +- d = [12345, 23456] +- tuples = [("eyes", "left"), ("eyes", "right"), ("ears", "left"), ("ears", "right")] +- events = { +- ("eyes", "left"): a, +- ("eyes", "right"): b, +- ("ears", "left"): c, +- ("ears", "right"): d, +- } +- multiind = MultiIndex.from_tuples(tuples, names=["part", "side"]) +- zed = DataFrame(events, index=["a", "b"], columns=multiind) +- +- if using_copy_on_write: +- with tm.raises_chained_assignment_error(): +- zed["eyes"]["right"].fillna(value=555, inplace=True) +- elif warn_copy_on_write: +- with tm.assert_produces_warning(None): +- zed["eyes"]["right"].fillna(value=555, inplace=True) +- else: +- msg = "A value is trying to be set on a copy of a slice from a DataFrame" +- with pytest.raises(SettingWithCopyError, match=msg): +- with tm.assert_produces_warning(None): +- zed["eyes"]["right"].fillna(value=555, inplace=True) +- + + @td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view + def test_cache_updating(using_copy_on_write, warn_copy_on_write): --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py -@@ -155,36 +155,6 @@ class TestMultiIndexSetItem: +@@ -154,36 +154,7 @@ class TestMultiIndexSetItem: with pytest.raises(TypeError, match=msg): df.loc["bar"] *= 2 @@ -118,9 +156,9 @@ - df_orig.index = MultiIndex.from_tuples( - df_orig.index, names=["Sit", "Com", "Type"] - ) -- + - expected = df_orig.copy() -- expected.iloc[[0, 2, 3]] *= 2 +- expected.iloc[[0, 1, 3]] *= 2 - - idx = pd.IndexSlice - df = df_orig.copy() @@ -133,9 +171,39 @@ def test_multiindex_assignment(self): # GH3777 part 2 +--- a/pandas/tests/indexing/test_chaining_and_caching.py ++++ b/pandas/tests/indexing/test_chaining_and_caching.py +@@ -429,27 +429,6 @@ class TestChaining: + df["column1"] = df["column1"] + "c" + str(df) + +- @pytest.mark.arm_slow +- def test_detect_chained_assignment_undefined_column( +- self, using_copy_on_write, warn_copy_on_write +- ): +- # from SO: +- # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc +- df = DataFrame(np.arange(0, 9), columns=["count"]) +- df["group"] = "b" +- df_original = df.copy() +- +- if using_copy_on_write: +- with tm.raises_chained_assignment_error(): +- df.iloc[0:5]["group"] = "a" +- tm.assert_frame_equal(df, df_original) +- elif warn_copy_on_write: +- with tm.raises_chained_assignment_error(): +- df.iloc[0:5]["group"] = "a" +- else: +- with pytest.raises(SettingWithCopyError, match=msg): +- with tm.raises_chained_assignment_error(): +- df.iloc[0:5]["group"] = "a" + + @pytest.mark.arm_slow + def test_detect_chained_assignment_changing_dtype( --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py -@@ -349,24 +349,6 @@ def test_trailing_delimiters(all_parsers +@@ -381,23 +381,6 @@ def test_trailing_delimiters(all_parsers tm.assert_frame_equal(result, expected) @@ -145,7 +213,7 @@ - data = '''SEARCH_TERM,ACTUAL_URL -"bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" -"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" --"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa: E501 +-"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' - - parser = all_parsers - result = parser.read_csv( @@ -156,66 +224,6 @@ - - tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) - -- - @xfail_pyarrow + def test_ignore_leading_whitespace(all_parsers): # see gh-3374, gh-6607 ---- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py -+++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py -@@ -12,32 +12,6 @@ from pandas import ( - import pandas._testing as tm - - --def test_detect_chained_assignment(using_copy_on_write): -- # Inplace ops, originally from: -- # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug -- a = [12, 23] -- b = [123, None] -- c = [1234, 2345] -- d = [12345, 23456] -- tuples = [("eyes", "left"), ("eyes", "right"), ("ears", "left"), ("ears", "right")] -- events = { -- ("eyes", "left"): a, -- ("eyes", "right"): b, -- ("ears", "left"): c, -- ("ears", "right"): d, -- } -- multiind = MultiIndex.from_tuples(tuples, names=["part", "side"]) -- zed = DataFrame(events, index=["a", "b"], columns=multiind) -- -- if using_copy_on_write: -- with tm.raises_chained_assignment_error(): -- zed["eyes"]["right"].fillna(value=555, inplace=True) -- else: -- msg = "A value is trying to be set on a copy of a slice from a DataFrame" -- with pytest.raises(SettingWithCopyError, match=msg): -- zed["eyes"]["right"].fillna(value=555, inplace=True) -- -- - @td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view - def test_cache_updating(using_copy_on_write): - # 5216 ---- a/pandas/tests/indexing/test_chaining_and_caching.py -+++ b/pandas/tests/indexing/test_chaining_and_caching.py -@@ -422,21 +422,6 @@ class TestChaining: - str(df) - - @pytest.mark.arm_slow -- def test_detect_chained_assignment_undefined_column(self, using_copy_on_write): -- # from SO: -- # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc -- df = DataFrame(np.arange(0, 9), columns=["count"]) -- df["group"] = "b" -- df_original = df.copy() -- -- if using_copy_on_write: -- with tm.raises_chained_assignment_error(): -- df.iloc[0:5]["group"] = "a" -- tm.assert_frame_equal(df, df_original) -- else: -- with pytest.raises(SettingWithCopyError, match=msg): -- df.iloc[0:5]["group"] = "a" -- - @pytest.mark.arm_slow - def test_detect_chained_assignment_changing_dtype( - self, using_array_manager, using_copy_on_write diff -Nru pandas-2.1.4+dfsg/debian/patches/series pandas-2.2.2+dfsg/debian/patches/series --- pandas-2.1.4+dfsg/debian/patches/series 2024-04-05 06:30:11.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/series 2024-05-18 10:49:17.000000000 +0000 @@ -1,7 +1,7 @@ deb_nonversioneer_version.patch deb_doc_donotoverride_PYTHONPATH.patch xfail_tests_nonintel_io.patch -deb_disable_googleanalytics.patch +deb_disable_analytics.patch mathjax-path.patch use_system_intersphinx.patch contributor_list_not_in_tarball.patch @@ -15,10 +15,8 @@ fix_overly_arch_specific_xfails.patch tests_dont_assume_64bit.patch armel_ignore_nonwarning.patch -no_sphinx_toggleprompt.patch sphinx_old_pydata_theme.patch 1029251_ignore_rounding_error.patch -sphinx7_compat.patch tests_dont_assume_endian.patch accept_system_tzdata.patch numba_warn_nonx86.patch @@ -31,8 +29,9 @@ 2p1_openpyxl_errors.patch pytables_python3p12.patch ignore_python3p12_deprecations.patch -test_complibs_blosc2.patch sum_loosen_test_tolerance.patch -xarray2024_compat.patch 1068104_time64.patch 1068422_ignore_dask_tests.patch +versioned_importorskip.patch +add_missing_importorskip.patch +blosc_nonstrict_xfail.patch diff -Nru pandas-2.1.4+dfsg/debian/patches/skip_test_missing_required_dependency.patch pandas-2.2.2+dfsg/debian/patches/skip_test_missing_required_dependency.patch --- pandas-2.1.4+dfsg/debian/patches/skip_test_missing_required_dependency.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/skip_test_missing_required_dependency.patch 2024-04-23 17:55:00.000000000 +0000 @@ -5,7 +5,7 @@ --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py -@@ -199,6 +199,7 @@ def test_yaml_dump(df): +@@ -193,6 +193,7 @@ def test_yaml_dump(df): @pytest.mark.single_cpu diff -Nru pandas-2.1.4+dfsg/debian/patches/sphinx7_compat.patch pandas-2.2.2+dfsg/debian/patches/sphinx7_compat.patch --- pandas-2.1.4+dfsg/debian/patches/sphinx7_compat.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/sphinx7_compat.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,24 +0,0 @@ -Description: Allow building docs with Sphinx 7 - -Author: Rebecca N. Palmer -Bug-Debian: https://bugs.debian.org/1042672 -Forwarded: https://github.com/pandas-dev/pandas/pull/54653 - ---- a/doc/source/conf.py -+++ b/doc/source/conf.py -@@ -162,7 +162,7 @@ master_doc = "index" - # General information about the project. - project = "pandas" - # We have our custom "pandas_footer.html" template, using copyright for the current year --copyright = f"{datetime.now().year}" -+copyright = f"{datetime.now().year} " - - # The version info for the project you're documenting, acts as replacement for - # |version| and |release|, also used in various other places throughout the ---- a/doc/_templates/pandas_footer.html -+++ b/doc/_templates/pandas_footer.html -@@ -1,3 +1,3 @@ - diff -Nru pandas-2.1.4+dfsg/debian/patches/sphinx_old_pydata_theme.patch pandas-2.2.2+dfsg/debian/patches/sphinx_old_pydata_theme.patch --- pandas-2.1.4+dfsg/debian/patches/sphinx_old_pydata_theme.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/sphinx_old_pydata_theme.patch 2024-04-23 18:05:30.000000000 +0000 @@ -4,16 +4,18 @@ --- a/doc/source/conf.py +++ b/doc/source/conf.py -@@ -241,12 +241,7 @@ html_theme_options = { +@@ -245,14 +245,7 @@ html_theme_options = { "footer_start": ["pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", "twitter_url": "https://twitter.com/pandas_dev", - "logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"}, +- "navbar_align": "left", - "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"], - "switcher": { - "json_url": "https://pandas.pydata.org/versions.json", - "version_match": switcher_version, - }, +- "show_version_warning_banner": True, + "navbar_end": ["navbar-icon-links"], "icon_links": [ { diff -Nru pandas-2.1.4+dfsg/debian/patches/sum_loosen_test_tolerance.patch pandas-2.2.2+dfsg/debian/patches/sum_loosen_test_tolerance.patch --- pandas-2.1.4+dfsg/debian/patches/sum_loosen_test_tolerance.patch 2024-01-30 20:20:54.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/sum_loosen_test_tolerance.patch 2024-04-23 18:09:07.000000000 +0000 @@ -8,7 +8,7 @@ --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py -@@ -258,7 +258,7 @@ class TestDataFrameAnalytics: +@@ -344,7 +344,7 @@ class TestDataFrameAnalytics: np.sum, mixed_float_frame.astype("float32"), check_dtype=False, diff -Nru pandas-2.1.4+dfsg/debian/patches/test_complibs_blosc2.patch pandas-2.2.2+dfsg/debian/patches/test_complibs_blosc2.patch --- pandas-2.1.4+dfsg/debian/patches/test_complibs_blosc2.patch 2024-01-30 20:20:04.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/test_complibs_blosc2.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,19 +0,0 @@ -Description: Allow blosc2 to report the type used - -e.g. the bug had blosc2:blosclz - -Author: Rebecca N. Palmer -Bug-Debian: https://bugs.debian.org/1061043 -Forwarded: no - upstream 55524 xfailed some but not all cases - ---- a/pandas/tests/io/pytables/test_file_handling.py -+++ b/pandas/tests/io/pytables/test_file_handling.py -@@ -268,7 +268,7 @@ def test_complibs(tmp_path, lvl, lib): - if lvl == 0: - assert node.filters.complib is None - else: -- assert node.filters.complib == lib -+ assert ((node.filters.complib == lib) or (lib=='blosc2' and node.filters.complib.startswith('blosc2:'))) - - - @pytest.mark.xfail(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False,run=False) diff -Nru pandas-2.1.4+dfsg/debian/patches/tests_dont_assume_64bit.patch pandas-2.2.2+dfsg/debian/patches/tests_dont_assume_64bit.patch --- pandas-2.1.4+dfsg/debian/patches/tests_dont_assume_64bit.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/tests_dont_assume_64bit.patch 2024-04-23 18:00:44.000000000 +0000 @@ -4,80 +4,9 @@ Bug-Debian: partly https://bugs.debian.org/1026351 Forwarded: no ---- a/pandas/core/arrays/_ranges.py -+++ b/pandas/core/arrays/_ranges.py -@@ -50,12 +50,12 @@ def generate_regular_range( - ndarray[np.int64] - Representing the given resolution. - """ -- istart = start._value if start is not None else None -- iend = end._value if end is not None else None -+ istart = int(start._value) if start is not None else None -+ iend = int(end._value) if end is not None else None - freq.nanos # raises if non-fixed frequency - td = Timedelta(freq) -- b: int | np.int64 | np.uint64 -- e: int | np.int64 | np.uint64 -+ b: int -+ e: int - try: - td = td.as_unit( # pyright: ignore[reportGeneralTypeIssues] - unit, round_ok=False -@@ -98,7 +98,7 @@ def generate_regular_range( - - def _generate_range_overflow_safe( - endpoint: int, periods: int, stride: int, side: str = "start" --) -> np.int64 | np.uint64: -+) -> int: - """ - Calculate the second endpoint for passing to np.arange, checking - to avoid an integer overflow. Catch OverflowError and re-raise -@@ -117,7 +117,7 @@ def _generate_range_overflow_safe( - - Returns - ------- -- other_end : np.int64 | np.uint64 -+ other_end : int - - Raises - ------ -@@ -159,13 +159,13 @@ def _generate_range_overflow_safe( - remaining = periods - mid_periods - assert 0 < remaining < periods, (remaining, periods, endpoint, stride) - -- midpoint = int(_generate_range_overflow_safe(endpoint, mid_periods, stride, side)) -+ midpoint = _generate_range_overflow_safe(endpoint, mid_periods, stride, side) - return _generate_range_overflow_safe(midpoint, remaining, stride, side) - - - def _generate_range_overflow_safe_signed( - endpoint: int, periods: int, stride: int, side: str --) -> np.int64 | np.uint64: -+) -> int: - """ - A special case for _generate_range_overflow_safe where `periods * stride` - can be calculated without overflowing int64 bounds. -@@ -183,7 +183,7 @@ def _generate_range_overflow_safe_signed - # Putting this into a DatetimeArray/TimedeltaArray - # would incorrectly be interpreted as NaT - raise OverflowError -- return result -+ return int(result) - except (FloatingPointError, OverflowError): - # with endpoint negative and addend positive we risk - # FloatingPointError; with reversed signed we risk OverflowError -@@ -202,7 +202,7 @@ def _generate_range_overflow_safe_signed - i64max = np.uint64(i8max) - assert uresult > i64max - if uresult <= i64max + np.uint64(stride): -- return uresult -+ return int(uresult) - - raise OutOfBoundsDatetime( - f"Cannot generate range with {side}={endpoint} and periods={periods}" --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py -@@ -5,6 +5,7 @@ import re +@@ -6,6 +6,7 @@ import re import numpy as np import pytest @@ -85,7 +14,7 @@ from pandas.errors import ( PerformanceWarning, SpecificationError, -@@ -2545,6 +2546,7 @@ def test_groupby_series_with_tuple_name( +@@ -2618,6 +2619,7 @@ def test_groupby_series_with_tuple_name( tm.assert_series_equal(result, expected) @@ -93,7 +22,7 @@ @pytest.mark.parametrize( "func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])] ) -@@ -2557,6 +2559,7 @@ def test_groupby_numerical_stability_sum +@@ -2630,6 +2632,7 @@ def test_groupby_numerical_stability_sum tm.assert_frame_equal(result, expected) @@ -103,15 +32,15 @@ data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py -@@ -8,6 +8,7 @@ import pytest - from pandas.compat import ( - is_ci_environment, - is_platform_windows, -+ IS64, - ) +@@ -4,6 +4,7 @@ from itertools import product + + import numpy as np + import pytest ++from pandas.compat import IS64 from pandas import ( -@@ -223,6 +224,7 @@ class TestMerge: + NA, +@@ -218,6 +219,7 @@ class TestMerge: assert result.name is None @pytest.mark.slow @@ -119,7 +48,7 @@ @pytest.mark.parametrize("how", ["left", "right", "outer", "inner"]) def test_int64_overflow_how_merge(self, left_right, how): left, right = left_right -@@ -233,6 +235,7 @@ class TestMerge: +@@ -228,6 +230,7 @@ class TestMerge: tm.assert_frame_equal(out, merge(left, right, how=how, sort=True)) @pytest.mark.slow @@ -127,7 +56,7 @@ def test_int64_overflow_sort_false_order(self, left_right): left, right = left_right -@@ -244,6 +247,7 @@ class TestMerge: +@@ -239,6 +242,7 @@ class TestMerge: tm.assert_frame_equal(right, out[right.columns.tolist()]) @pytest.mark.slow @@ -137,7 +66,7 @@ def test_int64_overflow_one_to_many_none_match(self, how, sort): --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py -@@ -21,6 +21,7 @@ from pandas import ( +@@ -20,6 +20,7 @@ from pandas import ( ) import pandas._testing as tm from pandas.core.reshape import reshape as reshape_lib @@ -145,7 +74,7 @@ @pytest.fixture(params=[True, False]) -@@ -2092,6 +2093,7 @@ Thu,Lunch,Yes,51.51,17""" +@@ -2175,6 +2176,7 @@ class TestStackUnstackMultiLevel: tm.assert_frame_equal(recons, df) @pytest.mark.slow @@ -155,15 +84,15 @@ # GH 26314: Change ValueError to PerformanceWarning --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py -@@ -26,6 +26,7 @@ import pandas._testing as tm - from pandas.api.types import CategoricalDtype as CDT +@@ -28,6 +28,7 @@ import pandas._testing as tm + from pandas.api.types import CategoricalDtype from pandas.core.reshape import reshape as reshape_lib from pandas.core.reshape.pivot import pivot_table +from pandas.compat import IS64 @pytest.fixture(params=[True, False]) -@@ -2059,6 +2060,7 @@ class TestPivotTable: +@@ -2092,6 +2093,7 @@ class TestPivotTable: tm.assert_frame_equal(result, expected) @pytest.mark.slow @@ -181,7 +110,7 @@ from pandas.core.arrays import ( DatetimeArray, TimedeltaArray, -@@ -236,6 +237,11 @@ def test_missing_required_dependency(): +@@ -230,6 +231,11 @@ def test_missing_required_dependency(): assert name in output diff -Nru pandas-2.1.4+dfsg/debian/patches/tests_dont_assume_endian.patch pandas-2.2.2+dfsg/debian/patches/tests_dont_assume_endian.patch --- pandas-2.1.4+dfsg/debian/patches/tests_dont_assume_endian.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/tests_dont_assume_endian.patch 2024-04-23 18:06:26.000000000 +0000 @@ -5,7 +5,7 @@ --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py -@@ -20,7 +20,7 @@ from pandas import ( +@@ -23,7 +23,7 @@ from pandas import ( import pandas._testing as tm from pandas.core.arrays import IntervalArray import pandas.core.common as com @@ -14,7 +14,7 @@ @pytest.fixture(params=[None, "foo"]) def name(request): -@@ -41,12 +41,12 @@ class ConstructorTests: +@@ -44,12 +44,12 @@ class ConstructorTests: (Index(np.arange(-10, 11, dtype=np.int64)), np.int64), (Index(np.arange(10, 31, dtype=np.uint64)), np.uint64), (Index(np.arange(20, 30, 0.5), dtype=np.float64), np.float64), diff -Nru pandas-2.1.4+dfsg/debian/patches/unbreak_clean.patch pandas-2.2.2+dfsg/debian/patches/unbreak_clean.patch --- pandas-2.1.4+dfsg/debian/patches/unbreak_clean.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/unbreak_clean.patch 2024-04-23 18:07:53.000000000 +0000 @@ -20,5 +20,5 @@ - self._clean_trees.append(d for d in ("build", "dist") if os.path.exists(d)) + self._clean_trees.extend([d for d in ("build", "dist") if os.path.exists(d)]) - def finalize_options(self): + def finalize_options(self) -> None: pass diff -Nru pandas-2.1.4+dfsg/debian/patches/use_system_intersphinx.patch pandas-2.2.2+dfsg/debian/patches/use_system_intersphinx.patch --- pandas-2.1.4+dfsg/debian/patches/use_system_intersphinx.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/use_system_intersphinx.patch 2024-04-23 17:46:00.000000000 +0000 @@ -6,19 +6,17 @@ --- a/doc/source/conf.py +++ b/doc/source/conf.py -@@ -451,12 +451,12 @@ latex_documents = [ +@@ -456,11 +456,11 @@ latex_documents = [ if include_api: intersphinx_mapping = { "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None), - "matplotlib": ("https://matplotlib.org/stable/", None), - "numpy": ("https://numpy.org/doc/stable/", None), -- "pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None), - "py": ("https://pylib.readthedocs.io/en/latest/", None), - "python": ("https://docs.python.org/3/", None), - "scipy": ("https://docs.scipy.org/doc/scipy/", None), + "matplotlib": ("https://matplotlib.org/stable/", None), # no longer has a -doc in Debian "/usr/share/doc/python-matplotlib-doc/html/objects.inv" + "numpy": ("https://numpy.org/doc/stable/", None), # no longer has a -doc in Debian "/usr/share/doc/python-numpy-doc/html/objects.inv" -+ "pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None), # not in Debian + "py": ("https://pylib.readthedocs.io/en/latest/", None), # no -doc in Debian + "python": ("https://docs.python.org/3/", "/usr/share/doc/python3-doc/html/objects.inv"), + "scipy": ("https://docs.scipy.org/doc/scipy/", ("/usr/share/doc/python-scipy-doc/html/objects.inv","/usr/share/doc/python-scipy/html/objects.inv")), diff -Nru pandas-2.1.4+dfsg/debian/patches/versioned_importorskip.patch pandas-2.2.2+dfsg/debian/patches/versioned_importorskip.patch --- pandas-2.1.4+dfsg/debian/patches/versioned_importorskip.patch 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/versioned_importorskip.patch 2024-05-06 10:04:42.000000000 +0000 @@ -0,0 +1,5935 @@ +Description: Avoid failing when a dependency is too old + +(some of them are pulled in by other dependencies, so +just not including them in d/control doesn't stop them being installed) + +Most of the content of this patch was generated by the following Python code: + +import pathlib +import re + +basedir = pathlib.Path.cwd() +if not (basedir / 'pandas/tests').exists(): + raise FileNotFoundError('must be run from the pandas root') +for source_file in basedir.glob('pandas/**/*.py'): + with open(source_file, 'r') as fd: + source_text = fd.read() + if 'pytest.importorskip' in source_text: + source_text = re.sub(r'pytest\.importorskip(.*)minversion', r'td.versioned_importorskip\1min_version', source_text) + source_text = re.sub(r'pytest\.importorskip', r'td.versioned_importorskip', source_text) + if '_test_decorators as td' not in source_text: + # add the import if it isn't already present + source_text, count = re.subn(r'^(import pandas|from pandas.*import)',r'import pandas.util._test_decorators as td\n\1', source_text, count=1, flags=re.MULTILINE) + if count != 1: + raise KeyError("failed to add import") + with open(source_file, 'w') as fd: + fd.write(source_text) + +Author: Rebecca N. Palmer +Forwarded: no + +--- a/pandas/_testing/_io.py ++++ b/pandas/_testing/_io.py +@@ -107,9 +107,9 @@ def round_trip_localpath(writer, reader, + pandas object + The original object that was serialized and then re-read. + """ +- import pytest ++ import pandas.util._test_decorators as td + +- LocalPath = pytest.importorskip("py.path").local ++ LocalPath = td.versioned_importorskip("py.path").local + if path is None: + path = "___localpath___" + with ensure_clean(path) as path: +--- a/pandas/conftest.py ++++ b/pandas/conftest.py +@@ -1801,7 +1801,7 @@ def ip(): + + Will raise a skip if IPython is not installed. + """ +- pytest.importorskip("IPython", minversion="6.0.0") ++ td.versioned_importorskip("IPython", min_version="6.0.0") + from IPython.core.interactiveshell import InteractiveShell + + # GH#35711 make sure sqlite history file handle is not leaked +@@ -1818,7 +1818,7 @@ def spmatrix(request): + """ + Yields scipy sparse matrix classes. + """ +- sparse = pytest.importorskip("scipy.sparse") ++ sparse = td.versioned_importorskip("scipy.sparse") + + return getattr(sparse, request.param + "_matrix") + +--- a/pandas/tests/apply/test_frame_apply.py ++++ b/pandas/tests/apply/test_frame_apply.py +@@ -4,6 +4,7 @@ import warnings + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.core.dtypes.dtypes import CategoricalDtype + + import pandas as pd +@@ -35,7 +36,7 @@ def int_frame_const_col(): + @pytest.fixture(params=["python", pytest.param("numba", marks=pytest.mark.single_cpu)]) + def engine(request): + if request.param == "numba": +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + return request.param + + +--- a/pandas/tests/apply/test_numba.py ++++ b/pandas/tests/apply/test_numba.py +@@ -26,7 +26,7 @@ def test_numba_vs_python_noop(float_fram + + def test_numba_vs_python_string_index(): + # GH#56189 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame( + 1, + index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), +--- a/pandas/tests/arrays/categorical/test_warnings.py ++++ b/pandas/tests/arrays/categorical/test_warnings.py +@@ -1,12 +1,13 @@ + import pytest + ++import pandas.util._test_decorators as td + import pandas._testing as tm + + + class TestCategoricalWarnings: + def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 +- pytest.importorskip("IPython", minversion="6.0.0") ++ td.versioned_importorskip("IPython", min_version="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; c = pd.Categorical([])" +--- a/pandas/tests/arrays/datetimes/test_constructors.py ++++ b/pandas/tests/arrays/datetimes/test_constructors.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._libs import iNaT + + from pandas.core.dtypes.dtypes import DatetimeTZDtype +@@ -226,7 +227,7 @@ COARSE_TO_FINE_SAFE = [123, None, -123] + def test_from_arrow_with_different_units_and_timezones_with( + pa_unit, pd_unit, pa_tz, pd_tz, data + ): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + pa_type = pa.timestamp(pa_unit, tz=pa_tz) + arr = pa.array(data, type=pa_type) +@@ -253,7 +254,7 @@ def test_from_arrow_with_different_units + ], + ) + def test_from_arrow_from_empty(unit, tz): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + data = [] + arr = pa.array(data) +@@ -269,7 +270,7 @@ def test_from_arrow_from_empty(unit, tz) + + + def test_from_arrow_from_integers(): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + data = [0, 123456789, None, 2**63 - 1, iNaT, -123456789] + arr = pa.array(data) +--- a/pandas/tests/arrays/interval/test_interval_pyarrow.py ++++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py +@@ -1,13 +1,14 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + import pandas._testing as tm + from pandas.core.arrays import IntervalArray + + + def test_arrow_extension_type(): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + +@@ -23,7 +24,7 @@ def test_arrow_extension_type(): + + + def test_arrow_array(): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + +@@ -52,7 +53,7 @@ def test_arrow_array(): + + + def test_arrow_array_missing(): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + +@@ -89,7 +90,7 @@ def test_arrow_array_missing(): + ids=["float", "datetime64[ns]"], + ) + def test_arrow_table_roundtrip(breaks): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + +@@ -125,7 +126,7 @@ def test_arrow_table_roundtrip(breaks): + ids=["float", "datetime64[ns]"], + ) + def test_arrow_table_roundtrip_without_metadata(breaks): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None +@@ -145,7 +146,7 @@ def test_from_arrow_from_raw_struct_arra + # in case pyarrow lost the Interval extension type (eg on parquet roundtrip + # with datetime64[ns] subtype, see GH-45881), still allow conversion + # from arrow to IntervalArray +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + arr = pa.array([{"left": 0, "right": 1}, {"left": 1, "right": 2}]) + dtype = pd.IntervalDtype(np.dtype("int64"), closed="neither") +--- a/pandas/tests/arrays/masked/test_arrow_compat.py ++++ b/pandas/tests/arrays/masked/test_arrow_compat.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + import pandas._testing as tm + +@@ -8,7 +9,7 @@ pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ) + +-pa = pytest.importorskip("pyarrow") ++pa = td.versioned_importorskip("pyarrow") + + from pandas.core.arrays.arrow._arrow_utils import pyarrow_array_to_numpy_and_mask + +--- a/pandas/tests/arrays/period/test_arrow_compat.py ++++ b/pandas/tests/arrays/period/test_arrow_compat.py +@@ -1,5 +1,6 @@ + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat.pyarrow import pa_version_under10p1 + + from pandas.core.dtypes.dtypes import PeriodDtype +@@ -16,7 +17,7 @@ pytestmark = pytest.mark.filterwarnings( + ) + + +-pa = pytest.importorskip("pyarrow") ++pa = td.versioned_importorskip("pyarrow") + + + def test_arrow_extension_type(): +--- a/pandas/tests/arrays/sparse/test_accessor.py ++++ b/pandas/tests/arrays/sparse/test_accessor.py +@@ -3,6 +3,7 @@ import string + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import SparseDtype + import pandas._testing as tm +@@ -26,7 +27,7 @@ class TestSeriesAccessor: + assert result == expected + + def test_from_coo(self): +- scipy_sparse = pytest.importorskip("scipy.sparse") ++ scipy_sparse = td.versioned_importorskip("scipy.sparse") + + row = [0, 3, 1, 0] + col = [0, 3, 1, 2] +@@ -64,7 +65,7 @@ class TestSeriesAccessor: + def test_to_coo( + self, sort_labels, expected_rows, expected_cols, expected_values_pos + ): +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + values = SparseArray([0, np.nan, 1, 0, None, 3], fill_value=0) + index = pd.MultiIndex.from_tuples( +@@ -107,7 +108,7 @@ class TestFrameAccessor: + @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])]) + @pytest.mark.parametrize("dtype", ["float64", "int64"]) + def test_from_spmatrix(self, format, labels, dtype): +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) + +@@ -120,7 +121,7 @@ class TestFrameAccessor: + + @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) + def test_from_spmatrix_including_explicit_zero(self, format): +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + mat = sp_sparse.random(10, 2, density=0.5, format=format) + mat.data[0] = 0 +@@ -134,7 +135,7 @@ class TestFrameAccessor: + [["a", "b"], pd.MultiIndex.from_product([["A"], ["a", "b"]]), ["a", "a"]], + ) + def test_from_spmatrix_columns(self, columns): +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + dtype = SparseDtype("float64", 0.0) + +@@ -147,7 +148,7 @@ class TestFrameAccessor: + "colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)] + ) + def test_to_coo(self, colnames): +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + df = pd.DataFrame( + {colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]" +@@ -158,7 +159,7 @@ class TestFrameAccessor: + + @pytest.mark.parametrize("fill_value", [1, np.nan]) + def test_to_coo_nonzero_fill_val_raises(self, fill_value): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = pd.DataFrame( + { + "A": SparseArray( +@@ -174,7 +175,7 @@ class TestFrameAccessor: + + def test_to_coo_midx_categorical(self): + # GH#50996 +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + midx = pd.MultiIndex.from_arrays( + [ +@@ -219,7 +220,7 @@ class TestFrameAccessor: + @pytest.mark.parametrize("dtype", ["int64", "float64"]) + @pytest.mark.parametrize("dense_index", [True, False]) + def test_series_from_coo(self, dtype, dense_index): +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + A = sp_sparse.eye(3, format="coo", dtype=dtype) + result = pd.Series.sparse.from_coo(A, dense_index=dense_index) +@@ -239,7 +240,7 @@ class TestFrameAccessor: + + def test_series_from_coo_incorrect_format_raises(self): + # gh-26554 +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + m = sp_sparse.csr_matrix(np.array([[0, 1], [0, 0]])) + with pytest.raises( +--- a/pandas/tests/arrays/sparse/test_constructors.py ++++ b/pandas/tests/arrays/sparse/test_constructors.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._libs.sparse import IntIndex + + import pandas as pd +@@ -188,7 +189,7 @@ class TestConstructors: + @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) + @pytest.mark.parametrize("size", [0, 10]) + def test_from_spmatrix(self, size, format): +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + mat = sp_sparse.random(size, 1, density=0.5, format=format) + result = SparseArray.from_spmatrix(mat) +@@ -199,7 +200,7 @@ class TestConstructors: + + @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) + def test_from_spmatrix_including_explicit_zero(self, format): +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + mat = sp_sparse.random(10, 1, density=0.5, format=format) + mat.data[0] = 0 +@@ -210,7 +211,7 @@ class TestConstructors: + tm.assert_numpy_array_equal(result, expected) + + def test_from_spmatrix_raises(self): +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + mat = sp_sparse.eye(5, 4, format="csc") + +--- a/pandas/tests/arrays/string_/test_string.py ++++ b/pandas/tests/arrays/string_/test_string.py +@@ -7,6 +7,7 @@ import operator + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat.pyarrow import pa_version_under12p0 + + from pandas.core.dtypes.common import is_dtype_equal +@@ -486,7 +487,7 @@ def test_fillna_args(dtype, arrow_string + + def test_arrow_array(dtype): + # protocol added in 0.15.0 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + import pyarrow.compute as pc + + data = pd.array(["a", "b", "c"], dtype=dtype) +@@ -502,7 +503,7 @@ def test_arrow_array(dtype): + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") + def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): + # roundtrip possible from arrow 1.0.0 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + if using_infer_string and string_storage2 != "pyarrow_numpy": + request.applymarker( +@@ -532,7 +533,7 @@ def test_arrow_load_from_zero_chunks( + dtype, string_storage2, request, using_infer_string + ): + # GH-41040 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + if using_infer_string and string_storage2 != "pyarrow_numpy": + request.applymarker( +--- a/pandas/tests/arrays/string_/test_string_arrow.py ++++ b/pandas/tests/arrays/string_/test_string_arrow.py +@@ -19,7 +19,7 @@ from pandas.core.arrays.string_arrow imp + + + def test_eq_all_na(): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow")) + result = a == a + expected = pd.array([pd.NA, pd.NA], dtype="boolean[pyarrow]") +@@ -48,7 +48,7 @@ def test_config_bad_storage_raises(): + @pytest.mark.parametrize("chunked", [True, False]) + @pytest.mark.parametrize("array", ["numpy", "pyarrow"]) + def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + array = pa if array in arrow_string_storage else np + +@@ -69,7 +69,7 @@ def test_constructor_not_string_type_rai + + @pytest.mark.parametrize("chunked", [True, False]) + def test_constructor_not_string_type_value_dictionary_raises(chunked): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + arr = pa.array([1, 2, 3], pa.dictionary(pa.int32(), pa.int32())) + if chunked: +@@ -87,7 +87,7 @@ def test_constructor_not_string_type_val + ) + @pytest.mark.parametrize("chunked", [True, False]) + def test_constructor_valid_string_type_value_dictionary(chunked): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode() + if chunked: +@@ -99,14 +99,14 @@ def test_constructor_valid_string_type_v + + def test_constructor_from_list(): + # GH#27673 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + result = pd.Series(["E"], dtype=StringDtype(storage="pyarrow")) + assert isinstance(result.dtype, StringDtype) + assert result.dtype.storage == "pyarrow" + + + def test_from_sequence_wrong_dtype_raises(using_infer_string): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") + +@@ -199,7 +199,7 @@ def test_pyarrow_not_installed_raises(): + ], + ) + def test_setitem(multiple_chunks, key, value, expected): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + result = pa.array(list("abcde")) + expected = pa.array(expected) +@@ -216,7 +216,7 @@ def test_setitem(multiple_chunks, key, v + + + def test_setitem_invalid_indexer_raises(): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + arr = ArrowStringArray(pa.array(list("abcde"))) + +@@ -242,7 +242,7 @@ def test_setitem_invalid_indexer_raises( + @pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) + def test_pickle_roundtrip(dtype): + # GH 42600 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + expected = pd.Series(range(10), dtype=dtype) + expected_sliced = expected.head(2) + full_pickled = pickle.dumps(expected) +@@ -259,7 +259,7 @@ def test_pickle_roundtrip(dtype): + + def test_string_dtype_error_message(): + # GH#55051 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." + with pytest.raises(ValueError, match=msg): + StringDtype("bla") +--- a/pandas/tests/computation/test_compat.py ++++ b/pandas/tests/computation/test_compat.py +@@ -1,5 +1,6 @@ + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat._optional import VERSIONS + + import pandas as pd +@@ -13,7 +14,7 @@ def test_compat(): + + from pandas.core.computation.check import NUMEXPR_INSTALLED + +- ne = pytest.importorskip("numexpr") ++ ne = td.versioned_importorskip("numexpr") + + ver = ne.__version__ + if Version(ver) < Version(VERSIONS["numexpr"]): +@@ -26,7 +27,7 @@ def test_compat(): + @pytest.mark.parametrize("parser", expr.PARSERS) + def test_invalid_numexpr_version(engine, parser): + if engine == "numexpr": +- pytest.importorskip("numexpr") ++ td.versioned_importorskip("numexpr") + a, b = 1, 2 # noqa: F841 + res = pd.eval("a + b", engine=engine, parser=parser) + assert res == 3 +--- a/pandas/tests/copy_view/test_astype.py ++++ b/pandas/tests/copy_view/test_astype.py +@@ -45,7 +45,7 @@ def test_astype_single_dtype(using_copy_ + @pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) + def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): + if new_dtype == "int64[pyarrow]": +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) + df_orig = df.copy() + df2 = df.astype(new_dtype) +@@ -70,7 +70,7 @@ def test_astype_avoids_copy(using_copy_o + @pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"]) + def test_astype_different_target_dtype(using_copy_on_write, dtype): + if dtype == "int32[pyarrow]": +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + df2 = df.astype(dtype) +@@ -198,7 +198,7 @@ def test_astype_different_timezones_diff + + + def test_astype_arrow_timestamp(using_copy_on_write): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame( + { + "a": [ +--- a/pandas/tests/dtypes/test_common.py ++++ b/pandas/tests/dtypes/test_common.py +@@ -214,7 +214,7 @@ def test_is_sparse(check_scipy): + + + def test_is_scipy_sparse(): +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + assert com.is_scipy_sparse(sp_sparse.bsr_matrix([1, 2, 3])) + +--- a/pandas/tests/dtypes/test_inference.py ++++ b/pandas/tests/dtypes/test_inference.py +@@ -28,6 +28,7 @@ import numpy as np + import pytest + import pytz + ++import pandas.util._test_decorators as td + from pandas._libs import ( + lib, + missing as libmissing, +@@ -1984,7 +1985,7 @@ def test_nan_to_nat_conversions(): + + @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") + def test_is_scipy_sparse(spmatrix): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + assert is_scipy_sparse(spmatrix([[0, 1]])) + assert not is_scipy_sparse(np.array([1])) + +--- a/pandas/tests/extension/test_arrow.py ++++ b/pandas/tests/extension/test_arrow.py +@@ -62,7 +62,7 @@ from pandas.api.types import ( + ) + from pandas.tests.extension import base + +-pa = pytest.importorskip("pyarrow") ++pa = td.versioned_importorskip("pyarrow") + + from pandas.core.arrays.arrow.array import ArrowExtensionArray + from pandas.core.arrays.arrow.extension_types import ArrowPeriodType +--- a/pandas/tests/extension/test_string.py ++++ b/pandas/tests/extension/test_string.py +@@ -21,6 +21,7 @@ from typing import cast + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + import pandas._testing as tm + from pandas.api.types import is_string_dtype +@@ -35,7 +36,7 @@ def maybe_split_array(arr, chunked): + elif arr.dtype.storage != "pyarrow": + return arr + +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + arrow_array = arr._pa_array + split = len(arrow_array) // 2 +--- a/pandas/tests/frame/indexing/test_indexing.py ++++ b/pandas/tests/frame/indexing/test_indexing.py +@@ -1945,7 +1945,7 @@ def test_adding_new_conditional_column() + ) + def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: + # https://github.com/pandas-dev/pandas/issues/56204 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + with pd.option_context("future.infer_string", infer_string): +@@ -1958,7 +1958,7 @@ def test_adding_new_conditional_column_w + + def test_add_new_column_infer_string(): + # GH#55366 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame({"x": [1]}) + with pd.option_context("future.infer_string", True): + df.loc[df["x"] == 1, "y"] = "1" +--- a/pandas/tests/frame/indexing/test_setitem.py ++++ b/pandas/tests/frame/indexing/test_setitem.py +@@ -760,7 +760,7 @@ class TestDataFrameSetItem: + + def test_setitem_string_option_object_index(self): + # GH#55638 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame({"a": [1, 2]}) + with pd.option_context("future.infer_string", True): + df["b"] = Index(["a", "b"], dtype=object) +--- a/pandas/tests/frame/methods/test_astype.py ++++ b/pandas/tests/frame/methods/test_astype.py +@@ -893,7 +893,7 @@ def test_frame_astype_no_copy(): + @pytest.mark.parametrize("dtype", ["int64", "Int64"]) + def test_astype_copies(dtype): + # GH#50984 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) + result = df.astype("int64[pyarrow]", copy=True) + df.iloc[0, 0] = 100 +--- a/pandas/tests/frame/methods/test_convert_dtypes.py ++++ b/pandas/tests/frame/methods/test_convert_dtypes.py +@@ -3,6 +3,7 @@ import datetime + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + import pandas._testing as tm + +@@ -49,7 +50,7 @@ class TestConvertDtypes: + assert result.columns.name == "cols" + + def test_pyarrow_dtype_backend(self): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), +@@ -105,13 +106,13 @@ class TestConvertDtypes: + tm.assert_frame_equal(result, expected) + + def test_pyarrow_dtype_backend_already_pyarrow(self): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + expected = pd.DataFrame([1, 2, 3], dtype="int64[pyarrow]") + result = expected.convert_dtypes(dtype_backend="pyarrow") + tm.assert_frame_equal(result, expected) + + def test_pyarrow_dtype_backend_from_pandas_nullable(self): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, 2, None], dtype="Int32"), +@@ -135,7 +136,7 @@ class TestConvertDtypes: + + def test_pyarrow_dtype_empty_object(self): + # GH 50970 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + expected = pd.DataFrame(columns=[0]) + result = expected.convert_dtypes(dtype_backend="pyarrow") + tm.assert_frame_equal(result, expected) +@@ -152,7 +153,7 @@ class TestConvertDtypes: + + def test_pyarrow_backend_no_conversion(self): + # GH#52872 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"}) + expected = df.copy() + result = df.convert_dtypes( +@@ -166,7 +167,7 @@ class TestConvertDtypes: + + def test_convert_dtypes_pyarrow_to_np_nullable(self): + # GH 53648 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + ser = pd.DataFrame(range(2), dtype="int32[pyarrow]") + result = ser.convert_dtypes(dtype_backend="numpy_nullable") + expected = pd.DataFrame(range(2), dtype="Int32") +@@ -174,7 +175,7 @@ class TestConvertDtypes: + + def test_convert_dtypes_pyarrow_timestamp(self): + # GH 54191 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + ser = pd.Series(pd.date_range("2020-01-01", "2020-01-02", freq="1min")) + expected = ser.astype("timestamp[ms][pyarrow]") + result = expected.convert_dtypes(dtype_backend="pyarrow") +--- a/pandas/tests/frame/methods/test_cov_corr.py ++++ b/pandas/tests/frame/methods/test_cov_corr.py +@@ -105,7 +105,7 @@ class TestDataFrameCorr: + + @pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"]) + def test_corr_scipy_method(self, float_frame, method): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + float_frame.loc[float_frame.index[:5], "A"] = np.nan + float_frame.loc[float_frame.index[5:10], "B"] = np.nan + float_frame.loc[float_frame.index[:10], "A"] = float_frame["A"][10:20].copy() +@@ -126,7 +126,7 @@ class TestDataFrameCorr: + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) + def test_corr_nooverlap(self, meth): + # nothing in common +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame( + { + "A": [1, 1.5, 1, np.nan, np.nan, np.nan], +@@ -159,7 +159,7 @@ class TestDataFrameCorr: + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame({"a": [True, False], "b": [1, 0]}) + + expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) +@@ -201,7 +201,7 @@ class TestDataFrameCorr: + @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) + def test_corr_nullable_integer(self, nullable_column, other_column, method): + # https://github.com/pandas-dev/pandas/issues/33803 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + data = DataFrame({"a": nullable_column, "b": other_column}) + result = data.corr(method=method) + expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) +@@ -250,7 +250,7 @@ class TestDataFrameCorr: + + @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) + def test_corr_min_periods_greater_than_length(self, method): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame({"A": [1, 2], "B": [1, 2]}) + result = df.corr(method=method, min_periods=3) + expected = DataFrame( +@@ -264,7 +264,7 @@ class TestDataFrameCorr: + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]}) + expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) + if numeric_only: +@@ -433,7 +433,7 @@ class TestDataFrameCorrWith: + + def test_corrwith_spearman(self): + # GH#21925 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).random(size=(100, 3))) + result = df.corrwith(df**2, method="spearman") + expected = Series(np.ones(len(result))) +@@ -441,7 +441,7 @@ class TestDataFrameCorrWith: + + def test_corrwith_kendall(self): + # GH#21925 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).random(size=(100, 3))) + result = df.corrwith(df**2, method="kendall") + expected = Series(np.ones(len(result))) +@@ -449,7 +449,7 @@ class TestDataFrameCorrWith: + + def test_corrwith_spearman_with_tied_data(self): + # GH#48826 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df1 = DataFrame( + { + "A": [1, np.nan, 7, 8], +--- a/pandas/tests/frame/methods/test_describe.py ++++ b/pandas/tests/frame/methods/test_describe.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + Categorical, +@@ -398,7 +399,7 @@ class TestDataFrameDescribe: + + def test_describe_exclude_pa_dtype(self): + # GH#52570 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + df = DataFrame( + { + "a": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())), +--- a/pandas/tests/frame/methods/test_dot.py ++++ b/pandas/tests/frame/methods/test_dot.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + Series, +@@ -144,7 +145,7 @@ class TestDataFrameDot(DotSharedTests): + [("Float32", "Float64"), ("Int16", "Int32"), ("float[pyarrow]", "double[pyarrow]")], + ) + def test_arrow_dtype(dtype, exp_dtype): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + + cols = ["a", "b"] + df_a = DataFrame([[1, 2], [3, 4], [5, 6]], columns=cols, dtype="int32") +--- a/pandas/tests/frame/methods/test_info.py ++++ b/pandas/tests/frame/methods/test_info.py +@@ -7,6 +7,7 @@ import textwrap + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat import ( + IS64, + PYPY, +@@ -527,7 +528,7 @@ def test_memory_usage_empty_no_warning() + @pytest.mark.single_cpu + def test_info_compute_numba(): + # GH#51922 +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + df = DataFrame([[1, 2], [3, 4]]) + + with option_context("compute.use_numba", True): +--- a/pandas/tests/frame/methods/test_interpolate.py ++++ b/pandas/tests/frame/methods/test_interpolate.py +@@ -213,7 +213,7 @@ class TestDataFrameInterpolate: + df.interpolate(method="values") + + def test_interp_various(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) +@@ -252,7 +252,7 @@ class TestDataFrameInterpolate: + tm.assert_frame_equal(result, expected, check_dtype=False) + + def test_interp_alt_scipy(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) +@@ -541,7 +541,7 @@ class TestDataFrameInterpolate: + ) + def test_interpolate_arrow(self, dtype): + # GH#55347 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype + "[pyarrow]") + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="float64[pyarrow]") +--- a/pandas/tests/frame/methods/test_join.py ++++ b/pandas/tests/frame/methods/test_join.py +@@ -3,6 +3,7 @@ from datetime import datetime + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.errors import MergeError + + import pandas as pd +@@ -163,7 +164,7 @@ def test_join_on_single_col_dup_on_right + # GH 46622 + # Dups on right allowed by one_to_many constraint + if dtype == "string[pyarrow]": +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + left_no_dup = left_no_dup.astype(dtype) + right_w_dups.index = right_w_dups.index.astype(dtype) + left_no_dup.join( +--- a/pandas/tests/frame/methods/test_rank.py ++++ b/pandas/tests/frame/methods/test_rank.py +@@ -6,6 +6,7 @@ from datetime import ( + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._libs.algos import ( + Infinity, + NegInfinity, +@@ -39,7 +40,7 @@ class TestRank: + return request.param + + def test_rank(self, float_frame): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + float_frame.loc[::2, "A"] = np.nan + float_frame.loc[::3, "B"] = np.nan +@@ -143,7 +144,7 @@ class TestRank: + float_string_frame.rank(axis=1) + + def test_rank_na_option(self, float_frame): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + float_frame.loc[::2, "A"] = np.nan + float_frame.loc[::3, "B"] = np.nan +@@ -227,7 +228,7 @@ class TestRank: + @pytest.mark.parametrize("ax", [0, 1]) + @pytest.mark.parametrize("m", ["average", "min", "max", "first", "dense"]) + def test_rank_methods_frame(self, ax, m): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + xs = np.random.default_rng(2).integers(0, 21, (100, 26)) + xs = (xs - 10.0) / 10.0 +@@ -503,7 +504,7 @@ class TestRank: + ) + def test_rank_string_dtype(self, dtype, exp_dtype): + # GH#55362 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + result = obj.rank(method="first") + expected = Series([1, 2, None, 3], dtype=exp_dtype) +--- a/pandas/tests/frame/test_api.py ++++ b/pandas/tests/frame/test_api.py +@@ -5,6 +5,7 @@ import pydoc + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._config import using_pyarrow_string_dtype + from pandas._config.config import option_context + +@@ -288,7 +289,7 @@ class TestDataFrameMisc: + + def test_tab_complete_warning(self, ip, frame_or_series): + # GH 16409 +- pytest.importorskip("IPython", minversion="6.0.0") ++ td.versioned_importorskip("IPython", min_version="6.0.0") + from IPython.core.completer import provisionalcompleter + + if frame_or_series is DataFrame: +@@ -383,7 +384,7 @@ class TestDataFrameMisc: + + def test_inspect_getmembers(self): + # GH38740 +- pytest.importorskip("jinja2") ++ td.versioned_importorskip("jinja2") + df = DataFrame() + msg = "DataFrame._data is deprecated" + with tm.assert_produces_warning( +--- a/pandas/tests/frame/test_arrow_interface.py ++++ b/pandas/tests/frame/test_arrow_interface.py +@@ -6,7 +6,7 @@ import pandas.util._test_decorators as t + + import pandas as pd + +-pa = pytest.importorskip("pyarrow") ++pa = td.versioned_importorskip("pyarrow") + + + @td.skip_if_no("pyarrow", min_version="14.0") +--- a/pandas/tests/frame/test_constructors.py ++++ b/pandas/tests/frame/test_constructors.py +@@ -2704,7 +2704,7 @@ class TestDataFrameConstructors: + + def test_frame_string_inference(self): + # GH#54430 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + expected = DataFrame( + {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) +@@ -2739,7 +2739,7 @@ class TestDataFrameConstructors: + + def test_frame_string_inference_array_string_dtype(self): + # GH#54496 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + expected = DataFrame( + {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) +@@ -2764,7 +2764,7 @@ class TestDataFrameConstructors: + + def test_frame_string_inference_block_dim(self): + # GH#55363 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) + assert df._mgr.blocks[0].ndim == 2 +@@ -2852,7 +2852,7 @@ class TestDataFrameConstructorIndexInfer + ) + def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type): + # GH 53617 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + cols = pd.arrays.ArrowExtensionArray( + pa.array(col_vals, type=pa.dictionary(pa.int8(), getattr(pa, col_type)())) + ) +--- a/pandas/tests/frame/test_query_eval.py ++++ b/pandas/tests/frame/test_query_eval.py +@@ -1384,7 +1384,7 @@ class TestDataFrameQueryBacktickQuoting: + @pytest.mark.parametrize("dtype", ["int64", "Int64", "int64[pyarrow]"]) + def test_query_ea_dtypes(self, dtype): + if dtype == "int64[pyarrow]": +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + # GH#50261 + df = DataFrame({"a": Series([1, 2], dtype=dtype)}) + ref = {2} # noqa: F841 +@@ -1402,7 +1402,7 @@ class TestDataFrameQueryBacktickQuoting: + if engine == "numexpr" and not NUMEXPR_INSTALLED: + pytest.skip("numexpr not installed") + if dtype == "int64[pyarrow]": +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame( + {"A": Series([1, 1, 2], dtype="Int64"), "B": Series([1, 2, 2], dtype=dtype)} + ) +--- a/pandas/tests/frame/test_reductions.py ++++ b/pandas/tests/frame/test_reductions.py +@@ -369,7 +369,7 @@ class TestDataFrameAnalytics: + ) + + def test_stat_op_calc_skew_kurtosis(self, float_frame_with_na): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + def skewness(x): + if len(x) < 3: +@@ -1162,7 +1162,7 @@ class TestDataFrameAnalytics: + + def test_idxmax_arrow_types(self): + # GH#55368 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + + df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1]}, dtype="int64[pyarrow]") + result = df.idxmax() +@@ -2020,7 +2020,7 @@ def test_reduction_axis_none_returns_sca + result = getattr(df, method)(axis=None, numeric_only=numeric_only) + np_arr = df.to_numpy(dtype=np.float64) + if method in {"skew", "kurt"}: +- comp_mod = pytest.importorskip("scipy.stats") ++ comp_mod = td.versioned_importorskip("scipy.stats") + if method == "kurt": + method = "kurtosis" + expected = getattr(comp_mod, method)(np_arr, bias=False, axis=None) +--- a/pandas/tests/frame/test_repr.py ++++ b/pandas/tests/frame/test_repr.py +@@ -7,6 +7,7 @@ from io import StringIO + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._config import using_pyarrow_string_dtype + + from pandas import ( +@@ -287,7 +288,7 @@ NaT 4""" + assert "StringCol" in repr(df) + + def test_latex_repr(self): +- pytest.importorskip("jinja2") ++ td.versioned_importorskip("jinja2") + expected = r"""\begin{tabular}{llll} + \toprule + & 0 & 1 & 2 \\ +@@ -475,7 +476,7 @@ NaT 4""" + + def test_repr_ea_columns(self, any_string_dtype): + # GH#54797 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame({"long_column_name": [1, 2, 3], "col2": [4, 5, 6]}) + df.columns = df.columns.astype(any_string_dtype) + expected = """ long_column_name col2 +--- a/pandas/tests/frame/test_subclass.py ++++ b/pandas/tests/frame/test_subclass.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + DataFrame, +@@ -669,7 +670,7 @@ class TestDataFrameSubclassing: + assert isinstance(result, tm.SubclassedSeries) + + def test_corrwith(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] + df1 = tm.SubclassedDataFrame( +--- a/pandas/tests/frame/test_ufunc.py ++++ b/pandas/tests/frame/test_ufunc.py +@@ -4,6 +4,7 @@ import re + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + import pandas._testing as tm + from pandas.api.types import is_extension_array_dtype +@@ -250,7 +251,7 @@ def test_alignment_deprecation_many_inpu + # https://github.com/pandas-dev/pandas/issues/39184 + # test that the deprecation also works with > 2 inputs -> using a numba + # written ufunc for this because numpy itself doesn't have such ufuncs +- numba = pytest.importorskip("numba") ++ numba = td.versioned_importorskip("numba") + + @numba.vectorize([numba.float64(numba.float64, numba.float64, numba.float64)]) + def my_ufunc(x, y, z): +--- a/pandas/tests/generic/test_finalize.py ++++ b/pandas/tests/generic/test_finalize.py +@@ -7,6 +7,7 @@ import re + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + import pandas._testing as tm + +@@ -443,7 +444,7 @@ def test_finalize_last(data): + + @not_implemented_mark + def test_finalize_called_eval_numexpr(): +- pytest.importorskip("numexpr") ++ td.versioned_importorskip("numexpr") + df = pd.DataFrame({"A": [1, 2]}) + df.attrs["A"] = 1 + result = df.eval("A + 1", engine="numexpr") +--- a/pandas/tests/generic/test_to_xarray.py ++++ b/pandas/tests/generic/test_to_xarray.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + Categorical, + DataFrame, +@@ -10,7 +11,7 @@ from pandas import ( + ) + import pandas._testing as tm + +-pytest.importorskip("xarray") ++td.versioned_importorskip("xarray") + + + class TestDataFrameToXArray: +--- a/pandas/tests/groupby/aggregate/test_numba.py ++++ b/pandas/tests/groupby/aggregate/test_numba.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.errors import NumbaUtilError + + from pandas import ( +@@ -22,7 +23,7 @@ pytestmark = pytest.mark.single_cpu + + + def test_correct_function_signature(): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def incorrect_function(x): + return sum(x) * 2.7 +@@ -39,7 +40,7 @@ def test_correct_function_signature(): + + + def test_check_nopython_kwargs(): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def incorrect_function(values, index): + return sum(values) * 2.7 +@@ -61,7 +62,7 @@ def test_check_nopython_kwargs(): + @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) + @pytest.mark.parametrize("as_index", [True, False]) + def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def func_numba(values, index): + return np.mean(values) * 2.7 +@@ -92,7 +93,7 @@ def test_numba_vs_cython(jit, pandas_obj + @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) + def test_cache(jit, pandas_obj, nogil, parallel, nopython): + # Test that the functions are cached correctly if we switch functions +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def func_1(values, index): + return np.mean(values) - 3.4 +@@ -130,7 +131,7 @@ def test_cache(jit, pandas_obj, nogil, p + + + def test_use_global_config(): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def func_1(values, index): + return np.mean(values) - 3.4 +@@ -155,7 +156,7 @@ def test_use_global_config(): + ], + ) + def test_multifunc_numba_vs_cython_frame(agg_kwargs): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + data = DataFrame( + { + 0: ["a", "a", "b", "b", "a"], +@@ -190,7 +191,7 @@ def test_multifunc_numba_vs_cython_frame + ], + ) + def test_multifunc_numba_udf_frame(agg_kwargs, expected_func): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + data = DataFrame( + { + 0: ["a", "a", "b", "b", "a"], +@@ -212,7 +213,7 @@ def test_multifunc_numba_udf_frame(agg_k + [{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}], + ) + def test_multifunc_numba_vs_cython_series(agg_kwargs): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + labels = ["a", "a", "b", "b", "a"] + data = Series([1.0, 2.0, 3.0, 4.0, 5.0]) + grouped = data.groupby(labels) +@@ -265,7 +266,7 @@ def test_multifunc_numba_vs_cython_serie + strict=False, + ) + def test_multifunc_numba_kwarg_propagation(data, agg_kwargs): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + labels = ["a", "a", "b", "b", "a"] + grouped = data.groupby(labels) + result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True}) +@@ -278,7 +279,7 @@ def test_multifunc_numba_kwarg_propagati + + def test_args_not_cached(): + # GH 41647 +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def sum_last(values, index, n): + return values[-n:].sum() +@@ -296,7 +297,7 @@ def test_args_not_cached(): + + def test_index_data_correctly_passed(): + # GH 43133 +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def f(values, index): + return np.mean(index) +@@ -312,7 +313,7 @@ def test_index_data_correctly_passed(): + def test_engine_kwargs_not_cached(): + # If the user passes a different set of engine_kwargs don't return the same + # jitted function +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + nogil = True + parallel = False + nopython = True +@@ -339,7 +340,7 @@ def test_engine_kwargs_not_cached(): + + @pytest.mark.filterwarnings("ignore") + def test_multiindex_one_key(nogil, parallel, nopython): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def numba_func(values, index): + return 1 +@@ -354,7 +355,7 @@ def test_multiindex_one_key(nogil, paral + + + def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def numba_func(values, index): + return 1 +@@ -368,7 +369,7 @@ def test_multiindex_multi_key_not_suppor + + + def test_multilabel_numba_vs_cython(numba_supported_reductions): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + reduction, kwargs = numba_supported_reductions + df = DataFrame( + { +@@ -389,7 +390,7 @@ def test_multilabel_numba_vs_cython(numb + + + def test_multilabel_udf_numba_vs_cython(): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], +--- a/pandas/tests/groupby/test_counting.py ++++ b/pandas/tests/groupby/test_counting.py +@@ -4,6 +4,7 @@ from string import ascii_lowercase + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + Index, +@@ -385,7 +386,7 @@ def test_count_uses_size_on_exception(): + + def test_count_arrow_string_array(any_string_dtype): + # GH#54751 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame( + {"a": [1, 2, 3], "b": Series(["a", "b", "a"], dtype=any_string_dtype)} + ) +--- a/pandas/tests/groupby/test_groupby.py ++++ b/pandas/tests/groupby/test_groupby.py +@@ -2596,7 +2596,7 @@ def test_groupby_column_index_name_lost( + def test_groupby_duplicate_columns(infer_string): + # GH: 31735 + if infer_string: +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame( + {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]} + ).astype(object) +--- a/pandas/tests/groupby/test_numba.py ++++ b/pandas/tests/groupby/test_numba.py +@@ -1,5 +1,6 @@ + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + Series, +@@ -9,7 +10,7 @@ import pandas._testing as tm + + pytestmark = pytest.mark.single_cpu + +-pytest.importorskip("numba") ++td.versioned_importorskip("numba") + + + @pytest.mark.filterwarnings("ignore") +--- a/pandas/tests/groupby/test_reductions.py ++++ b/pandas/tests/groupby/test_reductions.py +@@ -701,7 +701,7 @@ def test_groupby_min_max_categorical(fun + @pytest.mark.parametrize("func", ["min", "max"]) + def test_min_empty_string_dtype(func): + # GH#55619 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] + result = getattr(df.groupby("a"), func)() +--- a/pandas/tests/groupby/test_timegrouper.py ++++ b/pandas/tests/groupby/test_timegrouper.py +@@ -10,6 +10,7 @@ import numpy as np + import pytest + import pytz + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + DataFrame, +@@ -944,7 +945,7 @@ class TestGroupBy: + def test_groupby_agg_numba_timegrouper_with_nat( + self, groupby_with_truncated_bingrouper + ): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + # See discussion in GH#43487 + gb = groupby_with_truncated_bingrouper +--- a/pandas/tests/groupby/transform/test_numba.py ++++ b/pandas/tests/groupby/transform/test_numba.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.errors import NumbaUtilError + + from pandas import ( +@@ -14,7 +15,7 @@ pytestmark = pytest.mark.single_cpu + + + def test_correct_function_signature(): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def incorrect_function(x): + return x + 1 +@@ -31,7 +32,7 @@ def test_correct_function_signature(): + + + def test_check_nopython_kwargs(): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def incorrect_function(values, index): + return values + 1 +@@ -53,7 +54,7 @@ def test_check_nopython_kwargs(): + @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) + @pytest.mark.parametrize("as_index", [True, False]) + def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def func(values, index): + return values + 1 +@@ -84,7 +85,7 @@ def test_numba_vs_cython(jit, pandas_obj + @pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"]) + def test_cache(jit, pandas_obj, nogil, parallel, nopython): + # Test that the functions are cached correctly if we switch functions +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def func_1(values, index): + return values + 1 +@@ -121,7 +122,7 @@ def test_cache(jit, pandas_obj, nogil, p + + + def test_use_global_config(): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def func_1(values, index): + return values + 1 +@@ -141,7 +142,7 @@ def test_use_global_config(): + "agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}] + ) + def test_string_cython_vs_numba(agg_func, numba_supported_reductions): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + agg_func, kwargs = numba_supported_reductions + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] +@@ -159,7 +160,7 @@ def test_string_cython_vs_numba(agg_func + + def test_args_not_cached(): + # GH 41647 +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def sum_last(values, index, n): + return values[-n:].sum() +@@ -177,7 +178,7 @@ def test_args_not_cached(): + + def test_index_data_correctly_passed(): + # GH 43133 +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def f(values, index): + return index - 1 +@@ -191,7 +192,7 @@ def test_index_data_correctly_passed(): + def test_engine_kwargs_not_cached(): + # If the user passes a different set of engine_kwargs don't return the same + # jitted function +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + nogil = True + parallel = False + nopython = True +@@ -218,7 +219,7 @@ def test_engine_kwargs_not_cached(): + + @pytest.mark.filterwarnings("ignore") + def test_multiindex_one_key(nogil, parallel, nopython): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def numba_func(values, index): + return 1 +@@ -233,7 +234,7 @@ def test_multiindex_one_key(nogil, paral + + + def test_multiindex_multi_key_not_supported(nogil, parallel, nopython): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + + def numba_func(values, index): + return 1 +@@ -247,7 +248,7 @@ def test_multiindex_multi_key_not_suppor + + + def test_multilabel_numba_vs_cython(numba_supported_reductions): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + reduction, kwargs = numba_supported_reductions + df = DataFrame( + { +@@ -264,7 +265,7 @@ def test_multilabel_numba_vs_cython(numb + + + def test_multilabel_udf_numba_vs_cython(): +- pytest.importorskip("numba") ++ td.versioned_importorskip("numba") + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], +--- a/pandas/tests/indexes/base_class/test_constructors.py ++++ b/pandas/tests/indexes/base_class/test_constructors.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + Index, +@@ -47,7 +48,7 @@ class TestIndexConstructor: + + def test_index_string_inference(self): + # GH#54430 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + expected = Index(["a", "b"], dtype=dtype) + with pd.option_context("future.infer_string", True): +--- a/pandas/tests/indexes/base_class/test_reshape.py ++++ b/pandas/tests/indexes/base_class/test_reshape.py +@@ -4,6 +4,7 @@ Tests for ndarray-like method on the bas + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import Index + import pandas._testing as tm + +@@ -58,7 +59,7 @@ class TestReshape: + + def test_insert_none_into_string_numpy(self): + # GH#55365 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + result = index.insert(-1, None) + expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") +--- a/pandas/tests/indexes/multi/test_constructors.py ++++ b/pandas/tests/indexes/multi/test_constructors.py +@@ -7,6 +7,7 @@ import itertools + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike + + import pandas as pd +@@ -648,7 +649,7 @@ def test_from_frame(): + + def test_from_frame_missing_values_multiIndex(): + # GH 39984 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + df = pd.DataFrame( + { +--- a/pandas/tests/indexes/numeric/test_indexing.py ++++ b/pandas/tests/indexes/numeric/test_indexing.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.errors import InvalidIndexError + + from pandas import ( +@@ -385,7 +386,7 @@ class TestGetIndexer: + def test_get_indexer_masked_na_boolean(self, dtype): + # GH#39133 + if dtype == "bool[pyarrow]": +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + idx = Index([True, False, NA], dtype=dtype) + result = idx.get_loc(False) + assert result == 1 +@@ -393,7 +394,7 @@ class TestGetIndexer: + assert result == 2 + + def test_get_indexer_arrow_dictionary_target(self): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + target = Index( + ArrowExtensionArray( + pa.array([1, 2], type=pa.dictionary(pa.int8(), pa.int8())) +--- a/pandas/tests/indexes/test_base.py ++++ b/pandas/tests/indexes/test_base.py +@@ -1285,7 +1285,7 @@ class TestIndex: + + def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 +- pytest.importorskip("IPython", minversion="6.0.0") ++ td.versioned_importorskip("IPython", min_version="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; idx = pd.Index([1, 2])" +--- a/pandas/tests/indexing/test_datetime.py ++++ b/pandas/tests/indexing/test_datetime.py +@@ -2,6 +2,7 @@ import re + + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + DataFrame, +@@ -174,7 +175,7 @@ class TestDatetimeIndex: + + def test_getitem_pyarrow_index(self, frame_or_series): + # GH 53644 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + obj = frame_or_series( + range(5), + index=date_range("2020", freq="D", periods=5).astype( +--- a/pandas/tests/indexing/test_loc.py ++++ b/pandas/tests/indexing/test_loc.py +@@ -1308,7 +1308,7 @@ class TestLocBaseIndependent: + @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) + @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) + def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + spmatrix_t = getattr(sp_sparse, spmatrix_t) + +@@ -1337,7 +1337,7 @@ class TestLocBaseIndependent: + + def test_loc_getitem_sparse_frame(self): + # GH34687 +- sp_sparse = pytest.importorskip("scipy.sparse") ++ sp_sparse = td.versioned_importorskip("scipy.sparse") + + df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5)) + result = df.loc[range(2)] +@@ -3078,7 +3078,7 @@ def test_loc_periodindex_3_levels(): + + def test_loc_setitem_pyarrow_strings(): + # GH#52319 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame( + { + "strings": Series(["A", "B", "C"], dtype="string[pyarrow]"), +--- a/pandas/tests/interchange/test_impl.py ++++ b/pandas/tests/interchange/test_impl.py +@@ -6,6 +6,7 @@ from datetime import ( + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._libs.tslibs import iNaT + from pandas.compat import ( + is_ci_environment, +@@ -67,7 +68,7 @@ def test_categorical_dtype(data, data_ca + + def test_categorical_pyarrow(): + # GH 49889 +- pa = pytest.importorskip("pyarrow", "11.0.0") ++ pa = td.versioned_importorskip("pyarrow", "11.0.0") + + arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"] + table = pa.table({"weekday": pa.array(arr).dictionary_encode()}) +@@ -82,7 +83,7 @@ def test_categorical_pyarrow(): + + def test_empty_categorical_pyarrow(): + # https://github.com/pandas-dev/pandas/issues/53077 +- pa = pytest.importorskip("pyarrow", "11.0.0") ++ pa = td.versioned_importorskip("pyarrow", "11.0.0") + + arr = [None] + table = pa.table({"arr": pa.array(arr, "float64").dictionary_encode()}) +@@ -94,7 +95,7 @@ def test_empty_categorical_pyarrow(): + + def test_large_string_pyarrow(): + # GH 52795 +- pa = pytest.importorskip("pyarrow", "11.0.0") ++ pa = td.versioned_importorskip("pyarrow", "11.0.0") + + arr = ["Mon", "Tue"] + table = pa.table({"weekday": pa.array(arr, "large_string")}) +@@ -120,7 +121,7 @@ def test_large_string_pyarrow(): + ) + def test_bitmasks_pyarrow(offset, length, expected_values): + # GH 52795 +- pa = pytest.importorskip("pyarrow", "11.0.0") ++ pa = td.versioned_importorskip("pyarrow", "11.0.0") + + arr = [3.3, None, 2.1] + table = pa.table({"arr": arr}).slice(offset, length) +@@ -282,7 +283,7 @@ def test_categorical_to_numpy_dlpack(): + @pytest.mark.parametrize("data", [{}, {"a": []}]) + def test_empty_pyarrow(data): + # GH 53155 +- pytest.importorskip("pyarrow", "11.0.0") ++ td.versioned_importorskip("pyarrow", "11.0.0") + from pyarrow.interchange import from_dataframe as pa_from_dataframe + + expected = pd.DataFrame(data) +@@ -292,7 +293,7 @@ def test_empty_pyarrow(data): + + + def test_multi_chunk_pyarrow() -> None: +- pa = pytest.importorskip("pyarrow", "11.0.0") ++ pa = td.versioned_importorskip("pyarrow", "11.0.0") + n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + names = ["n_legs"] + table = pa.table([n_legs], names=names) +@@ -305,7 +306,7 @@ def test_multi_chunk_pyarrow() -> None: + + + def test_multi_chunk_column() -> None: +- pytest.importorskip("pyarrow", "11.0.0") ++ td.versioned_importorskip("pyarrow", "11.0.0") + ser = pd.Series([1, 2, None], dtype="Int64[pyarrow]") + df = pd.concat([ser, ser], ignore_index=True).to_frame("a") + df_orig = df.copy() +@@ -327,7 +328,7 @@ def test_multi_chunk_column() -> None: + + def test_timestamp_ns_pyarrow(): + # GH 56712 +- pytest.importorskip("pyarrow", "11.0.0") ++ td.versioned_importorskip("pyarrow", "11.0.0") + timestamp_args = { + "year": 2000, + "month": 1, +@@ -362,7 +363,7 @@ def test_datetimetzdtype(tz, unit): + + def test_interchange_from_non_pandas_tz_aware(request): + # GH 54239, 54287 +- pa = pytest.importorskip("pyarrow", "11.0.0") ++ pa = td.versioned_importorskip("pyarrow", "11.0.0") + import pyarrow.compute as pc + + if is_platform_windows() and is_ci_environment(): +@@ -420,7 +421,7 @@ def test_empty_string_column(): + + def test_large_string(): + # GH#56702 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + expected = pd.DataFrame({"a": ["x"]}, dtype="object") +@@ -500,7 +501,7 @@ def test_pandas_nullable_with_missing_va + ) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + # https://github.com/pandas-dev/pandas/issues/57664 +- pa = pytest.importorskip("pyarrow", "11.0.0") ++ pa = td.versioned_importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": +@@ -562,7 +563,7 @@ def test_pandas_nullable_without_missing + data: list, dtype: str, expected_dtype: str + ) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 +- pa = pytest.importorskip("pyarrow", "11.0.0") ++ pa = td.versioned_importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": +@@ -578,7 +579,7 @@ def test_pandas_nullable_without_missing + + def test_string_validity_buffer() -> None: + # https://github.com/pandas-dev/pandas/issues/57761 +- pytest.importorskip("pyarrow", "11.0.0") ++ td.versioned_importorskip("pyarrow", "11.0.0") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + assert result is None +@@ -586,7 +587,7 @@ def test_string_validity_buffer() -> Non + + def test_string_validity_buffer_no_missing() -> None: + # https://github.com/pandas-dev/pandas/issues/57762 +- pytest.importorskip("pyarrow", "11.0.0") ++ td.versioned_importorskip("pyarrow", "11.0.0") + df = pd.DataFrame({"a": ["x", None]}, dtype="large_string[pyarrow]") + validity = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + assert validity is not None +--- a/pandas/tests/interchange/test_utils.py ++++ b/pandas/tests/interchange/test_utils.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas.core.interchange.utils import dtype_to_arrow_c_fmt + +@@ -78,7 +79,7 @@ def test_dtype_to_arrow_c_fmt(pandas_dty + ) + def test_dtype_to_arrow_c_fmt_arrowdtype(pa_dtype, args_kwargs, c_string): + # GH 52323 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + if not args_kwargs: + pa_type = getattr(pa, pa_dtype)() + elif isinstance(args_kwargs, tuple): +--- a/pandas/tests/io/conftest.py ++++ b/pandas/tests/io/conftest.py +@@ -58,8 +58,8 @@ def s3_base(worker_id, monkeypatch): + Sets up moto server in separate process locally + Return url for motoserver/moto CI service + """ +- pytest.importorskip("s3fs") +- pytest.importorskip("boto3") ++ td.versioned_importorskip("s3fs") ++ td.versioned_importorskip("boto3") + + # temporary workaround as moto fails for botocore >= 1.11 otherwise, + # see https://github.com/spulec/moto/issues/1924 & 1952 +@@ -80,9 +80,9 @@ def s3_base(worker_id, monkeypatch): + # set in .github/workflows/unit-tests.yml + yield "http://localhost:5000" + else: +- requests = pytest.importorskip("requests") +- pytest.importorskip("moto") +- pytest.importorskip("flask") # server mode needs flask too ++ requests = td.versioned_importorskip("requests") ++ td.versioned_importorskip("moto") ++ td.versioned_importorskip("flask") # server mode needs flask too + + # Launching moto in server mode, i.e., as a separate process + # with an S3 endpoint on localhost +--- a/pandas/tests/io/excel/test_odf.py ++++ b/pandas/tests/io/excel/test_odf.py +@@ -3,12 +3,13 @@ import functools + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat import is_platform_windows + + import pandas as pd + import pandas._testing as tm + +-pytest.importorskip("odf") ++td.versioned_importorskip("odf") + + if is_platform_windows(): + pytestmark = pytest.mark.single_cpu +--- a/pandas/tests/io/excel/test_odswriter.py ++++ b/pandas/tests/io/excel/test_odswriter.py +@@ -6,6 +6,7 @@ import re + + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat import is_platform_windows + + import pandas as pd +@@ -13,7 +14,7 @@ import pandas._testing as tm + + from pandas.io.excel import ExcelWriter + +-odf = pytest.importorskip("odf") ++odf = td.versioned_importorskip("odf") + + if is_platform_windows(): + pytestmark = pytest.mark.single_cpu +--- a/pandas/tests/io/excel/test_openpyxl.py ++++ b/pandas/tests/io/excel/test_openpyxl.py +@@ -5,6 +5,7 @@ import re + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat import is_platform_windows + + import pandas as pd +@@ -17,7 +18,7 @@ from pandas.io.excel import ( + ) + from pandas.io.excel._openpyxl import OpenpyxlReader + +-openpyxl = pytest.importorskip("openpyxl") ++openpyxl = td.versioned_importorskip("openpyxl") + + if is_platform_windows(): + pytestmark = pytest.mark.single_cpu +--- a/pandas/tests/io/excel/test_readers.py ++++ b/pandas/tests/io/excel/test_readers.py +@@ -667,7 +667,7 @@ class TestReaders: + if read_ext in (".xlsb", ".xls"): + pytest.skip(f"No engine for filetype: '{read_ext}'") + +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + with pd.option_context("mode.string_storage", string_storage): + df = DataFrame( +--- a/pandas/tests/io/excel/test_style.py ++++ b/pandas/tests/io/excel/test_style.py +@@ -16,7 +16,7 @@ import pandas._testing as tm + from pandas.io.excel import ExcelWriter + from pandas.io.formats.excel import ExcelFormatter + +-pytest.importorskip("jinja2") ++td.versioned_importorskip("jinja2") + # jinja2 is currently required for Styler.__init__(). Technically Styler.to_excel + # could compute styles and render to excel without jinja2, since there is no + # 'template' file, but this needs the import error to delayed until render time. +@@ -41,14 +41,14 @@ def assert_equal_cell_styles(cell1, cell + ) + def test_styler_to_excel_unstyled(engine): + # compare DataFrame.to_excel and Styler.to_excel when no styles applied +- pytest.importorskip(engine) ++ td.versioned_importorskip(engine) + df = DataFrame(np.random.default_rng(2).standard_normal((2, 2))) + with tm.ensure_clean(".xlsx") as path: + with ExcelWriter(path, engine=engine) as writer: + df.to_excel(writer, sheet_name="dataframe") + df.style.to_excel(writer, sheet_name="unstyled") + +- openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl ++ openpyxl = td.versioned_importorskip("openpyxl") # test loading only with openpyxl + with contextlib.closing(openpyxl.load_workbook(path)) as wb: + for col1, col2 in zip(wb["dataframe"].columns, wb["unstyled"].columns): + assert len(col1) == len(col2) +@@ -133,7 +133,7 @@ shared_style_params = [ + ) + @pytest.mark.parametrize("css, attrs, expected", shared_style_params) + def test_styler_to_excel_basic(engine, css, attrs, expected): +- pytest.importorskip(engine) ++ td.versioned_importorskip(engine) + df = DataFrame(np.random.default_rng(2).standard_normal((1, 1))) + styler = df.style.map(lambda x: css) + +@@ -142,7 +142,7 @@ def test_styler_to_excel_basic(engine, c + df.to_excel(writer, sheet_name="dataframe") + styler.to_excel(writer, sheet_name="styled") + +- openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl ++ openpyxl = td.versioned_importorskip("openpyxl") # test loading only with openpyxl + with contextlib.closing(openpyxl.load_workbook(path)) as wb: + # test unstyled data cell does not have expected styles + # test styled cell has expected styles +@@ -164,7 +164,7 @@ def test_styler_to_excel_basic(engine, c + ) + @pytest.mark.parametrize("css, attrs, expected", shared_style_params) + def test_styler_to_excel_basic_indexes(engine, css, attrs, expected): +- pytest.importorskip(engine) ++ td.versioned_importorskip(engine) + df = DataFrame(np.random.default_rng(2).standard_normal((1, 1))) + + styler = df.style +@@ -181,7 +181,7 @@ def test_styler_to_excel_basic_indexes(e + null_styler.to_excel(writer, sheet_name="null_styled") + styler.to_excel(writer, sheet_name="styled") + +- openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl ++ openpyxl = td.versioned_importorskip("openpyxl") # test loading only with openpyxl + with contextlib.closing(openpyxl.load_workbook(path)) as wb: + # test null styled index cells does not have expected styles + # test styled cell has expected styles +@@ -233,7 +233,7 @@ def test_styler_to_excel_border_style(en + attrs = ["border", "left", "style"] + expected = border_style + +- pytest.importorskip(engine) ++ td.versioned_importorskip(engine) + df = DataFrame(np.random.default_rng(2).standard_normal((1, 1))) + styler = df.style.map(lambda x: css) + +@@ -242,7 +242,7 @@ def test_styler_to_excel_border_style(en + df.to_excel(writer, sheet_name="dataframe") + styler.to_excel(writer, sheet_name="styled") + +- openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl ++ openpyxl = td.versioned_importorskip("openpyxl") # test loading only with openpyxl + with contextlib.closing(openpyxl.load_workbook(path)) as wb: + # test unstyled data cell does not have expected styles + # test styled cell has expected styles +@@ -259,7 +259,7 @@ def test_styler_to_excel_border_style(en + + + def test_styler_custom_converter(): +- openpyxl = pytest.importorskip("openpyxl") ++ openpyxl = td.versioned_importorskip("openpyxl") + + def custom_converter(css): + return {"font": {"color": {"rgb": "111222"}}} +--- a/pandas/tests/io/excel/test_xlrd.py ++++ b/pandas/tests/io/excel/test_xlrd.py +@@ -3,6 +3,7 @@ import io + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat import is_platform_windows + + import pandas as pd +@@ -11,7 +12,7 @@ import pandas._testing as tm + from pandas.io.excel import ExcelFile + from pandas.io.excel._base import inspect_excel_format + +-xlrd = pytest.importorskip("xlrd") ++xlrd = td.versioned_importorskip("xlrd") + + if is_platform_windows(): + pytestmark = pytest.mark.single_cpu +--- a/pandas/tests/io/excel/test_xlsxwriter.py ++++ b/pandas/tests/io/excel/test_xlsxwriter.py +@@ -2,6 +2,7 @@ import contextlib + + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat import is_platform_windows + + from pandas import DataFrame +@@ -9,7 +10,7 @@ import pandas._testing as tm + + from pandas.io.excel import ExcelWriter + +-xlsxwriter = pytest.importorskip("xlsxwriter") ++xlsxwriter = td.versioned_importorskip("xlsxwriter") + + if is_platform_windows(): + pytestmark = pytest.mark.single_cpu +@@ -23,7 +24,7 @@ def ext(): + def test_column_format(ext): + # Test that column formats are applied to cells. Test for issue #9167. + # Applicable to xlsxwriter only. +- openpyxl = pytest.importorskip("openpyxl") ++ openpyxl = td.versioned_importorskip("openpyxl") + + with tm.ensure_clean(ext) as path: + frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]}) +--- a/pandas/tests/io/formats/style/test_bar.py ++++ b/pandas/tests/io/formats/style/test_bar.py +@@ -3,13 +3,14 @@ import io + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + NA, + DataFrame, + read_csv, + ) + +-pytest.importorskip("jinja2") ++td.versioned_importorskip("jinja2") + + + def bar_grad(a=None, b=None, c=None, d=None): +--- a/pandas/tests/io/formats/style/test_exceptions.py ++++ b/pandas/tests/io/formats/style/test_exceptions.py +@@ -1,6 +1,7 @@ + import pytest + +-jinja2 = pytest.importorskip("jinja2") ++import pandas.util._test_decorators as td ++jinja2 = td.versioned_importorskip("jinja2") + + from pandas import ( + DataFrame, +--- a/pandas/tests/io/formats/style/test_format.py ++++ b/pandas/tests/io/formats/style/test_format.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + NA, + DataFrame, +@@ -11,7 +12,7 @@ from pandas import ( + option_context, + ) + +-pytest.importorskip("jinja2") ++td.versioned_importorskip("jinja2") + from pandas.io.formats.style import Styler + from pandas.io.formats.style_render import _str_escape + +--- a/pandas/tests/io/formats/style/test_highlight.py ++++ b/pandas/tests/io/formats/style/test_highlight.py +@@ -1,13 +1,14 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + NA, + DataFrame, + IndexSlice, + ) + +-pytest.importorskip("jinja2") ++td.versioned_importorskip("jinja2") + + from pandas.io.formats.style import Styler + +--- a/pandas/tests/io/formats/style/test_html.py ++++ b/pandas/tests/io/formats/style/test_html.py +@@ -6,13 +6,14 @@ from textwrap import ( + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + MultiIndex, + option_context, + ) + +-jinja2 = pytest.importorskip("jinja2") ++jinja2 = td.versioned_importorskip("jinja2") + from pandas.io.formats.style import Styler + + +--- a/pandas/tests/io/formats/style/test_matplotlib.py ++++ b/pandas/tests/io/formats/style/test_matplotlib.py +@@ -3,14 +3,15 @@ import gc + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + IndexSlice, + Series, + ) + +-pytest.importorskip("matplotlib") +-pytest.importorskip("jinja2") ++td.versioned_importorskip("matplotlib") ++td.versioned_importorskip("jinja2") + + import matplotlib as mpl + +@@ -23,9 +24,9 @@ def mpl_cleanup(): + # 1) Resets units registry + # 2) Resets rc_context + # 3) Closes all figures +- mpl = pytest.importorskip("matplotlib") +- mpl_units = pytest.importorskip("matplotlib.units") +- plt = pytest.importorskip("matplotlib.pyplot") ++ mpl = td.versioned_importorskip("matplotlib") ++ mpl_units = td.versioned_importorskip("matplotlib.units") ++ plt = td.versioned_importorskip("matplotlib.pyplot") + orig_units_registry = mpl_units.registry.copy() + with mpl.rc_context(): + mpl.use("template") +--- a/pandas/tests/io/formats/style/test_non_unique.py ++++ b/pandas/tests/io/formats/style/test_non_unique.py +@@ -2,12 +2,13 @@ from textwrap import dedent + + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + IndexSlice, + ) + +-pytest.importorskip("jinja2") ++td.versioned_importorskip("jinja2") + + from pandas.io.formats.style import Styler + +--- a/pandas/tests/io/formats/style/test_style.py ++++ b/pandas/tests/io/formats/style/test_style.py +@@ -16,7 +16,7 @@ from pandas import ( + import pandas._testing as tm + import pandas.util._test_decorators as td + +-jinja2 = pytest.importorskip("jinja2") ++jinja2 = td.versioned_importorskip("jinja2") + from pandas.io.formats.style import ( # isort:skip + Styler, + ) +--- a/pandas/tests/io/formats/style/test_to_latex.py ++++ b/pandas/tests/io/formats/style/test_to_latex.py +@@ -3,6 +3,7 @@ from textwrap import dedent + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + MultiIndex, +@@ -10,7 +11,7 @@ from pandas import ( + option_context, + ) + +-pytest.importorskip("jinja2") ++td.versioned_importorskip("jinja2") + from pandas.io.formats.style import Styler + from pandas.io.formats.style_render import ( + _parse_latex_cell_styles, +--- a/pandas/tests/io/formats/style/test_to_string.py ++++ b/pandas/tests/io/formats/style/test_to_string.py +@@ -2,12 +2,13 @@ from textwrap import dedent + + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + Series, + ) + +-pytest.importorskip("jinja2") ++td.versioned_importorskip("jinja2") + from pandas.io.formats.style import Styler + + +--- a/pandas/tests/io/formats/style/test_tooltip.py ++++ b/pandas/tests/io/formats/style/test_tooltip.py +@@ -1,9 +1,10 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import DataFrame + +-pytest.importorskip("jinja2") ++td.versioned_importorskip("jinja2") + from pandas.io.formats.style import Styler + + +--- a/pandas/tests/io/formats/test_format.py ++++ b/pandas/tests/io/formats/test_format.py +@@ -11,6 +11,7 @@ from shutil import get_terminal_size + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._config import using_pyarrow_string_dtype + + import pandas as pd +@@ -2268,7 +2269,7 @@ def test_filepath_or_buffer_arg( + ): + df = DataFrame([data]) + if method in ["to_latex"]: # uses styler implementation +- pytest.importorskip("jinja2") ++ td.versioned_importorskip("jinja2") + + if filepath_or_buffer_id not in ["string", "pathlike"] and encoding is not None: + with pytest.raises( +@@ -2287,7 +2288,7 @@ def test_filepath_or_buffer_arg( + @pytest.mark.parametrize("method", ["to_string", "to_html", "to_latex"]) + def test_filepath_or_buffer_bad_arg_raises(float_frame, method): + if method in ["to_latex"]: # uses styler implementation +- pytest.importorskip("jinja2") ++ td.versioned_importorskip("jinja2") + msg = "buf is not a file name and it has no write method" + with pytest.raises(TypeError, match=msg): + getattr(float_frame, method)(buf=object()) +--- a/pandas/tests/io/formats/test_to_excel.py ++++ b/pandas/tests/io/formats/test_to_excel.py +@@ -6,6 +6,7 @@ import string + + import pytest + ++import pandas.util._test_decorators as td + from pandas.errors import CSSWarning + + import pandas._testing as tm +@@ -336,7 +337,7 @@ def tests_css_named_colors_valid(): + + + def test_css_named_colors_from_mpl_present(): +- mpl_colors = pytest.importorskip("matplotlib.colors") ++ mpl_colors = td.versioned_importorskip("matplotlib.colors") + + pd_colors = CSSToExcelConverter.NAMED_COLORS + for name, color in mpl_colors.CSS4_COLORS.items(): +--- a/pandas/tests/io/formats/test_to_latex.py ++++ b/pandas/tests/io/formats/test_to_latex.py +@@ -4,6 +4,7 @@ from textwrap import dedent + + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + DataFrame, +@@ -11,7 +12,7 @@ from pandas import ( + ) + import pandas._testing as tm + +-pytest.importorskip("jinja2") ++td.versioned_importorskip("jinja2") + + + def _dedent(string): +--- a/pandas/tests/io/formats/test_to_markdown.py ++++ b/pandas/tests/io/formats/test_to_markdown.py +@@ -5,10 +5,11 @@ from io import ( + + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + import pandas._testing as tm + +-pytest.importorskip("tabulate") ++td.versioned_importorskip("tabulate") + + + def test_simple(): +--- a/pandas/tests/io/formats/test_to_string.py ++++ b/pandas/tests/io/formats/test_to_string.py +@@ -10,6 +10,7 @@ from textwrap import dedent + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._config import using_pyarrow_string_dtype + + from pandas import ( +@@ -748,7 +749,7 @@ class TestDataFrameToString: + + def test_to_string_string_dtype(self): + # GH#50099 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame( + {"x": ["foo", "bar", "baz"], "y": ["a", "b", "c"], "z": [1, 2, 3]} + ) +--- a/pandas/tests/io/json/test_pandas.py ++++ b/pandas/tests/io/json/test_pandas.py +@@ -2034,7 +2034,7 @@ class TestPandasContainer: + self, string_storage, dtype_backend, orient, using_infer_string + ): + # GH#50750 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + df = DataFrame( + { + "a": Series([1, np.nan, 3], dtype="Int64"), +@@ -2056,7 +2056,7 @@ class TestPandasContainer: + string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) + + elif dtype_backend == "pyarrow": +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) +@@ -2103,7 +2103,7 @@ class TestPandasContainer: + @pytest.mark.parametrize("orient", ["split", "records", "index"]) + def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): + # GH#50750 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + ser = Series([1, np.nan, 3], dtype="Int64") + + out = ser.to_json(orient=orient) +@@ -2147,7 +2147,7 @@ def test_pyarrow_engine_lines_false(): + + + def test_json_roundtrip_string_inference(orient): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame( + [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] + ) +--- a/pandas/tests/io/json/test_readlines.py ++++ b/pandas/tests/io/json/test_readlines.py +@@ -5,6 +5,7 @@ from pathlib import Path + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + DataFrame, +@@ -28,7 +29,7 @@ def lines_json_df(): + @pytest.fixture(params=["ujson", "pyarrow"]) + def engine(request): + if request.param == "pyarrow": +- pytest.importorskip("pyarrow.json") ++ td.versioned_importorskip("pyarrow.json") + return request.param + + +--- a/pandas/tests/io/parser/conftest.py ++++ b/pandas/tests/io/parser/conftest.py +@@ -4,6 +4,7 @@ import os + + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat._optional import VERSIONS + + from pandas import ( +@@ -135,7 +136,7 @@ def all_parsers(request): + """ + parser = request.param() + if parser.engine == "pyarrow": +- pytest.importorskip("pyarrow", VERSIONS["pyarrow"]) ++ td.versioned_importorskip("pyarrow", VERSIONS["pyarrow"]) + # Try finding a way to disable threads all together + # for more stable CI runs + import pyarrow +--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py ++++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +@@ -8,6 +8,7 @@ from io import StringIO + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.errors import ParserWarning + + import pandas as pd +@@ -460,7 +461,7 @@ def test_dtype_backend_and_dtype(all_par + + def test_dtype_backend_string(all_parsers, string_storage): + # GH#36712 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + with pd.option_context("mode.string_storage", string_storage): + parser = all_parsers +@@ -503,7 +504,7 @@ def test_dtype_backend_ea_dtype_specifie + + def test_dtype_backend_pyarrow(all_parsers, request): + # GH#36712 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + parser = all_parsers + + data = """a,b,c,d,e,f,g,h,i,j +@@ -556,7 +557,7 @@ def test_ea_int_avoid_overflow(all_parse + + def test_string_inference(all_parsers): + # GH#54430 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + + data = """a,b +@@ -577,7 +578,7 @@ y,2 + @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) + def test_string_inference_object_dtype(all_parsers, dtype): + # GH#56047 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + + data = """a,b + x,a +--- a/pandas/tests/io/parser/test_concatenate_chunks.py ++++ b/pandas/tests/io/parser/test_concatenate_chunks.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.errors import DtypeWarning + + import pandas._testing as tm +@@ -11,7 +12,7 @@ from pandas.io.parsers.c_parser_wrapper + + def test_concatenate_chunks_pyarrow(): + # GH#51876 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + chunks = [ + {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, + {0: ArrowExtensionArray(pa.array([1, 2]))}, +@@ -23,7 +24,7 @@ def test_concatenate_chunks_pyarrow(): + + def test_concatenate_chunks_pyarrow_strings(): + # GH#51876 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + chunks = [ + {0: ArrowExtensionArray(pa.array([1.5, 2.5]))}, + {0: ArrowExtensionArray(pa.array(["a", "b"]))}, +--- a/pandas/tests/io/parser/test_network.py ++++ b/pandas/tests/io/parser/test_network.py +@@ -80,7 +80,7 @@ class TestS3: + def test_parse_public_s3_bucket(self, s3_public_bucket_with_data, tips_df, s3so): + # more of an integration test due to the not-public contents portion + # can probably mock this though. +- pytest.importorskip("s3fs") ++ td.versioned_importorskip("s3fs") + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext, +@@ -93,7 +93,7 @@ class TestS3: + + def test_parse_private_s3_bucket(self, s3_private_bucket_with_data, tips_df, s3so): + # Read public file from bucket with not-public contents +- pytest.importorskip("s3fs") ++ td.versioned_importorskip("s3fs") + df = read_csv( + f"s3://{s3_private_bucket_with_data.name}/tips.csv", storage_options=s3so + ) +@@ -258,7 +258,7 @@ class TestS3: + def test_write_s3_parquet_fails(self, tips_df, s3so): + # GH 27679 + # Attempting to write to an invalid S3 path should raise +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + import botocore + + # GH 34087 +@@ -318,7 +318,7 @@ class TestS3: + self, s3_public_bucket_with_data, feather_file, s3so + ): + # GH 29055 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + expected = read_feather(feather_file) + res = read_feather( + f"s3://{s3_public_bucket_with_data.name}/simple_dataset.feather", +--- a/pandas/tests/io/parser/test_python_parser_only.py ++++ b/pandas/tests/io/parser/test_python_parser_only.py +@@ -17,6 +17,7 @@ from typing import TYPE_CHECKING + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.errors import ( + ParserError, + ParserWarning, +@@ -167,7 +168,7 @@ def test_decompression_regex_sep(python_ + data = data.replace(b",", b"::") + expected = parser.read_csv(csv1) + +- module = pytest.importorskip(compression) ++ module = td.versioned_importorskip(compression) + klass = getattr(module, klass) + + with tm.ensure_clean() as path: +--- a/pandas/tests/io/parser/test_read_fwf.py ++++ b/pandas/tests/io/parser/test_read_fwf.py +@@ -14,6 +14,7 @@ from pathlib import Path + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.errors import EmptyDataError + + import pandas as pd +@@ -972,13 +973,13 @@ def test_dtype_backend(string_storage, d + arr = StringArray(np.array(["a", "b"], dtype=np.object_)) + arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) + elif dtype_backend == "pyarrow": +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + arr = ArrowExtensionArray(pa.array(["a", "b"])) + arr_na = ArrowExtensionArray(pa.array([None, "a"])) + else: +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + arr = ArrowStringArray(pa.array(["a", "b"])) + arr_na = ArrowStringArray(pa.array([None, "a"])) + +@@ -1002,7 +1003,7 @@ def test_dtype_backend(string_storage, d + } + ) + if dtype_backend == "pyarrow": +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + expected = DataFrame( +--- a/pandas/tests/io/parser/test_upcast.py ++++ b/pandas/tests/io/parser/test_upcast.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._libs.parsers import ( + _maybe_upcast, + na_values, +@@ -87,7 +88,7 @@ def test_maybe_upcaste_all_nan(): + @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) + def test_maybe_upcast_object(val, string_storage): + # GH#36712 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + with pd.option_context("mode.string_storage", string_storage): + arr = np.array(["a", "b", val], dtype=np.object_) +--- a/pandas/tests/io/pytables/common.py ++++ b/pandas/tests/io/pytables/common.py +@@ -5,9 +5,10 @@ import tempfile + + import pytest + ++import pandas.util._test_decorators as td + from pandas.io.pytables import HDFStore + +-tables = pytest.importorskip("tables") ++tables = td.versioned_importorskip("tables") + # set these parameters so we don't have file sharing + tables.parameters.MAX_NUMEXPR_THREADS = 1 + tables.parameters.MAX_BLOSC_THREADS = 1 +--- a/pandas/tests/io/pytables/test_append.py ++++ b/pandas/tests/io/pytables/test_append.py +@@ -29,7 +29,7 @@ is_crashing_arch=bool((platform.uname()[ + + pytestmark = pytest.mark.single_cpu + +-tables = pytest.importorskip("tables") ++tables = td.versioned_importorskip("tables") + + + @pytest.mark.filterwarnings("ignore::tables.NaturalNameWarning") +--- a/pandas/tests/io/pytables/test_compat.py ++++ b/pandas/tests/io/pytables/test_compat.py +@@ -1,9 +1,10 @@ + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + import pandas._testing as tm + +-tables = pytest.importorskip("tables") ++tables = td.versioned_importorskip("tables") + + + @pytest.fixture +--- a/pandas/tests/io/pytables/test_read.py ++++ b/pandas/tests/io/pytables/test_read.py +@@ -401,7 +401,7 @@ def test_read_py2_hdf_file_in_py3(datapa + + def test_read_infer_string(tmp_path, setup_path): + # GH#54431 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame({"a": ["a", "b", None]}) + path = tmp_path / setup_path + df.to_hdf(path, key="data", format="table") +--- a/pandas/tests/io/pytables/test_round_trip.py ++++ b/pandas/tests/io/pytables/test_round_trip.py +@@ -565,7 +565,7 @@ def test_round_trip_equals(tmp_path, set + + def test_infer_string_columns(tmp_path, setup_path): + # GH# +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + path = tmp_path / setup_path + with pd.option_context("future.infer_string", True): + df = DataFrame(1, columns=list("ABCD"), index=list(range(10))).set_index( +--- a/pandas/tests/io/pytables/test_store.py ++++ b/pandas/tests/io/pytables/test_store.py +@@ -7,6 +7,7 @@ import time + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + DataFrame, +@@ -37,7 +38,7 @@ is_crashing_arch=bool((platform.uname()[ + + pytestmark = pytest.mark.single_cpu + +-tables = pytest.importorskip("tables") ++tables = td.versioned_importorskip("tables") + + + def test_context(setup_path): +--- a/pandas/tests/io/pytables/test_subclass.py ++++ b/pandas/tests/io/pytables/test_subclass.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + Series, +@@ -12,7 +13,7 @@ from pandas.io.pytables import ( + read_hdf, + ) + +-pytest.importorskip("tables") ++td.versioned_importorskip("tables") + + + class TestHDFStoreSubclass: +--- a/pandas/tests/io/test_clipboard.py ++++ b/pandas/tests/io/test_clipboard.py +@@ -3,6 +3,7 @@ from textwrap import dedent + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.errors import ( + PyperclipException, + PyperclipWindowsException, +@@ -353,14 +354,14 @@ class TestClipboard: + ): + # GH#50502 + if string_storage == "pyarrow" or dtype_backend == "pyarrow": +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + if string_storage == "python": + string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) + string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + + elif dtype_backend == "pyarrow" and engine != "c": +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["x", "y"])) +--- a/pandas/tests/io/test_common.py ++++ b/pandas/tests/io/test_common.py +@@ -100,7 +100,7 @@ bar2,12,13,14,15 + + def test_stringify_file_and_path_like(self): + # GH 38125: do not stringify file objects that are also path-like +- fsspec = pytest.importorskip("fsspec") ++ fsspec = td.versioned_importorskip("fsspec") + with tm.ensure_clean() as path: + with fsspec.open(f"file://{path}", mode="wb") as fsspec_obj: + assert fsspec_obj == icom.stringify_path(fsspec_obj) +@@ -153,7 +153,7 @@ Look,a snake,🐍""" + + # Test that pyarrow can handle a file opened with get_handle + def test_get_handle_pyarrow_compat(self): +- pa_csv = pytest.importorskip("pyarrow.csv") ++ pa_csv = td.versioned_importorskip("pyarrow.csv") + + # Test latin1, ucs-2, and ucs-4 chars + data = """a,b,c +@@ -196,7 +196,7 @@ Look,a snake,🐍""" + ], + ) + def test_read_non_existent(self, reader, module, error_class, fn_ext): +- pytest.importorskip(module) ++ td.versioned_importorskip(module) + + path = os.path.join(HERE, "data", "does_not_exist." + fn_ext) + msg1 = rf"File (b')?.+does_not_exist\.{fn_ext}'? does not exist" +@@ -234,7 +234,7 @@ Look,a snake,🐍""" + ) + # NOTE: Missing parent directory for pd.DataFrame.to_hdf is handled by PyTables + def test_write_missing_parent_directory(self, method, module, error_class, fn_ext): +- pytest.importorskip(module) ++ td.versioned_importorskip(module) + + dummy_frame = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]}) + +@@ -264,7 +264,7 @@ Look,a snake,🐍""" + def test_read_expands_user_home_dir( + self, reader, module, error_class, fn_ext, monkeypatch + ): +- pytest.importorskip(module) ++ td.versioned_importorskip(module) + + path = os.path.join("~", "does_not_exist." + fn_ext) + monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x)) +@@ -321,7 +321,7 @@ Look,a snake,🐍""" + ], + ) + def test_read_fspath_all(self, reader, module, path, datapath): +- pytest.importorskip(module) ++ td.versioned_importorskip(module) + path = datapath(*path) + + mypath = CustomFSPath(path) +@@ -349,13 +349,13 @@ Look,a snake,🐍""" + ) + def test_write_fspath_all(self, writer_name, writer_kwargs, module): + if writer_name in ["to_latex"]: # uses Styler implementation +- pytest.importorskip("jinja2") ++ td.versioned_importorskip("jinja2") + p1 = tm.ensure_clean("string") + p2 = tm.ensure_clean("fspath") + df = pd.DataFrame({"A": [1, 2]}) + + with p1 as string, p2 as fspath: +- pytest.importorskip(module) ++ td.versioned_importorskip(module) + mypath = CustomFSPath(fspath) + writer = getattr(df, writer_name) + +@@ -377,7 +377,7 @@ Look,a snake,🐍""" + # Same test as write_fspath_all, except HDF5 files aren't + # necessarily byte-for-byte identical for a given dataframe, so we'll + # have to read and compare equality +- pytest.importorskip("tables") ++ td.versioned_importorskip("tables") + + df = pd.DataFrame({"A": [1, 2]}) + p1 = tm.ensure_clean("string") +--- a/pandas/tests/io/test_feather.py ++++ b/pandas/tests/io/test_feather.py +@@ -2,6 +2,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + import pandas._testing as tm + from pandas.core.arrays import ( +@@ -15,7 +16,7 @@ pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ) + +-pa = pytest.importorskip("pyarrow") ++pa = td.versioned_importorskip("pyarrow") + + + @pytest.mark.single_cpu +--- a/pandas/tests/io/test_fsspec.py ++++ b/pandas/tests/io/test_fsspec.py +@@ -25,7 +25,7 @@ pytestmark = pytest.mark.filterwarnings( + + @pytest.fixture + def fsspectest(): +- pytest.importorskip("fsspec") ++ td.versioned_importorskip("fsspec") + from fsspec import register_implementation + from fsspec.implementations.memory import MemoryFileSystem + from fsspec.registry import _registry as registry +@@ -59,7 +59,7 @@ def df1(): + + @pytest.fixture + def cleared_fs(): +- fsspec = pytest.importorskip("fsspec") ++ fsspec = td.versioned_importorskip("fsspec") + + memfs = fsspec.filesystem("memory") + yield memfs +@@ -99,7 +99,7 @@ def test_to_csv(cleared_fs, df1): + + + def test_to_excel(cleared_fs, df1): +- pytest.importorskip("openpyxl") ++ td.versioned_importorskip("openpyxl") + ext = "xlsx" + path = f"memory://test/test.{ext}" + df1.to_excel(path, index=True) +@@ -111,7 +111,7 @@ def test_to_excel(cleared_fs, df1): + + @pytest.mark.parametrize("binary_mode", [False, True]) + def test_to_csv_fsspec_object(cleared_fs, binary_mode, df1): +- fsspec = pytest.importorskip("fsspec") ++ fsspec = td.versioned_importorskip("fsspec") + + path = "memory://test/test.csv" + mode = "wb" if binary_mode else "w" +@@ -153,7 +153,7 @@ def test_read_table_options(fsspectest): + + + def test_excel_options(fsspectest): +- pytest.importorskip("openpyxl") ++ td.versioned_importorskip("openpyxl") + extension = "xlsx" + + df = DataFrame({"a": [0]}) +@@ -168,7 +168,7 @@ def test_excel_options(fsspectest): + + def test_to_parquet_new_file(cleared_fs, df1): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" +- pytest.importorskip("fastparquet") ++ td.versioned_importorskip("fastparquet") + + df1.to_parquet( + "memory://test/test.csv", index=True, engine="fastparquet", compression=None +@@ -177,7 +177,7 @@ def test_to_parquet_new_file(cleared_fs, + + def test_arrowparquet_options(fsspectest): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame({"a": [0]}) + df.to_parquet( + "testmem://test/test.csv", +@@ -197,7 +197,7 @@ def test_arrowparquet_options(fsspectest + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet + def test_fastparquet_options(fsspectest): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" +- pytest.importorskip("fastparquet") ++ td.versioned_importorskip("fastparquet") + + df = DataFrame({"a": [0]}) + df.to_parquet( +@@ -217,7 +217,7 @@ def test_fastparquet_options(fsspectest) + + @pytest.mark.single_cpu + def test_from_s3_csv(s3_public_bucket_with_data, tips_file, s3so): +- pytest.importorskip("s3fs") ++ td.versioned_importorskip("s3fs") + tm.assert_equal( + read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips.csv", storage_options=s3so +@@ -242,7 +242,7 @@ def test_from_s3_csv(s3_public_bucket_wi + @pytest.mark.single_cpu + @pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"]) + def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): +- pytest.importorskip("s3fs") ++ td.versioned_importorskip("s3fs") + tm.assert_equal( + read_csv( + f"{protocol}://{s3_public_bucket_with_data.name}/tips.csv", +@@ -255,8 +255,8 @@ def test_s3_protocols(s3_public_bucket_w + @pytest.mark.single_cpu + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet + def test_s3_parquet(s3_public_bucket, s3so, df1): +- pytest.importorskip("fastparquet") +- pytest.importorskip("s3fs") ++ td.versioned_importorskip("fastparquet") ++ td.versioned_importorskip("s3fs") + + fn = f"s3://{s3_public_bucket.name}/test.parquet" + df1.to_parquet( +@@ -274,7 +274,7 @@ def test_not_present_exception(): + + + def test_feather_options(fsspectest): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame({"a": [0]}) + df.to_feather("testmem://mockfile", storage_options={"test": "feather_write"}) + assert fsspectest.test[0] == "feather_write" +@@ -321,7 +321,7 @@ def test_stata_options(fsspectest): + + + def test_markdown_options(fsspectest): +- pytest.importorskip("tabulate") ++ td.versioned_importorskip("tabulate") + df = DataFrame({"a": [0]}) + df.to_markdown("testmem://mockfile", storage_options={"test": "md_write"}) + assert fsspectest.test[0] == "md_write" +@@ -329,7 +329,7 @@ def test_markdown_options(fsspectest): + + + def test_non_fsspec_options(): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + with pytest.raises(ValueError, match="storage_options"): + read_csv("localfile", storage_options={"a": True}) + with pytest.raises(ValueError, match="storage_options"): +--- a/pandas/tests/io/test_gcs.py ++++ b/pandas/tests/io/test_gcs.py +@@ -27,8 +27,8 @@ pytestmark = pytest.mark.filterwarnings( + @pytest.fixture + def gcs_buffer(): + """Emulate GCS using a binary buffer.""" +- pytest.importorskip("gcsfs") +- fsspec = pytest.importorskip("fsspec") ++ td.versioned_importorskip("gcsfs") ++ fsspec = td.versioned_importorskip("fsspec") + + gcs_buffer = BytesIO() + gcs_buffer.close = lambda: True +@@ -81,8 +81,8 @@ def test_to_read_gcs(gcs_buffer, format, + df1.to_json(path) + df2 = read_json(path, convert_dates=["dt"]) + elif format == "parquet": +- pytest.importorskip("pyarrow") +- pa_fs = pytest.importorskip("pyarrow.fs") ++ td.versioned_importorskip("pyarrow") ++ pa_fs = td.versioned_importorskip("pyarrow.fs") + + class MockFileSystem(pa_fs.FileSystem): + @staticmethod +@@ -98,7 +98,7 @@ def test_to_read_gcs(gcs_buffer, format, + captured = capsys.readouterr() + assert captured.out == "Using pyarrow filesystem\nUsing pyarrow filesystem\n" + elif format == "markdown": +- pytest.importorskip("tabulate") ++ td.versioned_importorskip("tabulate") + df1.to_markdown(path) + df2 = df1 + +@@ -187,8 +187,8 @@ def test_to_csv_compression_encoding_gcs + + def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" +- pytest.importorskip("fastparquet") +- pytest.importorskip("gcsfs") ++ td.versioned_importorskip("fastparquet") ++ td.versioned_importorskip("gcsfs") + + from fsspec import AbstractFileSystem + +--- a/pandas/tests/io/test_html.py ++++ b/pandas/tests/io/test_html.py +@@ -71,8 +71,8 @@ def assert_framelist_equal(list1, list2, + + + def test_bs4_version_fails(monkeypatch, datapath): +- bs4 = pytest.importorskip("bs4") +- pytest.importorskip("html5lib") ++ bs4 = td.versioned_importorskip("bs4") ++ td.versioned_importorskip("html5lib") + + monkeypatch.setattr(bs4, "__version__", "4.2") + with pytest.raises(ImportError, match="Pandas requires version"): +@@ -89,9 +89,9 @@ def test_invalid_flavor(): + + + def test_same_ordering(datapath): +- pytest.importorskip("bs4") +- pytest.importorskip("lxml") +- pytest.importorskip("html5lib") ++ td.versioned_importorskip("bs4") ++ td.versioned_importorskip("lxml") ++ td.versioned_importorskip("html5lib") + + filename = datapath("io", "data", "html", "valid_markup.html") + dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"]) +@@ -184,13 +184,13 @@ class TestReadHtml: + string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) + string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) + else: +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + string_array = ArrowStringArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowStringArray(pa.array(["a", "b", None])) + +--- a/pandas/tests/io/test_http_headers.py ++++ b/pandas/tests/io/test_http_headers.py +@@ -161,7 +161,7 @@ def test_to_parquet_to_disk_with_storage + "Auth": "other_custom", + } + +- pytest.importorskip(engine) ++ td.versioned_importorskip(engine) + + true_df = pd.DataFrame({"column_name": ["column_value"]}) + msg = ( +--- a/pandas/tests/io/test_orc.py ++++ b/pandas/tests/io/test_orc.py +@@ -8,12 +8,13 @@ import pathlib + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import read_orc + import pandas._testing as tm + from pandas.core.arrays import StringArray + +-pytest.importorskip("pyarrow.orc") ++td.versioned_importorskip("pyarrow.orc") + + import pyarrow as pa + +@@ -248,7 +249,7 @@ def test_orc_reader_snappy_compressed(di + def test_orc_roundtrip_file(dirpath): + # GH44554 + # PyArrow gained ORC write support with the current argument order +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + + data = { + "boolean1": np.array([False, True], dtype="bool"), +@@ -273,7 +274,7 @@ def test_orc_roundtrip_file(dirpath): + def test_orc_roundtrip_bytesio(): + # GH44554 + # PyArrow gained ORC write support with the current argument order +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + + data = { + "boolean1": np.array([False, True], dtype="bool"), +@@ -297,7 +298,7 @@ def test_orc_roundtrip_bytesio(): + def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported): + # GH44554 + # PyArrow gained ORC write support with the current argument order +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + + msg = "The dtype of one or more columns is not supported yet." + with pytest.raises(NotImplementedError, match=msg): +@@ -305,7 +306,7 @@ def test_orc_writer_dtypes_not_supported + + + def test_orc_dtype_backend_pyarrow(): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = pd.DataFrame( + { + "string": list("abc"), +@@ -341,7 +342,7 @@ def test_orc_dtype_backend_pyarrow(): + + def test_orc_dtype_backend_numpy_nullable(): + # GH#50503 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = pd.DataFrame( + { + "string": list("abc"), +--- a/pandas/tests/io/test_parquet.py ++++ b/pandas/tests/io/test_parquet.py +@@ -8,6 +8,7 @@ import pathlib + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._config import using_copy_on_write + from pandas._config.config import _get_option + +@@ -389,7 +390,7 @@ class Base: + @pytest.mark.single_cpu + def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): + if engine != "auto": +- pytest.importorskip(engine) ++ td.versioned_importorskip(engine) + with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: + httpserver.serve_content(content=f.read()) + df = read_parquet(httpserver.url) +@@ -615,7 +616,7 @@ class TestBasic(Base): + check_round_trip(df, engine) + + def test_dtype_backend(self, engine, request): +- pq = pytest.importorskip("pyarrow.parquet") ++ pq = td.versioned_importorskip("pyarrow.parquet") + + if engine == "fastparquet": + # We are manually disabling fastparquet's +@@ -803,7 +804,7 @@ class TestParquetPyArrow(Base): + + @pytest.mark.single_cpu + def test_s3_roundtrip_explicit_fs(self, df_compat, s3_public_bucket, pa, s3so): +- s3fs = pytest.importorskip("s3fs") ++ s3fs = td.versioned_importorskip("s3fs") + s3 = s3fs.S3FileSystem(**s3so) + kw = {"filesystem": s3} + check_round_trip( +@@ -837,7 +838,7 @@ class TestParquetPyArrow(Base): + def test_s3_roundtrip_for_dir( + self, df_compat, s3_public_bucket, pa, partition_col, s3so + ): +- pytest.importorskip("s3fs") ++ td.versioned_importorskip("s3fs") + # GH #26388 + expected_df = df_compat.copy() + +@@ -866,14 +867,14 @@ class TestParquetPyArrow(Base): + ) + + def test_read_file_like_obj_support(self, df_compat): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + buffer = BytesIO() + df_compat.to_parquet(buffer) + df_from_buf = read_parquet(buffer) + tm.assert_frame_equal(df_compat, df_from_buf) + + def test_expand_user(self, df_compat, monkeypatch): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + monkeypatch.setenv("HOME", "TestingUser") + monkeypatch.setenv("USERPROFILE", "TestingUser") + with pytest.raises(OSError, match=r".*TestingUser.*"): +@@ -928,7 +929,7 @@ class TestParquetPyArrow(Base): + def test_additional_extension_arrays(self, pa): + # test additional ExtensionArrays that are supported through the + # __arrow_array__ protocol +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype="Int64"), +@@ -943,7 +944,7 @@ class TestParquetPyArrow(Base): + + def test_pyarrow_backed_string_array(self, pa, string_storage): + # test ArrowStringArray supported through the __arrow_array__ protocol +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) + with pd.option_context("string_storage", string_storage): + check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) +@@ -951,7 +952,7 @@ class TestParquetPyArrow(Base): + def test_additional_extension_types(self, pa): + # test additional ExtensionArrays that are supported through the + # __arrow_array__ protocol + by defining a custom ExtensionType +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = pd.DataFrame( + { + "c": pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)]), +@@ -996,7 +997,7 @@ class TestParquetPyArrow(Base): + + def test_filter_row_groups(self, pa): + # https://github.com/pandas-dev/pandas/issues/26551 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = pd.DataFrame({"a": list(range(3))}) + with tm.ensure_clean() as path: + df.to_parquet(path, engine=pa) +@@ -1346,7 +1347,7 @@ class TestParquetFastParquet(Base): + tm.assert_frame_equal(result, df) + + def test_filesystem_notimplemented(self): +- pytest.importorskip("fastparquet") ++ td.versioned_importorskip("fastparquet") + df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) + with tm.ensure_clean() as path: + with pytest.raises( +@@ -1362,7 +1363,7 @@ class TestParquetFastParquet(Base): + read_parquet(path, engine="fastparquet", filesystem="foo") + + def test_invalid_filesystem(self): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) + with tm.ensure_clean() as path: + with pytest.raises( +@@ -1378,7 +1379,7 @@ class TestParquetFastParquet(Base): + read_parquet(path, engine="pyarrow", filesystem="foo") + + def test_unsupported_pa_filesystem_storage_options(self): +- pa_fs = pytest.importorskip("pyarrow.fs") ++ pa_fs = td.versioned_importorskip("pyarrow.fs") + df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) + with tm.ensure_clean() as path: + with pytest.raises( +--- a/pandas/tests/io/test_pickle.py ++++ b/pandas/tests/io/test_pickle.py +@@ -499,7 +499,7 @@ def test_pickle_generalurl_read(monkeypa + + + def test_pickle_fsspec_roundtrip(): +- pytest.importorskip("fsspec") ++ td.versioned_importorskip("fsspec") + with tm.ensure_clean(): + mockurl = "memory://mockfile" + df = DataFrame( +--- a/pandas/tests/io/test_s3.py ++++ b/pandas/tests/io/test_s3.py +@@ -2,13 +2,14 @@ from io import BytesIO + + import pytest + ++import pandas.util._test_decorators as td + from pandas import read_csv + + + def test_streaming_s3_objects(): + # GH17135 + # botocore gained iteration support in 1.10.47, can now be used in read_* +- pytest.importorskip("botocore", minversion="1.10.47") ++ td.versioned_importorskip("botocore", min_version="1.10.47") + from botocore.response import StreamingBody + + data = [b"foo,bar,baz\n1,2,3\n4,5,6\n", b"just,the,header\n"] +@@ -20,7 +21,7 @@ def test_streaming_s3_objects(): + @pytest.mark.single_cpu + def test_read_without_creds_from_pub_bucket(s3_public_bucket_with_data, s3so): + # GH 34626 +- pytest.importorskip("s3fs") ++ td.versioned_importorskip("s3fs") + result = read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips.csv", + nrows=3, +@@ -33,7 +34,7 @@ def test_read_without_creds_from_pub_buc + def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, s3so): + # Ensure we can read from a public bucket with credentials + # GH 34626 +- pytest.importorskip("s3fs") ++ td.versioned_importorskip("s3fs") + df = read_csv( + f"s3://{s3_public_bucket_with_data.name}/tips.csv", + nrows=5, +--- a/pandas/tests/io/test_spss.py ++++ b/pandas/tests/io/test_spss.py +@@ -4,11 +4,12 @@ from pathlib import Path + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + import pandas._testing as tm + from pandas.util.version import Version + +-pyreadstat = pytest.importorskip("pyreadstat") ++pyreadstat = td.versioned_importorskip("pyreadstat") + + + # TODO(CoW) - detection of chained assignment in cython +@@ -101,7 +102,7 @@ def test_spss_umlauts_dtype_backend(data + expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}, dtype="Int64") + + if dtype_backend == "pyarrow": +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + from pandas.arrays import ArrowExtensionArray + +--- a/pandas/tests/io/test_sql.py ++++ b/pandas/tests/io/test_sql.py +@@ -601,8 +601,8 @@ def drop_view( + + @pytest.fixture + def mysql_pymysql_engine(): +- sqlalchemy = pytest.importorskip("sqlalchemy") +- pymysql = pytest.importorskip("pymysql") ++ sqlalchemy = td.versioned_importorskip("sqlalchemy") ++ pymysql = td.versioned_importorskip("pymysql") + engine = sqlalchemy.create_engine( + "mysql+pymysql://root@localhost:3306/pandas", + connect_args={"client_flag": pymysql.constants.CLIENT.MULTI_STATEMENTS}, +@@ -649,8 +649,8 @@ def mysql_pymysql_conn_types(mysql_pymys + + @pytest.fixture + def postgresql_psycopg2_engine(): +- sqlalchemy = pytest.importorskip("sqlalchemy") +- pytest.importorskip("psycopg2") ++ sqlalchemy = td.versioned_importorskip("sqlalchemy") ++ td.versioned_importorskip("psycopg2") + engine = sqlalchemy.create_engine( + "postgresql+psycopg2://postgres:postgres@localhost:5432/pandas", + poolclass=sqlalchemy.pool.NullPool, +@@ -684,7 +684,7 @@ def postgresql_psycopg2_conn(postgresql_ + + @pytest.fixture + def postgresql_adbc_conn(): +- pytest.importorskip("adbc_driver_postgresql") ++ td.versioned_importorskip("adbc_driver_postgresql") + from adbc_driver_postgresql import dbapi + + uri = "postgresql://postgres:postgres@localhost:5432/pandas" +@@ -747,14 +747,14 @@ def postgresql_psycopg2_conn_types(postg + + @pytest.fixture + def sqlite_str(): +- pytest.importorskip("sqlalchemy") ++ td.versioned_importorskip("sqlalchemy") + with tm.ensure_clean() as name: + yield f"sqlite:///{name}" + + + @pytest.fixture + def sqlite_engine(sqlite_str): +- sqlalchemy = pytest.importorskip("sqlalchemy") ++ sqlalchemy = td.versioned_importorskip("sqlalchemy") + engine = sqlalchemy.create_engine(sqlite_str, poolclass=sqlalchemy.pool.NullPool) + yield engine + for view in get_all_views(engine): +@@ -772,7 +772,7 @@ def sqlite_conn(sqlite_engine): + + @pytest.fixture + def sqlite_str_iris(sqlite_str, iris_path): +- sqlalchemy = pytest.importorskip("sqlalchemy") ++ sqlalchemy = td.versioned_importorskip("sqlalchemy") + engine = sqlalchemy.create_engine(sqlite_str) + create_and_load_iris(engine, iris_path) + create_and_load_iris_view(engine) +@@ -795,7 +795,7 @@ def sqlite_conn_iris(sqlite_engine_iris) + + @pytest.fixture + def sqlite_str_types(sqlite_str, types_data): +- sqlalchemy = pytest.importorskip("sqlalchemy") ++ sqlalchemy = td.versioned_importorskip("sqlalchemy") + engine = sqlalchemy.create_engine(sqlite_str) + create_and_load_types(engine, types_data, "sqlite") + engine.dispose() +@@ -816,7 +816,7 @@ def sqlite_conn_types(sqlite_engine_type + + @pytest.fixture + def sqlite_adbc_conn(): +- pytest.importorskip("adbc_driver_sqlite") ++ td.versioned_importorskip("adbc_driver_sqlite") + from adbc_driver_sqlite import dbapi + + with tm.ensure_clean() as name: +@@ -1001,7 +1001,7 @@ def test_dataframe_to_sql_empty(conn, te + @pytest.mark.parametrize("conn", all_connectable) + def test_dataframe_to_sql_arrow_dtypes(conn, request): + # GH 52046 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame( + { + "int": pd.array([1], dtype="int8[pyarrow]"), +@@ -1035,7 +1035,7 @@ def test_dataframe_to_sql_arrow_dtypes(c + @pytest.mark.parametrize("conn", all_connectable) + def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): + # GH 52046 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame( + { + "datetime": pd.array( +@@ -2508,7 +2508,7 @@ def test_sqlalchemy_integer_overload_map + + @pytest.mark.parametrize("conn", all_connectable) + def test_database_uri_string(conn, request, test_frame1): +- pytest.importorskip("sqlalchemy") ++ td.versioned_importorskip("sqlalchemy") + conn = request.getfixturevalue(conn) + # Test read_sql and .to_sql method with a database URI (GH10654) + # db_uri = 'sqlite:///:memory:' # raises +@@ -2530,7 +2530,7 @@ def test_database_uri_string(conn, reque + @td.skip_if_installed("pg8000") + @pytest.mark.parametrize("conn", all_connectable) + def test_pg8000_sqlalchemy_passthrough_error(conn, request): +- pytest.importorskip("sqlalchemy") ++ td.versioned_importorskip("sqlalchemy") + conn = request.getfixturevalue(conn) + # using driver that will not be installed on CI to trigger error + # in sqlalchemy.create_engine -> test passing of this error to user +@@ -3407,7 +3407,7 @@ def test_to_sql_with_negative_npinf(conn + # The input {"foo": [-np.inf], "infe0": ["bar"]} does not raise any error + # for pymysql version >= 0.10 + # TODO(GH#36465): remove this version check after GH 36465 is fixed +- pymysql = pytest.importorskip("pymysql") ++ pymysql = td.versioned_importorskip("pymysql") + + if Version(pymysql.__version__) < Version("1.0.3") and "infe0" in df.columns: + mark = pytest.mark.xfail(reason="GH 36465") +@@ -3522,7 +3522,7 @@ def test_options_auto(conn, request, tes + + + def test_options_get_engine(): +- pytest.importorskip("sqlalchemy") ++ td.versioned_importorskip("sqlalchemy") + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + + with pd.option_context("io.sql.engine", "sqlalchemy"): +@@ -3674,14 +3674,14 @@ def dtype_backend_expected(): + string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) + + elif dtype_backend == "pyarrow": +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment] + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment] + + else: +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + string_array = ArrowStringArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowStringArray(pa.array(["a", "b", None])) + +@@ -3698,7 +3698,7 @@ def dtype_backend_expected(): + } + ) + if dtype_backend == "pyarrow": +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + from pandas.arrays import ArrowExtensionArray + +@@ -3843,7 +3843,7 @@ def test_row_object_is_named_tuple(sqlit + def test_read_sql_string_inference(sqlite_engine): + conn = sqlite_engine + # GH#54430 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + table = "test" + df = DataFrame({"a": ["x", "y"]}) + df.to_sql(table, con=conn, index=False, if_exists="replace") +--- a/pandas/tests/io/test_stata.py ++++ b/pandas/tests/io/test_stata.py +@@ -2045,11 +2045,11 @@ def test_compression(compression, versio + with bz2.open(path, "rb") as comp: + fp = io.BytesIO(comp.read()) + elif compression == "zstd": +- zstd = pytest.importorskip("zstandard") ++ zstd = td.versioned_importorskip("zstandard") + with zstd.open(path, "rb") as comp: + fp = io.BytesIO(comp.read()) + elif compression == "xz": +- lzma = pytest.importorskip("lzma") ++ lzma = td.versioned_importorskip("lzma") + with lzma.open(path, "rb") as comp: + fp = io.BytesIO(comp.read()) + elif compression is None: +--- a/pandas/tests/io/xml/test_to_xml.py ++++ b/pandas/tests/io/xml/test_to_xml.py +@@ -867,7 +867,7 @@ def test_encoding_option_str(xml_baby_na + + + def test_correct_encoding_file(xml_baby_names): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + df_file = read_xml(xml_baby_names, encoding="ISO-8859-1", parser="lxml") + + with tm.ensure_clean("test.xml") as path: +@@ -876,7 +876,7 @@ def test_correct_encoding_file(xml_baby_ + + @pytest.mark.parametrize("encoding", ["UTF-8", "UTF-16", "ISO-8859-1"]) + def test_wrong_encoding_option_lxml(xml_baby_names, parser, encoding): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + df_file = read_xml(xml_baby_names, encoding="ISO-8859-1", parser="lxml") + + with tm.ensure_clean("test.xml") as path: +@@ -892,7 +892,7 @@ def test_misspelled_encoding(parser, geo + + + def test_xml_declaration_pretty_print(geom_df): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + expected = """\ + + +@@ -1005,7 +1005,7 @@ xsl_expected = """\ + + + def test_stylesheet_file_like(xsl_row_field_output, mode, geom_df): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + with open( + xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None + ) as f: +@@ -1015,7 +1015,7 @@ def test_stylesheet_file_like(xsl_row_fi + def test_stylesheet_io(xsl_row_field_output, mode, geom_df): + # note: By default the bodies of untyped functions are not checked, + # consider using --check-untyped-defs +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked] + + with open( +@@ -1032,7 +1032,7 @@ def test_stylesheet_io(xsl_row_field_out + + + def test_stylesheet_buffered_reader(xsl_row_field_output, mode, geom_df): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + with open( + xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None + ) as f: +@@ -1044,7 +1044,7 @@ def test_stylesheet_buffered_reader(xsl_ + + + def test_stylesheet_wrong_path(geom_df): +- lxml_etree = pytest.importorskip("lxml.etree") ++ lxml_etree = td.versioned_importorskip("lxml.etree") + + xsl = os.path.join("data", "xml", "row_field_output.xslt") + +@@ -1057,7 +1057,7 @@ def test_stylesheet_wrong_path(geom_df): + + @pytest.mark.parametrize("val", ["", b""]) + def test_empty_string_stylesheet(val, geom_df): +- lxml_etree = pytest.importorskip("lxml.etree") ++ lxml_etree = td.versioned_importorskip("lxml.etree") + + msg = "|".join( + [ +@@ -1073,7 +1073,7 @@ def test_empty_string_stylesheet(val, ge + + + def test_incorrect_xsl_syntax(geom_df): +- lxml_etree = pytest.importorskip("lxml.etree") ++ lxml_etree = td.versioned_importorskip("lxml.etree") + + xsl = """\ + +@@ -1103,7 +1103,7 @@ def test_incorrect_xsl_syntax(geom_df): + + + def test_incorrect_xsl_eval(geom_df): +- lxml_etree = pytest.importorskip("lxml.etree") ++ lxml_etree = td.versioned_importorskip("lxml.etree") + + xsl = """\ + +@@ -1131,7 +1131,7 @@ def test_incorrect_xsl_eval(geom_df): + + + def test_incorrect_xsl_apply(geom_df): +- lxml_etree = pytest.importorskip("lxml.etree") ++ lxml_etree = td.versioned_importorskip("lxml.etree") + + xsl = """\ + +@@ -1169,7 +1169,7 @@ def test_stylesheet_with_etree(geom_df): + + + def test_style_to_csv(geom_df): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + xsl = """\ + + +@@ -1198,7 +1198,7 @@ def test_style_to_csv(geom_df): + + + def test_style_to_string(geom_df): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + xsl = """\ + + +@@ -1232,7 +1232,7 @@ def test_style_to_string(geom_df): + + + def test_style_to_json(geom_df): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + xsl = """\ + + +@@ -1363,8 +1363,8 @@ def test_unsuported_compression(parser, + + @pytest.mark.single_cpu + def test_s3_permission_output(parser, s3_public_bucket, geom_df): +- s3fs = pytest.importorskip("s3fs") +- pytest.importorskip("lxml") ++ s3fs = td.versioned_importorskip("s3fs") ++ td.versioned_importorskip("lxml") + + with tm.external_error_raised((PermissionError, FileNotFoundError)): + fs = s3fs.S3FileSystem(anon=True) +--- a/pandas/tests/io/xml/test_xml.py ++++ b/pandas/tests/io/xml/test_xml.py +@@ -249,7 +249,7 @@ df_kml = DataFrame( + + def test_literal_xml_deprecation(): + # GH 53809 +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + msg = ( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " +@@ -289,7 +289,7 @@ def read_xml_iterparse_comp(comp_path, c + + + def test_parser_consistency_file(xml_books): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + df_file_lxml = read_xml(xml_books, parser="lxml") + df_file_etree = read_xml(xml_books, parser="etree") + +@@ -462,7 +462,7 @@ def test_file_handle_close(xml_books, pa + + @pytest.mark.parametrize("val", ["", b""]) + def test_empty_string_lxml(val): +- lxml_etree = pytest.importorskip("lxml.etree") ++ lxml_etree = td.versioned_importorskip("lxml.etree") + + msg = "|".join( + [ +@@ -505,7 +505,7 @@ def test_wrong_file_path(parser, datapat + @pytest.mark.network + @pytest.mark.single_cpu + def test_url(httpserver, xml_file): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + with open(xml_file, encoding="utf-8") as f: + httpserver.serve_content(content=f.read()) + df_url = read_xml(httpserver.url, xpath=".//book[count(*)=4]") +@@ -587,7 +587,7 @@ def test_whitespace(parser): + + + def test_empty_xpath_lxml(xml_books): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + with pytest.raises(ValueError, match=("xpath does not return any nodes")): + read_xml(xml_books, xpath=".//python", parser="lxml") + +@@ -600,7 +600,7 @@ def test_bad_xpath_etree(xml_books): + + + def test_bad_xpath_lxml(xml_books): +- lxml_etree = pytest.importorskip("lxml.etree") ++ lxml_etree = td.versioned_importorskip("lxml.etree") + + with pytest.raises(lxml_etree.XPathEvalError, match=("Invalid expression")): + read_xml(xml_books, xpath=".//[book]", parser="lxml") +@@ -659,7 +659,7 @@ def test_prefix_namespace(parser): + + + def test_consistency_default_namespace(): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + df_lxml = read_xml( + StringIO(xml_default_nmsp), + xpath=".//ns:row", +@@ -678,7 +678,7 @@ def test_consistency_default_namespace() + + + def test_consistency_prefix_namespace(): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + df_lxml = read_xml( + StringIO(xml_prefix_nmsp), + xpath=".//doc:row", +@@ -710,7 +710,7 @@ def test_missing_prefix_definition_etree + + + def test_missing_prefix_definition_lxml(kml_cta_rail_lines): +- lxml_etree = pytest.importorskip("lxml.etree") ++ lxml_etree = td.versioned_importorskip("lxml.etree") + + with pytest.raises(lxml_etree.XPathEvalError, match=("Undefined namespace prefix")): + read_xml(kml_cta_rail_lines, xpath=".//kml:Placemark", parser="lxml") +@@ -718,7 +718,7 @@ def test_missing_prefix_definition_lxml( + + @pytest.mark.parametrize("key", ["", None]) + def test_none_namespace_prefix(key): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + with pytest.raises( + TypeError, match=("empty namespace prefix is not supported in XPath") + ): +@@ -831,7 +831,7 @@ def test_empty_elems_only(parser): + + + def test_attribute_centric_xml(): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + xml = """\ + + +@@ -1061,7 +1061,7 @@ def test_ascii_encoding(xml_baby_names, + + + def test_parser_consistency_with_encoding(xml_baby_names): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + df_xpath_lxml = read_xml(xml_baby_names, parser="lxml", encoding="ISO-8859-1") + df_xpath_etree = read_xml(xml_baby_names, parser="etree", encoding="iso-8859-1") + +@@ -1084,7 +1084,7 @@ def test_parser_consistency_with_encodin + + + def test_wrong_encoding_for_lxml(): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + # GH#45133 + data = """ + +@@ -1131,7 +1131,7 @@ def test_wrong_parser(xml_books): + + + def test_stylesheet_file(kml_cta_rail_lines, xsl_flatten_doc): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + df_style = read_xml( + kml_cta_rail_lines, + xpath=".//k:Placemark", +@@ -1158,7 +1158,7 @@ def test_stylesheet_file(kml_cta_rail_li + + + def test_stylesheet_file_like(kml_cta_rail_lines, xsl_flatten_doc, mode): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f: + df_style = read_xml( + kml_cta_rail_lines, +@@ -1173,7 +1173,7 @@ def test_stylesheet_file_like(kml_cta_ra + def test_stylesheet_io(kml_cta_rail_lines, xsl_flatten_doc, mode): + # note: By default the bodies of untyped functions are not checked, + # consider using --check-untyped-defs +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked] + + with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f: +@@ -1193,7 +1193,7 @@ def test_stylesheet_io(kml_cta_rail_line + + + def test_stylesheet_buffered_reader(kml_cta_rail_lines, xsl_flatten_doc, mode): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f: + xsl_obj = f.read() + +@@ -1208,7 +1208,7 @@ def test_stylesheet_buffered_reader(kml_ + + + def test_style_charset(): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + xml = "<中文標籤>12" + + xsl = """\ +@@ -1237,7 +1237,7 @@ def test_style_charset(): + + + def test_not_stylesheet(kml_cta_rail_lines, xml_books): +- lxml_etree = pytest.importorskip("lxml.etree") ++ lxml_etree = td.versioned_importorskip("lxml.etree") + + with pytest.raises( + lxml_etree.XSLTParseError, match=("document is not a stylesheet") +@@ -1246,7 +1246,7 @@ def test_not_stylesheet(kml_cta_rail_lin + + + def test_incorrect_xsl_syntax(kml_cta_rail_lines): +- lxml_etree = pytest.importorskip("lxml.etree") ++ lxml_etree = td.versioned_importorskip("lxml.etree") + + xsl = """\ + +@@ -1321,7 +1321,7 @@ def test_incorrect_xsl_apply(kml_cta_rai + + + def test_wrong_stylesheet(kml_cta_rail_lines, xml_data_path): +- xml_etree = pytest.importorskip("lxml.etree") ++ xml_etree = td.versioned_importorskip("lxml.etree") + + xsl = xml_data_path / "flatten.xsl" + +@@ -1335,7 +1335,7 @@ def test_wrong_stylesheet(kml_cta_rail_l + def test_stylesheet_file_close(kml_cta_rail_lines, xsl_flatten_doc, mode): + # note: By default the bodies of untyped functions are not checked, + # consider using --check-untyped-defs +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked] + + with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f: +@@ -1350,7 +1350,7 @@ def test_stylesheet_file_close(kml_cta_r + + + def test_stylesheet_with_etree(kml_cta_rail_lines, xsl_flatten_doc): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + with pytest.raises( + ValueError, match=("To use stylesheet, you need lxml installed") + ): +@@ -1359,7 +1359,7 @@ def test_stylesheet_with_etree(kml_cta_r + + @pytest.mark.parametrize("val", ["", b""]) + def test_empty_stylesheet(val, datapath): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + msg = ( + "Passing literal xml to 'read_xml' is deprecated and " + "will be removed in a future version. To read from a " +@@ -1662,7 +1662,7 @@ def test_empty_data(xml_books, parser): + + + def test_online_stylesheet(): +- pytest.importorskip("lxml") ++ td.versioned_importorskip("lxml") + xml = """\ + + +@@ -1993,8 +1993,8 @@ def test_unsuported_compression(parser): + @pytest.mark.network + @pytest.mark.single_cpu + def test_s3_parser_consistency(s3_public_bucket_with_data, s3so): +- pytest.importorskip("s3fs") +- pytest.importorskip("lxml") ++ td.versioned_importorskip("s3fs") ++ td.versioned_importorskip("lxml") + s3 = f"s3://{s3_public_bucket_with_data.name}/books.xml" + + df_lxml = read_xml(s3, parser="lxml", storage_options=s3so) +@@ -2035,7 +2035,7 @@ def test_read_xml_nullable_dtypes( + """ + + if using_infer_string: +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + string_array = ArrowStringArrayNumpySemantics(pa.array(["x", "y"])) + string_array_na = ArrowStringArrayNumpySemantics(pa.array(["x", None])) + +@@ -2044,14 +2044,14 @@ def test_read_xml_nullable_dtypes( + string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + + elif dtype_backend == "pyarrow": +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["x", "y"])) + string_array_na = ArrowExtensionArray(pa.array(["x", None])) + + else: +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + string_array = ArrowStringArray(pa.array(["x", "y"])) + string_array_na = ArrowStringArray(pa.array(["x", None])) + +@@ -2073,7 +2073,7 @@ def test_read_xml_nullable_dtypes( + ) + + if dtype_backend == "pyarrow": +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + expected = DataFrame( +--- a/pandas/tests/plotting/conftest.py ++++ b/pandas/tests/plotting/conftest.py +@@ -3,6 +3,7 @@ import gc + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + to_datetime, +@@ -15,9 +16,9 @@ def mpl_cleanup(): + # 1) Resets units registry + # 2) Resets rc_context + # 3) Closes all figures +- mpl = pytest.importorskip("matplotlib") +- mpl_units = pytest.importorskip("matplotlib.units") +- plt = pytest.importorskip("matplotlib.pyplot") ++ mpl = td.versioned_importorskip("matplotlib") ++ mpl_units = td.versioned_importorskip("matplotlib.units") ++ plt = td.versioned_importorskip("matplotlib.pyplot") + orig_units_registry = mpl_units.registry.copy() + with mpl.rc_context(): + mpl.use("template") +--- a/pandas/tests/plotting/frame/test_frame.py ++++ b/pandas/tests/plotting/frame/test_frame.py +@@ -47,8 +47,8 @@ from pandas.tests.plotting.common import + + from pandas.io.formats.printing import pprint_thing + +-mpl = pytest.importorskip("matplotlib") +-plt = pytest.importorskip("matplotlib.pyplot") ++mpl = td.versioned_importorskip("matplotlib") ++plt = td.versioned_importorskip("matplotlib.pyplot") + + + class TestDataFramePlots: +@@ -1118,7 +1118,7 @@ class TestDataFramePlots: + _check_box_return_type(result, return_type) + + def test_kde_df(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).standard_normal((100, 4))) + ax = _check_plot_works(df.plot, kind="kde") + expected = [pprint_thing(c) for c in df.columns] +@@ -1126,13 +1126,13 @@ class TestDataFramePlots: + _check_ticks_props(ax, xrot=0) + + def test_kde_df_rot(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) + ax = df.plot(kind="kde", rot=20, fontsize=5) + _check_ticks_props(ax, xrot=20, xlabelsize=5, ylabelsize=5) + + def test_kde_df_subplots(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) + axes = _check_plot_works( + df.plot, +@@ -1143,13 +1143,13 @@ class TestDataFramePlots: + _check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + def test_kde_df_logy(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) + axes = df.plot(kind="kde", logy=True, subplots=True) + _check_ax_scales(axes, yaxis="log") + + def test_kde_missing_vals(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).uniform(size=(100, 4))) + df.loc[0, 0] = np.nan + _check_plot_works(df.plot, kind="kde") +@@ -1446,14 +1446,14 @@ class TestDataFramePlots: + + @pytest.mark.parametrize("kind", plotting.PlotAccessor._common_kinds) + def test_kind_both_ways(self, kind): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame({"x": [1, 2, 3]}) + df.plot(kind=kind) + getattr(df.plot, kind)() + + @pytest.mark.parametrize("kind", ["scatter", "hexbin"]) + def test_kind_both_ways_x_y(self, kind): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame({"x": [1, 2, 3]}) + df.plot("x", "x", kind=kind) + getattr(df.plot, kind)("x", "x") +@@ -2099,7 +2099,7 @@ class TestDataFramePlots: + @pytest.mark.parametrize("kind", plotting.PlotAccessor._all_kinds) + def test_memory_leak(self, kind): + """Check that every plot type gets properly collected.""" +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + args = {} + if kind in ["hexbin", "scatter", "pie"]: + df = DataFrame( +@@ -2426,7 +2426,7 @@ class TestDataFramePlots: + "kind", ("line", "bar", "barh", "hist", "kde", "density", "area", "pie") + ) + def test_group_subplot(self, kind): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + d = { + "a": np.arange(10), + "b": np.arange(10) + 1, +--- a/pandas/tests/plotting/frame/test_frame_color.py ++++ b/pandas/tests/plotting/frame/test_frame_color.py +@@ -4,6 +4,7 @@ import re + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import DataFrame + import pandas._testing as tm +@@ -14,9 +15,9 @@ from pandas.tests.plotting.common import + ) + from pandas.util.version import Version + +-mpl = pytest.importorskip("matplotlib") +-plt = pytest.importorskip("matplotlib.pyplot") +-cm = pytest.importorskip("matplotlib.cm") ++mpl = td.versioned_importorskip("matplotlib") ++plt = td.versioned_importorskip("matplotlib.pyplot") ++cm = td.versioned_importorskip("matplotlib.cm") + + + def _check_colors_box(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): +@@ -446,7 +447,7 @@ class TestDataFrameColor: + _check_colors(ax.patches[::10], facecolors=["green"] * 5) + + def test_kde_colors(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + custom_colors = "rgcby" + df = DataFrame(np.random.default_rng(2).random((5, 5))) + +@@ -455,14 +456,14 @@ class TestDataFrameColor: + + @pytest.mark.parametrize("colormap", ["jet", cm.jet]) + def test_kde_colors_cmap(self, colormap): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).standard_normal((5, 5))) + ax = df.plot.kde(colormap=colormap) + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + _check_colors(ax.get_lines(), linecolors=rgba_colors) + + def test_kde_colors_and_styles_subplots(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + default_colors = _unpack_cycler(mpl.pyplot.rcParams) + + df = DataFrame(np.random.default_rng(2).standard_normal((5, 5))) +@@ -473,14 +474,14 @@ class TestDataFrameColor: + + @pytest.mark.parametrize("colormap", ["k", "red"]) + def test_kde_colors_and_styles_subplots_single_col_str(self, colormap): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).standard_normal((5, 5))) + axes = df.plot(kind="kde", color=colormap, subplots=True) + for ax in axes: + _check_colors(ax.get_lines(), linecolors=[colormap]) + + def test_kde_colors_and_styles_subplots_custom_color(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).standard_normal((5, 5))) + custom_colors = "rgcby" + axes = df.plot(kind="kde", color=custom_colors, subplots=True) +@@ -489,7 +490,7 @@ class TestDataFrameColor: + + @pytest.mark.parametrize("colormap", ["jet", cm.jet]) + def test_kde_colors_and_styles_subplots_cmap(self, colormap): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).standard_normal((5, 5))) + rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] + axes = df.plot(kind="kde", colormap=colormap, subplots=True) +@@ -497,7 +498,7 @@ class TestDataFrameColor: + _check_colors(ax.get_lines(), linecolors=[c]) + + def test_kde_colors_and_styles_subplots_single_col(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).standard_normal((5, 5))) + # make color a list if plotting one column frame + # handles cases like df.plot(color='DodgerBlue') +@@ -505,7 +506,7 @@ class TestDataFrameColor: + _check_colors(axes[0].lines, linecolors=["DodgerBlue"]) + + def test_kde_colors_and_styles_subplots_single_char(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).standard_normal((5, 5))) + # list of styles + # single character style +@@ -514,7 +515,7 @@ class TestDataFrameColor: + _check_colors(ax.get_lines(), linecolors=["r"]) + + def test_kde_colors_and_styles_subplots_list(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).standard_normal((5, 5))) + # list of styles + styles = list("rgcby") +--- a/pandas/tests/plotting/frame/test_frame_groupby.py ++++ b/pandas/tests/plotting/frame/test_frame_groupby.py +@@ -2,10 +2,11 @@ + + import pytest + ++import pandas.util._test_decorators as td + from pandas import DataFrame + from pandas.tests.plotting.common import _check_visible + +-pytest.importorskip("matplotlib") ++td.versioned_importorskip("matplotlib") + + + class TestDataFramePlotsGroupby: +--- a/pandas/tests/plotting/frame/test_frame_legend.py ++++ b/pandas/tests/plotting/frame/test_frame_legend.py +@@ -14,7 +14,7 @@ from pandas.tests.plotting.common import + ) + from pandas.util.version import Version + +-mpl = pytest.importorskip("matplotlib") ++mpl = td.versioned_importorskip("matplotlib") + + + class TestFrameLegend: +@@ -61,7 +61,7 @@ class TestFrameLegend: + + @pytest.mark.parametrize("kind", ["line", "bar", "barh", "kde", "area", "hist"]) + def test_df_legend_labels(self, kind): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).random((3, 3)), columns=["a", "b", "c"]) + df2 = DataFrame( + np.random.default_rng(2).random((3, 3)), columns=["d", "e", "f"] +@@ -87,7 +87,7 @@ class TestFrameLegend: + _check_legend_labels(ax, labels=expected) + + def test_df_legend_labels_secondary_y(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame(np.random.default_rng(2).random((3, 3)), columns=["a", "b", "c"]) + df2 = DataFrame( + np.random.default_rng(2).random((3, 3)), columns=["d", "e", "f"] +@@ -105,7 +105,7 @@ class TestFrameLegend: + + def test_df_legend_labels_time_series(self): + # Time Series +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + ind = date_range("1/1/2014", periods=3) + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), +@@ -131,7 +131,7 @@ class TestFrameLegend: + + def test_df_legend_labels_time_series_scatter(self): + # Time Series +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + ind = date_range("1/1/2014", periods=3) + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), +@@ -157,7 +157,7 @@ class TestFrameLegend: + _check_legend_labels(ax, labels=["data1", "data3"]) + + def test_df_legend_labels_time_series_no_mutate(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + ind = date_range("1/1/2014", periods=3) + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), +--- a/pandas/tests/plotting/frame/test_frame_subplots.py ++++ b/pandas/tests/plotting/frame/test_frame_subplots.py +@@ -6,6 +6,7 @@ import numpy as np + from numpy.testing import assert_array_almost_equal_nulp + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat import is_platform_linux + from pandas.compat.numpy import np_version_gte1p24 + +@@ -27,8 +28,8 @@ from pandas.tests.plotting.common import + + from pandas.io.formats.printing import pprint_thing + +-mpl = pytest.importorskip("matplotlib") +-plt = pytest.importorskip("matplotlib.pyplot") ++mpl = td.versioned_importorskip("matplotlib") ++plt = td.versioned_importorskip("matplotlib.pyplot") + + + class TestDataFramePlotsSubplots: +--- a/pandas/tests/plotting/frame/test_hist_box_by.py ++++ b/pandas/tests/plotting/frame/test_hist_box_by.py +@@ -3,6 +3,7 @@ import re + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import DataFrame + import pandas._testing as tm + from pandas.tests.plotting.common import ( +@@ -12,7 +13,7 @@ from pandas.tests.plotting.common import + get_y_axis, + ) + +-pytest.importorskip("matplotlib") ++td.versioned_importorskip("matplotlib") + + + @pytest.fixture +--- a/pandas/tests/plotting/test_boxplot_method.py ++++ b/pandas/tests/plotting/test_boxplot_method.py +@@ -6,6 +6,7 @@ import string + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + MultiIndex, +@@ -25,8 +26,8 @@ from pandas.tests.plotting.common import + + from pandas.io.formats.printing import pprint_thing + +-mpl = pytest.importorskip("matplotlib") +-plt = pytest.importorskip("matplotlib.pyplot") ++mpl = td.versioned_importorskip("matplotlib") ++plt = td.versioned_importorskip("matplotlib.pyplot") + + + def _check_ax_limits(col, ax): +--- a/pandas/tests/plotting/test_common.py ++++ b/pandas/tests/plotting/test_common.py +@@ -1,5 +1,6 @@ + import pytest + ++import pandas.util._test_decorators as td + from pandas import DataFrame + from pandas.tests.plotting.common import ( + _check_plot_works, +@@ -7,7 +8,7 @@ from pandas.tests.plotting.common import + _gen_two_subplots, + ) + +-plt = pytest.importorskip("matplotlib.pyplot") ++plt = td.versioned_importorskip("matplotlib.pyplot") + + + class TestCommon: +--- a/pandas/tests/plotting/test_converter.py ++++ b/pandas/tests/plotting/test_converter.py +@@ -8,6 +8,7 @@ import sys + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas._config.config as cf + + from pandas._libs.tslibs import to_offset +@@ -41,8 +42,8 @@ except ImportError: + # causing an improper skip + pass + +-pytest.importorskip("matplotlib.pyplot") +-dates = pytest.importorskip("matplotlib.dates") ++td.versioned_importorskip("matplotlib.pyplot") ++dates = td.versioned_importorskip("matplotlib.dates") + + + @pytest.mark.single_cpu +@@ -79,7 +80,7 @@ class TestRegistration: + assert subprocess.check_call(call) == 0 + + def test_registering_no_warning(self): +- plt = pytest.importorskip("matplotlib.pyplot") ++ plt = td.versioned_importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range("2017", periods=12)) + _, ax = plt.subplots() + +@@ -89,7 +90,7 @@ class TestRegistration: + plt.close() + + def test_pandas_plots_register(self): +- plt = pytest.importorskip("matplotlib.pyplot") ++ plt = td.versioned_importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range("2017", periods=12)) + # Set to the "warn" state, in case this isn't the first test run + with tm.assert_produces_warning(None) as w: +@@ -101,7 +102,7 @@ class TestRegistration: + plt.close() + + def test_matplotlib_formatters(self): +- units = pytest.importorskip("matplotlib.units") ++ units = td.versioned_importorskip("matplotlib.units") + + # Can't make any assertion about the start state. + # We we check that toggling converters off removes it, and toggling it +@@ -113,9 +114,9 @@ class TestRegistration: + assert Timestamp in units.registry + + def test_option_no_warning(self): +- pytest.importorskip("matplotlib.pyplot") ++ td.versioned_importorskip("matplotlib.pyplot") + ctx = cf.option_context("plotting.matplotlib.register_converters", False) +- plt = pytest.importorskip("matplotlib.pyplot") ++ plt = td.versioned_importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range("2017", periods=12)) + _, ax = plt.subplots() + +@@ -130,8 +131,8 @@ class TestRegistration: + plt.close() + + def test_registry_resets(self): +- units = pytest.importorskip("matplotlib.units") +- dates = pytest.importorskip("matplotlib.dates") ++ units = td.versioned_importorskip("matplotlib.units") ++ dates = td.versioned_importorskip("matplotlib.dates") + + # make a copy, to reset to + original = dict(units.registry) +--- a/pandas/tests/plotting/test_datetimelike.py ++++ b/pandas/tests/plotting/test_datetimelike.py +@@ -10,6 +10,7 @@ import pickle + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._libs.tslibs import ( + BaseOffset, + to_offset, +@@ -41,7 +42,7 @@ from pandas.tests.plotting.common import + + from pandas.tseries.offsets import WeekOfMonth + +-mpl = pytest.importorskip("matplotlib") ++mpl = td.versioned_importorskip("matplotlib") + + + class TestTSPlot: +@@ -737,7 +738,7 @@ class TestTSPlot: + assert ax.get_yaxis().get_visible() + + def test_secondary_kde(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + ser = Series(np.random.default_rng(2).standard_normal(10)) + fig, ax = mpl.pyplot.subplots() + ax = ser.plot(secondary_y=True, kind="density", ax=ax) +--- a/pandas/tests/plotting/test_groupby.py ++++ b/pandas/tests/plotting/test_groupby.py +@@ -4,6 +4,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + Index, +@@ -14,7 +15,7 @@ from pandas.tests.plotting.common import + _check_legend_labels, + ) + +-pytest.importorskip("matplotlib") ++td.versioned_importorskip("matplotlib") + + + class TestDataFrameGroupByPlots: +--- a/pandas/tests/plotting/test_hist_method.py ++++ b/pandas/tests/plotting/test_hist_method.py +@@ -4,6 +4,7 @@ import re + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + Index, +@@ -25,7 +26,7 @@ from pandas.tests.plotting.common import + get_y_axis, + ) + +-mpl = pytest.importorskip("matplotlib") ++mpl = td.versioned_importorskip("matplotlib") + + + @pytest.fixture +@@ -206,7 +207,7 @@ class TestSeriesPlots: + + @pytest.mark.xfail(reason="Api changed in 3.6.0") + def test_hist_kde(self, ts): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + _, ax = mpl.pyplot.subplots() + ax = ts.plot.hist(logy=True, ax=ax) + _check_ax_scales(ax, yaxis="log") +@@ -217,16 +218,16 @@ class TestSeriesPlots: + _check_text_labels(ylabels, [""] * len(ylabels)) + + def test_hist_kde_plot_works(self, ts): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + _check_plot_works(ts.plot.kde) + + def test_hist_kde_density_works(self, ts): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + _check_plot_works(ts.plot.density) + + @pytest.mark.xfail(reason="Api changed in 3.6.0") + def test_hist_kde_logy(self, ts): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + _, ax = mpl.pyplot.subplots() + ax = ts.plot.kde(logy=True, ax=ax) + _check_ax_scales(ax, yaxis="log") +@@ -236,7 +237,7 @@ class TestSeriesPlots: + _check_text_labels(ylabels, [""] * len(ylabels)) + + def test_hist_kde_color_bins(self, ts): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + _, ax = mpl.pyplot.subplots() + ax = ts.plot.hist(logy=True, bins=10, color="b", ax=ax) + _check_ax_scales(ax, yaxis="log") +@@ -244,7 +245,7 @@ class TestSeriesPlots: + _check_colors(ax.patches, facecolors=["b"] * 10) + + def test_hist_kde_color(self, ts): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + _, ax = mpl.pyplot.subplots() + ax = ts.plot.kde(logy=True, color="r", ax=ax) + _check_ax_scales(ax, yaxis="log") +@@ -631,7 +632,7 @@ class TestDataFramePlots: + + def test_hist_with_nans_and_weights(self): + # GH 48884 +- mpl_patches = pytest.importorskip("matplotlib.patches") ++ mpl_patches = td.versioned_importorskip("matplotlib.patches") + df = DataFrame( + [[np.nan, 0.2, 0.3], [0.4, np.nan, np.nan], [0.7, 0.8, 0.9]], + columns=list("abc"), +--- a/pandas/tests/plotting/test_misc.py ++++ b/pandas/tests/plotting/test_misc.py +@@ -26,9 +26,9 @@ from pandas.tests.plotting.common import + _check_ticks_props, + ) + +-mpl = pytest.importorskip("matplotlib") +-plt = pytest.importorskip("matplotlib.pyplot") +-cm = pytest.importorskip("matplotlib.cm") ++mpl = td.versioned_importorskip("matplotlib") ++plt = td.versioned_importorskip("matplotlib.pyplot") ++cm = td.versioned_importorskip("matplotlib.cm") + + + @pytest.fixture +@@ -148,7 +148,7 @@ class TestSeriesPlots: + class TestDataFramePlots: + @pytest.mark.parametrize("pass_axis", [False, True]) + def test_scatter_matrix_axis(self, pass_axis): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + scatter_matrix = plotting.scatter_matrix + + ax = None +@@ -173,7 +173,7 @@ class TestDataFramePlots: + + @pytest.mark.parametrize("pass_axis", [False, True]) + def test_scatter_matrix_axis_smaller(self, pass_axis): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + scatter_matrix = plotting.scatter_matrix + + ax = None +--- a/pandas/tests/plotting/test_series.py ++++ b/pandas/tests/plotting/test_series.py +@@ -32,8 +32,8 @@ from pandas.tests.plotting.common import + get_y_axis, + ) + +-mpl = pytest.importorskip("matplotlib") +-plt = pytest.importorskip("matplotlib.pyplot") ++mpl = td.versioned_importorskip("matplotlib") ++plt = td.versioned_importorskip("matplotlib.pyplot") + + + @pytest.fixture +@@ -569,16 +569,16 @@ class TestSeriesPlots: + ], + ) + def test_kde_kwargs(self, ts, bw_method, ind): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + _check_plot_works(ts.plot.kde, bw_method=bw_method, ind=ind) + + def test_density_kwargs(self, ts): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + sample_points = np.linspace(-100, 100, 20) + _check_plot_works(ts.plot.density, bw_method=0.5, ind=sample_points) + + def test_kde_kwargs_check_axes(self, ts): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + _, ax = mpl.pyplot.subplots() + sample_points = np.linspace(-100, 100, 20) + ax = ts.plot.kde(logy=True, bw_method=0.5, ind=sample_points, ax=ax) +@@ -586,7 +586,7 @@ class TestSeriesPlots: + _check_text_labels(ax.yaxis.get_label(), "Density") + + def test_kde_missing_vals(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series(np.random.default_rng(2).uniform(size=50)) + s[0] = np.nan + axes = _check_plot_works(s.plot.kde) +@@ -609,7 +609,7 @@ class TestSeriesPlots: + plotting.PlotAccessor._common_kinds + plotting.PlotAccessor._series_kinds, + ) + def test_kind_kwarg(self, kind): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series(range(3)) + _, ax = mpl.pyplot.subplots() + s.plot(kind=kind, ax=ax) +@@ -620,7 +620,7 @@ class TestSeriesPlots: + plotting.PlotAccessor._common_kinds + plotting.PlotAccessor._series_kinds, + ) + def test_kind_attr(self, kind): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series(range(3)) + _, ax = mpl.pyplot.subplots() + getattr(s.plot, kind)() +@@ -636,7 +636,7 @@ class TestSeriesPlots: + + @pytest.mark.parametrize("kind", plotting.PlotAccessor._common_kinds) + def test_valid_object_plot(self, kind): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series(range(10), dtype=object) + _check_plot_works(s.plot, kind=kind) + +@@ -750,7 +750,7 @@ class TestSeriesPlots: + @pytest.mark.slow + def test_series_grid_settings(self): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + _check_grid_settings( + Series([1, 2, 3]), + plotting.PlotAccessor._series_kinds + plotting.PlotAccessor._common_kinds, +--- a/pandas/tests/plotting/test_style.py ++++ b/pandas/tests/plotting/test_style.py +@@ -1,8 +1,9 @@ + import pytest + ++import pandas.util._test_decorators as td + from pandas import Series + +-pytest.importorskip("matplotlib") ++td.versioned_importorskip("matplotlib") + from pandas.plotting._matplotlib.style import get_standard_colors + + +--- a/pandas/tests/reductions/test_reductions.py ++++ b/pandas/tests/reductions/test_reductions.py +@@ -7,6 +7,7 @@ from decimal import Decimal + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + Categorical, +@@ -1091,7 +1092,7 @@ class TestSeriesReductions: + + def test_any_all_pyarrow_string(self): + # GH#54591 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + ser = Series(["", "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() +--- a/pandas/tests/reductions/test_stat_reductions.py ++++ b/pandas/tests/reductions/test_stat_reductions.py +@@ -6,6 +6,7 @@ import inspect + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + DataFrame, +@@ -231,7 +232,7 @@ class TestSeriesStatReductions: + assert pd.isna(result) + + def test_skew(self): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + string_series = Series(range(20), dtype=np.float64, name="series") + +@@ -253,7 +254,7 @@ class TestSeriesStatReductions: + assert (df.skew() == 0).all() + + def test_kurt(self): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + string_series = Series(range(20), dtype=np.float64, name="series") + +--- a/pandas/tests/resample/test_datetime_index.py ++++ b/pandas/tests/resample/test_datetime_index.py +@@ -1110,7 +1110,7 @@ def test_resample_dtype_preservation(uni + + + def test_resample_dtype_coercion(unit): +- pytest.importorskip("scipy.interpolate") ++ td.versioned_importorskip("scipy.interpolate") + + # GH 16361 + df = {"a": [1, 3, 1, 4]} +--- a/pandas/tests/reshape/merge/test_merge.py ++++ b/pandas/tests/reshape/merge/test_merge.py +@@ -8,6 +8,7 @@ import re + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.core.dtypes.common import ( + is_object_dtype, + is_string_dtype, +@@ -2817,7 +2818,7 @@ def test_merge_ea_and_non_ea(any_numeric + @pytest.mark.parametrize("dtype", ["int64", "int64[pyarrow]"]) + def test_merge_arrow_and_numpy_dtypes(dtype): + # GH#52406 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame({"a": [1, 2]}, dtype=dtype) + df2 = DataFrame({"a": [1, 2]}, dtype="int64[pyarrow]") + result = df.merge(df2) +@@ -2967,7 +2968,7 @@ def test_merge_ea_int_and_float_numpy(): + + def test_merge_arrow_string_index(any_string_dtype): + # GH#54894 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + left = DataFrame({"a": ["a", "b"]}, dtype=any_string_dtype) + right = DataFrame({"b": 1}, index=Index(["a", "c"], dtype=any_string_dtype)) + result = left.merge(right, left_on="a", right_index=True, how="left") +--- a/pandas/tests/reshape/test_melt.py ++++ b/pandas/tests/reshape/test_melt.py +@@ -3,6 +3,7 @@ import re + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + DataFrame, +@@ -1224,7 +1225,7 @@ class TestWideToLong: + + def test_wide_to_long_pyarrow_string_columns(): + # GH 57066 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + df = DataFrame( + { + "ID": {0: 1}, +--- a/pandas/tests/series/accessors/test_list_accessor.py ++++ b/pandas/tests/series/accessors/test_list_accessor.py +@@ -2,13 +2,14 @@ import re + + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + ArrowDtype, + Series, + ) + import pandas._testing as tm + +-pa = pytest.importorskip("pyarrow") ++pa = td.versioned_importorskip("pyarrow") + + from pandas.compat import pa_version_under11p0 + +--- a/pandas/tests/series/accessors/test_struct_accessor.py ++++ b/pandas/tests/series/accessors/test_struct_accessor.py +@@ -2,6 +2,7 @@ import re + + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat.pyarrow import ( + pa_version_under11p0, + pa_version_under13p0, +@@ -15,8 +16,8 @@ from pandas import ( + ) + import pandas._testing as tm + +-pa = pytest.importorskip("pyarrow") +-pc = pytest.importorskip("pyarrow.compute") ++pa = td.versioned_importorskip("pyarrow") ++pc = td.versioned_importorskip("pyarrow.compute") + + + def test_struct_accessor_dtypes(): +--- a/pandas/tests/series/methods/test_convert_dtypes.py ++++ b/pandas/tests/series/methods/test_convert_dtypes.py +@@ -3,6 +3,7 @@ from itertools import product + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._libs import lib + + import pandas as pd +@@ -291,7 +292,7 @@ class TestSeriesConvertDtypes: + + def test_convert_dtypes_pyarrow_to_np_nullable(self): + # GH 53648 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + ser = pd.Series(range(2), dtype="int32[pyarrow]") + result = ser.convert_dtypes(dtype_backend="numpy_nullable") + expected = pd.Series(range(2), dtype="Int32") +@@ -299,7 +300,7 @@ class TestSeriesConvertDtypes: + + def test_convert_dtypes_pyarrow_null(self): + # GH#55346 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + ser = pd.Series([None, None]) + result = ser.convert_dtypes(dtype_backend="pyarrow") + expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) +--- a/pandas/tests/series/methods/test_cov_corr.py ++++ b/pandas/tests/series/methods/test_cov_corr.py +@@ -3,6 +3,7 @@ import math + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + Series, +@@ -58,7 +59,7 @@ class TestSeriesCov: + class TestSeriesCorr: + @pytest.mark.parametrize("dtype", ["float64", "Float64"]) + def test_corr(self, datetime_series, dtype): +- stats = pytest.importorskip("scipy.stats") ++ stats = td.versioned_importorskip("scipy.stats") + + datetime_series = datetime_series.astype(dtype) + +@@ -93,7 +94,7 @@ class TestSeriesCorr: + tm.assert_almost_equal(result, expected) + + def test_corr_rank(self): +- stats = pytest.importorskip("scipy.stats") ++ stats = td.versioned_importorskip("scipy.stats") + + # kendall and spearman + A = Series( +--- a/pandas/tests/series/methods/test_drop_duplicates.py ++++ b/pandas/tests/series/methods/test_drop_duplicates.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + Categorical, +@@ -252,7 +253,7 @@ class TestSeriesDropDuplicates: + tm.assert_series_equal(result, expected) + + def test_duplicated_arrow_dtype(self): +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + ser = Series([True, False, None, False], dtype="bool[pyarrow]") + result = ser.drop_duplicates() + expected = Series([True, False, None], dtype="bool[pyarrow]") +@@ -260,7 +261,7 @@ class TestSeriesDropDuplicates: + + def test_drop_duplicates_arrow_strings(self): + # GH#54904 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + ser = Series(["a", "a"], dtype=pd.ArrowDtype(pa.string())) + result = ser.drop_duplicates() + expecetd = Series(["a"], dtype=pd.ArrowDtype(pa.string())) +--- a/pandas/tests/series/methods/test_explode.py ++++ b/pandas/tests/series/methods/test_explode.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + import pandas._testing as tm + +@@ -146,7 +147,7 @@ def test_explode_scalars_can_ignore_inde + @pytest.mark.parametrize("ignore_index", [True, False]) + def test_explode_pyarrow_list_type(ignore_index): + # GH 53602 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + data = [ + [None, None], +@@ -167,7 +168,7 @@ def test_explode_pyarrow_list_type(ignor + + @pytest.mark.parametrize("ignore_index", [True, False]) + def test_explode_pyarrow_non_list_type(ignore_index): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + data = [1, 2, 3] + ser = pd.Series(data, dtype=pd.ArrowDtype(pa.int64())) + result = ser.explode(ignore_index=ignore_index) +--- a/pandas/tests/series/methods/test_interpolate.py ++++ b/pandas/tests/series/methods/test_interpolate.py +@@ -118,7 +118,7 @@ class TestSeriesInterpolateData: + non_ts.interpolate(method="time") + + def test_interpolate_cubicspline(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + ser = Series([10, 11, 12, 13]) + + expected = Series( +@@ -133,7 +133,7 @@ class TestSeriesInterpolateData: + tm.assert_series_equal(result, expected) + + def test_interpolate_pchip(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + ser = Series(np.sort(np.random.default_rng(2).uniform(size=100))) + + # interpolate at new_index +@@ -145,7 +145,7 @@ class TestSeriesInterpolateData: + interp_s.loc[49:51] + + def test_interpolate_akima(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + ser = Series([10, 11, 12, 13]) + + # interpolate at new_index where `der` is zero +@@ -171,7 +171,7 @@ class TestSeriesInterpolateData: + tm.assert_series_equal(interp_s.loc[1:3], expected) + + def test_interpolate_piecewise_polynomial(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + ser = Series([10, 11, 12, 13]) + + expected = Series( +@@ -186,7 +186,7 @@ class TestSeriesInterpolateData: + tm.assert_series_equal(interp_s.loc[1:3], expected) + + def test_interpolate_from_derivatives(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + ser = Series([10, 11, 12, 13]) + + expected = Series( +@@ -276,14 +276,14 @@ class TestSeriesInterpolateData: + tm.assert_series_equal(result, expected) + + def test_interp_quad(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4]) + result = sq.interpolate(method="quadratic") + expected = Series([1.0, 4.0, 9.0, 16.0], index=[1, 2, 3, 4]) + tm.assert_series_equal(result, expected) + + def test_interp_scipy_basic(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series([1, 3, np.nan, 12, np.nan, 25]) + # slinear + expected = Series([1.0, 3.0, 7.5, 12.0, 18.5, 25.0]) +@@ -618,7 +618,7 @@ class TestSeriesInterpolateData: + tm.assert_series_equal(result, expected) + + def test_interp_all_good(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series([1, 2, 3]) + result = s.interpolate(method="polynomial", order=1) + tm.assert_series_equal(result, s) +@@ -645,7 +645,7 @@ class TestSeriesInterpolateData: + s.interpolate(method="polynomial", order=1) + + def test_interp_nonmono_raise(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series([1, np.nan, 3], index=[0, 2, 1]) + msg = "krogh interpolation requires that the index be monotonic" + with pytest.raises(ValueError, match=msg): +@@ -653,7 +653,7 @@ class TestSeriesInterpolateData: + + @pytest.mark.parametrize("method", ["nearest", "pad"]) + def test_interp_datetime64(self, method, tz_naive_fixture): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = Series( + [1, np.nan, 3], index=date_range("1/1/2000", periods=3, tz=tz_naive_fixture) + ) +@@ -699,7 +699,7 @@ class TestSeriesInterpolateData: + @pytest.mark.parametrize("method", ["polynomial", "spline"]) + def test_no_order(self, method): + # see GH-10633, GH-24014 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series([0, 1, np.nan, 3]) + msg = "You must specify the order of the spline or polynomial" + with pytest.raises(ValueError, match=msg): +@@ -707,21 +707,21 @@ class TestSeriesInterpolateData: + + @pytest.mark.parametrize("order", [-1, -1.0, 0, 0.0, np.nan]) + def test_interpolate_spline_invalid_order(self, order): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series([0, 1, np.nan, 3]) + msg = "order needs to be specified and greater than 0" + with pytest.raises(ValueError, match=msg): + s.interpolate(method="spline", order=order) + + def test_spline(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series([1, 2, np.nan, 4, 5, np.nan, 7]) + result = s.interpolate(method="spline", order=1) + expected = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) + tm.assert_series_equal(result, expected) + + def test_spline_extrapolate(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series([1, 2, 3, 4, np.nan, 6, np.nan]) + result3 = s.interpolate(method="spline", order=1, ext=3) + expected3 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0]) +@@ -732,7 +732,7 @@ class TestSeriesInterpolateData: + tm.assert_series_equal(result1, expected1) + + def test_spline_smooth(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series([1, 2, np.nan, 4, 5.1, np.nan, 7]) + assert ( + s.interpolate(method="spline", order=3, s=0)[5] +@@ -741,7 +741,7 @@ class TestSeriesInterpolateData: + + def test_spline_interpolation(self): + # Explicit cast to float to avoid implicit cast when setting np.nan +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series(np.arange(10) ** 2, dtype="float") + s[np.random.default_rng(2).integers(0, 9, 3)] = np.nan + result1 = s.interpolate(method="spline", order=1) +@@ -801,7 +801,7 @@ class TestSeriesInterpolateData: + + method, kwargs = interp_methods_ind + if method == "pchip": +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + + if method == "linear": + result = df[0].interpolate(**kwargs) +@@ -824,7 +824,7 @@ class TestSeriesInterpolateData: + are tested here. + """ + # gh 21662 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + ind = pd.timedelta_range(start=1, periods=4) + df = pd.DataFrame([0, 1, np.nan, 3], index=ind) + +@@ -861,7 +861,7 @@ class TestSeriesInterpolateData: + + def test_interpolate_fill_value(self): + # GH#54920 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + ser = Series([np.nan, 0, 1, np.nan, 3, np.nan]) + result = ser.interpolate(method="nearest", fill_value=0) + expected = Series([np.nan, 0, 1, 1, 3, 0]) +--- a/pandas/tests/series/methods/test_rank.py ++++ b/pandas/tests/series/methods/test_rank.py +@@ -56,7 +56,7 @@ def dtype(request): + + class TestSeriesRank: + def test_rank(self, datetime_series): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + datetime_series[::2] = np.nan + datetime_series[:10:3] = 4.0 +@@ -269,7 +269,7 @@ class TestSeriesRank: + def test_rank_tie_methods_on_infs_nans( + self, method, na_option, ascending, dtype, na_value, pos_inf, neg_inf + ): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + if dtype == "float64[pyarrow]": + if method == "average": + exp_dtype = "float64[pyarrow]" +@@ -318,7 +318,7 @@ class TestSeriesRank: + ], + ) + def test_rank_methods_series(self, method, op, value): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + xs = np.random.default_rng(2).standard_normal(9) + xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates +--- a/pandas/tests/series/methods/test_reset_index.py ++++ b/pandas/tests/series/methods/test_reset_index.py +@@ -3,6 +3,7 @@ from datetime import datetime + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + DataFrame, +@@ -170,7 +171,7 @@ class TestResetIndex: + + def test_reset_index_drop_infer_string(self): + # GH#56160 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + ser = Series(["a", "b", "c"], dtype=object) + with option_context("future.infer_string", True): + result = ser.reset_index(drop=True) +--- a/pandas/tests/series/test_api.py ++++ b/pandas/tests/series/test_api.py +@@ -4,6 +4,7 @@ import pydoc + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import ( + DataFrame, +@@ -169,7 +170,7 @@ class TestSeriesMisc: + + def test_inspect_getmembers(self): + # GH38782 +- pytest.importorskip("jinja2") ++ td.versioned_importorskip("jinja2") + ser = Series(dtype=object) + msg = "Series._data is deprecated" + with tm.assert_produces_warning( +--- a/pandas/tests/series/test_constructors.py ++++ b/pandas/tests/series/test_constructors.py +@@ -2094,7 +2094,7 @@ class TestSeriesConstructors: + + def test_series_string_inference(self): + # GH#54430 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + expected = Series(["a", "b"], dtype=dtype) + with pd.option_context("future.infer_string", True): +@@ -2109,7 +2109,7 @@ class TestSeriesConstructors: + @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA]) + def test_series_string_with_na_inference(self, na_value): + # GH#54430 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + expected = Series(["a", na_value], dtype=dtype) + with pd.option_context("future.infer_string", True): +@@ -2118,7 +2118,7 @@ class TestSeriesConstructors: + + def test_series_string_inference_scalar(self): + # GH#54430 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + expected = Series("a", index=[1], dtype="string[pyarrow_numpy]") + with pd.option_context("future.infer_string", True): + ser = Series("a", index=[1]) +@@ -2126,7 +2126,7 @@ class TestSeriesConstructors: + + def test_series_string_inference_array_string_dtype(self): + # GH#54496 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + with pd.option_context("future.infer_string", True): + ser = Series(np.array(["a", "b"])) +@@ -2134,7 +2134,7 @@ class TestSeriesConstructors: + + def test_series_string_inference_storage_definition(self): + # GH#54793 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + expected = Series(["a", "b"], dtype="string[pyarrow_numpy]") + with pd.option_context("future.infer_string", True): + result = Series(["a", "b"], dtype="string") +@@ -2150,7 +2150,7 @@ class TestSeriesConstructors: + + def test_series_string_inference_na_first(self): + # GH#55655 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + expected = Series([pd.NA, "b"], dtype="string[pyarrow_numpy]") + with pd.option_context("future.infer_string", True): + result = Series([pd.NA, "b"]) +--- a/pandas/tests/series/test_formats.py ++++ b/pandas/tests/series/test_formats.py +@@ -6,6 +6,7 @@ from datetime import ( + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._config import using_pyarrow_string_dtype + + import pandas as pd +@@ -227,7 +228,7 @@ class TestSeriesRepr: + repr(ts2).splitlines()[-1] + + def test_latex_repr(self): +- pytest.importorskip("jinja2") # uses Styler implementation ++ td.versioned_importorskip("jinja2") # uses Styler implementation + result = r"""\begin{tabular}{ll} + \toprule + & 0 \\ +--- a/pandas/tests/series/test_logical_ops.py ++++ b/pandas/tests/series/test_logical_ops.py +@@ -4,6 +4,7 @@ import operator + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + Index, +@@ -533,7 +534,7 @@ class TestSeriesLogicalOps: + + def test_pyarrow_numpy_string_invalid(self): + # GH#56008 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + ser = Series([False, True]) + ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") + result = ser == ser2 +--- a/pandas/tests/series/test_reductions.py ++++ b/pandas/tests/series/test_reductions.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + import pandas as pd + from pandas import Series + import pandas._testing as tm +@@ -53,7 +54,7 @@ def test_mode_nullable_dtype(any_numeric + + def test_mode_infer_string(): + # GH#56183 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + ser = Series(["a", "b"], dtype=object) + with pd.option_context("future.infer_string", True): + result = ser.mode() +--- a/pandas/tests/strings/test_extract.py ++++ b/pandas/tests/strings/test_extract.py +@@ -4,6 +4,7 @@ import re + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.core.dtypes.dtypes import ArrowDtype + + from pandas import ( +@@ -718,7 +719,7 @@ def test_extractall_same_as_extract_subj + def test_extractall_preserves_dtype(): + # Ensure that when extractall is called on a series with specific dtypes set, that + # the dtype is preserved in the resulting DataFrame's column. +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + result = Series(["abc", "ab"], dtype=ArrowDtype(pa.string())).str.extractall("(ab)") + assert result.dtypes[0] == "string[pyarrow]" +--- a/pandas/tests/test_algos.py ++++ b/pandas/tests/test_algos.py +@@ -4,6 +4,7 @@ import struct + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas._libs import ( + algos as libalgos, + hashtable as ht, +@@ -1789,7 +1790,7 @@ class TestRank: + ], + ) + def test_scipy_compat(self, arr): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + arr = np.array(arr) + +--- a/pandas/tests/test_downstream.py ++++ b/pandas/tests/test_downstream.py +@@ -44,8 +44,8 @@ def test_dask(df): + olduse = pd.get_option("compute.use_numexpr") + + try: +- pytest.importorskip("toolz") +- dd = pytest.importorskip("dask.dataframe") ++ td.versioned_importorskip("toolz") ++ dd = td.versioned_importorskip("dask.dataframe") + + ddf = dd.from_pandas(df, npartitions=3) + assert ddf.A is not None +@@ -61,8 +61,8 @@ def test_dask_ufunc(): + olduse = pd.get_option("compute.use_numexpr") + + try: +- da = pytest.importorskip("dask.array") +- dd = pytest.importorskip("dask.dataframe") ++ da = td.versioned_importorskip("dask.array") ++ dd = td.versioned_importorskip("dask.dataframe") + + s = Series([1.5, 2.3, 3.7, 4.0]) + ds = dd.from_pandas(s, npartitions=2) +@@ -78,7 +78,7 @@ def test_dask_ufunc(): + def test_construct_dask_float_array_int_dtype_match_ndarray(): + # GH#40110 make sure we treat a float-dtype dask array with the same + # rules we would for an ndarray +- dd = pytest.importorskip("dask.dataframe") ++ dd = td.versioned_importorskip("dask.dataframe") + + arr = np.array([1, 2.5, 3]) + darr = dd.from_array(arr) +@@ -102,15 +102,15 @@ def test_construct_dask_float_array_int_ + + + def test_xarray(df): +- pytest.importorskip("xarray") ++ td.versioned_importorskip("xarray") + + assert df.to_xarray() is not None + + + def test_xarray_cftimeindex_nearest(): + # https://github.com/pydata/xarray/issues/3751 +- cftime = pytest.importorskip("cftime") +- xarray = pytest.importorskip("xarray") ++ cftime = td.versioned_importorskip("cftime") ++ xarray = td.versioned_importorskip("xarray") + + times = xarray.cftime_range("0001", periods=2) + key = cftime.DatetimeGregorian(2000, 1, 1) +@@ -142,7 +142,7 @@ def test_oo_optimized_datetime_index_unp + + + def test_statsmodels(): +- smf = pytest.importorskip("statsmodels.formula.api") ++ smf = td.versioned_importorskip("statsmodels.formula.api") + + df = DataFrame( + {"Lottery": range(5), "Literacy": range(5), "Pop1831": range(100, 105)} +@@ -151,7 +151,7 @@ def test_statsmodels(): + + + def test_scikit_learn(): +- pytest.importorskip("sklearn") ++ td.versioned_importorskip("sklearn") + from sklearn import ( + datasets, + svm, +@@ -164,7 +164,7 @@ def test_scikit_learn(): + + + def test_seaborn(): +- seaborn = pytest.importorskip("seaborn") ++ seaborn = td.versioned_importorskip("seaborn") + tips = DataFrame( + {"day": pd.date_range("2023", freq="D", periods=5), "total_bill": range(5)} + ) +@@ -172,12 +172,12 @@ def test_seaborn(): + + + def test_pandas_datareader(): +- pytest.importorskip("pandas_datareader") ++ td.versioned_importorskip("pandas_datareader") + + + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") + def test_pyarrow(df): +- pyarrow = pytest.importorskip("pyarrow") ++ pyarrow = td.versioned_importorskip("pyarrow") + table = pyarrow.Table.from_pandas(df) + result = table.to_pandas() + tm.assert_frame_equal(result, df) +@@ -185,7 +185,7 @@ def test_pyarrow(df): + + def test_yaml_dump(df): + # GH#42748 +- yaml = pytest.importorskip("yaml") ++ yaml = td.versioned_importorskip("yaml") + + dumped = yaml.dump(df) + +@@ -247,7 +247,7 @@ def test_frame_setitem_dask_array_into_n + olduse = pd.get_option("compute.use_numexpr") + + try: +- da = pytest.importorskip("dask.array") ++ da = td.versioned_importorskip("dask.array") + + dda = da.array([1, 2]) + df = DataFrame({"a": ["a", "b"]}) +@@ -348,7 +348,7 @@ def test_dataframe_consortium() -> None: + Full testing is done at https://github.com/data-apis/dataframe-api-compat, + this is just to check that the entry point works as expected. + """ +- pytest.importorskip("dataframe_api_compat") ++ td.versioned_importorskip("dataframe_api_compat") + df_pd = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = df_pd.__dataframe_consortium_standard__() + result_1 = df.get_column_names() +@@ -362,7 +362,7 @@ def test_dataframe_consortium() -> None: + + def test_xarray_coerce_unit(): + # GH44053 +- xr = pytest.importorskip("xarray") ++ xr = td.versioned_importorskip("xarray") + + arr = xr.DataArray([1, 2, 3]) + result = pd.to_datetime(arr, unit="ns") +--- a/pandas/tests/test_nanops.py ++++ b/pandas/tests/test_nanops.py +@@ -500,7 +500,7 @@ class TestnanopsDataFrame: + + @pytest.mark.parametrize("ddof", range(3)) + def test_nansem(self, ddof, skipna): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + with np.errstate(invalid="ignore"): + self.check_funs( +@@ -559,7 +559,7 @@ class TestnanopsDataFrame: + return result + + def test_nanskew(self, skipna): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + func = partial(self._skew_kurt_wrap, func=sp_stats.skew) + with np.errstate(invalid="ignore"): +@@ -573,7 +573,7 @@ class TestnanopsDataFrame: + ) + + def test_nankurt(self, skipna): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + func1 = partial(sp_stats.kurtosis, fisher=True) + func = partial(self._skew_kurt_wrap, func=func1) +@@ -704,7 +704,7 @@ class TestnanopsDataFrame: + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson") + + def test_nancorr_kendall(self): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + targ0 = sp_stats.kendalltau(self.arr_float_2d, self.arr_float1_2d)[0] + targ1 = sp_stats.kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] +@@ -714,7 +714,7 @@ class TestnanopsDataFrame: + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="kendall") + + def test_nancorr_spearman(self): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + targ0 = sp_stats.spearmanr(self.arr_float_2d, self.arr_float1_2d)[0] + targ1 = sp_stats.spearmanr(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] +@@ -724,7 +724,7 @@ class TestnanopsDataFrame: + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="spearman") + + def test_invalid_method(self): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] + targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] + msg = "Unknown method 'foo', expected one of 'kendall', 'spearman'" +--- a/pandas/tests/test_optional_dependency.py ++++ b/pandas/tests/test_optional_dependency.py +@@ -3,6 +3,7 @@ import types + + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat._optional import ( + VERSIONS, + import_optional_dependency, +@@ -23,7 +24,7 @@ def test_import_optional(): + + + def test_xlrd_version_fallback(): +- pytest.importorskip("xlrd") ++ td.versioned_importorskip("xlrd") + import_optional_dependency("xlrd") + + +--- a/pandas/tests/tools/test_to_datetime.py ++++ b/pandas/tests/tools/test_to_datetime.py +@@ -1002,7 +1002,7 @@ class TestToDatetime: + @pytest.mark.parametrize("utc", [True, False]) + @pytest.mark.parametrize("tz", [None, "US/Central"]) + def test_to_datetime_arrow(self, tz, utc, arg_class): +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + + dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz) + dti = arg_class(dti) +@@ -1357,7 +1357,7 @@ class TestToDatetime: + + def test_to_datetime_tz_psycopg2(self, request, cache): + # xref 8260 +- psycopg2_tz = pytest.importorskip("psycopg2.tz") ++ psycopg2_tz = td.versioned_importorskip("psycopg2.tz") + + # misc cases + tz1 = psycopg2_tz.FixedOffsetTimezone(offset=-300, name=None) +@@ -3731,7 +3731,7 @@ def test_ignoring_unknown_tz_deprecated( + + def test_from_numeric_arrow_dtype(any_numeric_ea_dtype): + # GH 52425 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + ser = Series([1, 2], dtype=f"{any_numeric_ea_dtype.lower()}[pyarrow]") + result = to_datetime(ser) + expected = Series([1, 2], dtype="datetime64[ns]") +--- a/pandas/tests/tools/test_to_numeric.py ++++ b/pandas/tests/tools/test_to_numeric.py +@@ -867,7 +867,7 @@ def test_to_numeric_dtype_backend(val, d + def test_to_numeric_dtype_backend_na(val, dtype): + # GH#50505 + if "pyarrow" in dtype: +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + dtype_backend = "pyarrow" + else: + dtype_backend = "numpy_nullable" +@@ -891,7 +891,7 @@ def test_to_numeric_dtype_backend_na(val + def test_to_numeric_dtype_backend_downcasting(val, dtype, downcast): + # GH#50505 + if "pyarrow" in dtype: +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + dtype_backend = "pyarrow" + else: + dtype_backend = "numpy_nullable" +@@ -908,7 +908,7 @@ def test_to_numeric_dtype_backend_downca + def test_to_numeric_dtype_backend_downcasting_uint(smaller, dtype_backend): + # GH#50505 + if dtype_backend == "pyarrow": +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + ser = Series([1, pd.NA], dtype="UInt64") + result = to_numeric(ser, dtype_backend=dtype_backend, downcast="unsigned") + expected = Series([1, pd.NA], dtype=smaller) +@@ -931,7 +931,7 @@ def test_to_numeric_dtype_backend_downca + def test_to_numeric_dtype_backend_already_nullable(dtype): + # GH#50505 + if "pyarrow" in dtype: +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + ser = Series([1, pd.NA], dtype=dtype) + result = to_numeric(ser, dtype_backend="numpy_nullable") + expected = Series([1, pd.NA], dtype=dtype) +@@ -971,7 +971,7 @@ def test_invalid_dtype_backend(): + + def test_coerce_pyarrow_backend(): + # GH 52588 +- pa = pytest.importorskip("pyarrow") ++ pa = td.versioned_importorskip("pyarrow") + ser = Series(list("12x"), dtype=ArrowDtype(pa.string())) + result = to_numeric(ser, errors="coerce", dtype_backend="pyarrow") + expected = Series([1, 2, None], dtype=ArrowDtype(pa.int64())) +--- a/pandas/tests/tools/test_to_timedelta.py ++++ b/pandas/tests/tools/test_to_timedelta.py +@@ -6,6 +6,7 @@ from datetime import ( + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas.compat import IS64 + from pandas.errors import OutOfBoundsTimedelta + +@@ -324,7 +325,7 @@ class TestTimedeltas: + + def test_from_numeric_arrow_dtype(any_numeric_ea_dtype): + # GH 52425 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + ser = Series([1, 2], dtype=f"{any_numeric_ea_dtype.lower()}[pyarrow]") + result = to_timedelta(ser) + expected = Series([1, 2], dtype="timedelta64[ns]") +@@ -334,7 +335,7 @@ def test_from_numeric_arrow_dtype(any_nu + @pytest.mark.parametrize("unit", ["ns", "ms"]) + def test_from_timedelta_arrow_dtype(unit): + # GH 54298 +- pytest.importorskip("pyarrow") ++ td.versioned_importorskip("pyarrow") + expected = Series([timedelta(1)], dtype=f"duration[{unit}][pyarrow]") + result = to_timedelta(expected) + tm.assert_series_equal(result, expected) +--- a/pandas/tests/window/test_online.py ++++ b/pandas/tests/window/test_online.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + Series, +@@ -9,7 +10,7 @@ import pandas._testing as tm + + pytestmark = pytest.mark.single_cpu + +-pytest.importorskip("numba") ++td.versioned_importorskip("numba") + + + @pytest.mark.filterwarnings("ignore") +--- a/pandas/tests/window/test_rolling_skew_kurt.py ++++ b/pandas/tests/window/test_rolling_skew_kurt.py +@@ -3,6 +3,7 @@ from functools import partial + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + Series, +@@ -17,7 +18,7 @@ from pandas.tseries import offsets + + @pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]]) + def test_series(series, sp_func, roll_func): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + compare_func = partial(getattr(sp_stats, sp_func), bias=False) + result = getattr(series.rolling(50), roll_func)() +@@ -27,7 +28,7 @@ def test_series(series, sp_func, roll_fu + + @pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]]) + def test_frame(raw, frame, sp_func, roll_func): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + compare_func = partial(getattr(sp_stats, sp_func), bias=False) + result = getattr(frame.rolling(50), roll_func)() +@@ -41,7 +42,7 @@ def test_frame(raw, frame, sp_func, roll + + @pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]]) + def test_time_rule_series(series, sp_func, roll_func): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + compare_func = partial(getattr(sp_stats, sp_func), bias=False) + win = 25 +@@ -56,7 +57,7 @@ def test_time_rule_series(series, sp_fun + + @pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]]) + def test_time_rule_frame(raw, frame, sp_func, roll_func): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + compare_func = partial(getattr(sp_stats, sp_func), bias=False) + win = 25 +@@ -75,7 +76,7 @@ def test_time_rule_frame(raw, frame, sp_ + + @pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]]) + def test_nans(sp_func, roll_func): +- sp_stats = pytest.importorskip("scipy.stats") ++ sp_stats = td.versioned_importorskip("scipy.stats") + + compare_func = partial(getattr(sp_stats, sp_func), bias=False) + obj = Series(np.random.default_rng(2).standard_normal(50)) +--- a/pandas/tests/window/test_win_type.py ++++ b/pandas/tests/window/test_win_type.py +@@ -1,6 +1,7 @@ + import numpy as np + import pytest + ++import pandas.util._test_decorators as td + from pandas import ( + DataFrame, + Series, +@@ -35,7 +36,7 @@ def win_types_special(request): + + def test_constructor(frame_or_series): + # GH 12669 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + c = frame_or_series(range(5)).rolling + + # valid +@@ -47,7 +48,7 @@ def test_constructor(frame_or_series): + @pytest.mark.parametrize("w", [2.0, "foo", np.array([2])]) + def test_invalid_constructor(frame_or_series, w): + # not valid +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + c = frame_or_series(range(5)).rolling + with pytest.raises(ValueError, match="min_periods must be an integer"): + c(win_type="boxcar", window=2, min_periods=w) +@@ -57,7 +58,7 @@ def test_invalid_constructor(frame_or_se + + @pytest.mark.parametrize("wt", ["foobar", 1]) + def test_invalid_constructor_wintype(frame_or_series, wt): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + c = frame_or_series(range(5)).rolling + with pytest.raises(ValueError, match="Invalid win_type"): + c(win_type=wt, window=2) +@@ -65,14 +66,14 @@ def test_invalid_constructor_wintype(fra + + def test_constructor_with_win_type(frame_or_series, win_types): + # GH 12669 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + c = frame_or_series(range(5)).rolling + c(win_type=win_types, window=2) + + + @pytest.mark.parametrize("arg", ["median", "kurt", "skew"]) + def test_agg_function_support(arg): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame({"A": np.arange(5)}) + roll = df.rolling(2, win_type="triang") + +@@ -89,7 +90,7 @@ def test_agg_function_support(arg): + + def test_invalid_scipy_arg(): + # This error is raised by scipy +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + msg = r"boxcar\(\) got an unexpected" + with pytest.raises(TypeError, match=msg): + Series(range(3)).rolling(1, win_type="boxcar").mean(foo="bar") +@@ -97,7 +98,7 @@ def test_invalid_scipy_arg(): + + def test_constructor_with_win_type_invalid(frame_or_series): + # GH 13383 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + c = frame_or_series(range(5)).rolling + + msg = "window must be an integer 0 or greater" +@@ -108,7 +109,7 @@ def test_constructor_with_win_type_inval + + def test_window_with_args(step): + # make sure that we are aggregating window functions correctly with arg +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + r = Series(np.random.default_rng(2).standard_normal(100)).rolling( + window=10, min_periods=1, win_type="gaussian", step=step + ) +@@ -130,7 +131,7 @@ def test_window_with_args(step): + + + def test_win_type_with_method_invalid(): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + with pytest.raises( + NotImplementedError, match="'single' is the only supported method type." + ): +@@ -140,7 +141,7 @@ def test_win_type_with_method_invalid(): + @pytest.mark.parametrize("arg", [2000000000, "2s", Timedelta("2s")]) + def test_consistent_win_type_freq(arg): + # GH 15969 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + s = Series(range(1)) + with pytest.raises(ValueError, match="Invalid win_type freq"): + s.rolling(arg, win_type="freq") +@@ -153,7 +154,7 @@ def test_win_type_freq_return_none(): + + + def test_win_type_not_implemented(): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed, step): +@@ -167,7 +168,7 @@ def test_win_type_not_implemented(): + + def test_cmov_mean(step): + # GH 8238 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48]) + result = Series(vals).rolling(5, center=True, step=step).mean() + expected_values = [ +@@ -188,7 +189,7 @@ def test_cmov_mean(step): + + def test_cmov_window(step): + # GH 8238 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48]) + result = Series(vals).rolling(5, win_type="boxcar", center=True, step=step).mean() + expected_values = [ +@@ -210,7 +211,7 @@ def test_cmov_window(step): + def test_cmov_window_corner(step): + # GH 8238 + # all nan +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + vals = Series([np.nan] * 10) + result = vals.rolling(5, center=True, win_type="boxcar", step=step).mean() + assert np.isnan(result).all() +@@ -294,7 +295,7 @@ def test_cmov_window_corner(step): + ) + def test_cmov_window_frame(f, xp, step): + # Gh 8238 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame( + np.array( + [ +@@ -321,7 +322,7 @@ def test_cmov_window_frame(f, xp, step): + + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4, 5]) + def test_cmov_window_na_min_periods(step, min_periods): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + vals = Series(np.random.default_rng(2).standard_normal(10)) + vals[4] = np.nan + vals[8] = np.nan +@@ -335,7 +336,7 @@ def test_cmov_window_na_min_periods(step + + def test_cmov_window_regular(win_types, step): + # GH 8238 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48]) + xps = { + "hamming": [ +@@ -443,7 +444,7 @@ def test_cmov_window_regular(win_types, + + def test_cmov_window_regular_linear_range(win_types, step): + # GH 8238 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + vals = np.array(range(10), dtype=float) + xp = vals.copy() + xp[:2] = np.nan +@@ -456,7 +457,7 @@ def test_cmov_window_regular_linear_rang + + def test_cmov_window_regular_missing_data(win_types, step): + # GH 8238 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan, 10.63, 14.48] + ) +@@ -566,7 +567,7 @@ def test_cmov_window_regular_missing_dat + + def test_cmov_window_special(win_types_special, step): + # GH 8238 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + kwds = { + "kaiser": {"beta": 1.0}, + "gaussian": {"std": 1.0}, +@@ -638,7 +639,7 @@ def test_cmov_window_special(win_types_s + + def test_cmov_window_special_linear_range(win_types_special, step): + # GH 8238 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + kwds = { + "kaiser": {"beta": 1.0}, + "gaussian": {"std": 1.0}, +@@ -663,7 +664,7 @@ def test_cmov_window_special_linear_rang + + def test_weighted_var_big_window_no_segfault(win_types, center): + # GitHub Issue #46772 +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + x = Series(0) + result = x.rolling(window=16, center=center, win_type=win_types).var() + expected = Series(np.nan) +@@ -672,7 +673,7 @@ def test_weighted_var_big_window_no_segf + + + def test_rolling_center_axis_1(): +- pytest.importorskip("scipy") ++ td.versioned_importorskip("scipy") + df = DataFrame( + {"a": [1, 1, 0, 0, 0, 1], "b": [1, 0, 0, 1, 0, 0], "c": [1, 0, 0, 1, 0, 1]} + ) +--- a/pandas/util/_test_decorators.py ++++ b/pandas/util/_test_decorators.py +@@ -79,8 +79,8 @@ def skip_if_no(package: str, min_version + + The mark can be used as either a decorator for a test class or to be + applied to parameters in pytest.mark.parametrize calls or parametrized +- fixtures. Use pytest.importorskip if an imported moduled is later needed +- or for test functions. ++ fixtures. Use td.versioned_importorskip if an imported module is later ++ needed or for test functions. + + If the import and version check are unsuccessful, then the test function + (or test case when used in conjunction with parametrization) will be +@@ -171,3 +171,22 @@ skip_copy_on_write_invalid_test = pytest + get_option("mode.copy_on_write") is True, + reason="Test not valid for Copy-on-Write mode", + ) ++ ++def versioned_importorskip(*args, **kwargs): ++ """ ++ (warning - this is currently Debian-specific, the name may change if upstream request this) ++ ++ Return the requested module, or skip the test if it is ++ not available in a new enough version. ++ ++ Intended as a replacement for pytest.importorskip that ++ defaults to requiring at least pandas' minimum version for that ++ optional dependency, rather than any version. ++ ++ See import_optional_dependency for full parameter documentation. ++ """ ++ try: ++ module = import_optional_dependency(*args, **kwargs) ++ except ImportError as exc: ++ pytest.skip(str(exc), allow_module_level=True) ++ return module diff -Nru pandas-2.1.4+dfsg/debian/patches/xarray2024_compat.patch pandas-2.2.2+dfsg/debian/patches/xarray2024_compat.patch --- pandas-2.1.4+dfsg/debian/patches/xarray2024_compat.patch 2024-03-16 17:11:12.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/xarray2024_compat.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,38 +0,0 @@ -Description: Avoid test-failing xarray warning - -Author: Luke Manley -Origin: upstream https://github.com/pandas-dev/pandas/pull/56949 -Bug-Debian: https://bugs.debian.org/1066801 -Forwarded: not-needed - ---- a/pandas/tests/generic/test_to_xarray.py -+++ b/pandas/tests/generic/test_to_xarray.py -@@ -41,7 +41,7 @@ class TestDataFrameToXArray: - df.index.name = "foo" - df.columns.name = "bar" - result = df.to_xarray() -- assert result.dims["foo"] == 3 -+ assert result.sizes["foo"] == 3 - assert len(result.coords) == 1 - assert len(result.data_vars) == 8 - tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) -@@ -60,7 +60,7 @@ class TestDataFrameToXArray: - - df.index.name = "foo" - result = df[0:0].to_xarray() -- assert result.dims["foo"] == 0 -+ assert result.sizes["foo"] == 0 - assert isinstance(result, Dataset) - - def test_to_xarray_with_multiindex(self, df): -@@ -69,8 +69,8 @@ class TestDataFrameToXArray: - # MultiIndex - df.index = MultiIndex.from_product([["a"], range(3)], names=["one", "two"]) - result = df.to_xarray() -- assert result.dims["one"] == 1 -- assert result.dims["two"] == 3 -+ assert result.sizes["one"] == 1 -+ assert result.sizes["two"] == 3 - assert len(result.coords) == 2 - assert len(result.data_vars) == 8 - tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) diff -Nru pandas-2.1.4+dfsg/debian/patches/xarray_version_workaround.patch pandas-2.2.2+dfsg/debian/patches/xarray_version_workaround.patch --- pandas-2.1.4+dfsg/debian/patches/xarray_version_workaround.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/xarray_version_workaround.patch 2024-04-23 18:09:06.000000000 +0000 @@ -5,12 +5,12 @@ --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py -@@ -46,7 +46,7 @@ VERSIONS = { - "sqlalchemy": "1.4.36", - "tables": "3.7.0", - "tabulate": "0.8.10", -- "xarray": "2022.03.0", +@@ -49,7 +49,7 @@ VERSIONS = { + "sqlalchemy": "2.0.0", + "tables": "3.8.0", + "tabulate": "0.9.0", +- "xarray": "2022.12.0", + "xarray": "999",#Debian xarray 2023.08.0-1 says it's 999 "xlrd": "2.0.1", - "xlsxwriter": "3.0.3", - "zstandard": "0.17.0", + "xlsxwriter": "3.0.5", + "zstandard": "0.19.0", diff -Nru pandas-2.1.4+dfsg/debian/patches/xfail_tests_nonintel_io.patch pandas-2.2.2+dfsg/debian/patches/xfail_tests_nonintel_io.patch --- pandas-2.1.4+dfsg/debian/patches/xfail_tests_nonintel_io.patch 2024-01-28 12:40:53.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/patches/xfail_tests_nonintel_io.patch 2024-04-23 17:39:06.000000000 +0000 @@ -18,7 +18,7 @@ --- a/pyproject.toml +++ b/pyproject.toml -@@ -480,6 +480,7 @@ filterwarnings = [ +@@ -489,6 +489,7 @@ filterwarnings = [ "error:::pandas", "error::ResourceWarning", "error::pytest.PytestUnraisableExceptionWarning", @@ -39,7 +39,7 @@ import numpy as np -@@ -559,6 +563,8 @@ class HDFStore: +@@ -560,6 +564,8 @@ class HDFStore: fletcher32: bool = False, **kwargs, ) -> None: @@ -48,7 +48,7 @@ if "format" in kwargs: raise ValueError("format is not a defined argument for HDFStore") -@@ -780,7 +786,10 @@ class HDFStore: +@@ -781,7 +787,10 @@ class HDFStore: self._handle.flush() if fsync: with suppress(OSError): @@ -62,7 +62,7 @@ """ --- a/pandas/io/stata.py +++ b/pandas/io/stata.py -@@ -30,6 +30,8 @@ from typing import ( +@@ -29,6 +29,8 @@ from typing import ( cast, ) import warnings @@ -71,7 +71,7 @@ import numpy as np -@@ -976,6 +978,8 @@ class StataParser: +@@ -971,6 +973,8 @@ class StataParser: # NOTE: the byte type seems to be reserved for categorical variables # with a label, but the underlying variable is -127 to 100 # we're going to drop the label and cast to int @@ -82,7 +82,7 @@ + [ --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py -@@ -26,6 +26,10 @@ from pandas.tests.io.pytables.common imp +@@ -28,6 +28,10 @@ from pandas.tests.io.pytables.common imp ensure_clean_store, tables, ) @@ -93,7 +93,7 @@ from pandas.io import pytables from pandas.io.pytables import Term -@@ -267,6 +271,7 @@ def test_complibs(tmp_path, lvl, lib): +@@ -297,6 +301,7 @@ def test_complibs(tmp_path, lvl, lib, re assert node.filters.complib == lib @@ -101,7 +101,7 @@ @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) -@@ -299,6 +304,7 @@ def test_encoding(setup_path): +@@ -329,6 +334,7 @@ def test_encoding(setup_path): ], ) @pytest.mark.parametrize("dtype", ["category", object]) @@ -111,7 +111,7 @@ nan_rep = "" --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py -@@ -21,6 +21,10 @@ from pandas.tests.io.pytables.common imp +@@ -22,6 +22,10 @@ from pandas.tests.io.pytables.common imp _maybe_remove, ensure_clean_store, ) @@ -122,7 +122,7 @@ pytestmark = pytest.mark.single_cpu -@@ -275,6 +279,7 @@ def test_append_all_nans(setup_path): +@@ -282,6 +286,7 @@ def test_append_all_nans(setup_path): tm.assert_frame_equal(store["df2"], df, check_index_type=True) @@ -132,7 +132,7 @@ # column oriented --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py -@@ -29,6 +29,10 @@ from pandas.io.pytables import ( +@@ -30,6 +30,10 @@ from pandas.io.pytables import ( HDFStore, read_hdf, ) @@ -143,7 +143,7 @@ pytestmark = pytest.mark.single_cpu -@@ -759,6 +763,7 @@ def test_start_stop_fixed(setup_path): +@@ -880,6 +884,7 @@ def test_start_stop_fixed(setup_path): df.iloc[8:10, -2] = np.nan @@ -162,23 +162,23 @@ import pandas as pd from pandas import ( -@@ -167,6 +167,7 @@ def test_pytables_native2_read(datapath) - assert isinstance(d1, DataFrame) +@@ -172,6 +172,7 @@ def test_pytables_native2_read(datapath) + assert isinstance(d1, DataFrame) +@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError) def test_legacy_table_fixed_format_read_py2(datapath): # GH 24510 # legacy table with fixed format written in Python 2 -@@ -182,6 +183,7 @@ def test_legacy_table_fixed_format_read_ - tm.assert_frame_equal(expected, result) +@@ -187,6 +188,7 @@ def test_legacy_table_fixed_format_read_ + tm.assert_frame_equal(expected, result) +@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError) def test_legacy_table_fixed_format_read_datetime_py2(datapath): # GH 31750 # legacy table with fixed format and datetime64 column written in Python 2 -@@ -364,6 +366,7 @@ def test_read_hdf_series_mode_r(tmp_path +@@ -370,6 +372,7 @@ def test_read_hdf_series_mode_r(tmp_path @pytest.mark.filterwarnings(r"ignore:Period with BDay freq is deprecated:FutureWarning") @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @@ -188,7 +188,7 @@ --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py -@@ -32,6 +32,8 @@ from pandas.io.stata import ( +@@ -34,6 +34,8 @@ from pandas.io.stata import ( read_stata, ) @@ -207,7 +207,7 @@ import pandas as pd from pandas import ( -@@ -308,6 +309,7 @@ def test_store_timezone(setup_path): +@@ -312,6 +313,7 @@ def test_store_timezone(setup_path): tm.assert_frame_equal(result, df) @@ -215,7 +215,7 @@ def test_legacy_datetimetz_object(datapath): # legacy from < 0.17.0 # 8260 -@@ -360,6 +362,7 @@ def test_read_with_where_tz_aware_index( +@@ -364,6 +366,7 @@ def test_read_with_where_tz_aware_index( tm.assert_frame_equal(result, expected) @@ -225,8 +225,8 @@ # Python 3. --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py -@@ -17,7 +17,7 @@ import tempfile - +@@ -18,7 +18,7 @@ import tempfile + import numpy as np import pytest -from pandas.compat import is_platform_windows @@ -234,7 +234,7 @@ import pandas.util._test_decorators as td import pandas as pd -@@ -300,11 +300,11 @@ Look,a snake,🐍""" +@@ -305,11 +305,11 @@ Look,a snake,🐍""" "pyarrow", ("io", "data", "feather", "feather-0_3_1.feather"), ), @@ -250,7 +250,7 @@ (pd.read_json, "os", ("io", "json", "data", "tsframe_v012.json")), --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py -@@ -12,6 +12,7 @@ from typing import ( +@@ -13,6 +13,7 @@ from typing import ( cast, ) import warnings @@ -258,7 +258,7 @@ from pandas.compat import PY311 -@@ -186,6 +187,8 @@ def _assert_caught_no_extra_warnings( +@@ -187,6 +188,8 @@ def _assert_caught_no_extra_warnings( # pyproject.toml errors on EncodingWarnings in pandas # Ignore EncodingWarnings from other libraries continue diff -Nru pandas-2.1.4+dfsg/debian/rules pandas-2.2.2+dfsg/debian/rules --- pandas-2.1.4+dfsg/debian/rules 2024-04-21 12:22:27.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/rules 2024-05-05 13:40:35.000000000 +0000 @@ -38,7 +38,7 @@ # The echo-random-characters is to allow searching for fails that don't show the usual test failure message i.e. crashes/errors # -v makes the log too long for salsa-ci # the grep -q is because trying to test __pycache__ is an error in pytest 8+, #1063959 -export PYBUILD_TEST_ARGS=TEST_SUCCESS=true; cd {build_dir} ; for TEST_SUBSET in {build_dir}/pandas/tests/* ; do echo $$TEST_SUBSET | grep -q -e __pycache__ || PANDAS_CI=1 LC_ALL=C.UTF-8 xvfb-run -a -s "-screen 0 1280x1024x24 -noreset" {interpreter} -m pytest -s -m "$(PYTEST_MARKER)" $(PYTEST_WARN_IGNORE) -k "not test_register_entrypoint and not test_wrong_url and not test_statsmodels" --confcutdir={build_dir}/pandas --deb-data-root-dir={dir}/pandas/tests $$TEST_SUBSET || test $$? = 5 || TEST_SUCCESS=false && echo "rdjoqkol test state = $$TEST_SUCCESS"; done ; rm -rf test-data.xml .pytest_cache ; $$TEST_SUCCESS +export PYBUILD_TEST_ARGS=TEST_SUCCESS=true; cd {build_dir} ; for TEST_SUBSET in {build_dir}/pandas/tests/* ; do echo $$TEST_SUBSET | grep -q -e __pycache__ || PANDAS_CI=1 LC_ALL=C.UTF-8 xvfb-run -a -s "-screen 0 1280x1024x24 -noreset" {interpreter} -m pytest -s -m "$(PYTEST_MARKER)" $(PYTEST_WARN_IGNORE) -k "not test_register_entrypoint and not test_wrong_url and not test_statsmodels" --confcutdir={build_dir}/pandas --deb-data-root-dir={dir}/pandas/tests $$TEST_SUBSET || test $$? = 5 || TEST_SUCCESS=false && echo "rdjoqkol test state = $$TEST_SUCCESS"; done ; rm -rf test-data.xml test_stata.dta .pytest_cache ; $$TEST_SUCCESS export PYBUILD_EXT_DESTDIR=debian/python3-pandas-lib export PYBUILD_DESTDIR=debian/python3-pandas diff -Nru pandas-2.1.4+dfsg/debian/salsa-ci.yml pandas-2.2.2+dfsg/debian/salsa-ci.yml --- pandas-2.1.4+dfsg/debian/salsa-ci.yml 2024-03-15 20:44:06.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/salsa-ci.yml 2024-05-18 11:01:51.000000000 +0000 @@ -12,4 +12,4 @@ # tests are long, so run them only in build # shorten the build logs to make room for reporting differences SALSA_CI_REPROTEST_ARGS: "--append-build-command=-Pnocheck --vary=environment.variables+=DEB_BUILD_OPTIONS=terse" - SALSA_CI_DISABLE_AUTOPKGTEST: 1 +# SALSA_CI_DISABLE_AUTOPKGTEST: 1 diff -Nru pandas-2.1.4+dfsg/debian/tests/control pandas-2.2.2+dfsg/debian/tests/control --- pandas-2.1.4+dfsg/debian/tests/control 2024-03-15 07:48:36.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/tests/control 2024-05-30 18:29:59.000000000 +0000 @@ -15,45 +15,47 @@ ca-certificates, locales-all, python3-all, -# we don't have this version python3-blosc (>= 1.21.0~), - python3-bottleneck (>= 1.3.4~), - python3-bs4 (>= 4.11.1~), +# we don't have this version python3-blosc (>= 1.21.3~), + python3-bottleneck (>= 1.3.6~), + python3-bs4 (>= 4.11.2~), python3-dask (>= 2023.2.0~), python3-dateutil, - python3-fsspec (>= 2022.05.0~), + python3-fsspec (>= 2022.11.0~), python3-html5lib (>= 1.1~), python3-hypothesis (>= 6.46.1~), python3-jinja2 (>= 3.1.2~), - python3-lxml (>= 4.8.0~), - python3-matplotlib (>= 3.6.1~) [!ia64 !sh4 !x32], + python3-lxml (>= 4.9.2~), + python3-matplotlib (>= 3.6.3~) [!ia64 !sh4 !x32], # architectures here are the ones on which to treat numba-related failures as RC - see also debian/control -# temporarily disabled as numba is not in testing #1033907 python3-numba (>= 0.55.2~) [amd64 i386 ppc64el], - python3-numexpr (>= 2.8.0~), +# temporarily disabled as numba is not in testing #1033907 python3-numba (>= 0.56.4~) [amd64 i386 ppc64el], + python3-numexpr (>= 2.8.4~), python3-numpy (>= 1:1.23.2~), python3-odf (>= 1.4.1~), - python3-openpyxl (>= 3.0.10~), -# doesn't seem to work in this test environment python3-psycopg2 (>= 2.9.3~), + python3-openpyxl (>= 3.1.0~), +# doesn't seem to work in this test environment python3-psycopg2 (>= 2.9.6~), python3-py, # doesn't seem to work in this test environment python3-pymysql (>= 1.0.2~), - python3-pyreadstat (>= 1.1.5~), - python3-pytest (>= 7.3~), + python3-pyqt5 (>= 5.15.9~), + python3-pyreadstat (>= 1.2.0~), + python3-pytest (>= 7.3.2~), python3-pytest-asyncio (>= 0.17~), python3-pytest-forked, python3-pytest-localserver, python3-pytest-xdist (>= 2.2.0~), - python3-scipy (>= 1.8.1~), + python3-pytestqt (>= 4.2.0~), +# we don't have python3-pyxlsb + python3-scipy (>= 1.10.0~), python3-setuptools (>= 51~), -# we don't have this version python3-snappy (>= 0.6.0~), - python3-sqlalchemy (>= 1.4.36~), +# we don't have this version python3-sqlalchemy (>= 2.0.0~), # python3-tables is now little-endian only, and also unavailable on some ports - python3-tables (>= 3.7.0~) [!s390x !hppa !powerpc !ppc64 !sparc64 !hurd-any !alpha], -# we don't have this version python3-tabulate (>= 0.8.10~), + python3-tables (>= 3.8.0~) [!s390x !hppa !powerpc !ppc64 !sparc64 !hurd-any !alpha], + python3-tabulate (>= 0.9.0~), python3-tk, - python3-tz (>= 2022.1~), - python3-xarray (>= 2022.03.0~), -# we don't have this version python3-xlrd (>= 2.0.1~), -# we don't have this version python3-xlsxwriter (>= 3.0.3~), - python3-zstandard (>= 0.17.0~), + python3-tz (>= 2022.7~), + python3-xarray (>= 2022.12.0~), + python3-xlrd (>= 2.0.1~), + python3-xlsxwriter (>= 3.0.5~), + python3-zstandard (>= 0.19.0~), tzdata-legacy, xauth, xvfb, @@ -70,44 +72,47 @@ ca-certificates, locales-all, python3-all, -# we don't have this version python3-blosc (>= 1.21.0~), - python3-bottleneck (>= 1.3.4~), - python3-bs4 (>= 4.11.1~), +# we don't have this version python3-blosc (>= 1.21.3~), + python3-bottleneck (>= 1.3.6~), + python3-bs4 (>= 4.11.2~), python3-dask (>= 2023.2.0~), python3-dateutil, - python3-fsspec (>= 2022.05.0~), + python3-fsspec (>= 2022.11.0~), python3-html5lib (>= 1.1~), python3-hypothesis (>= 6.46.1~), python3-jinja2 (>= 3.1.2~), - python3-lxml (>= 4.8.0~), - python3-matplotlib (>= 3.6.1~) [!ia64 !sh4 !x32], -# temporarily disabled as numba is not in testing #1033907 python3-numba (>= 0.55.2~), - python3-numexpr (>= 2.8.0~), + python3-lxml (>= 4.9.2~), + python3-matplotlib (>= 3.6.3~) [!ia64 !sh4 !x32], +# architectures here are the ones on which to treat numba-related failures as RC - see also debian/control +# temporarily disabled as numba is not in testing #1033907 python3-numba (>= 0.56.4~), + python3-numexpr (>= 2.8.4~), python3-numpy (>= 1:1.23.2~), python3-odf (>= 1.4.1~), - python3-openpyxl (>= 3.0.10~), - python3-psycopg2 (>= 2.9.3~), + python3-openpyxl (>= 3.1.0~), + python3-psycopg2 (>= 2.9.6~), python3-py, python3-pymysql (>= 1.0.2~), - python3-pyreadstat (>= 1.1.5~), - python3-pytest (>= 7.3~), + python3-pyqt5 (>= 5.15.9~), + python3-pyreadstat (>= 1.2.0~), + python3-pytest (>= 7.3.2~), python3-pytest-asyncio (>= 0.17~), python3-pytest-forked, python3-pytest-localserver, python3-pytest-xdist (>= 2.2.0~), - python3-scipy (>= 1.8.1~), + python3-pytestqt (>= 4.2.0~), +# we don't have python3-pyxlsb + python3-scipy (>= 1.10.0~), python3-setuptools (>= 51~), -# we don't have this version python3-snappy (>= 0.6.0~), - python3-sqlalchemy (>= 1.4.36~), +# we don't have this version python3-sqlalchemy (>= 2.0.0~), # python3-tables is now little-endian only, and also unavailable on some ports - python3-tables (>= 3.7.0~) [!s390x !hppa !powerpc !ppc64 !sparc64 !hurd-any !alpha], -# we don't have this version python3-tabulate (>= 0.8.10~), + python3-tables (>= 3.8.0~) [!s390x !hppa !powerpc !ppc64 !sparc64 !hurd-any !alpha], + python3-tabulate (>= 0.9.0~), python3-tk, - python3-tz (>= 2022.1~), - python3-xarray (>= 2022.03.0~), -# we don't have this version python3-xlrd (>= 2.0.1~), -# we don't have this version python3-xlsxwriter (>= 3.0.3~), - python3-zstandard (>= 0.17.0~), + python3-tz (>= 2022.7~), + python3-xarray (>= 2022.12.0~), + python3-xlrd (>= 2.0.1~), + python3-xlsxwriter (>= 3.0.5~), + python3-zstandard (>= 0.19.0~), tzdata-legacy, xauth, xvfb, diff -Nru pandas-2.1.4+dfsg/debian/tests/ignoredtests pandas-2.2.2+dfsg/debian/tests/ignoredtests --- pandas-2.1.4+dfsg/debian/tests/ignoredtests 2024-02-08 19:36:41.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/tests/ignoredtests 2024-05-30 06:29:33.000000000 +0000 @@ -11,11 +11,11 @@ echo "=== $py ===" modpath=$($py -c 'import pandas as pd; print(pd.__path__[0])') echo "tests that use numba (may crash on non-x86) - checked with grep -rl -e numba pandas/tests - -m not slow because there are enough to time out otherwise" - PANDAS_CI=1 LC_ALL=C.UTF-8 xvfb-run --auto-servernum --server-args="-screen 0 1024x768x24" $py -m pytest --forked --tb=long -s -v -m "not slow" --deb-data-root-dir=$sourcetestroot --confcutdir=$modpath $modpath/tests/frame/test_ufunc.py $modpath/tests/groupby/test_numba.py $modpath/tests/groupby/test_timegrouper.py $modpath/tests/groupby/transform/test_numba.py $modpath/tests/groupby/aggregate/test_numba.py $modpath/tests/util/test_numba.py $modpath/tests/window $TEST_SUBSET 2>&1 + PANDAS_CI=1 LC_ALL=C.UTF-8 xvfb-run --auto-servernum --server-args="-screen 0 1024x768x24" $py -m pytest --forked --tb=long -s -m "not slow" --deb-data-root-dir=$sourcetestroot --confcutdir=$modpath $modpath/tests/frame/test_ufunc.py $modpath/tests/groupby/test_numba.py $modpath/tests/groupby/test_timegrouper.py $modpath/tests/groupby/transform/test_numba.py $modpath/tests/groupby/aggregate/test_numba.py $modpath/tests/util/test_numba.py $modpath/tests/window $TEST_SUBSET 2>&1 echo "tests with a run=False xfail for hdf5 crashes - see xfail_tests_nonintel_io.patch" - PANDAS_CI=1 LC_ALL=C.UTF-8 xvfb-run --auto-servernum --server-args="-screen 0 1024x768x24" $py -m pytest --forked --runxfail --tb=long -s -v --deb-data-root-dir=$sourcetestroot --confcutdir=$modpath $modpath/tests/io/pytables/test_file_handling.py $modpath/tests/io/pytables/test_append.py $modpath/tests/io/pytables/test_store.py + PANDAS_CI=1 LC_ALL=C.UTF-8 xvfb-run --auto-servernum --server-args="-screen 0 1024x768x24" $py -m pytest --forked --runxfail --tb=long -s --deb-data-root-dir=$sourcetestroot --confcutdir=$modpath $modpath/tests/io/pytables/test_file_handling.py $modpath/tests/io/pytables/test_append.py $modpath/tests/io/pytables/test_store.py echo "pymysql/psycopg2 tests, which do not work in this test environment" - PANDAS_CI=1 LC_ALL=C.UTF-8 xvfb-run --auto-servernum --server-args="-screen 0 1024x768x24" $py -m pytest --forked --tb=long -s -v --deb-data-root-dir=$sourcetestroot --confcutdir=$modpath $modpath/tests/io/test_sql.py $modpath/tests/tools/test_to_datetime.py + PANDAS_CI=1 LC_ALL=C.UTF-8 xvfb-run --auto-servernum --server-args="-screen 0 1024x768x24" $py -m pytest --forked --tb=long -s --deb-data-root-dir=$sourcetestroot --confcutdir=$modpath $modpath/tests/io/test_sql.py $modpath/tests/tools/test_to_datetime.py done # this intentionally doesn't actually fail, as it exists to run known possibly-crashing tests to get a log of whether they still fail true diff -Nru pandas-2.1.4+dfsg/debian/tests/unittests3 pandas-2.2.2+dfsg/debian/tests/unittests3 --- pandas-2.1.4+dfsg/debian/tests/unittests3 2024-03-14 21:37:23.000000000 +0000 +++ pandas-2.2.2+dfsg/debian/tests/unittests3 2024-06-04 22:04:20.000000000 +0000 @@ -5,6 +5,9 @@ arch=$(dpkg --print-architecture) pys="$(py3versions -s 2>/dev/null)" sourcetestroot="$PWD/pandas/tests" +# some tests _require_ the treat-warnings-as-errors set here +# (as they use pytest.raises to catch what would normally be a warning) +tomlfile="$PWD/pyproject.toml" # Debian: Enable "slow" tests on x86 to keep the code coverage. # Ubuntu: Disable "slow" tests on ALL architectures. @@ -31,7 +34,7 @@ modpath=$($py -c 'import pandas as pd; print(pd.__path__[0])') for TEST_SUBSET in $modpath/tests/* ; do echo $TEST_SUBSET | grep -q -e __pycache__ || PANDAS_CI=1 LC_ALL=C.UTF-8 xvfb-run --auto-servernum --server-args="-screen 0 1024x768x24" \ - $py -m pytest --tb=long -s -m "$marker" $PYTEST_WARN_IGNORE --deb-data-root-dir=$sourcetestroot --confcutdir=$modpath $TEST_SUBSET 2>&1 || test $? == 5 || TEST_SUCCESS=false && echo "rdjoqkol test state = $TEST_SUCCESS" + $py -m pytest --tb=long -s -m "$marker" $PYTEST_WARN_IGNORE -c $tomlfile --deb-data-root-dir=$sourcetestroot --rootdir=$modpath $TEST_SUBSET 2>&1 || test $? == 5 || TEST_SUCCESS=false && echo "rdjoqkol test state = $TEST_SUCCESS" done done $TEST_SUCCESS diff -Nru pandas-2.1.4+dfsg/doc/_templates/autosummary/class.rst pandas-2.2.2+dfsg/doc/_templates/autosummary/class.rst --- pandas-2.1.4+dfsg/doc/_templates/autosummary/class.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/_templates/autosummary/class.rst 2024-04-10 17:42:52.000000000 +0000 @@ -1,33 +1,32 @@ -{% extends "!autosummary/class.rst" %} +{{ fullname | escape | underline}} -{% block methods %} -{% if methods %} +.. currentmodule:: {{ module }} -.. - HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. - .. autosummary:: - :toctree: - {% for item in all_methods %} - {%- if not item.startswith('_') or item in ['__call__'] %} - {{ name }}.{{ item }} - {%- endif -%} - {%- endfor %} +.. autoclass:: {{ objname }} -{% endif %} -{% endblock %} + {% block methods %} -{% block attributes %} -{% if attributes %} + {% block attributes %} + {% if attributes %} + .. rubric:: {{ _('Attributes') }} -.. - HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. .. autosummary:: - :toctree: - {% for item in all_attributes %} - {%- if not item.startswith('_') %} - {{ name }}.{{ item }} - {%- endif -%} - {%- endfor %} + {% for item in attributes %} + {% if item in members and not item.startswith('_') %} + ~{{ name }}.{{ item }} + {% endif %} + {%- endfor %} + {% endif %} + {% endblock %} + + {% if methods %} + .. rubric:: {{ _('Methods') }} -{% endif %} -{% endblock %} + .. autosummary:: + {% for item in methods %} + {% if item in members and (not item.startswith('_') or item in ['__call__']) %} + ~{{ name }}.{{ item }} + {% endif %} + {%- endfor %} + {% endif %} + {% endblock %} diff -Nru pandas-2.1.4+dfsg/doc/cheatsheet/README.md pandas-2.2.2+dfsg/doc/cheatsheet/README.md --- pandas-2.1.4+dfsg/doc/cheatsheet/README.md 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/cheatsheet/README.md 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,22 @@ +# Pandas Cheat Sheet + +The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013. +To create the PDF version, within Powerpoint, simply do a "Save As" +and pick "PDF" as the format. + +This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf). + +| Topic | PDF | PPT | +|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas_Cheat_Sheet | | | +| Pandas_Cheat_Sheet_JA | | | + + +**Alternative** + +Alternatively, if you want to complement your learning, you can use the Pandas Cheat sheets +developed by [DataCamp](https://www.datacamp.com/) in "PDF", "Google Colab" and "Streamlit" formats. + +| Topic | PDF | Streamlit | Google Colab | +|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas | | | Open In Colab | diff -Nru pandas-2.1.4+dfsg/doc/cheatsheet/README.txt pandas-2.2.2+dfsg/doc/cheatsheet/README.txt --- pandas-2.1.4+dfsg/doc/cheatsheet/README.txt 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/cheatsheet/README.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1,8 +0,0 @@ -The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013. -To create the PDF version, within Powerpoint, simply do a "Save As" -and pick "PDF" as the format. - -This cheat sheet was inspired by the RStudio Data Wrangling Cheatsheet[1], written by Irv Lustig, Princeton Consultants[2]. - -[1]: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf -[2]: https://www.princetonoptimization.com/ diff -Nru pandas-2.1.4+dfsg/doc/make.py pandas-2.2.2+dfsg/doc/make.py --- pandas-2.1.4+dfsg/doc/make.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/make.py 2024-04-10 17:42:52.000000000 +0000 @@ -45,12 +45,14 @@ single_doc=None, verbosity=0, warnings_are_errors=False, + no_browser=False, ) -> None: self.num_jobs = num_jobs self.include_api = include_api self.whatsnew = whatsnew self.verbosity = verbosity self.warnings_are_errors = warnings_are_errors + self.no_browser = no_browser if single_doc: single_doc = self._process_single_doc(single_doc) @@ -100,7 +102,7 @@ ) @staticmethod - def _run_os(*args): + def _run_os(*args) -> None: """ Execute a command as a OS terminal. @@ -123,14 +125,14 @@ Parameters ---------- - kind : {'html', 'latex'} + kind : {'html', 'latex', 'linkcheck'} Examples -------- >>> DocBuilder(num_jobs=4)._sphinx_build('html') """ - if kind not in ("html", "latex"): - raise ValueError(f"kind must be html or latex, not {kind}") + if kind not in ("html", "latex", "linkcheck"): + raise ValueError(f"kind must be html, latex or linkcheck, not {kind}") cmd = ["sphinx-build", "-b", kind] if self.num_jobs: @@ -147,7 +149,7 @@ ] return subprocess.call(cmd) - def _open_browser(self, single_doc_html): + def _open_browser(self, single_doc_html) -> None: """ Open a browser tab showing single """ @@ -159,10 +161,10 @@ Open the rst file `page` and extract its title. """ fname = os.path.join(SOURCE_PATH, f"{page}.rst") - option_parser = docutils.frontend.OptionParser( - components=(docutils.parsers.rst.Parser,) + doc = docutils.utils.new_document( + "", + docutils.frontend.get_default_settings(docutils.parsers.rst.Parser), ) - doc = docutils.utils.new_document("", option_parser.get_default_values()) with open(fname, encoding="utf-8") as f: data = f.read() @@ -181,7 +183,7 @@ return title.astext() - def _add_redirects(self): + def _add_redirects(self) -> None: """ Create in the build directory an html file with a redirect, for every row in REDIRECTS_FILE. @@ -235,10 +237,11 @@ if ret_code == 0: if self.single_doc_html is not None: - self._open_browser(self.single_doc_html) + if not self.no_browser: + self._open_browser(self.single_doc_html) else: self._add_redirects() - if self.whatsnew: + if self.whatsnew and not self.no_browser: self._open_browser(os.path.join("whatsnew", "index.html")) return ret_code @@ -269,14 +272,14 @@ return self.latex(force=True) @staticmethod - def clean(): + def clean() -> None: """ Clean documentation generated files. """ shutil.rmtree(BUILD_PATH, ignore_errors=True) shutil.rmtree(os.path.join(SOURCE_PATH, "reference", "api"), ignore_errors=True) - def zip_html(self): + def zip_html(self) -> None: """ Compress HTML documentation into a zip file. """ @@ -288,6 +291,12 @@ os.chdir(dirname) self._run_os("zip", zip_fname, "-r", "-q", *fnames) + def linkcheck(self): + """ + Check for broken links in the documentation. + """ + return self._sphinx_build("linkcheck") + def main(): cmds = [method for method in dir(DocBuilder) if not method.startswith("_")] @@ -343,6 +352,12 @@ action="store_true", help="fail if warnings are raised", ) + argparser.add_argument( + "--no-browser", + help="Don't open browser", + default=False, + action="store_true", + ) args = argparser.parse_args() if args.command not in cmds: @@ -368,6 +383,7 @@ args.single, args.verbosity, args.warnings_are_errors, + args.no_browser, ) return getattr(builder, args.command)() diff -Nru pandas-2.1.4+dfsg/doc/redirects.csv pandas-2.2.2+dfsg/doc/redirects.csv --- pandas-2.1.4+dfsg/doc/redirects.csv 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/redirects.csv 2024-04-10 17:42:52.000000000 +0000 @@ -100,7 +100,6 @@ generated/pandas.api.types.infer_dtype,../reference/api/pandas.api.types.infer_dtype generated/pandas.api.types.is_bool_dtype,../reference/api/pandas.api.types.is_bool_dtype generated/pandas.api.types.is_bool,../reference/api/pandas.api.types.is_bool -generated/pandas.api.types.is_categorical_dtype,../reference/api/pandas.api.types.is_categorical_dtype generated/pandas.api.types.is_complex_dtype,../reference/api/pandas.api.types.is_complex_dtype generated/pandas.api.types.is_complex,../reference/api/pandas.api.types.is_complex generated/pandas.api.types.is_datetime64_any_dtype,../reference/api/pandas.api.types.is_datetime64_any_dtype @@ -127,7 +126,6 @@ generated/pandas.api.types.is_numeric_dtype,../reference/api/pandas.api.types.is_numeric_dtype generated/pandas.api.types.is_object_dtype,../reference/api/pandas.api.types.is_object_dtype generated/pandas.api.types.is_period_dtype,../reference/api/pandas.api.types.is_period_dtype -generated/pandas.api.types.is_period,../reference/api/pandas.api.types.is_period generated/pandas.api.types.is_re_compilable,../reference/api/pandas.api.types.is_re_compilable generated/pandas.api.types.is_re,../reference/api/pandas.api.types.is_re generated/pandas.api.types.is_scalar,../reference/api/pandas.api.types.is_scalar @@ -194,40 +192,39 @@ generated/pandas.core.groupby.DataFrameGroupBy.size,../reference/api/pandas.core.groupby.DataFrameGroupBy.size generated/pandas.core.groupby.DataFrameGroupBy.skew,../reference/api/pandas.core.groupby.DataFrameGroupBy.skew generated/pandas.core.groupby.DataFrameGroupBy.take,../reference/api/pandas.core.groupby.DataFrameGroupBy.take -generated/pandas.core.groupby.GroupBy.agg,../reference/api/pandas.core.groupby.GroupBy.agg -generated/pandas.core.groupby.GroupBy.aggregate,../reference/api/pandas.core.groupby.GroupBy.aggregate -generated/pandas.core.groupby.GroupBy.all,../reference/api/pandas.core.groupby.GroupBy.all -generated/pandas.core.groupby.GroupBy.any,../reference/api/pandas.core.groupby.GroupBy.any -generated/pandas.core.groupby.GroupBy.apply,../reference/api/pandas.core.groupby.GroupBy.apply -generated/pandas.core.groupby.GroupBy.bfill,../reference/api/pandas.core.groupby.GroupBy.bfill -generated/pandas.core.groupby.GroupBy.count,../reference/api/pandas.core.groupby.GroupBy.count -generated/pandas.core.groupby.GroupBy.cumcount,../reference/api/pandas.core.groupby.GroupBy.cumcount -generated/pandas.core.groupby.GroupBy.ffill,../reference/api/pandas.core.groupby.GroupBy.ffill -generated/pandas.core.groupby.GroupBy.first,../reference/api/pandas.core.groupby.GroupBy.first -generated/pandas.core.groupby.GroupBy.get_group,../reference/api/pandas.core.groupby.GroupBy.get_group -generated/pandas.core.groupby.GroupBy.groups,../reference/api/pandas.core.groupby.GroupBy.groups -generated/pandas.core.groupby.GroupBy.head,../reference/api/pandas.core.groupby.GroupBy.head -generated/pandas.core.groupby.GroupBy.indices,../reference/api/pandas.core.groupby.GroupBy.indices -generated/pandas.core.groupby.GroupBy.__iter__,../reference/api/pandas.core.groupby.GroupBy.__iter__ -generated/pandas.core.groupby.GroupBy.last,../reference/api/pandas.core.groupby.GroupBy.last -generated/pandas.core.groupby.GroupBy.max,../reference/api/pandas.core.groupby.GroupBy.max -generated/pandas.core.groupby.GroupBy.mean,../reference/api/pandas.core.groupby.GroupBy.mean -generated/pandas.core.groupby.GroupBy.median,../reference/api/pandas.core.groupby.GroupBy.median -generated/pandas.core.groupby.GroupBy.min,../reference/api/pandas.core.groupby.GroupBy.min -generated/pandas.core.groupby.GroupBy.ngroup,../reference/api/pandas.core.groupby.GroupBy.ngroup -generated/pandas.core.groupby.GroupBy.nth,../reference/api/pandas.core.groupby.GroupBy.nth -generated/pandas.core.groupby.GroupBy.ohlc,../reference/api/pandas.core.groupby.GroupBy.ohlc -generated/pandas.core.groupby.GroupBy.pct_change,../reference/api/pandas.core.groupby.GroupBy.pct_change -generated/pandas.core.groupby.GroupBy.pipe,../reference/api/pandas.core.groupby.GroupBy.pipe -generated/pandas.core.groupby.GroupBy.prod,../reference/api/pandas.core.groupby.GroupBy.prod -generated/pandas.core.groupby.GroupBy.rank,../reference/api/pandas.core.groupby.GroupBy.rank -generated/pandas.core.groupby.GroupBy.sem,../reference/api/pandas.core.groupby.GroupBy.sem -generated/pandas.core.groupby.GroupBy.size,../reference/api/pandas.core.groupby.GroupBy.size -generated/pandas.core.groupby.GroupBy.std,../reference/api/pandas.core.groupby.GroupBy.std -generated/pandas.core.groupby.GroupBy.sum,../reference/api/pandas.core.groupby.GroupBy.sum -generated/pandas.core.groupby.GroupBy.tail,../reference/api/pandas.core.groupby.GroupBy.tail -generated/pandas.core.groupby.GroupBy.transform,../reference/api/pandas.core.groupby.GroupBy.transform -generated/pandas.core.groupby.GroupBy.var,../reference/api/pandas.core.groupby.GroupBy.var +generated/pandas.core.groupby.GroupBy.agg,../reference/api/pandas.core.groupby.DataFrameGroupBy.agg +generated/pandas.core.groupby.GroupBy.aggregate,../reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate +generated/pandas.core.groupby.GroupBy.all,../reference/api/pandas.core.groupby.DataFrameGroupBy.all +generated/pandas.core.groupby.GroupBy.any,../reference/api/pandas.core.groupby.DataFrameGroupBy.any +generated/pandas.core.groupby.GroupBy.apply,../reference/api/pandas.core.groupby.DataFrameGroupBy.apply +generated/pandas.core.groupby.GroupBy.bfill,../reference/api/pandas.core.groupby.DataFrameGroupBy.bfill +generated/pandas.core.groupby.GroupBy.count,../reference/api/pandas.core.groupby.DataFrameGroupBy.count +generated/pandas.core.groupby.GroupBy.cumcount,../reference/api/pandas.core.groupby.DataFrameGroupBy.cumcount +generated/pandas.core.groupby.GroupBy.ffill,../reference/api/pandas.core.groupby.DataFrameGroupBy.ffill +generated/pandas.core.groupby.GroupBy.first,../reference/api/pandas.core.groupby.DataFrameGroupBy.first +generated/pandas.core.groupby.GroupBy.get_group,../reference/api/pandas.core.groupby.DataFrameGroupBy.get_group +generated/pandas.core.groupby.GroupBy.groups,../reference/api/pandas.core.groupby.DataFrameGroupBy.groups +generated/pandas.core.groupby.GroupBy.head,../reference/api/pandas.core.groupby.DataFrameGroupBy.head +generated/pandas.core.groupby.GroupBy.indices,../reference/api/pandas.core.groupby.DataFrameGroupBy.indices +generated/pandas.core.groupby.GroupBy.__iter__,../reference/api/pandas.core.groupby.DataFrameGroupBy.__iter__ +generated/pandas.core.groupby.GroupBy.last,../reference/api/pandas.core.groupby.DataFrameGroupBy.last +generated/pandas.core.groupby.GroupBy.max,../reference/api/pandas.core.groupby.DataFrameGroupBy.max +generated/pandas.core.groupby.GroupBy.mean,../reference/api/pandas.core.groupby.DataFrameGroupBy.mean +generated/pandas.core.groupby.GroupBy.median,../reference/api/pandas.core.groupby.DataFrameGroupBy.median +generated/pandas.core.groupby.GroupBy.min,../reference/api/pandas.core.groupby.DataFrameGroupBy.min +generated/pandas.core.groupby.GroupBy.ngroup,../reference/api/pandas.core.groupby.DataFrameGroupBy.ngroup +generated/pandas.core.groupby.GroupBy.nth,../reference/api/pandas.core.groupby.DataFrameGroupBy.nth +generated/pandas.core.groupby.GroupBy.ohlc,../reference/api/pandas.core.groupby.DataFrameGroupBy.ohlc +generated/pandas.core.groupby.GroupBy.pct_change,../reference/api/pandas.core.groupby.DataFrameGroupBy.pct_change +generated/pandas.core.groupby.GroupBy.pipe,../reference/api/pandas.core.groupby.DataFrameGroupBy.pipe +generated/pandas.core.groupby.GroupBy.prod,../reference/api/pandas.core.groupby.DataFrameGroupBy.prod +generated/pandas.core.groupby.GroupBy.rank,../reference/api/pandas.core.groupby.DataFrameGroupBy.rank +generated/pandas.core.groupby.GroupBy.size,../reference/api/pandas.core.groupby.DataFrameGroupBy.size +generated/pandas.core.groupby.GroupBy.std,../reference/api/pandas.core.groupby.DataFrameGroupBy.std +generated/pandas.core.groupby.GroupBy.sum,../reference/api/pandas.core.groupby.DataFrameGroupBy.sum +generated/pandas.core.groupby.GroupBy.tail,../reference/api/pandas.core.groupby.DataFrameGroupBy.tail +generated/pandas.core.groupby.GroupBy.transform,../reference/api/pandas.core.groupby.DataFrameGroupBy.transform +generated/pandas.core.groupby.GroupBy.var,../reference/api/pandas.core.groupby.DataFrameGroupBy.var generated/pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing,../reference/api/pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing generated/pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing,../reference/api/pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing generated/pandas.core.groupby.SeriesGroupBy.nlargest,../reference/api/pandas.core.groupby.SeriesGroupBy.nlargest @@ -238,7 +235,7 @@ generated/pandas.core.resample.Resampler.aggregate,../reference/api/pandas.core.resample.Resampler.aggregate generated/pandas.core.resample.Resampler.apply,../reference/api/pandas.core.resample.Resampler.apply generated/pandas.core.resample.Resampler.asfreq,../reference/api/pandas.core.resample.Resampler.asfreq -generated/pandas.core.resample.Resampler.backfill,../reference/api/pandas.core.resample.Resampler.backfill +generated/pandas.core.resample.Resampler.backfill,../reference/api/pandas.core.resample.Resampler.bfill generated/pandas.core.resample.Resampler.bfill,../reference/api/pandas.core.resample.Resampler.bfill generated/pandas.core.resample.Resampler.count,../reference/api/pandas.core.resample.Resampler.count generated/pandas.core.resample.Resampler.ffill,../reference/api/pandas.core.resample.Resampler.ffill @@ -257,7 +254,6 @@ generated/pandas.core.resample.Resampler.nearest,../reference/api/pandas.core.resample.Resampler.nearest generated/pandas.core.resample.Resampler.nunique,../reference/api/pandas.core.resample.Resampler.nunique generated/pandas.core.resample.Resampler.ohlc,../reference/api/pandas.core.resample.Resampler.ohlc -generated/pandas.core.resample.Resampler.pad,../reference/api/pandas.core.resample.Resampler.pad generated/pandas.core.resample.Resampler.pipe,../reference/api/pandas.core.resample.Resampler.pipe generated/pandas.core.resample.Resampler.prod,../reference/api/pandas.core.resample.Resampler.prod generated/pandas.core.resample.Resampler.quantile,../reference/api/pandas.core.resample.Resampler.quantile @@ -709,7 +705,7 @@ generated/pandas.Index.symmetric_difference,../reference/api/pandas.Index.symmetric_difference generated/pandas.Index.take,../reference/api/pandas.Index.take generated/pandas.Index.T,../reference/api/pandas.Index.T -generated/pandas.Index.to_flat_index,../reference/api/pandas.Index.to_flat_index +generated/pandas.Index.to_flat_index,../reference/api/pandas.MultiIndex.to_flat_index generated/pandas.Index.to_frame,../reference/api/pandas.Index.to_frame generated/pandas.Index.to_list,../reference/api/pandas.Index.to_list generated/pandas.Index.tolist,../reference/api/pandas.Index.tolist @@ -754,7 +750,8 @@ generated/pandas.interval_range,../reference/api/pandas.interval_range generated/pandas.Interval.right,../reference/api/pandas.Interval.right generated/pandas.io.formats.style.Styler.apply,../reference/api/pandas.io.formats.style.Styler.apply -generated/pandas.io.formats.style.Styler.applymap,../reference/api/pandas.io.formats.style.Styler.applymap +generated/pandas.io.formats.style.Styler.applymap,../reference/api/pandas.io.formats.style.Styler.map +generated/pandas.io.formats.style.Styler.applymap_index,../reference/api/pandas.io.formats.style.Styler.map_index generated/pandas.io.formats.style.Styler.background_gradient,../reference/api/pandas.io.formats.style.Styler.background_gradient generated/pandas.io.formats.style.Styler.bar,../reference/api/pandas.io.formats.style.Styler.bar generated/pandas.io.formats.style.Styler.clear,../reference/api/pandas.io.formats.style.Styler.clear @@ -1385,3 +1382,69 @@ # Cached searches reference/api/pandas.DataFrame.from_csv,pandas.read_csv + +# GroupBy -> DataFrameGroupBy +reference/api/pandas.core.groupby.GroupBy.__iter__,pandas.core.groupby.DataFrameGroupBy.__iter__ +reference/api/pandas.core.groupby.GroupBy.agg,pandas.core.groupby.DataFrameGroupBy.agg +reference/api/pandas.core.groupby.GroupBy.aggregate,pandas.core.groupby.DataFrameGroupBy.aggregate +reference/api/pandas.core.groupby.GroupBy.all,pandas.core.groupby.DataFrameGroupBy.all +reference/api/pandas.core.groupby.GroupBy.any,pandas.core.groupby.DataFrameGroupBy.any +reference/api/pandas.core.groupby.GroupBy.apply,pandas.core.groupby.DataFrameGroupBy.apply +reference/api/pandas.core.groupby.GroupBy.bfill,pandas.core.groupby.DataFrameGroupBy.bfill +reference/api/pandas.core.groupby.GroupBy.count,pandas.core.groupby.DataFrameGroupBy.count +reference/api/pandas.core.groupby.GroupBy.cumcount,pandas.core.groupby.DataFrameGroupBy.cumcount +reference/api/pandas.core.groupby.GroupBy.cummax,pandas.core.groupby.DataFrameGroupBy.cummax +reference/api/pandas.core.groupby.GroupBy.cummin,pandas.core.groupby.DataFrameGroupBy.cummin +reference/api/pandas.core.groupby.GroupBy.cumprod,pandas.core.groupby.DataFrameGroupBy.cumprod +reference/api/pandas.core.groupby.GroupBy.cumsum,pandas.core.groupby.DataFrameGroupBy.cumsum +reference/api/pandas.core.groupby.GroupBy.ffill,pandas.core.groupby.DataFrameGroupBy.ffill +reference/api/pandas.core.groupby.GroupBy.first,pandas.core.groupby.DataFrameGroupBy.first +reference/api/pandas.core.groupby.GroupBy.get_group,pandas.core.groupby.DataFrameGroupBy.get_group +reference/api/pandas.core.groupby.GroupBy.groups,pandas.core.groupby.DataFrameGroupBy.groups +reference/api/pandas.core.groupby.GroupBy.head,pandas.core.groupby.DataFrameGroupBy.head +reference/api/pandas.core.groupby.GroupBy.indices,pandas.core.groupby.DataFrameGroupBy.indices +reference/api/pandas.core.groupby.GroupBy.last,pandas.core.groupby.DataFrameGroupBy.last +reference/api/pandas.core.groupby.GroupBy.max,pandas.core.groupby.DataFrameGroupBy.max +reference/api/pandas.core.groupby.GroupBy.mean,pandas.core.groupby.DataFrameGroupBy.mean +reference/api/pandas.core.groupby.GroupBy.median,pandas.core.groupby.DataFrameGroupBy.median +reference/api/pandas.core.groupby.GroupBy.min,pandas.core.groupby.DataFrameGroupBy.min +reference/api/pandas.core.groupby.GroupBy.ngroup,pandas.core.groupby.DataFrameGroupBy.ngroup +reference/api/pandas.core.groupby.GroupBy.nth,pandas.core.groupby.DataFrameGroupBy.nth +reference/api/pandas.core.groupby.GroupBy.ohlc,pandas.core.groupby.DataFrameGroupBy.ohlc +reference/api/pandas.core.groupby.GroupBy.pct_change,pandas.core.groupby.DataFrameGroupBy.pct_change +reference/api/pandas.core.groupby.GroupBy.pipe,pandas.core.groupby.DataFrameGroupBy.pipe +reference/api/pandas.core.groupby.GroupBy.prod,pandas.core.groupby.DataFrameGroupBy.prod +reference/api/pandas.core.groupby.GroupBy.rank,pandas.core.groupby.DataFrameGroupBy.rank +reference/api/pandas.core.groupby.GroupBy.sem,pandas.core.groupby.DataFrameGroupBy.sem +reference/api/pandas.core.groupby.GroupBy.size,pandas.core.groupby.DataFrameGroupBy.size +reference/api/pandas.core.groupby.GroupBy.std,pandas.core.groupby.DataFrameGroupBy.std +reference/api/pandas.core.groupby.GroupBy.sum,pandas.core.groupby.DataFrameGroupBy.sum +reference/api/pandas.core.groupby.GroupBy.tail,pandas.core.groupby.DataFrameGroupBy.tail +reference/api/pandas.core.groupby.GroupBy.transform,pandas.core.groupby.DataFrameGroupBy.transform +reference/api/pandas.core.groupby.GroupBy.var,pandas.core.groupby.DataFrameGroupBy.var + +# Renamed or alias doc page was removed +reference/api/pandas.DataFrame.subtract,pandas.DataFrame.sub +reference/api/pandas.DataFrame.multiply,pandas.DataFrame.mul +reference/api/pandas.DataFrame.divide,pandas.DataFrame.div +reference/api/pandas.Series.subtract,pandas.Series.sub +reference/api/pandas.Series.multiply,pandas.Series.mul +reference/api/pandas.Series.divide,pandas.Series.div +reference/api/pandas.Series.tolist,pandas.Series.to_list +reference/api/pandas.Series.transpose,pandas.Series.T +reference/api/pandas.Index.transpose,pandas.Index.T +reference/api/pandas.Index.notnull,pandas.Index.notna +reference/api/pandas.Index.tolist,pandas.Index.to_list +reference/api/pandas.arrays.PandasArray,pandas.arrays.NumpyExtensionArray +reference/api/pandas.core.groupby.DataFrameGroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill +reference/api/pandas.core.groupby.GroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill +reference/api/pandas.core.resample.Resampler.backfill,pandas.core.resample.Resampler.bfill +reference/api/pandas.io.formats.style.Styler.applymap,pandas.io.formats.style.Styler.map +reference/api/pandas.io.formats.style.Styler.applymap_index,pandas.io.formats.style.Styler.map_index + +# EWM -> ExponentialMovingWindow +reference/api/pandas.core.window.ewm.EWM.corr,pandas.core.window.ewm.ExponentialMovingWindow.corr +reference/api/pandas.core.window.ewm.EWM.cov,pandas.core.window.ewm.ExponentialMovingWindow.cov +reference/api/pandas.core.window.ewm.EWM.mean,pandas.core.window.ewm.ExponentialMovingWindow.mean +reference/api/pandas.core.window.ewm.EWM.std,pandas.core.window.ewm.ExponentialMovingWindow.std +reference/api/pandas.core.window.ewm.EWM.var,pandas.core.window.ewm.ExponentialMovingWindow.var diff -Nru pandas-2.1.4+dfsg/doc/scripts/eval_performance.py pandas-2.2.2+dfsg/doc/scripts/eval_performance.py --- pandas-2.1.4+dfsg/doc/scripts/eval_performance.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/scripts/eval_performance.py 2024-04-10 17:42:52.000000000 +0000 @@ -64,7 +64,7 @@ return ev, qu -def plot_perf(df, engines, title, filename=None): +def plot_perf(df, engines, title, filename=None) -> None: from matplotlib.pyplot import figure sns.set() diff -Nru pandas-2.1.4+dfsg/doc/source/conf.py pandas-2.2.2+dfsg/doc/source/conf.py --- pandas-2.1.4+dfsg/doc/source/conf.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/conf.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,6 +14,7 @@ import inspect import logging import os +import re import sys import warnings @@ -57,7 +58,6 @@ "numpydoc", "sphinx_copybutton", "sphinx_design", - "sphinx_toggleprompt", "sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.coverage", @@ -76,7 +76,6 @@ # to ensure that include files (partial pages) aren't built, exclude them # https://github.com/sphinx-doc/sphinx/issues/1965#issuecomment-124732907 "**/includes/**", - "**/api/pandas.Series.dt.rst", ] try: import nbconvert @@ -130,6 +129,8 @@ autodoc_typehints = "none" # numpydoc +numpydoc_show_class_members = False +numpydoc_show_inherited_class_members = False numpydoc_attributes_as_param_list = False # matplotlib plot directive @@ -161,7 +162,7 @@ # General information about the project. project = "pandas" # We have our custom "pandas_footer.html" template, using copyright for the current year -copyright = f"{datetime.now().year}" +copyright = f"{datetime.now().year}," # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -229,24 +230,31 @@ # further. For a list of options available for each theme, see the # documentation. -switcher_version = version if ".dev" in version: switcher_version = "dev" elif "rc" in version: switcher_version = version.split("rc", maxsplit=1)[0] + " (rc)" +else: + # only keep major.minor version number to match versions.json + switcher_version = ".".join(version.split(".")[:2]) html_theme_options = { "external_links": [], "footer_start": ["pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", "twitter_url": "https://twitter.com/pandas_dev", - "analytics": {"google_analytics_id": "G-5RE31C1RNW"}, + "analytics": { + "plausible_analytics_domain": "pandas.pydata.org", + "plausible_analytics_url": "https://views.scientific-python.org/js/script.js", + }, "logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"}, + "navbar_align": "left", "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"], "switcher": { "json_url": "https://pandas.pydata.org/versions.json", "version_match": switcher_version, }, + "show_version_warning_banner": True, "icon_links": [ { "name": "Mastodon", @@ -452,7 +460,6 @@ "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None), "matplotlib": ("https://matplotlib.org/stable/", None), "numpy": ("https://numpy.org/doc/stable/", None), - "pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None), "py": ("https://pylib.readthedocs.io/en/latest/", None), "python": ("https://docs.python.org/3/", None), "scipy": ("https://docs.scipy.org/doc/scipy/", None), @@ -499,7 +506,7 @@ # lower than MethodDocumenter so this is not chosen for normal methods priority = 0.6 - def format_signature(self): + def format_signature(self) -> str: # this method gives an error/warning for the accessors, therefore # overriding it (accessor has no arguments) return "" @@ -629,7 +636,7 @@ # based on numpy doc/source/conf.py -def linkcode_resolve(domain, info): +def linkcode_resolve(domain, info) -> str | None: """ Determine the URL corresponding to Python object """ @@ -691,12 +698,12 @@ # remove the docstring of the flags attribute (inherited from numpy ndarray) # because these give doc build errors (see GH issue 5331) -def remove_flags_docstring(app, what, name, obj, options, lines): +def remove_flags_docstring(app, what, name, obj, options, lines) -> None: if what == "attribute" and name.endswith(".flags"): del lines[:] -def process_class_docstrings(app, what, name, obj, options, lines): +def process_class_docstrings(app, what, name, obj, options, lines) -> None: """ For those classes for which we use :: @@ -748,7 +755,7 @@ ] -def process_business_alias_docstrings(app, what, name, obj, options, lines): +def process_business_alias_docstrings(app, what, name, obj, options, lines) -> None: """ Starting with sphinx 3.4, the "autodoc-process-docstring" event also gets called for alias classes. This results in numpydoc adding the @@ -771,7 +778,7 @@ suppress_warnings.append("ref.ref") -def rstjinja(app, docname, source): +def rstjinja(app, docname, source) -> None: """ Render our pages as a jinja template for fancy templating goodness. """ @@ -784,7 +791,7 @@ source[0] = rendered -def setup(app): +def setup(app) -> None: app.connect("source-read", rstjinja) app.connect("autodoc-process-docstring", remove_flags_docstring) app.connect("autodoc-process-docstring", process_class_docstrings) @@ -794,3 +801,49 @@ app.add_autodocumenter(AccessorMethodDocumenter) app.add_autodocumenter(AccessorCallableDocumenter) app.add_directive("autosummary", PandasAutosummary) + + +# Ignore list for broken links,found in CI run checks for broken-linkcheck.yml + +linkcheck_ignore = [ + "^http://$", + "^https://$", + *[ + re.escape(link) + for link in [ + "http://scatterci.github.io/pydata/pandas", + "http://specs.frictionlessdata.io/json-table-schema/", + "https://cloud.google.com/bigquery/docs/access-control#roles", + "https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query", + "https://crates.io/crates/calamine", + "https://devguide.python.org/setup/#macos", + "https://en.wikipedia.org/wiki/Imputation_statistics", + "https://en.wikipedia.org/wiki/Imputation_(statistics", + "https://github.com/noatamir/pandas-dev", + "https://github.com/pandas-dev/pandas/blob/main/pandas/plotting/__init__.py#L1", + "https://github.com/pandas-dev/pandas/blob/v0.20.2/pandas/core/generic.py#L568", + "https://github.com/pandas-dev/pandas/blob/v0.20.2/pandas/core/frame.py#L1495", + "https://github.com/pandas-dev/pandas/issues/174151", + "https://gitpod.io/#https://github.com/USERNAME/pandas", + "https://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/", + "https://matplotlib.org/api/axes_api.html#matplotlib.axes.Axes.table", + "https://nipunbatra.github.io/blog/visualisation/2013/05/01/aggregation-timeseries.html", + "https://nbviewer.ipython.org/gist/metakermit/5720498", + "https://numpy.org/doc/stable/user/basics.byteswapping.html", + "https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-8-0", + "https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking", + "https://pandas.pydata.org/pandas-docs/stable/ecosystem.html", + "https://sqlalchemy.readthedocs.io/en/latest/dialects/index.html", + "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a000245912.htm", + "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a000214639.htm", + "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a002283942.htm", + "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a000245965.htm", + "https://support.sas.com/documentation/cdl/en/imlug/66845/HTML/default/viewer.htm#imlug_langref_sect455.htm", + "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a002284668.htm", + "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a002978282.htm", + "https://wesmckinney.com/blog/update-on-upcoming-pandas-v0-10-new-file-parser-other-performance-wins/", + "https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022", + "pandas.zip", + ] + ], +] diff -Nru pandas-2.1.4+dfsg/doc/source/development/contributing.rst pandas-2.2.2+dfsg/doc/source/development/contributing.rst --- pandas-2.1.4+dfsg/doc/source/development/contributing.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/development/contributing.rst 2024-04-10 17:42:52.000000000 +0000 @@ -311,6 +311,7 @@ .. code-block:: shell git checkout main + git fetch upstream git merge upstream/main mamba activate pandas-dev mamba env update -f environment.yml --prune @@ -320,6 +321,7 @@ .. code-block:: shell git checkout main + git fetch upstream git merge upstream/main # activate the virtual environment based on your platform python -m pip install --upgrade -r requirements-dev.txt diff -Nru pandas-2.1.4+dfsg/doc/source/development/contributing_codebase.rst pandas-2.2.2+dfsg/doc/source/development/contributing_codebase.rst --- pandas-2.1.4+dfsg/doc/source/development/contributing_codebase.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/development/contributing_codebase.rst 2024-04-10 17:42:52.000000000 +0000 @@ -39,7 +39,7 @@ Additionally, :ref:`Continuous Integration ` will run code formatting checks like ``black``, ``ruff``, -``isort``, and ``cpplint`` and more using `pre-commit hooks `_. +``isort``, and ``clang-format`` and more using `pre-commit hooks `_. Any warnings from these checks will cause the :ref:`Continuous Integration ` to fail; therefore, it is helpful to run the check yourself before submitting code. This can be done by installing ``pre-commit`` (which should already have happened if you followed the instructions @@ -456,6 +456,12 @@ - tests.io + .. note:: + + This includes ``to_string`` but excludes ``__repr__``, which is + tested in ``tests.frame.test_repr`` and ``tests.series.test_repr``. + Other classes often have a ``test_formats`` file. + C) Otherwise This test likely belongs in one of: @@ -528,7 +534,7 @@ is not meant to be captured, use ``pytest.mark.xfail`` It is common to use this method for a test that exhibits buggy behavior or a non-implemented feature. If the failing test has flaky behavior, use the argument ``strict=False``. This -will make it so pytest does not fail if the test happens to pass. +will make it so pytest does not fail if the test happens to pass. Using ``strict=False`` is highly undesirable, please use it only as a last resort. Prefer the decorator ``@pytest.mark.xfail`` and the argument ``pytest.param`` over usage within a test so that the test is appropriately marked during the @@ -540,7 +546,7 @@ def test_xfail(request): mark = pytest.mark.xfail(raises=TypeError, reason="Indicate why here") - request.node.add_marker(mark) + request.applymarker(mark) xfail is not to be used for tests involving failure due to invalid user arguments. For these tests, we need to verify the correct exception type and error message @@ -754,7 +760,7 @@ your installation is probably fine and you can start contributing! Often it is worth running only a subset of tests first around your changes before running the -entire suite (tip: you can use the [pandas-coverage app](https://pandas-coverage-12d2130077bc.herokuapp.com/)) +entire suite (tip: you can use the `pandas-coverage app `_) to find out which tests hit the lines of code you've modified, and then run only those). The easiest way to do this is with:: diff -Nru pandas-2.1.4+dfsg/doc/source/development/contributing_environment.rst pandas-2.2.2+dfsg/doc/source/development/contributing_environment.rst --- pandas-2.1.4+dfsg/doc/source/development/contributing_environment.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/development/contributing_environment.rst 2024-04-10 17:42:52.000000000 +0000 @@ -44,8 +44,9 @@ **macOS** To use the :ref:`mamba `-based compilers, you will need to install the -Developer Tools using ``xcode-select --install``. Otherwise -information about compiler installation can be found here: +Developer Tools using ``xcode-select --install``. + +If you prefer to use a different compiler, general information can be found here: https://devguide.python.org/setup/#macos **Linux** @@ -86,12 +87,12 @@ Option 1: using mamba (recommended) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* Install `mamba `_ +* Install miniforge to get `mamba `_ * Make sure your mamba is up to date (``mamba update mamba``) +* Create and activate the ``pandas-dev`` mamba environment using the following commands: .. code-block:: none - # Create and activate the build environment mamba env create --file environment.yml mamba activate pandas-dev @@ -214,7 +215,7 @@ The newer build system, invokes the meson backend through pip (via a `PEP 517 `_ build). It automatically uses all available cores on your CPU, and also avoids the need for manual rebuilds by -rebuilding automatically whenever pandas is imported(with an editable install). +rebuilding automatically whenever pandas is imported (with an editable install). For these reasons, you should compile pandas with meson. Because the meson build system is newer, you may find bugs/minor issues as it matures. You can report these bugs @@ -228,6 +229,14 @@ # If you do not want to see this, omit everything after --no-build-isolation python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true +.. note:: + The version number is pulled from the latest repository tag. Be sure to fetch the latest tags from upstream + before building:: + + # set the upstream repository, if not done already, and fetch the latest tags + git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream --tags + **Build options** It is possible to pass options from the pip frontend to the meson backend if you would like to configure your @@ -265,6 +274,8 @@ You will need to repeat this step each time the C extensions change, for example if you modified any file in ``pandas/_libs`` or if you did a fetch and merge from ``upstream/main``. +**Checking the build** + At this point you should be able to import pandas from your locally built version:: $ python @@ -272,6 +283,12 @@ >>> print(pandas.__version__) # note: the exact output may differ 2.0.0.dev0+880.g2b9e661fbb.dirty + +At this point you may want to try +`running the test suite `_. + +**Keeping up to date with the latest build** + When building pandas with meson, importing pandas will automatically trigger a rebuild, even when C/Cython files are modified. By default, no output will be produced by this rebuild (the import will just take longer). If you would like to see meson's output when importing pandas, you can set the environment variable ``MESONPY_EDTIABLE_VERBOSE``. For example, this would be:: diff -Nru pandas-2.1.4+dfsg/doc/source/development/debugging_extensions.rst pandas-2.2.2+dfsg/doc/source/development/debugging_extensions.rst --- pandas-2.1.4+dfsg/doc/source/development/debugging_extensions.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/development/debugging_extensions.rst 2024-04-10 17:42:52.000000000 +0000 @@ -14,19 +14,49 @@ 2. `Fundamental Python Debugging Part 2 - Python Extensions `_ 3. `Fundamental Python Debugging Part 3 - Cython Extensions `_ -Generating debug builds ------------------------ +Debugging locally +----------------- By default building pandas from source will generate a release build. To generate a development build you can type:: pip install -ve . --no-build-isolation --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug" +.. note:: + + conda environments update CFLAGS/CPPFLAGS with flags that are geared towards generating releases. If using conda, you may need to set ``CFLAGS="$CFLAGS -O0"`` and ``CPPFLAGS="$CPPFLAGS -O0"`` to ensure optimizations are turned off for debugging + By specifying ``builddir="debug"`` all of the targets will be built and placed in the debug directory relative to the project root. This helps to keep your debug and release artifacts separate; you are of course able to choose a different directory name or omit altogether if you do not care to separate build types. +Using Docker +------------ + +To simplify the debugging process, pandas has created a Docker image with a debug build of Python and the gdb/Cython debuggers pre-installed. You may either ``docker pull pandas/pandas-debug`` to get access to this image or build it from the ``tooling/debug`` folder locallly. + +You can then mount your pandas repository into this image via: + +.. code-block:: sh + + docker run --rm -it -w /data -v ${PWD}:/data pandas/pandas-debug + +Inside the image, you can use meson to build/install pandas and place the build artifacts into a ``debug`` folder using a command as follows: + +.. code-block:: sh + + python -m pip install -ve . --no-build-isolation --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug" + +If planning to use cygdb, the files required by that application are placed within the build folder. So you have to first ``cd`` to the build folder, then start that application. + +.. code-block:: sh + + cd debug + cygdb + +Within the debugger you can use `cygdb commands `_ to navigate cython extensions. + Editor support -------------- -The meson build system generates a `compilation database `_ automatically and places it in the build directory. Many language servers and IDEs can use this information to provide code-completion, go-to-defintion and error checking support as you type. +The meson build system generates a `compilation database `_ automatically and places it in the build directory. Many language servers and IDEs can use this information to provide code-completion, go-to-definition and error checking support as you type. How each language server / IDE chooses to look for the compilation database may vary. When in doubt you may want to create a symlink at the root of the project that points to the compilation database in your build directory. Assuming you used *debug* as your directory name, you can run:: diff -Nru pandas-2.1.4+dfsg/doc/source/development/extending.rst pandas-2.2.2+dfsg/doc/source/development/extending.rst --- pandas-2.1.4+dfsg/doc/source/development/extending.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/development/extending.rst 2024-04-10 17:42:52.000000000 +0000 @@ -99,7 +99,7 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A :class:`pandas.api.extensions.ExtensionDtype` is similar to a ``numpy.dtype`` object. It describes the -data type. Implementors are responsible for a few unique items like the name. +data type. Implementers are responsible for a few unique items like the name. One particularly important item is the ``type`` property. This should be the class that is the scalar type for your data. For example, if you were writing an diff -Nru pandas-2.1.4+dfsg/doc/source/development/maintaining.rst pandas-2.2.2+dfsg/doc/source/development/maintaining.rst --- pandas-2.1.4+dfsg/doc/source/development/maintaining.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/development/maintaining.rst 2024-04-10 17:42:52.000000000 +0000 @@ -44,6 +44,9 @@ Issue triage ------------ +Triage is an important first step in addressing issues reported by the community, and even +partial contributions are a great way to help maintain pandas. Only remove the "Needs Triage" +tag once all of the steps below have been completed. Here's a typical workflow for triaging a newly opened issue. @@ -67,9 +70,9 @@ 3. **Is this a duplicate issue?** We have many open issues. If a new issue is clearly a duplicate, label the - new issue as "Duplicate" assign the milestone "No Action", and close the issue - with a link to the original issue. Make sure to still thank the reporter, and - encourage them to chime in on the original issue, and perhaps try to fix it. + new issue as "Duplicate" and close the issue with a link to the original issue. + Make sure to still thank the reporter, and encourage them to chime in on the + original issue, and perhaps try to fix it. If the new issue provides relevant information, such as a better or slightly different example, add it to the original issue as a comment or an edit to @@ -90,6 +93,10 @@ If a reproducible example is provided, but you see a simplification, edit the original post with your simpler reproducible example. + Ensure the issue exists on the main branch and that it has the "Needs Triage" tag + until all steps have been completed. Add a comment to the issue once you have + verified it exists on the main branch, so others know it has been confirmed. + 5. **Is this a clearly defined feature request?** Generally, pandas prefers to discuss and design new features in issues, before @@ -97,8 +104,9 @@ for the new feature. Having them write a full docstring is a good way to pin down specifics. - We'll need a discussion from several pandas maintainers before deciding whether - the proposal is in scope for pandas. + Tag new feature requests with "Needs Discussion", as we'll need a discussion + from several pandas maintainers before deciding whether the proposal is in + scope for pandas. 6. **Is this a usage question?** @@ -117,10 +125,6 @@ If the issue is clearly defined and the fix seems relatively straightforward, label the issue as "Good first issue". - Typically, new issues will be assigned the "Contributions welcome" milestone, - unless it's know that this issue should be addressed in a specific release (say - because it's a large regression). - Once you have completed the above, make sure to remove the "needs triage" label. .. _maintaining.regressions: @@ -445,9 +449,13 @@ git tag -a v1.5.0.dev0 -m "DEV: Start 1.5.0" git push upstream main --follow-tags -3. Build the source distribution (git must be in the tag commit):: +3. Download the source distribution and wheels from the `wheel staging area `_. + Be careful to make sure that no wheels are missing (e.g. due to failed builds). + + Running scripts/download_wheels.sh with the version that you want to download wheels/the sdist for should do the trick. + This script will make a ``dist`` folder inside your clone of pandas and put the downloaded wheels and sdist there:: - ./setup.py sdist --formats=gztar --quiet + scripts/download_wheels.sh 4. Create a `new GitHub release `_: @@ -459,23 +467,19 @@ - Set as the latest release: Leave checked, unless releasing a patch release for an older version (e.g. releasing 1.4.5 after 1.5 has been released) -5. The GitHub release will after some hours trigger an +5. Upload wheels to PyPI:: + + twine upload pandas/dist/pandas-*.{whl,tar.gz} --skip-existing + +6. The GitHub release will after some hours trigger an `automated conda-forge PR `_. + (If you don't want to wait, you can open an issue titled ``@conda-forge-admin, please update version`` to trigger the bot.) Merge it once the CI is green, and it will generate the conda-forge packages. + In case a manual PR needs to be done, the version, sha256 and build fields are the ones that usually need to be changed. If anything else in the recipe has changed since the last release, those changes should be available in ``ci/meta.yaml``. -6. Packages for supported versions in PyPI are built automatically from our CI. - Once all packages are build download all wheels from the - `Anaconda repository >`_ - where our CI published them to the ``dist/`` directory in your local pandas copy. - You can use the script ``scripts/download_wheels.sh`` to download all wheels at once. - -7. Upload wheels to PyPI:: - - twine upload pandas/dist/pandas-*.{whl,tar.gz} --skip-existing - Post-Release ```````````` diff -Nru pandas-2.1.4+dfsg/doc/source/getting_started/comparison/comparison_with_sql.rst pandas-2.2.2+dfsg/doc/source/getting_started/comparison/comparison_with_sql.rst --- pandas-2.1.4+dfsg/doc/source/getting_started/comparison/comparison_with_sql.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/getting_started/comparison/comparison_with_sql.rst 2024-04-10 17:42:52.000000000 +0000 @@ -164,16 +164,16 @@ tips.groupby("sex").size() -Notice that in the pandas code we used :meth:`~pandas.core.groupby.DataFrameGroupBy.size` and not -:meth:`~pandas.core.groupby.DataFrameGroupBy.count`. This is because -:meth:`~pandas.core.groupby.DataFrameGroupBy.count` applies the function to each column, returning +Notice that in the pandas code we used :meth:`.DataFrameGroupBy.size` and not +:meth:`.DataFrameGroupBy.count`. This is because +:meth:`.DataFrameGroupBy.count` applies the function to each column, returning the number of ``NOT NULL`` records within each. .. ipython:: python tips.groupby("sex").count() -Alternatively, we could have applied the :meth:`~pandas.core.groupby.DataFrameGroupBy.count` method +Alternatively, we could have applied the :meth:`.DataFrameGroupBy.count` method to an individual column: .. ipython:: python @@ -181,7 +181,7 @@ tips.groupby("sex")["total_bill"].count() Multiple functions can also be applied at once. For instance, say we'd like to see how tip amount -differs by day of the week - :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` allows you to pass a dictionary +differs by day of the week - :meth:`.DataFrameGroupBy.agg` allows you to pass a dictionary to your grouped DataFrame, indicating which functions to apply to specific columns. .. code-block:: sql diff -Nru pandas-2.1.4+dfsg/doc/source/getting_started/install.rst pandas-2.2.2+dfsg/doc/source/getting_started/install.rst --- pandas-2.1.4+dfsg/doc/source/getting_started/install.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/getting_started/install.rst 2024-04-10 17:42:52.000000000 +0000 @@ -21,7 +21,7 @@ Python version support ---------------------- -Officially Python 3.9, 3.10 and 3.11. +Officially Python 3.9, 3.10, 3.11 and 3.12. Installing pandas ----------------- @@ -206,7 +206,7 @@ `NumPy `__ 1.22.4 `python-dateutil `__ 2.8.2 `pytz `__ 2020.1 -`tzdata `__ 2022.1 +`tzdata `__ 2022.7 ================================================================ ========================== .. _install.optional_dependencies: @@ -239,9 +239,9 @@ ===================================================== ================== ================== =================================================================================================================================================================================== Dependency Minimum Version pip extra Notes ===================================================== ================== ================== =================================================================================================================================================================================== -`numexpr `__ 2.8.0 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups -`bottleneck `__ 1.3.4 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup. -`numba `__ 0.55.2 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler. +`numexpr `__ 2.8.4 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups +`bottleneck `__ 1.3.6 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup. +`numba `__ 0.56.4 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler. ===================================================== ================== ================== =================================================================================================================================================================================== Visualization @@ -252,9 +252,9 @@ ========================= ================== ================== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== ================== ============================================================= -matplotlib 3.6.1 plot Plotting library +matplotlib 3.6.3 plot Plotting library Jinja2 3.1.2 output-formatting Conditional formatting with DataFrame.style -tabulate 0.8.10 output-formatting Printing in Markdown-friendly format (see `tabulate`_) +tabulate 0.9.0 output-formatting Printing in Markdown-friendly format (see `tabulate`_) ========================= ================== ================== ============================================================= Computation @@ -265,8 +265,8 @@ ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -SciPy 1.8.1 computation Miscellaneous statistical functions -xarray 2022.03.0 computation pandas-like API for N-dimensional data +SciPy 1.10.0 computation Miscellaneous statistical functions +xarray 2022.12.0 computation pandas-like API for N-dimensional data ========================= ================== =============== ============================================================= Excel files @@ -278,9 +278,10 @@ Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= xlrd 2.0.1 excel Reading Excel -xlsxwriter 3.0.3 excel Writing Excel -openpyxl 3.0.10 excel Reading / writing for xlsx files -pyxlsb 1.0.9 excel Reading for xlsb files +xlsxwriter 3.0.5 excel Writing Excel +openpyxl 3.1.0 excel Reading / writing for xlsx files +pyxlsb 1.0.10 excel Reading for xlsb files +python-calamine 0.1.7 excel Reading for xls/xlsx/xlsb/ods files ========================= ================== =============== ============================================================= HTML @@ -291,9 +292,9 @@ ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -BeautifulSoup4 4.11.1 html HTML parser for read_html +BeautifulSoup4 4.11.2 html HTML parser for read_html html5lib 1.1 html HTML parser for read_html -lxml 4.8.0 html HTML parser for read_html +lxml 4.9.2 html HTML parser for read_html ========================= ================== =============== ============================================================= One of the following combinations of libraries is needed to use the @@ -328,22 +329,24 @@ ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -lxml 4.8.0 xml XML parser for read_xml and tree builder for to_xml +lxml 4.9.2 xml XML parser for read_xml and tree builder for to_xml ========================= ================== =============== ============================================================= SQL databases ^^^^^^^^^^^^^ -Installable with ``pip install "pandas[postgresql, mysql, sql-other]"``. +Traditional drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"`` ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -SQLAlchemy 1.4.36 postgresql, SQL support for databases other than sqlite +SQLAlchemy 2.0.0 postgresql, SQL support for databases other than sqlite mysql, sql-other -psycopg2 2.9.3 postgresql PostgreSQL engine for sqlalchemy +psycopg2 2.9.6 postgresql PostgreSQL engine for sqlalchemy pymysql 1.0.2 mysql MySQL engine for sqlalchemy +adbc-driver-postgresql 0.8.0 postgresql ADBC Driver for PostgreSQL +adbc-driver-sqlite 0.8.0 sql-other ADBC Driver for SQLite ========================= ================== =============== ============================================================= Other data sources @@ -354,12 +357,12 @@ ========================= ================== ================ ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== ================ ============================================================= -PyTables 3.7.0 hdf5 HDF5-based reading / writing -blosc 1.21.0 hdf5 Compression for HDF5; only available on ``conda`` +PyTables 3.8.0 hdf5 HDF5-based reading / writing +blosc 1.21.3 hdf5 Compression for HDF5; only available on ``conda`` zlib hdf5 Compression for HDF5 -fastparquet 0.8.1 - Parquet reading / writing (pyarrow is default) -pyarrow 7.0.0 parquet, feather Parquet, ORC, and feather reading / writing -pyreadstat 1.1.5 spss SPSS files (.sav) reading +fastparquet 2022.12.0 - Parquet reading / writing (pyarrow is default) +pyarrow 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing +pyreadstat 1.2.0 spss SPSS files (.sav) reading odfpy 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing ========================= ================== ================ ============================================================= @@ -379,11 +382,11 @@ ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -fsspec 2022.05.0 fss, gcp, aws Handling files aside from simple local and HTTP (required +fsspec 2022.11.0 fss, gcp, aws Handling files aside from simple local and HTTP (required dependency of s3fs, gcsfs). -gcsfs 2022.05.0 gcp Google Cloud Storage access -pandas-gbq 0.17.5 gcp Google Big Query access -s3fs 2022.05.0 aws Amazon S3 access +gcsfs 2022.11.0 gcp Google Cloud Storage access +pandas-gbq 0.19.0 gcp Google Big Query access +s3fs 2022.11.0 aws Amazon S3 access ========================= ================== =============== ============================================================= Clipboard @@ -394,8 +397,8 @@ ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -PyQt4/PyQt5 5.15.6 clipboard Clipboard I/O -qtpy 2.2.0 clipboard Clipboard I/O +PyQt4/PyQt5 5.15.9 clipboard Clipboard I/O +qtpy 2.3.0 clipboard Clipboard I/O ========================= ================== =============== ============================================================= .. note:: @@ -412,7 +415,7 @@ ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -Zstandard 0.17.0 compression Zstandard compression +Zstandard 0.19.0 compression Zstandard compression ========================= ================== =============== ============================================================= Consortium Standard diff -Nru pandas-2.1.4+dfsg/doc/source/getting_started/intro_tutorials/01_table_oriented.rst pandas-2.2.2+dfsg/doc/source/getting_started/intro_tutorials/01_table_oriented.rst --- pandas-2.1.4+dfsg/doc/source/getting_started/intro_tutorials/01_table_oriented.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/getting_started/intro_tutorials/01_table_oriented.rst 2024-04-10 17:42:52.000000000 +0000 @@ -106,9 +106,9 @@ .. note:: - If you are familiar to Python + If you are familiar with Python :ref:`dictionaries `, the selection of a - single column is very similar to selection of dictionary values based on + single column is very similar to the selection of dictionary values based on the key. You can create a ``Series`` from scratch as well: diff -Nru pandas-2.1.4+dfsg/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst pandas-2.2.2+dfsg/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst --- pandas-2.1.4+dfsg/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst 2024-04-10 17:42:52.000000000 +0000 @@ -266,7 +266,7 @@ :: - air_quality.groupby(["parameter", "location"]).mean() + air_quality.groupby(["parameter", "location"])[["value"]].mean() .. raw:: html diff -Nru pandas-2.1.4+dfsg/doc/source/getting_started/intro_tutorials/09_timeseries.rst pandas-2.2.2+dfsg/doc/source/getting_started/intro_tutorials/09_timeseries.rst --- pandas-2.1.4+dfsg/doc/source/getting_started/intro_tutorials/09_timeseries.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/getting_started/intro_tutorials/09_timeseries.rst 2024-04-10 17:42:52.000000000 +0000 @@ -295,7 +295,7 @@ .. ipython:: python - monthly_max = no_2.resample("M").max() + monthly_max = no_2.resample("ME").max() monthly_max A very powerful method on time series data with a datetime index, is the diff -Nru pandas-2.1.4+dfsg/doc/source/getting_started/tutorials.rst pandas-2.2.2+dfsg/doc/source/getting_started/tutorials.rst --- pandas-2.1.4+dfsg/doc/source/getting_started/tutorials.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/getting_started/tutorials.rst 2024-04-10 17:42:52.000000000 +0000 @@ -115,7 +115,6 @@ * `Statistical Data Analysis in Python, tutorial videos, by Christopher Fonnesbeck from SciPy 2013 `_ * `Financial analysis in Python, by Thomas Wiecki `_ * `Intro to pandas data structures, by Greg Reda `_ -* `Pandas and Python: Top 10, by Manish Amde `_ * `Pandas DataFrames Tutorial, by Karlijn Willems `_ * `A concise tutorial with real life examples `_ * `430+ Searchable Pandas recipes by Isshin Inada `_ diff -Nru pandas-2.1.4+dfsg/doc/source/reference/arrays.rst pandas-2.2.2+dfsg/doc/source/reference/arrays.rst --- pandas-2.1.4+dfsg/doc/source/reference/arrays.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/reference/arrays.rst 2024-04-10 17:42:52.000000000 +0000 @@ -134,11 +134,6 @@ Timestamp -.. autosummary:: - :toctree: api/ - - NaT - Properties ~~~~~~~~~~ .. autosummary:: @@ -257,11 +252,6 @@ Timedelta -.. autosummary:: - :toctree: api/ - - NaT - Properties ~~~~~~~~~~ .. autosummary:: @@ -465,7 +455,6 @@ UInt16Dtype UInt32Dtype UInt64Dtype - NA .. _api.arrays.float_na: @@ -484,7 +473,6 @@ Float32Dtype Float64Dtype - NA .. _api.arrays.categorical: @@ -621,7 +609,6 @@ :template: autosummary/class_without_autosummary.rst BooleanDtype - NA .. Dtype attributes which are manually listed in their docstrings: including diff -Nru pandas-2.1.4+dfsg/doc/source/reference/extensions.rst pandas-2.2.2+dfsg/doc/source/reference/extensions.rst --- pandas-2.1.4+dfsg/doc/source/reference/extensions.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/reference/extensions.rst 2024-04-10 17:42:52.000000000 +0000 @@ -34,6 +34,7 @@ api.extensions.ExtensionArray._accumulate api.extensions.ExtensionArray._concat_same_type + api.extensions.ExtensionArray._explode api.extensions.ExtensionArray._formatter api.extensions.ExtensionArray._from_factorized api.extensions.ExtensionArray._from_sequence @@ -48,6 +49,7 @@ api.extensions.ExtensionArray.copy api.extensions.ExtensionArray.view api.extensions.ExtensionArray.dropna + api.extensions.ExtensionArray.duplicated api.extensions.ExtensionArray.equals api.extensions.ExtensionArray.factorize api.extensions.ExtensionArray.fillna diff -Nru pandas-2.1.4+dfsg/doc/source/reference/frame.rst pandas-2.2.2+dfsg/doc/source/reference/frame.rst --- pandas-2.1.4+dfsg/doc/source/reference/frame.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/reference/frame.rst 2024-04-10 17:42:52.000000000 +0000 @@ -49,6 +49,7 @@ DataFrame.infer_objects DataFrame.copy DataFrame.bool + DataFrame.to_numpy Indexing, iteration ~~~~~~~~~~~~~~~~~~~ diff -Nru pandas-2.1.4+dfsg/doc/source/reference/general_functions.rst pandas-2.2.2+dfsg/doc/source/reference/general_functions.rst --- pandas-2.1.4+dfsg/doc/source/reference/general_functions.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/reference/general_functions.rst 2024-04-10 17:42:52.000000000 +0000 @@ -73,6 +73,13 @@ eval +Datetime formats +~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: api/ + + tseries.api.guess_datetime_format + Hashing ~~~~~~~ .. autosummary:: diff -Nru pandas-2.1.4+dfsg/doc/source/reference/index.rst pandas-2.2.2+dfsg/doc/source/reference/index.rst --- pandas-2.1.4+dfsg/doc/source/reference/index.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/reference/index.rst 2024-04-10 17:42:52.000000000 +0000 @@ -53,6 +53,7 @@ options extensions testing + missing_value .. This is to prevent warnings in the doc build. We don't want to encourage .. these methods. diff -Nru pandas-2.1.4+dfsg/doc/source/reference/indexing.rst pandas-2.2.2+dfsg/doc/source/reference/indexing.rst --- pandas-2.1.4+dfsg/doc/source/reference/indexing.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/reference/indexing.rst 2024-04-10 17:42:52.000000000 +0000 @@ -489,3 +489,5 @@ PeriodIndex.asfreq PeriodIndex.strftime PeriodIndex.to_timestamp + PeriodIndex.from_fields + PeriodIndex.from_ordinals diff -Nru pandas-2.1.4+dfsg/doc/source/reference/missing_value.rst pandas-2.2.2+dfsg/doc/source/reference/missing_value.rst --- pandas-2.1.4+dfsg/doc/source/reference/missing_value.rst 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/reference/missing_value.rst 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,24 @@ +{{ header }} + +.. _api.missing_value: + +============== +Missing values +============== +.. currentmodule:: pandas + +NA is the way to represent missing values for nullable dtypes (see below): + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + NA + +NaT is the missing value for timedelta and datetime data (see below): + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + NaT diff -Nru pandas-2.1.4+dfsg/doc/source/reference/series.rst pandas-2.2.2+dfsg/doc/source/reference/series.rst --- pandas-2.1.4+dfsg/doc/source/reference/series.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/reference/series.rst 2024-04-10 17:42:52.000000000 +0000 @@ -177,6 +177,7 @@ :toctree: api/ Series.align + Series.case_when Series.drop Series.droplevel Series.drop_duplicates @@ -273,6 +274,19 @@ These are separate namespaces within :class:`Series` that only apply to specific data types. +.. autosummary:: + :toctree: api/ + :nosignatures: + :template: autosummary/accessor.rst + + Series.str + Series.cat + Series.dt + Series.sparse + DataFrame.sparse + Index.str + + =========================== ================================= Data Type Accessor =========================== ================================= @@ -328,7 +342,6 @@ Series.dt.tz Series.dt.freq Series.dt.unit - Series.dt.normalize Datetime methods ^^^^^^^^^^^^^^^^ @@ -458,22 +471,6 @@ Series.str.isdecimal Series.str.get_dummies -.. - The following is needed to ensure the generated pages are created with the - correct template (otherwise they would be created in the Series/Index class page) - -.. - .. autosummary:: - :toctree: api/ - :template: autosummary/accessor.rst - - Series.str - Series.cat - Series.dt - Series.sparse - DataFrame.sparse - Index.str - .. _api.series.cat: Categorical accessor @@ -528,6 +525,46 @@ Series.sparse.from_coo Series.sparse.to_coo + +.. _api.series.list: + +List accessor +~~~~~~~~~~~~~ + +Arrow list-dtype specific methods and attributes are provided under the +``Series.list`` accessor. + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_method.rst + + Series.list.flatten + Series.list.len + Series.list.__getitem__ + + +.. _api.series.struct: + +Struct accessor +~~~~~~~~~~~~~~~ + +Arrow struct-dtype specific methods and attributes are provided under the +``Series.struct`` accessor. + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_attribute.rst + + Series.struct.dtypes + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_method.rst + + Series.struct.field + Series.struct.explode + + .. _api.series.flags: Flags diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/10min.rst pandas-2.2.2+dfsg/doc/source/user_guide/10min.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/10min.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/10min.rst 2024-04-10 17:42:52.000000000 +0000 @@ -451,7 +451,7 @@ Concat ~~~~~~ -pandas provides various facilities for easily combining together :class:`Series`` and +pandas provides various facilities for easily combining together :class:`Series` and :class:`DataFrame` objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations. @@ -525,7 +525,7 @@ df Grouping by a column label, selecting column labels, and then applying the -:meth:`~pandas.core.groupby.DataFrameGroupBy.sum` function to the resulting +:meth:`.DataFrameGroupBy.sum` function to the resulting groups: .. ipython:: python @@ -610,7 +610,7 @@ .. ipython:: python - rng = pd.date_range("1/1/2012", periods=100, freq="S") + rng = pd.date_range("1/1/2012", periods=100, freq="s") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) ts.resample("5Min").sum() diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/advanced.rst pandas-2.2.2+dfsg/doc/source/user_guide/advanced.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/advanced.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/advanced.rst 2024-04-10 17:42:52.000000000 +0000 @@ -976,7 +976,7 @@ pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=4, freq="W") - pd.interval_range(start=pd.Timedelta("0 days"), periods=3, freq="9H") + pd.interval_range(start=pd.Timedelta("0 days"), periods=3, freq="9h") Additionally, the ``closed`` parameter can be used to specify which side(s) the intervals are closed on. Intervals are closed on the right side by default. diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/basics.rst pandas-2.2.2+dfsg/doc/source/user_guide/basics.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/basics.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/basics.rst 2024-04-10 17:42:52.000000000 +0000 @@ -269,7 +269,7 @@ .. ipython:: python df2 = df.copy() - df2["three"]["a"] = 1.0 + df2.loc["a", "three"] = 1.0 df df2 df + df2 @@ -2007,7 +2007,7 @@ | | | | | ``'Int64'``, ``'UInt8'``, ``'UInt16'``,| | | | | | ``'UInt32'``, ``'UInt64'`` | +-------------------------------------------------+---------------------------+--------------------+-------------------------------+----------------------------------------+ -| ``nullable float`` | :class:`Float64Dtype`, ...| (none) | :class:`arrays.FloatingArray` | ``'Float32'``, ``'Float64'`` | +| :ref:`nullable float ` | :class:`Float64Dtype`, ...| (none) | :class:`arrays.FloatingArray` | ``'Float32'``, ``'Float64'`` | +-------------------------------------------------+---------------------------+--------------------+-------------------------------+----------------------------------------+ | :ref:`Strings ` | :class:`StringDtype` | :class:`str` | :class:`arrays.StringArray` | ``'string'`` | +-------------------------------------------------+---------------------------+--------------------+-------------------------------+----------------------------------------+ @@ -2261,23 +2261,6 @@ m = ["apple", pd.Timedelta("1day")] pd.to_timedelta(m, errors="coerce") -The ``errors`` parameter has a third option of ``errors='ignore'``, which will simply return the passed in data if it -encounters any errors with the conversion to a desired data type: - -.. ipython:: python - :okwarning: - - import datetime - - m = ["apple", datetime.datetime(2016, 3, 2)] - pd.to_datetime(m, errors="ignore") - - m = ["apple", 2, 3] - pd.to_numeric(m, errors="ignore") - - m = ["apple", pd.Timedelta("1day")] - pd.to_timedelta(m, errors="ignore") - In addition to object conversion, :meth:`~pandas.to_numeric` provides another argument ``downcast``, which gives the option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory: diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/categorical.rst pandas-2.2.2+dfsg/doc/source/user_guide/categorical.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/categorical.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/categorical.rst 2024-04-10 17:42:52.000000000 +0000 @@ -647,7 +647,7 @@ raw_cat = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) df = pd.DataFrame({"A": raw_cat, "B": ["c", "d", "c", "d"], "values": [1, 2, 3, 4]}) - pd.pivot_table(df, values="values", index=["A", "B"]) + pd.pivot_table(df, values="values", index=["A", "B"], observed=False) Data munging ------------ @@ -832,9 +832,6 @@ | category (int) | category (float) | False | float (dtype is inferred) | +-------------------+------------------------+----------------------+-----------------------------+ -See also the section on :ref:`merge dtypes` for notes about -preserving merge dtypes and performance. - .. _categorical.union: Unioning diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/copy_on_write.rst pandas-2.2.2+dfsg/doc/source/user_guide/copy_on_write.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/copy_on_write.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/copy_on_write.rst 2024-04-10 17:42:52.000000000 +0000 @@ -6,11 +6,17 @@ Copy-on-Write (CoW) ******************* +.. note:: + + Copy-on-Write will become the default in pandas 3.0. We recommend + :ref:`turning it on now ` + to benefit from all improvements. + Copy-on-Write was first introduced in version 1.5.0. Starting from version 2.0 most of the optimizations that become possible through CoW are implemented and supported. All possible optimizations are supported starting from pandas 2.1. -We expect that CoW will be enabled by default in version 3.0. +CoW will be enabled by default in version 3.0. CoW will lead to more predictable behavior since it is not possible to update more than one object with one statement, e.g. indexing operations or methods won't have side-effects. Additionally, through @@ -20,7 +26,7 @@ ----------------- pandas indexing behavior is tricky to understand. Some operations return views while -other return copies. Depending on the result of the operation, mutation one object +other return copies. Depending on the result of the operation, mutating one object might accidentally mutate another: .. ipython:: python @@ -46,6 +52,105 @@ The following sections will explain what this means and how it impacts existing applications. +.. _copy_on_write.migration_guide: + +Migrating to Copy-on-Write +-------------------------- + +Copy-on-Write will be the default and only mode in pandas 3.0. This means that users +need to migrate their code to be compliant with CoW rules. + +The default mode in pandas will raise warnings for certain cases that will actively +change behavior and thus change user intended behavior. + +We added another mode, e.g. + +.. code-block:: python + + pd.options.mode.copy_on_write = "warn" + +that will warn for every operation that will change behavior with CoW. We expect this mode +to be very noisy, since many cases that we don't expect that they will influence users will +also emit a warning. We recommend checking this mode and analyzing the warnings, but it is +not necessary to address all of these warning. The first two items of the following lists +are the only cases that need to be addressed to make existing code work with CoW. + +The following few items describe the user visible changes: + +**Chained assignment will never work** + +``loc`` should be used as an alternative. Check the +:ref:`chained assignment section ` for more details. + +**Accessing the underlying array of a pandas object will return a read-only view** + + +.. ipython:: python + + ser = pd.Series([1, 2, 3]) + ser.to_numpy() + +This example returns a NumPy array that is a view of the Series object. This view can +be modified and thus also modify the pandas object. This is not compliant with CoW +rules. The returned array is set to non-writeable to protect against this behavior. +Creating a copy of this array allows modification. You can also make the array +writeable again if you don't care about the pandas object anymore. + +See the section about :ref:`read-only NumPy arrays ` +for more details. + +**Only one pandas object is updated at once** + +The following code snippet updates both ``df`` and ``subset`` without CoW: + +.. ipython:: python + + df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + subset = df["foo"] + subset.iloc[0] = 100 + df + +This won't be possible anymore with CoW, since the CoW rules explicitly forbid this. +This includes updating a single column as a :class:`Series` and relying on the change +propagating back to the parent :class:`DataFrame`. +This statement can be rewritten into a single statement with ``loc`` or ``iloc`` if +this behavior is necessary. :meth:`DataFrame.where` is another suitable alternative +for this case. + +Updating a column selected from a :class:`DataFrame` with an inplace method will +also not work anymore. + +.. ipython:: python + :okwarning: + + df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + df["foo"].replace(1, 5, inplace=True) + df + +This is another form of chained assignment. This can generally be rewritten in 2 +different forms: + +.. ipython:: python + + df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + df.replace({"foo": {1: 5}}, inplace=True) + df + +A different alternative would be to not use ``inplace``: + +.. ipython:: python + + df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + df["foo"] = df["foo"].replace(1, 5) + df + +**Constructors now copy NumPy arrays by default** + +The Series and DataFrame constructors will now copy NumPy array by default when not +otherwise specified. This was changed to avoid mutating a pandas object when the +NumPy array is changed inplace outside of pandas. You can set ``copy=False`` to +avoid this copy. + Description ----------- @@ -123,6 +228,8 @@ df view +.. _copy_on_write_chained_assignment: + Chained Assignment ------------------ @@ -130,6 +237,7 @@ two subsequent indexing operations, e.g. .. ipython:: python + :okwarning: with pd.option_context("mode.copy_on_write", False): df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) @@ -154,6 +262,8 @@ df.loc[df["bar"] > 5, "foo"] = 100 +.. _copy_on_write_read_only_na: + Read-only NumPy arrays ---------------------- @@ -207,7 +317,7 @@ .. ipython:: python df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df2 = df.reset_index() + df2 = df.reset_index(drop=True) df2.iloc[0, 0] = 100 This creates two objects that share data and thus the setitem operation will trigger a @@ -218,7 +328,7 @@ .. ipython:: python df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df = df.reset_index() + df = df.reset_index(drop=True) df.iloc[0, 0] = 100 No copy is necessary in this example. @@ -238,6 +348,8 @@ These methods return views when Copy-on-Write is enabled, which provides a significant performance improvement compared to the regular execution. +.. _copy_on_write_enabling: + How to enable CoW ----------------- diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/dsintro.rst pandas-2.2.2+dfsg/doc/source/user_guide/dsintro.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/dsintro.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/dsintro.rst 2024-04-10 17:42:52.000000000 +0000 @@ -308,7 +308,7 @@ From dict of ndarrays / lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The ndarrays must all be the same length. If an index is passed, it must +All ndarrays must share the same length. If an index is passed, it must also be the same length as the arrays. If no index is passed, the result will be ``range(n)``, where ``n`` is the array length. diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/enhancingperf.rst pandas-2.2.2+dfsg/doc/source/user_guide/enhancingperf.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/enhancingperf.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/enhancingperf.rst 2024-04-10 17:42:52.000000000 +0000 @@ -453,7 +453,7 @@ :func:`~pandas.eval` is many orders of magnitude slower for smaller expressions or objects than plain Python. A good rule of thumb is to only use :func:`~pandas.eval` when you have a - :class:`~pandas.core.frame.DataFrame` with more than 10,000 rows. + :class:`.DataFrame` with more than 10,000 rows. Supported syntax ~~~~~~~~~~~~~~~~ diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/groupby.rst pandas-2.2.2+dfsg/doc/source/user_guide/groupby.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/groupby.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/groupby.rst 2024-04-10 17:42:52.000000000 +0000 @@ -13,10 +13,8 @@ * **Applying** a function to each group independently. * **Combining** the results into a data structure. -Out of these, the split step is the most straightforward. In fact, in many -situations we may wish to split the data set into groups and do something with -those groups. In the apply step, we might wish to do one of the -following: +Out of these, the split step is the most straightforward. In the apply step, we +might wish to do one of the following: * **Aggregation**: compute a summary statistic (or statistics) for each group. Some examples: @@ -53,9 +51,7 @@ function. -Since the set of object instance methods on pandas data structures is generally -rich and expressive, we often simply want to invoke, say, a DataFrame function -on each group. The name GroupBy should be quite familiar to those who have used +The name GroupBy should be quite familiar to those who have used a SQL-based tool (or ``itertools``), in which you can write code like: .. code-block:: sql @@ -65,7 +61,7 @@ GROUP BY Column1, Column2 We aim to make operations like this natural and easy to express using -pandas. We'll address each area of GroupBy functionality then provide some +pandas. We'll address each area of GroupBy functionality, then provide some non-trivial examples / use cases. See the :ref:`cookbook` for some advanced strategies. @@ -134,6 +130,7 @@ .. ipython:: python grouped = df.groupby("A") + grouped = df.groupby("B") grouped = df.groupby(["A", "B"]) .. note:: @@ -170,9 +167,11 @@ .. ipython:: python - lst = [1, 2, 3, 1, 2, 3] + index = [1, 2, 3, 1, 2, 3] - s = pd.Series([1, 2, 3, 10, 20, 30], lst) + s = pd.Series([1, 2, 3, 10, 20, 30], index=index) + + s grouped = s.groupby(level=0) @@ -211,9 +210,9 @@ .. ipython:: python df3 = pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}) - df3.groupby(["X"]).get_group("A") + df3.groupby("X").get_group("A") - df3.groupby(["X"]).get_group("B") + df3.groupby(["X"]).get_group(("B",)) .. _groupby.dropna: @@ -256,8 +255,8 @@ df.groupby("A").groups df.T.groupby(get_letter_type).groups -Calling the standard Python ``len`` function on the GroupBy object just returns -the length of the ``groups`` dict, so it is largely just a convenience: +Calling the standard Python ``len`` function on the GroupBy object returns +the number of groups, which is the same as the length of the ``groups`` dictionary: .. ipython:: python @@ -268,7 +267,7 @@ .. _groupby.tabcompletion: -``GroupBy`` will tab complete column names (and other attributes): +``GroupBy`` will tab complete column names, GroupBy operations, and other attributes: .. ipython:: python @@ -420,6 +419,12 @@ Additionally, this method avoids recomputing the internal grouping information derived from the passed key. +You can also include the grouping columns if you want to operate on them. + +.. ipython:: python + + grouped[["A", "B"]].sum() + .. _groupby.iterating-label: Iterating through groups @@ -452,7 +457,7 @@ ----------------- A single group can be selected using -:meth:`~pandas.core.groupby.DataFrameGroupBy.get_group`: +:meth:`.DataFrameGroupBy.get_group`: .. ipython:: python @@ -499,7 +504,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Many common aggregations are built-in to GroupBy objects as methods. Of the methods -listed below, those with a ``*`` do *not* have a Cython-optimized implementation. +listed below, those with a ``*`` do *not* have an efficient, GroupBy-specific, implementation. .. csv-table:: :header: "Method", "Description" @@ -511,8 +516,8 @@ :meth:`~.DataFrameGroupBy.count`;Compute the number of non-NA values in the groups :meth:`~.DataFrameGroupBy.cov` * ;Compute the covariance of the groups :meth:`~.DataFrameGroupBy.first`;Compute the first occurring value in each group - :meth:`~.DataFrameGroupBy.idxmax` *;Compute the index of the maximum value in each group - :meth:`~.DataFrameGroupBy.idxmin` *;Compute the index of the minimum value in each group + :meth:`~.DataFrameGroupBy.idxmax`;Compute the index of the maximum value in each group + :meth:`~.DataFrameGroupBy.idxmin`;Compute the index of the minimum value in each group :meth:`~.DataFrameGroupBy.last`;Compute the last occurring value in each group :meth:`~.DataFrameGroupBy.max`;Compute the maximum value in each group :meth:`~.DataFrameGroupBy.mean`;Compute the mean of each group @@ -535,16 +540,16 @@ df.groupby("A")[["C", "D"]].max() df.groupby(["A", "B"]).mean() -Another simple aggregation example is to compute the size of each group. +Another aggregation example is to compute the size of each group. This is included in GroupBy as the ``size`` method. It returns a Series whose -index are the group names and whose values are the sizes of each group. +index consists of the group names and the values are the sizes of each group. .. ipython:: python grouped = df.groupby(["A", "B"]) grouped.size() -While the :meth:`~.DataFrameGroupBy.describe` method is not itself a reducer, it +While the :meth:`.DataFrameGroupBy.describe` method is not itself a reducer, it can be used to conveniently produce a collection of summary statistics about each of the groups. @@ -553,7 +558,7 @@ grouped.describe() Another aggregation example is to compute the number of unique values of each group. -This is similar to the ``value_counts`` function, except that it only counts the +This is similar to the :meth:`.DataFrameGroupBy.value_counts` function, except that it only counts the number of unique values. .. ipython:: python @@ -566,11 +571,11 @@ .. note:: Aggregation functions **will not** return the groups that you are aggregating over - as named *columns*, when ``as_index=True``, the default. The grouped columns will + as named *columns* when ``as_index=True``, the default. The grouped columns will be the **indices** of the returned object. - Passing ``as_index=False`` **will** return the groups that you are aggregating over, if they are - named **indices** or *columns*. + Passing ``as_index=False`` **will** return the groups that you are aggregating over as + named columns, regardless if they are named **indices** or *columns* in the inputs. .. _groupby.aggregate.agg: @@ -596,7 +601,7 @@ grouped.agg("sum") The result of the aggregation will have the group names as the -new index along the grouped axis. In the case of multiple keys, the result is a +new index. In the case of multiple keys, the result is a :ref:`MultiIndex ` by default. As mentioned above, this can be changed by using the ``as_index`` option: @@ -650,16 +655,17 @@ Applying multiple functions at once ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -With grouped ``Series`` you can also pass a list or dict of functions to do -aggregation with, outputting a DataFrame: +On a grouped ``Series``, you can pass a list or dict of functions to +:meth:`SeriesGroupBy.agg`, outputting a DataFrame: .. ipython:: python grouped = df.groupby("A") grouped["C"].agg(["sum", "mean", "std"]) -On a grouped ``DataFrame``, you can pass a list of functions to apply to each -column, which produces an aggregated result with a hierarchical index: +On a grouped ``DataFrame``, you can pass a list of functions to +:meth:`DataFrameGroupBy.agg` to aggregate each +column, which produces an aggregated result with a hierarchical column index: .. ipython:: python @@ -824,8 +830,7 @@ Built-in transformation methods ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The following methods on GroupBy act as transformations. Of these methods, only -``fillna`` does not have a Cython-optimized implementation. +The following methods on GroupBy act as transformations. .. csv-table:: :header: "Method", "Description" @@ -840,15 +845,14 @@ :meth:`~.DataFrameGroupBy.cumsum`;Compute the cumulative sum within each group :meth:`~.DataFrameGroupBy.diff`;Compute the difference between adjacent values within each group :meth:`~.DataFrameGroupBy.ffill`;Forward fill NA values within each group - :meth:`~.DataFrameGroupBy.fillna`;Fill NA values within each group :meth:`~.DataFrameGroupBy.pct_change`;Compute the percent change between adjacent values within each group :meth:`~.DataFrameGroupBy.rank`;Compute the rank of each value within each group :meth:`~.DataFrameGroupBy.shift`;Shift values up or down within each group In addition, passing any built-in aggregation method as a string to :meth:`~.DataFrameGroupBy.transform` (see the next section) will broadcast the result -across the group, producing a transformed result. If the aggregation method is -Cython-optimized, this will be performant as well. +across the group, producing a transformed result. If the aggregation method has an efficient +implementation, this will be performant as well. .. _groupby.transformation.transform: @@ -890,7 +894,7 @@ the built-in methods. All of the examples in this section can be made more performant by calling - built-in methods instead of using ``transform``. + built-in methods instead of using UDFs. See :ref:`below for examples `. .. versionchanged:: 2.0.0 @@ -921,7 +925,7 @@ We would expect the result to now have mean 0 and standard deviation 1 within -each group, which we can easily check: +each group (up to floating-point error), which we can easily check: .. ipython:: python @@ -995,18 +999,18 @@ .. ipython:: python - # ts.groupby(lambda x: x.year).transform( + # result = ts.groupby(lambda x: x.year).transform( # lambda x: (x - x.mean()) / x.std() # ) grouped = ts.groupby(lambda x: x.year) result = (ts - grouped.transform("mean")) / grouped.transform("std") - # ts.groupby(lambda x: x.year).transform(lambda x: x.max() - x.min()) + # result = ts.groupby(lambda x: x.year).transform(lambda x: x.max() - x.min()) grouped = ts.groupby(lambda x: x.year) result = grouped.transform("max") - grouped.transform("min") # grouped = data_df.groupby(key) - # grouped.transform(lambda x: x.fillna(x.mean())) + # result = grouped.transform(lambda x: x.fillna(x.mean())) grouped = data_df.groupby(key) result = data_df.fillna(grouped.transform("mean")) @@ -1053,14 +1057,14 @@ ).set_index("date") df_re - df_re.groupby("group").resample("1D").ffill() + df_re.groupby("group").resample("1D", include_groups=False).ffill() .. _groupby.filter: Filtration ---------- -A filtration is a GroupBy operation the subsets the original grouping object. It +A filtration is a GroupBy operation that subsets the original grouping object. It may either filter out entire groups, part of groups, or both. Filtrations return a filtered version of the calling object, including the grouping columns when provided. In the following example, ``class`` is included in the result. @@ -1085,8 +1089,8 @@ Built-in filtrations ~~~~~~~~~~~~~~~~~~~~ -The following methods on GroupBy act as filtrations. All these methods have a -Cython-optimized implementation. +The following methods on GroupBy act as filtrations. All these methods have an +efficient, GroupBy-specific, implementation. .. csv-table:: :header: "Method", "Description" @@ -1232,13 +1236,13 @@ .. ipython:: python - df.groupby("A", group_keys=True).apply(lambda x: x) + df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False) with .. ipython:: python - df.groupby("A", group_keys=False).apply(lambda x: x) + df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False) Numba Accelerated Routines @@ -1263,8 +1267,8 @@ Other useful features --------------------- -Exclusion of "nuisance" columns -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Exclusion of non-numeric columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Again consider the example DataFrame we've been looking at: @@ -1274,8 +1278,8 @@ Suppose we wish to compute the standard deviation grouped by the ``A`` column. There is a slight problem, namely that we don't care about the data in -column ``B`` because it is not numeric. We refer to these non-numeric columns as -"nuisance" columns. You can avoid nuisance columns by specifying ``numeric_only=True``: +column ``B`` because it is not numeric. You can avoid non-numeric columns by +specifying ``numeric_only=True``: .. ipython:: python @@ -1302,17 +1306,8 @@ ], } ) - - # Decimal columns can be sum'd explicitly by themselves... df_dec.groupby(["id"])[["dec_column"]].sum() - # ...but cannot be combined with standard data types or they will be excluded - df_dec.groupby(["id"])[["int_column", "dec_column"]].sum() - - # Use .agg function to aggregate over standard and "nuisance" data types - # at the same time - df_dec.groupby(["id"]).agg({"int_column": "sum", "dec_column": "sum"}) - .. _groupby.observed: Handling of (un)observed Categorical values @@ -1344,35 +1339,55 @@ s = ( pd.Series([1, 1, 1]) - .groupby(pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=False) + .groupby(pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=True) .count() ) s.index.dtype .. _groupby.missing: -NA and NaT group handling -~~~~~~~~~~~~~~~~~~~~~~~~~ +NA group handling +~~~~~~~~~~~~~~~~~ + +By ``NA``, we are referring to any ``NA`` values, including +:class:`NA`, ``NaN``, ``NaT``, and ``None``. If there are any ``NA`` values in the +grouping key, by default these will be excluded. In other words, any +"``NA`` group" will be dropped. You can include NA groups by specifying ``dropna=False``. + +.. ipython:: python + + df = pd.DataFrame({"key": [1.0, 1.0, np.nan, 2.0, np.nan], "A": [1, 2, 3, 4, 5]}) + df + + df.groupby("key", dropna=True).sum() -If there are any NaN or NaT values in the grouping key, these will be -automatically excluded. In other words, there will never be an "NA group" or -"NaT group". This was not the case in older versions of pandas, but users were -generally discarding the NA group anyway (and supporting it was an -implementation headache). + df.groupby("key", dropna=False).sum() Grouping with ordered factors ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Categorical variables represented as instances of pandas's ``Categorical`` class -can be used as group keys. If so, the order of the levels will be preserved: +can be used as group keys. If so, the order of the levels will be preserved. When +``observed=False`` and ``sort=False``, any unobserved categories will be at the +end of the result in order. .. ipython:: python - data = pd.Series(np.random.randn(100)) + days = pd.Categorical( + values=["Wed", "Mon", "Thu", "Mon", "Wed", "Sat"], + categories=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"], + ) + data = pd.DataFrame( + { + "day": days, + "workers": [3, 4, 1, 4, 2, 2], + } + ) + data - factor = pd.qcut(data, [0, 0.25, 0.5, 0.75, 1.0]) + data.groupby("day", observed=False, sort=True).sum() - data.groupby(factor, observed=False).mean() + data.groupby("day", observed=False, sort=False).sum() .. _groupby.specify: @@ -1410,19 +1425,20 @@ .. ipython:: python - df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"])[["Quantity"]].sum() + df.groupby([pd.Grouper(freq="1ME", key="Date"), "Buyer"])[["Quantity"]].sum() When ``freq`` is specified, the object returned by ``pd.Grouper`` will be an -instance of ``pandas.api.typing.TimeGrouper``. You have an ambiguous specification -in that you have a named index and a column that could be potential groupers. +instance of ``pandas.api.typing.TimeGrouper``. When there is a column and index +with the same name, you can use ``key`` to group by the column and ``level`` +to group by the index. .. ipython:: python df = df.set_index("Date") df["Date"] = df.index + pd.offsets.MonthEnd(2) - df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"])[["Quantity"]].sum() + df.groupby([pd.Grouper(freq="6ME", key="Date"), "Buyer"])[["Quantity"]].sum() - df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"])[["Quantity"]].sum() + df.groupby([pd.Grouper(freq="6ME", level="Date"), "Buyer"])[["Quantity"]].sum() Taking the first rows of each group @@ -1525,7 +1541,7 @@ To see the ordering of the groups (as opposed to the order of rows within a group given by ``cumcount``) you can use -:meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`. +:meth:`.DataFrameGroupBy.ngroup`. @@ -1607,7 +1623,7 @@ ) df.head(2) -Now, to find prices per store/product, we can simply do: +We now find the prices per store/product. .. ipython:: python @@ -1637,24 +1653,12 @@ Examples -------- -Regrouping by factor -~~~~~~~~~~~~~~~~~~~~ - -Regroup columns of a DataFrame according to their sum, and sum the aggregated ones. - -.. ipython:: python - - df = pd.DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "c": [1, 0, 0], "d": [2, 3, 4]}) - df - dft = df.T - dft.groupby(dft.sum()).sum() - .. _groupby.multicolumn_factorization: Multi-column factorization ~~~~~~~~~~~~~~~~~~~~~~~~~~ -By using :meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`, we can extract +By using :meth:`.DataFrameGroupBy.ngroup`, we can extract information about the groups in a way similar to :func:`factorize` (as described further in the :ref:`reshaping API `) but which applies naturally to multiple columns of mixed type and different @@ -1683,7 +1687,7 @@ In order for resample to work on indices that are non-datetimelike, the following procedure can be utilized. -In the following examples, **df.index // 5** returns a binary array which is used to determine what gets selected for the groupby operation. +In the following examples, **df.index // 5** returns an integer array which is used to determine what gets selected for the groupby operation. .. note:: @@ -1722,7 +1726,7 @@ result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} return pd.Series(result, name="metrics") - result = df.groupby("a").apply(compute_metrics) + result = df.groupby("a").apply(compute_metrics, include_groups=False) result diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/indexing.rst pandas-2.2.2+dfsg/doc/source/user_guide/indexing.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/indexing.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/indexing.rst 2024-04-10 17:42:52.000000000 +0000 @@ -62,6 +62,8 @@ * A boolean array (any ``NA`` values will be treated as ``False``). * A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above). + * A tuple of row (and column) indices whose elements are one of the + above inputs. See more at :ref:`Selection by Label `. @@ -78,6 +80,8 @@ * A boolean array (any ``NA`` values will be treated as ``False``). * A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above). + * A tuple of row (and column) indices whose elements are one of the + above inputs. See more at :ref:`Selection by Position `, :ref:`Advanced Indexing ` and :ref:`Advanced @@ -85,6 +89,12 @@ * ``.loc``, ``.iloc``, and also ``[]`` indexing can accept a ``callable`` as indexer. See more at :ref:`Selection By Callable `. + .. note:: + + Destructuring tuple keys into row (and column) indexes occurs + *before* callables are applied, so you cannot return a tuple from + a callable to index both rows and columns. + Getting values from an object with multi-axes selection uses the following notation (using ``.loc`` as an example, but the following applies to ``.iloc`` as well). Any of the axes accessors may be the null slice ``:``. Axes left out of @@ -450,6 +460,8 @@ * A slice object with ints ``1:7``. * A boolean array. * A ``callable``, see :ref:`Selection By Callable `. +* A tuple of row (and column) indexes, whose elements are one of the + above types. .. ipython:: python @@ -553,6 +565,12 @@ ``.loc``, ``.iloc``, and also ``[]`` indexing can accept a ``callable`` as indexer. The ``callable`` must be a function with one argument (the calling Series or DataFrame) that returns valid output for indexing. +.. note:: + + For ``.iloc`` indexing, returning a tuple from the callable is + not supported, since tuple destructuring for row and column indexes + occurs *before* applying callables. + .. ipython:: python df1 = pd.DataFrame(np.random.randn(6, 4), @@ -1709,6 +1727,22 @@ Returning a view versus a copy ------------------------------ +.. warning:: + + :ref:`Copy-on-Write ` + will become the new default in pandas 3.0. This means than chained indexing will + never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary + anymore. + See :ref:`this section ` + for more context. + We recommend turning Copy-on-Write on to leverage the improvements with + + ``` + pd.options.mode.copy_on_write = True + ``` + + even before pandas 3.0 is available. + When setting values in a pandas object, care must be taken to avoid what is called ``chained indexing``. Here is an example. @@ -1747,6 +1781,22 @@ Why does assignment fail when using chained indexing? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. warning:: + + :ref:`Copy-on-Write ` + will become the new default in pandas 3.0. This means than chained indexing will + never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary + anymore. + See :ref:`this section ` + for more context. + We recommend turning Copy-on-Write on to leverage the improvements with + + ``` + pd.options.mode.copy_on_write = True + ``` + + even before pandas 3.0 is available. + The problem in the previous section is just a performance issue. What's up with the ``SettingWithCopy`` warning? We don't **usually** throw warnings around when you do something that might cost a few extra milliseconds! @@ -1803,6 +1853,22 @@ Evaluation order matters ~~~~~~~~~~~~~~~~~~~~~~~~ +.. warning:: + + :ref:`Copy-on-Write ` + will become the new default in pandas 3.0. This means than chained indexing will + never work. As a consequence, the ``SettingWithCopyWarning`` won't be necessary + anymore. + See :ref:`this section ` + for more context. + We recommend turning Copy-on-Write on to leverage the improvements with + + ``` + pd.options.mode.copy_on_write = True + ``` + + even before pandas 3.0 is available. + When you use chained indexing, the order and type of the indexing operation partially determine whether the result is a slice into the original object, or a copy of the slice. @@ -1837,7 +1903,7 @@ :okwarning: :okexcept: - with option_context('mode.chained_assignment','warn'): + with pd.option_context('mode.chained_assignment','warn'): dfb[dfb['a'].str.startswith('o')]['c'] = 42 A chained assignment can also crop up in setting in a mixed dtype frame. @@ -1879,7 +1945,7 @@ :okwarning: :okexcept: - with option_context('mode.chained_assignment','raise'): + with pd.option_context('mode.chained_assignment','raise'): dfd.loc[0]['a'] = 1111 .. warning:: diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/io.rst pandas-2.2.2+dfsg/doc/source/user_guide/io.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/io.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/io.rst 2024-04-10 17:42:52.000000000 +0000 @@ -81,6 +81,9 @@ If this option is set to ``True``, nothing should be passed in for the ``delimiter`` parameter. + .. deprecated: 2.2.0 + Use ``sep="\\s+" instead. + Column and index locations and names ++++++++++++++++++++++++++++++++++++ @@ -836,6 +839,7 @@ column names: .. ipython:: python + :okwarning: data = ( "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" @@ -856,6 +860,7 @@ to retain them via the ``keep_date_col`` keyword: .. ipython:: python + :okwarning: df = pd.read_csv( "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True @@ -871,6 +876,7 @@ You can also use a dict to specify custom name columns: .. ipython:: python + :okwarning: date_spec = {"nominal": [1, 2], "actual": [1, 3]} df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec) @@ -883,6 +889,7 @@ .. ipython:: python + :okwarning: date_spec = {"nominal": [1, 2], "actual": [1, 3]} df = pd.read_csv( @@ -902,6 +909,10 @@ for your data to store datetimes in this format, load times will be significantly faster, ~20x has been observed. +.. deprecated:: 2.2.0 + Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime`` + on the relevant result columns instead. + Date parsing functions ++++++++++++++++++++++ @@ -1490,9 +1501,9 @@ .. ipython:: python - from pandas._testing import makeCustomDataframe as mkdf - - df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) + mi_idx = pd.MultiIndex.from_arrays([[1, 2, 3, 4], list("abcd")], names=list("ab")) + mi_col = pd.MultiIndex.from_arrays([[1, 2], list("ab")], names=list("cd")) + df = pd.DataFrame(np.ones((4, 2)), index=mi_idx, columns=mi_col) df.to_csv("mi.csv") print(open("mi.csv").read()) pd.read_csv("mi.csv", header=[0, 1, 2, 3], index_col=[0, 1]) @@ -1811,8 +1822,8 @@ A ``Series`` or ``DataFrame`` can be converted to a valid JSON string. Use ``to_json`` with optional parameters: -* ``path_or_buf`` : the pathname or buffer to write the output - This can be ``None`` in which case a JSON string is returned +* ``path_or_buf`` : the pathname or buffer to write the output. + This can be ``None`` in which case a JSON string is returned. * ``orient`` : ``Series``: @@ -2332,7 +2343,7 @@ .. ipython:: python - s_per = pd.Series(1, index=pd.period_range("2016", freq="A-DEC", periods=4)) + s_per = pd.Series(1, index=pd.period_range("2016", freq="Y-DEC", periods=4)) build_table_schema(s_per) * Categoricals use the ``any`` type and an ``enum`` constraint listing @@ -2448,7 +2459,7 @@ .. code-block:: ipython - In [320]: "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list" + In [320]: url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list" In [321]: pd.read_html(url) Out[321]: [ Bank NameBank CityCity StateSt ... Acquiring InstitutionAI Closing DateClosing FundFund @@ -2619,7 +2630,7 @@ .. code-block:: python - url_mcc = "https://en.wikipedia.org/wiki/Mobile_country_code" + url_mcc = "https://en.wikipedia.org/wiki/Mobile_country_code?oldid=899173761" dfs = pd.read_html( url_mcc, match="Telekom Albania", @@ -2701,7 +2712,7 @@ .. note:: Not all of the possible options for ``DataFrame.to_html`` are shown here for - brevity's sake. See :func:`~pandas.core.frame.DataFrame.to_html` for the + brevity's sake. See :func:`.DataFrame.to_html` for the full set of options. .. note:: @@ -3453,26 +3464,22 @@ The :func:`~pandas.read_excel` method can read Excel 2007+ (``.xlsx``) files using the ``openpyxl`` Python module. Excel 2003 (``.xls``) files can be read using ``xlrd``. Binary Excel (``.xlsb``) -files can be read using ``pyxlsb``. +files can be read using ``pyxlsb``. All formats can be read +using :ref:`calamine` engine. The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. See the :ref:`cookbook` for some advanced strategies. -.. warning:: +.. note:: - The `xlrd `__ package is now only for reading - old-style ``.xls`` files. + When ``engine=None``, the following logic will be used to determine the engine: - Before pandas 1.3.0, the default argument ``engine=None`` to :func:`~pandas.read_excel` - would result in using the ``xlrd`` engine in many cases, including new - Excel 2007+ (``.xlsx``) files. pandas will now default to using the - `openpyxl `__ engine. - - It is strongly encouraged to install ``openpyxl`` to read Excel 2007+ - (``.xlsx``) files. - **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.** - This is no longer supported, switch to using ``openpyxl`` instead. + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. + - Otherwise ``openpyxl`` will be used. .. _io.excel_reader: @@ -3494,6 +3501,9 @@ * For the engine odf, pandas is using :func:`odf.opendocument.load` to read in (``.ods``) files. +* For the engine calamine, pandas is using :func:`python_calamine.load_workbook` + to read in (``.xlsx``), (``.xlsm``), (``.xls``), (``.xlsb``), (``.ods``) files. + .. code-block:: python # Returns a DataFrame @@ -3935,7 +3945,8 @@ using the ``pyxlsb`` module. The semantics and features for reading binary Excel files mostly match what can be done for `Excel files`_ using ``engine='pyxlsb'``. ``pyxlsb`` does not recognize datetime types -in files and will return floats instead. +in files and will return floats instead (you can use :ref:`calamine` +if you need recognize datetime types). .. code-block:: python @@ -3947,6 +3958,20 @@ Currently pandas only supports *reading* binary Excel files. Writing is not implemented. +.. _io.calamine: + +Calamine (Excel and ODS files) +------------------------------ + +The :func:`~pandas.read_excel` method can read Excel file (``.xlsx``, ``.xlsm``, ``.xls``, ``.xlsb``) +and OpenDocument spreadsheets (``.ods``) using the ``python-calamine`` module. +This module is a binding for Rust library `calamine `__ +and is faster than other engines in most cases. The optional dependency 'python-calamine' needs to be installed. + +.. code-block:: python + + # Returns a DataFrame + pd.read_excel("path_to_file.xlsb", engine="calamine") .. _io.clipboard: @@ -4220,7 +4245,7 @@ .. ipython:: python df_tl = pd.DataFrame({"A": list(range(5)), "B": list(range(5))}) - df_tl.to_hdf("store_tl.h5", "table", append=True) + df_tl.to_hdf("store_tl.h5", key="table", append=True) pd.read_hdf("store_tl.h5", "table", where=["index>2"]) .. ipython:: python @@ -4243,12 +4268,12 @@ ) df_with_missing - df_with_missing.to_hdf("file.h5", "df_with_missing", format="table", mode="w") + df_with_missing.to_hdf("file.h5", key="df_with_missing", format="table", mode="w") pd.read_hdf("file.h5", "df_with_missing") df_with_missing.to_hdf( - "file.h5", "df_with_missing", format="table", mode="w", dropna=True + "file.h5", key="df_with_missing", format="table", mode="w", dropna=True ) pd.read_hdf("file.h5", "df_with_missing") @@ -4278,7 +4303,7 @@ .. ipython:: python :okexcept: - pd.DataFrame(np.random.randn(10, 2)).to_hdf("test_fixed.h5", "df") + pd.DataFrame(np.random.randn(10, 2)).to_hdf("test_fixed.h5", key="df") pd.read_hdf("test_fixed.h5", "df", where="index>5") .. ipython:: python @@ -5352,7 +5377,6 @@ Write to a parquet file. .. ipython:: python - :okwarning: df.to_parquet("example_pa.parquet", engine="pyarrow") df.to_parquet("example_fp.parquet", engine="fastparquet") @@ -5360,7 +5384,6 @@ Read from a parquet file. .. ipython:: python - :okwarning: result = pd.read_parquet("example_fp.parquet", engine="fastparquet") result = pd.read_parquet("example_pa.parquet", engine="pyarrow") @@ -5370,7 +5393,6 @@ By setting the ``dtype_backend`` argument you can control the default dtypes used for the resulting DataFrame. .. ipython:: python - :okwarning: result = pd.read_parquet("example_pa.parquet", engine="pyarrow", dtype_backend="pyarrow") @@ -5384,7 +5406,6 @@ Read only certain columns of a parquet file. .. ipython:: python - :okwarning: result = pd.read_parquet( "example_fp.parquet", @@ -5413,7 +5434,6 @@ more columns in the output file. Thus, this code: .. ipython:: python - :okwarning: df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) df.to_parquet("test.parquet", engine="pyarrow") @@ -5430,7 +5450,6 @@ :func:`~pandas.DataFrame.to_parquet`: .. ipython:: python - :okwarning: df.to_parquet("test.parquet", index=False) @@ -5453,7 +5472,6 @@ Parquet supports partitioning of data based on the values of one or more columns. .. ipython:: python - :okwarning: df = pd.DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}) df.to_parquet(path="test", engine="pyarrow", partition_cols=["a"], compression=None) @@ -5519,14 +5537,12 @@ Write to an orc file. .. ipython:: python - :okwarning: df.to_orc("example_pa.orc", engine="pyarrow") Read from an orc file. .. ipython:: python - :okwarning: result = pd.read_orc("example_pa.orc") @@ -5555,9 +5571,23 @@ ----------- The :mod:`pandas.io.sql` module provides a collection of query wrappers to both -facilitate data retrieval and to reduce dependency on DB-specific API. Database abstraction -is provided by SQLAlchemy if installed. In addition you will need a driver library for -your database. Examples of such drivers are `psycopg2 `__ +facilitate data retrieval and to reduce dependency on DB-specific API. + +Where available, users may first want to opt for `Apache Arrow ADBC +`_ drivers. These drivers +should provide the best performance, null handling, and type detection. + + .. versionadded:: 2.2.0 + + Added native support for ADBC drivers + +For a full list of ADBC drivers and their development status, see the `ADBC Driver +Implementation Status `_ +documentation. + +Where an ADBC driver is not available or may be missing functionality, +users should opt for installing SQLAlchemy alongside their database driver library. +Examples of such drivers are `psycopg2 `__ for PostgreSQL or `pymysql `__ for MySQL. For `SQLite `__ this is included in Python's standard library by default. @@ -5590,6 +5620,18 @@ engine. You can use a temporary SQLite database where data are stored in "memory". +To connect using an ADBC driver you will want to install the ``adbc_driver_sqlite`` using your +package manager. Once installed, you can use the DBAPI interface provided by the ADBC driver +to connect to your database. + +.. code-block:: python + + import adbc_driver_sqlite.dbapi as sqlite_dbapi + + # Create the connection + with sqlite_dbapi.connect("sqlite:///:memory:") as conn: + df = pd.read_sql_table("data", conn) + To connect with SQLAlchemy you use the :func:`create_engine` function to create an engine object from database URI. You only need to create the engine once per database you are connecting to. @@ -5665,9 +5707,74 @@ SQL data types ++++++++++++++ -:func:`~pandas.DataFrame.to_sql` will try to map your data to an appropriate -SQL data type based on the dtype of the data. When you have columns of dtype -``object``, pandas will try to infer the data type. +Ensuring consistent data type management across SQL databases is challenging. +Not every SQL database offers the same types, and even when they do the implementation +of a given type can vary in ways that have subtle effects on how types can be +preserved. + +For the best odds at preserving database types users are advised to use +ADBC drivers when available. The Arrow type system offers a wider array of +types that more closely match database types than the historical pandas/NumPy +type system. To illustrate, note this (non-exhaustive) listing of types +available in different databases and pandas backends: + ++-----------------+-----------------------+----------------+---------+ +|numpy/pandas |arrow |postgres |sqlite | ++=================+=======================+================+=========+ +|int16/Int16 |int16 |SMALLINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int32/Int32 |int32 |INTEGER |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int64/Int64 |int64 |BIGINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|float32 |float32 |REAL |REAL | ++-----------------+-----------------------+----------------+---------+ +|float64 |float64 |DOUBLE PRECISION|REAL | ++-----------------+-----------------------+----------------+---------+ +|object |string |TEXT |TEXT | ++-----------------+-----------------------+----------------+---------+ +|bool |``bool_`` |BOOLEAN | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns] |timestamp(us) |TIMESTAMP | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns,tz]|timestamp(us,tz) |TIMESTAMPTZ | | ++-----------------+-----------------------+----------------+---------+ +| |date32 |DATE | | ++-----------------+-----------------------+----------------+---------+ +| |month_day_nano_interval|INTERVAL | | ++-----------------+-----------------------+----------------+---------+ +| |binary |BINARY |BLOB | ++-----------------+-----------------------+----------------+---------+ +| |decimal128 |DECIMAL [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |list |ARRAY [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |struct |COMPOSITE TYPE | | +| | | [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ + +.. rubric:: Footnotes + +.. [#f1] Not implemented as of writing, but theoretically possible + +If you are interested in preserving database types as best as possible +throughout the lifecycle of your DataFrame, users are encouraged to +leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` + +.. code-block:: ipython + + # for roundtripping + with pg_dbapi.connect(uri) as conn: + df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") + +This will prevent your data from being converted to the traditional pandas/NumPy +type system, which often converts SQL types in ways that make them impossible to +round-trip. + +In case an ADBC driver is not available, :func:`~pandas.DataFrame.to_sql` +will try to map your data to an appropriate SQL data type based on the dtype of +the data. When you have columns of dtype ``object``, pandas will try to infer +the data type. You can always override the default type by specifying the desired SQL type of any of the columns by using the ``dtype`` argument. This argument needs a @@ -5686,7 +5793,9 @@ Due to the limited support for timedelta's in the different database flavors, columns with type ``timedelta64`` will be written as integer - values as nanoseconds to the database and a warning will be raised. + values as nanoseconds to the database and a warning will be raised. The only + exception to this is when using the ADBC PostgreSQL driver in which case a + timedelta will be written to the database as an ``INTERVAL`` .. note:: @@ -5701,7 +5810,7 @@ Datetime data types ''''''''''''''''''' -Using SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing +Using ADBC or SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing datetime data that is timezone naive or timezone aware. However, the resulting data stored in the database ultimately depends on the supported data type for datetime data of the database system being used. @@ -5792,7 +5901,7 @@ .. note:: In order to use :func:`~pandas.read_sql_table`, you **must** have the - SQLAlchemy optional dependency installed. + ADBC driver or SQLAlchemy optional dependency installed. .. ipython:: python @@ -5800,7 +5909,8 @@ .. note:: - Note that pandas infers column dtypes from query outputs, and not by looking + ADBC drivers will map database types directly back to arrow types. For other drivers + note that pandas infers column dtypes from query outputs, and not by looking up data types in the physical database schema. For example, assume ``userid`` is an integer column in a table. Then, intuitively, ``select userid ...`` will return integer-valued series, while ``select cast(userid as text) ...`` will @@ -6001,7 +6111,7 @@ Writing to stata format ''''''''''''''''''''''' -The method :func:`~pandas.core.frame.DataFrame.to_stata` will write a DataFrame +The method :func:`.DataFrame.to_stata` will write a DataFrame into a .dta file. The format version of this file is always 115 (Stata 12). .. ipython:: python @@ -6041,7 +6151,7 @@ .. warning:: :class:`~pandas.io.stata.StataWriter` and - :func:`~pandas.core.frame.DataFrame.to_stata` only support fixed width + :func:`.DataFrame.to_stata` only support fixed width strings containing up to 244 characters, a limitation imposed by the version 115 dta file format. Attempting to write *Stata* dta files with strings longer than 244 characters raises a ``ValueError``. @@ -6321,7 +6431,7 @@ def test_hdf_fixed_write(df): - df.to_hdf("test_fixed.hdf", "test", mode="w") + df.to_hdf("test_fixed.hdf", key="test", mode="w") def test_hdf_fixed_read(): @@ -6329,7 +6439,7 @@ def test_hdf_fixed_write_compress(df): - df.to_hdf("test_fixed_compress.hdf", "test", mode="w", complib="blosc") + df.to_hdf("test_fixed_compress.hdf", key="test", mode="w", complib="blosc") def test_hdf_fixed_read_compress(): @@ -6337,7 +6447,7 @@ def test_hdf_table_write(df): - df.to_hdf("test_table.hdf", "test", mode="w", format="table") + df.to_hdf("test_table.hdf", key="test", mode="w", format="table") def test_hdf_table_read(): @@ -6346,7 +6456,7 @@ def test_hdf_table_write_compress(df): df.to_hdf( - "test_table_compress.hdf", "test", mode="w", complib="blosc", format="table" + "test_table_compress.hdf", key="test", mode="w", complib="blosc", format="table" ) diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/merging.rst pandas-2.2.2+dfsg/doc/source/user_guide/merging.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/merging.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/merging.rst 2024-04-10 17:42:52.000000000 +0000 @@ -15,27 +15,27 @@ Merge, join, concatenate and compare ************************************ -pandas provides various facilities for easily combining together Series or -DataFrame with various kinds of set logic for the indexes -and relational algebra functionality in the case of join / merge-type -operations. +pandas provides various methods for combining and comparing :class:`Series` or +:class:`DataFrame`. -In addition, pandas also provides utilities to compare two Series or DataFrame -and summarize their differences. +* :func:`~pandas.concat`: Merge multiple :class:`Series` or :class:`DataFrame` objects along a shared index or column +* :meth:`DataFrame.join`: Merge multiple :class:`DataFrame` objects along the columns +* :meth:`DataFrame.combine_first`: Update missing values with non-missing values in the same location +* :func:`~pandas.merge`: Combine two :class:`Series` or :class:`DataFrame` objects with SQL-style joining +* :func:`~pandas.merge_ordered`: Combine two :class:`Series` or :class:`DataFrame` objects along an ordered axis +* :func:`~pandas.merge_asof`: Combine two :class:`Series` or :class:`DataFrame` objects by near instead of exact matching keys +* :meth:`Series.compare` and :meth:`DataFrame.compare`: Show differences in values between two :class:`Series` or :class:`DataFrame` objects .. _merging.concat: -Concatenating objects ---------------------- - -The :func:`~pandas.concat` function (in the main pandas namespace) does all of -the heavy lifting of performing concatenation operations along an axis while -performing optional set logic (union or intersection) of the indexes (if any) on -the other axes. Note that I say "if any" because there is only a single possible -axis of concatenation for Series. +:func:`~pandas.concat` +---------------------- -Before diving into all of the details of ``concat`` and what it can do, here is -a simple example: +The :func:`~pandas.concat` function concatenates an arbitrary amount of +:class:`Series` or :class:`DataFrame` objects along an axis while +performing optional set logic (union or intersection) of the indexes on +the other axes. Like ``numpy.concatenate``, :func:`~pandas.concat` +takes a list or dict of homogeneously-typed objects and concatenates them. .. ipython:: python @@ -71,6 +71,7 @@ frames = [df1, df2, df3] result = pd.concat(frames) + result .. ipython:: python :suppress: @@ -79,81 +80,12 @@ p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True); plt.close("all"); -Like its sibling function on ndarrays, ``numpy.concatenate``, ``pandas.concat`` -takes a list or dict of homogeneously-typed objects and concatenates them with -some configurable handling of "what to do with the other axes": - -:: - - pd.concat( - objs, - axis=0, - join="outer", - ignore_index=False, - keys=None, - levels=None, - names=None, - verify_integrity=False, - copy=True, - ) - -* ``objs`` : a sequence or mapping of Series or DataFrame objects. If a - dict is passed, the sorted keys will be used as the ``keys`` argument, unless - it is passed, in which case the values will be selected (see below). Any None - objects will be dropped silently unless they are all None in which case a - ValueError will be raised. -* ``axis`` : {0, 1, ...}, default 0. The axis to concatenate along. -* ``join`` : {'inner', 'outer'}, default 'outer'. How to handle indexes on - other axis(es). Outer for union and inner for intersection. -* ``ignore_index`` : boolean, default False. If True, do not use the index - values on the concatenation axis. The resulting axis will be labeled 0, ..., - n - 1. This is useful if you are concatenating objects where the - concatenation axis does not have meaningful indexing information. Note - the index values on the other axes are still respected in the join. -* ``keys`` : sequence, default None. Construct hierarchical index using the - passed keys as the outermost level. If multiple levels passed, should - contain tuples. -* ``levels`` : list of sequences, default None. Specific levels (unique values) - to use for constructing a MultiIndex. Otherwise they will be inferred from the - keys. -* ``names`` : list, default None. Names for the levels in the resulting - hierarchical index. -* ``verify_integrity`` : boolean, default False. Check whether the new - concatenated axis contains duplicates. This can be very expensive relative - to the actual data concatenation. -* ``copy`` : boolean, default True. If False, do not copy data unnecessarily. - -Without a little bit of context many of these arguments don't make much sense. -Let's revisit the above example. Suppose we wanted to associate specific keys -with each of the pieces of the chopped up DataFrame. We can do this using the -``keys`` argument: - -.. ipython:: python - - result = pd.concat(frames, keys=["x", "y", "z"]) - -.. ipython:: python - :suppress: - - @savefig merging_concat_keys.png - p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True) - plt.close("all"); - -As you can see (if you've read the rest of the documentation), the resulting -object's index has a :ref:`hierarchical index `. This -means that we can now select out each chunk by key: - -.. ipython:: python - - result.loc["y"] - -It's not a stretch to see how this can be very useful. More detail on this -functionality below. - .. note:: - It is worth noting that :func:`~pandas.concat` makes a full copy of the data, and that constantly - reusing this function can create a significant performance hit. If you need - to use the operation over several datasets, use a list comprehension. + + :func:`~pandas.concat` makes a full copy of the data, and iteratively + reusing :func:`~pandas.concat` can create unnecessary copies. Collect all + :class:`DataFrame` or :class:`Series` objects in a list before using + :func:`~pandas.concat`. .. code-block:: python @@ -162,26 +94,20 @@ .. note:: - When concatenating DataFrames with named axes, pandas will attempt to preserve + When concatenating :class:`DataFrame` with named axes, pandas will attempt to preserve these index/column names whenever possible. In the case where all inputs share a common name, this name will be assigned to the result. When the input names do not all agree, the result will be unnamed. The same is true for :class:`MultiIndex`, but the logic is applied separately on a level-by-level basis. -Set logic on the other axes -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Joining logic of the resulting axis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When gluing together multiple DataFrames, you have a choice of how to handle -the other axes (other than the one being concatenated). This can be done in -the following two ways: - -* Take the union of them all, ``join='outer'``. This is the default - option as it results in zero information loss. -* Take the intersection, ``join='inner'``. +The ``join`` keyword specifies how to handle axis values that don't exist in the first +:class:`DataFrame`. -Here is an example of each of these methods. First, the default ``join='outer'`` -behavior: +``join='outer'`` takes the union of all axis values .. ipython:: python @@ -194,6 +120,7 @@ index=[2, 3, 6, 7], ) result = pd.concat([df1, df4], axis=1) + result .. ipython:: python @@ -203,11 +130,12 @@ p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False); plt.close("all"); -Here is the same thing with ``join='inner'``: +``join='inner'`` takes the intersection of the axis values .. ipython:: python result = pd.concat([df1, df4], axis=1, join="inner") + result .. ipython:: python :suppress: @@ -216,18 +144,13 @@ p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False); plt.close("all"); -Lastly, suppose we just wanted to reuse the *exact index* from the original -DataFrame: +To perform an effective "left" join using the *exact index* from the original +:class:`DataFrame`, result can be reindexed. .. ipython:: python result = pd.concat([df1, df4], axis=1).reindex(df1.index) - -Similarly, we could index before the concatenation: - -.. ipython:: python - - pd.concat([df1, df4.reindex(df1.index)], axis=1) + result .. ipython:: python :suppress: @@ -240,13 +163,14 @@ Ignoring indexes on the concatenation axis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For ``DataFrame`` objects which don't have a meaningful index, you may wish -to append them and ignore the fact that they may have overlapping indexes. To -do this, use the ``ignore_index`` argument: + +For :class:`DataFrame` objects which don't have a meaningful index, the ``ignore_index`` +ignores overlapping indexes. .. ipython:: python result = pd.concat([df1, df4], ignore_index=True, sort=False) + result .. ipython:: python :suppress: @@ -257,17 +181,18 @@ .. _merging.mixed_ndims: -Concatenating with mixed ndims -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Concatenating :class:`Series` and :class:`DataFrame` together +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can concatenate a mix of ``Series`` and ``DataFrame`` objects. The -``Series`` will be transformed to ``DataFrame`` with the column name as -the name of the ``Series``. +You can concatenate a mix of :class:`Series` and :class:`DataFrame` objects. The +:class:`Series` will be transformed to :class:`DataFrame` with the column name as +the name of the :class:`Series`. .. ipython:: python s1 = pd.Series(["X0", "X1", "X2", "X3"], name="X") result = pd.concat([df1, s1], axis=1) + result .. ipython:: python :suppress: @@ -276,19 +201,13 @@ p.plot([df1, s1], result, labels=["df1", "s1"], vertical=False); plt.close("all"); -.. note:: - - Since we're concatenating a ``Series`` to a ``DataFrame``, we could have - achieved the same result with :meth:`DataFrame.assign`. To concatenate an - arbitrary number of pandas objects (``DataFrame`` or ``Series``), use - ``concat``. - -If unnamed ``Series`` are passed they will be numbered consecutively. +Unnamed :class:`Series` will be numbered consecutively. .. ipython:: python s2 = pd.Series(["_0", "_1", "_2", "_3"]) result = pd.concat([df1, s2, s2, s2], axis=1) + result .. ipython:: python :suppress: @@ -297,11 +216,12 @@ p.plot([df1, s2], result, labels=["df1", "s2"], vertical=False); plt.close("all"); -Passing ``ignore_index=True`` will drop all name references. +``ignore_index=True`` will drop all name references. .. ipython:: python result = pd.concat([df1, s1], axis=1, ignore_index=True) + result .. ipython:: python :suppress: @@ -310,48 +230,45 @@ p.plot([df1, s1], result, labels=["df1", "s1"], vertical=False); plt.close("all"); -More concatenating with group keys -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Resulting ``keys`` +~~~~~~~~~~~~~~~~~~ -A fairly common use of the ``keys`` argument is to override the column names -when creating a new ``DataFrame`` based on existing ``Series``. -Notice how the default behaviour consists on letting the resulting ``DataFrame`` -inherit the parent ``Series``' name, when these existed. +The ``keys`` argument adds another axis level to the resulting index or column (creating +a :class:`MultiIndex`) associate specific keys with each original :class:`DataFrame`. .. ipython:: python - s3 = pd.Series([0, 1, 2, 3], name="foo") - s4 = pd.Series([0, 1, 2, 3]) - s5 = pd.Series([0, 1, 4, 5]) - - pd.concat([s3, s4, s5], axis=1) - -Through the ``keys`` argument we can override the existing column names. + result = pd.concat(frames, keys=["x", "y", "z"]) + result + result.loc["y"] .. ipython:: python + :suppress: - pd.concat([s3, s4, s5], axis=1, keys=["red", "blue", "yellow"]) + @savefig merging_concat_keys.png + p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True) + plt.close("all"); -Let's consider a variation of the very first example presented: +The ``keys`` argument cane override the column names +when creating a new :class:`DataFrame` based on existing :class:`Series`. .. ipython:: python - result = pd.concat(frames, keys=["x", "y", "z"]) - -.. ipython:: python - :suppress: + s3 = pd.Series([0, 1, 2, 3], name="foo") + s4 = pd.Series([0, 1, 2, 3]) + s5 = pd.Series([0, 1, 4, 5]) - @savefig merging_concat_group_keys2.png - p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True); - plt.close("all"); + pd.concat([s3, s4, s5], axis=1) + pd.concat([s3, s4, s5], axis=1, keys=["red", "blue", "yellow"]) -You can also pass a dict to ``concat`` in which case the dict keys will be used -for the ``keys`` argument (unless other keys are specified): +You can also pass a dict to :func:`concat` in which case the dict keys will be used +for the ``keys`` argument unless other ``keys`` argument is specified: .. ipython:: python pieces = {"x": df1, "y": df2, "z": df3} result = pd.concat(pieces) + result .. ipython:: python :suppress: @@ -363,6 +280,7 @@ .. ipython:: python result = pd.concat(pieces, keys=["z", "y"]) + result .. ipython:: python :suppress: @@ -371,21 +289,21 @@ p.plot([df1, df2, df3], result, labels=["df1", "df2", "df3"], vertical=True); plt.close("all"); -The MultiIndex created has levels that are constructed from the passed keys and -the index of the ``DataFrame`` pieces: +The :class:`MultiIndex` created has levels that are constructed from the passed keys and +the index of the :class:`DataFrame` pieces: .. ipython:: python result.index.levels -If you wish to specify other levels (as will occasionally be the case), you can -do so using the ``levels`` argument: +``levels`` argument allows specifying resulting levels associated with the ``keys`` .. ipython:: python result = pd.concat( pieces, keys=["x", "y", "z"], levels=[["z", "y", "x", "w"]], names=["group_key"] ) + result .. ipython:: python :suppress: @@ -398,21 +316,19 @@ result.index.levels -This is fairly esoteric, but it is actually necessary for implementing things -like GroupBy where the order of a categorical variable is meaningful. - .. _merging.append.row: -Appending rows to a DataFrame -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Appending rows to a :class:`DataFrame` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -If you have a series that you want to append as a single row to a ``DataFrame``, you can convert the row into a -``DataFrame`` and use ``concat`` +If you have a :class:`Series` that you want to append as a single row to a :class:`DataFrame`, you can convert the row into a +:class:`DataFrame` and use :func:`concat` .. ipython:: python s2 = pd.Series(["X0", "X1", "X2", "X3"], index=["A", "B", "C", "D"]) result = pd.concat([df1, s2.to_frame().T], ignore_index=True) + result .. ipython:: python :suppress: @@ -421,131 +337,35 @@ p.plot([df1, s2], result, labels=["df1", "s2"], vertical=True); plt.close("all"); -You should use ``ignore_index`` with this method to instruct DataFrame to -discard its index. If you wish to preserve the index, you should construct an -appropriately-indexed DataFrame and append or concatenate those objects. - .. _merging.join: -Database-style DataFrame or named Series joining/merging --------------------------------------------------------- - -pandas has full-featured, **high performance** in-memory join operations -idiomatically very similar to relational databases like SQL. These methods -perform significantly better (in some cases well over an order of magnitude -better) than other open source implementations (like ``base::merge.data.frame`` -in R). The reason for this is careful algorithmic design and the internal layout -of the data in ``DataFrame``. - -See the :ref:`cookbook` for some advanced strategies. +:func:`~pandas.merge` +--------------------- -Users who are familiar with SQL but new to pandas might be interested in a +:func:`~pandas.merge` performs join operations similar to relational databases like SQL. +Users who are familiar with SQL but new to pandas can reference a :ref:`comparison with SQL`. -pandas provides a single function, :func:`~pandas.merge`, as the entry point for -all standard database join operations between ``DataFrame`` or named ``Series`` objects: +Merge types +~~~~~~~~~~~ -:: +:func:`~pandas.merge` implements common SQL style joining operations. - pd.merge( - left, - right, - how="inner", - on=None, - left_on=None, - right_on=None, - left_index=False, - right_index=False, - sort=True, - suffixes=("_x", "_y"), - copy=True, - indicator=False, - validate=None, - ) - -* ``left``: A DataFrame or named Series object. -* ``right``: Another DataFrame or named Series object. -* ``on``: Column or index level names to join on. Must be found in both the left - and right DataFrame and/or Series objects. If not passed and ``left_index`` and - ``right_index`` are ``False``, the intersection of the columns in the - DataFrames and/or Series will be inferred to be the join keys. -* ``left_on``: Columns or index levels from the left DataFrame or Series to use as - keys. Can either be column names, index level names, or arrays with length - equal to the length of the DataFrame or Series. -* ``right_on``: Columns or index levels from the right DataFrame or Series to use as - keys. Can either be column names, index level names, or arrays with length - equal to the length of the DataFrame or Series. -* ``left_index``: If ``True``, use the index (row labels) from the left - DataFrame or Series as its join key(s). In the case of a DataFrame or Series with a MultiIndex - (hierarchical), the number of levels must match the number of join keys - from the right DataFrame or Series. -* ``right_index``: Same usage as ``left_index`` for the right DataFrame or Series -* ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``, ``'cross'``. Defaults - to ``inner``. See below for more detailed description of each method. -* ``sort``: Sort the result DataFrame by the join keys in lexicographical - order. Defaults to ``True``, setting to ``False`` will improve performance - substantially in many cases. -* ``suffixes``: A tuple of string suffixes to apply to overlapping - columns. Defaults to ``('_x', '_y')``. -* ``copy``: Always copy data (default ``True``) from the passed DataFrame or named Series - objects, even when reindexing is not necessary. Cannot be avoided in many - cases but may improve performance / memory usage. The cases where copying - can be avoided are somewhat pathological but this option is provided - nonetheless. -* ``indicator``: Add a column to the output DataFrame called ``_merge`` - with information on the source of each row. ``_merge`` is Categorical-type - and takes on a value of ``left_only`` for observations whose merge key - only appears in ``'left'`` DataFrame or Series, ``right_only`` for observations whose - merge key only appears in ``'right'`` DataFrame or Series, and ``both`` if the - observation's merge key is found in both. - -* ``validate`` : string, default None. - If specified, checks if merge is of specified type. - - * "one_to_one" or "1:1": checks if merge keys are unique in both - left and right datasets. - * "one_to_many" or "1:m": checks if merge keys are unique in left - dataset. - * "many_to_one" or "m:1": checks if merge keys are unique in right - dataset. - * "many_to_many" or "m:m": allowed, but does not result in checks. - -The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` or named ``Series`` -and ``right`` is a subclass of ``DataFrame``, the return type will still be ``DataFrame``. - -``merge`` is a function in the pandas namespace, and it is also available as a -``DataFrame`` instance method :meth:`~DataFrame.merge`, with the calling -``DataFrame`` being implicitly considered the left object in the join. - -The related :meth:`~DataFrame.join` method, uses ``merge`` internally for the -index-on-index (by default) and column(s)-on-index join. If you are joining on -index only, you may wish to use ``DataFrame.join`` to save yourself some typing. - -Brief primer on merge methods (relational algebra) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Experienced users of relational databases like SQL will be familiar with the -terminology used to describe join operations between two SQL-table like -structures (``DataFrame`` objects). There are several cases to consider which -are very important to understand: - -* **one-to-one** joins: for example when joining two ``DataFrame`` objects on - their indexes (which must contain unique values). -* **many-to-one** joins: for example when joining an index (unique) to one or - more columns in a different ``DataFrame``. -* **many-to-many** joins: joining columns on columns. +* **one-to-one**: joining two :class:`DataFrame` objects on + their indexes which must contain unique values. +* **many-to-one**: joining a unique index to one or + more columns in a different :class:`DataFrame`. +* **many-to-many** : joining columns on columns. .. note:: - When joining columns on columns (potentially a many-to-many join), any - indexes on the passed ``DataFrame`` objects **will be discarded**. + When joining columns on columns, potentially a many-to-many join, any + indexes on the passed :class:`DataFrame` objects **will be discarded**. -It is worth spending some time understanding the result of the **many-to-many** -join case. In SQL / standard relational algebra, if a key combination appears -more than once in both tables, the resulting table will have the **Cartesian -product** of the associated data. Here is a very basic example with one unique -key combination: +For a **many-to-many** join, if a key combination appears +more than once in both tables, the :class:`DataFrame` will have the **Cartesian +product** of the associated data. .. ipython:: python @@ -565,6 +385,7 @@ } ) result = pd.merge(left, right, on="key") + result .. ipython:: python :suppress: @@ -573,41 +394,8 @@ p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -Here is a more complicated example with multiple join keys. Only the keys -appearing in ``left`` and ``right`` are present (the intersection), since -``how='inner'`` by default. - -.. ipython:: python - - left = pd.DataFrame( - { - "key1": ["K0", "K0", "K1", "K2"], - "key2": ["K0", "K1", "K0", "K1"], - "A": ["A0", "A1", "A2", "A3"], - "B": ["B0", "B1", "B2", "B3"], - } - ) - - right = pd.DataFrame( - { - "key1": ["K0", "K1", "K1", "K2"], - "key2": ["K0", "K0", "K0", "K0"], - "C": ["C0", "C1", "C2", "C3"], - "D": ["D0", "D1", "D2", "D3"], - } - ) - - result = pd.merge(left, right, on=["key1", "key2"]) - -.. ipython:: python - :suppress: - - @savefig merging_merge_on_key_multiple.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); - -The ``how`` argument to ``merge`` specifies how to determine which keys are to -be included in the resulting table. If a key combination **does not appear** in +The ``how`` argument to :func:`~pandas.merge` specifies which keys are +included in the resulting table. If a key combination **does not appear** in either the left or right tables, the values in the joined table will be ``NA``. Here is a summary of the ``how`` options and their SQL equivalent names: @@ -623,7 +411,24 @@ .. ipython:: python + left = pd.DataFrame( + { + "key1": ["K0", "K0", "K1", "K2"], + "key2": ["K0", "K1", "K0", "K1"], + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + } + ) + right = pd.DataFrame( + { + "key1": ["K0", "K1", "K1", "K2"], + "key2": ["K0", "K0", "K0", "K0"], + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + } + ) result = pd.merge(left, right, how="left", on=["key1", "key2"]) + result .. ipython:: python :suppress: @@ -635,6 +440,7 @@ .. ipython:: python result = pd.merge(left, right, how="right", on=["key1", "key2"]) + result .. ipython:: python :suppress: @@ -645,6 +451,7 @@ .. ipython:: python result = pd.merge(left, right, how="outer", on=["key1", "key2"]) + result .. ipython:: python :suppress: @@ -656,6 +463,7 @@ .. ipython:: python result = pd.merge(left, right, how="inner", on=["key1", "key2"]) + result .. ipython:: python :suppress: @@ -667,6 +475,7 @@ .. ipython:: python result = pd.merge(left, right, how="cross") + result .. ipython:: python :suppress: @@ -675,10 +484,9 @@ p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -You can merge a mult-indexed Series and a DataFrame, if the names of -the MultiIndex correspond to the columns from the DataFrame. Transform -the Series to a DataFrame using :meth:`Series.reset_index` before merging, -as shown in the following example. +You can :class:`Series` and a :class:`DataFrame` with a :class:`MultiIndex` if the names of +the :class:`MultiIndex` correspond to the columns from the :class:`DataFrame`. Transform +the :class:`Series` to a :class:`DataFrame` using :meth:`Series.reset_index` before merging .. ipython:: python @@ -696,7 +504,7 @@ pd.merge(df, ser.reset_index(), on=["Let", "Num"]) -Here is another example with duplicate join keys in DataFrames: +Performing an outer join with duplicate join keys in :class:`DataFrame` .. ipython:: python @@ -705,6 +513,7 @@ right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]}) result = pd.merge(left, right, on="B", how="outer") + result .. ipython:: python :suppress: @@ -716,21 +525,17 @@ .. warning:: - Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions, which may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames. + Merging on duplicate keys significantly increase the dimensions of the result + and can cause a memory overflow. .. _merging.validation: -Checking for duplicate keys -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Merge key uniqueness +~~~~~~~~~~~~~~~~~~~~ -Users can use the ``validate`` argument to automatically check whether there -are unexpected duplicates in their merge keys. Key uniqueness is checked before -merge operations and so should protect against memory overflows. Checking key -uniqueness is also a good way to ensure user data structures are as expected. - -In the following example, there are duplicate values of ``B`` in the right -``DataFrame``. As this is not a one-to-one merge -- as specified in the -``validate`` argument -- an exception will be raised. +The ``validate`` argument checks whether the uniqueness of merge keys. +Key uniqueness is checked before merge operations and can protect against memory overflows +and unexpected key duplication. .. ipython:: python :okexcept: @@ -739,8 +544,8 @@ right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]}) result = pd.merge(left, right, on="B", how="outer", validate="one_to_one") -If the user is aware of the duplicates in the right ``DataFrame`` but wants to -ensure there are no duplicates in the left DataFrame, one can use the +If the user is aware of the duplicates in the right :class:`DataFrame` but wants to +ensure there are no duplicates in the left :class:`DataFrame`, one can use the ``validate='one_to_many'`` argument instead, which will not raise an exception. .. ipython:: python @@ -750,8 +555,8 @@ .. _merging.indicator: -The merge indicator -~~~~~~~~~~~~~~~~~~~ +Merge result indicator +~~~~~~~~~~~~~~~~~~~~~~ :func:`~pandas.merge` accepts the argument ``indicator``. If ``True``, a Categorical-type column called ``_merge`` will be added to the output object @@ -771,97 +576,53 @@ df2 = pd.DataFrame({"col1": [1, 2, 2], "col_right": [2, 2, 2]}) pd.merge(df1, df2, on="col1", how="outer", indicator=True) -The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. +A string argument to ``indicator`` will use the value as the name for the indicator column. .. ipython:: python pd.merge(df1, df2, on="col1", how="outer", indicator="indicator_column") -.. _merging.dtypes: - -Merge dtypes -~~~~~~~~~~~~ - -Merging will preserve the dtype of the join keys. - -.. ipython:: python - - left = pd.DataFrame({"key": [1], "v1": [10]}) - left - right = pd.DataFrame({"key": [1, 2], "v1": [20, 30]}) - right - -We are able to preserve the join keys: - -.. ipython:: python - - pd.merge(left, right, how="outer") - pd.merge(left, right, how="outer").dtypes - -Of course if you have missing values that are introduced, then the -resulting dtype will be upcast. - -.. ipython:: python - - pd.merge(left, right, how="outer", on="key") - pd.merge(left, right, how="outer", on="key").dtypes - -Merging will preserve ``category`` dtypes of the mergands. See also the section on :ref:`categoricals `. +Overlapping value columns +~~~~~~~~~~~~~~~~~~~~~~~~~ -The left frame. +The merge ``suffixes`` argument takes a tuple of list of strings to append to +overlapping column names in the input :class:`DataFrame` to disambiguate the result +columns: .. ipython:: python - from pandas.api.types import CategoricalDtype - - X = pd.Series(np.random.choice(["foo", "bar"], size=(10,))) - X = X.astype(CategoricalDtype(categories=["foo", "bar"])) - - left = pd.DataFrame( - {"X": X, "Y": np.random.choice(["one", "two", "three"], size=(10,))} - ) - left - left.dtypes + left = pd.DataFrame({"k": ["K0", "K1", "K2"], "v": [1, 2, 3]}) + right = pd.DataFrame({"k": ["K0", "K0", "K3"], "v": [4, 5, 6]}) -The right frame. + result = pd.merge(left, right, on="k") + result .. ipython:: python + :suppress: - right = pd.DataFrame( - { - "X": pd.Series(["foo", "bar"], dtype=CategoricalDtype(["foo", "bar"])), - "Z": [1, 2], - } - ) - right - right.dtypes - -The merged result: + @savefig merging_merge_overlapped.png + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. ipython:: python - result = pd.merge(left, right, how="outer") + result = pd.merge(left, right, on="k", suffixes=("_l", "_r")) result - result.dtypes - -.. note:: - The category dtypes must be *exactly* the same, meaning the same categories and the ordered attribute. - Otherwise the result will coerce to the categories' dtype. - -.. note:: - - Merging on ``category`` dtypes that are the same can be quite performant compared to ``object`` dtype merging. +.. ipython:: python + :suppress: -.. _merging.join.index: + @savefig merging_merge_overlapped_suffix.png + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); -Joining on index -~~~~~~~~~~~~~~~~ +:meth:`DataFrame.join` +---------------------- -:meth:`DataFrame.join` is a convenient method for combining the columns of two -potentially differently-indexed ``DataFrames`` into a single result -``DataFrame``. Here is a very basic example: +:meth:`DataFrame.join` combines the columns of multiple, +potentially differently-indexed :class:`DataFrame` into a single result +:class:`DataFrame`. .. ipython:: python @@ -874,6 +635,7 @@ ) result = left.join(right) + result .. ipython:: python :suppress: @@ -885,6 +647,7 @@ .. ipython:: python result = left.join(right, how="outer") + result .. ipython:: python :suppress: @@ -893,11 +656,10 @@ p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -The same as above, but with ``how='inner'``. - .. ipython:: python result = left.join(right, how="inner") + result .. ipython:: python :suppress: @@ -906,50 +668,9 @@ p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -The data alignment here is on the indexes (row labels). This same behavior can -be achieved using ``merge`` plus additional arguments instructing it to use the -indexes: - -.. ipython:: python - - result = pd.merge(left, right, left_index=True, right_index=True, how="outer") - -.. ipython:: python - :suppress: - - @savefig merging_merge_index_outer.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); - -.. ipython:: python - - result = pd.merge(left, right, left_index=True, right_index=True, how="inner") - -.. ipython:: python - :suppress: - - @savefig merging_merge_index_inner.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); - -Joining key columns on an index -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -:meth:`~DataFrame.join` takes an optional ``on`` argument which may be a column -or multiple column names, which specifies that the passed ``DataFrame`` is to be -aligned on that column in the ``DataFrame``. These two function calls are -completely equivalent: - -:: - - left.join(right, on=key_or_keys) - pd.merge( - left, right, left_on=key_or_keys, right_index=True, how="left", sort=False - ) - -Obviously you can choose whichever form you find more convenient. For -many-to-one joins (where one of the ``DataFrame``'s is already indexed by the -join key), using ``join`` may be more convenient. Here is a simple example: +:meth:`DataFrame.join` takes an optional ``on`` argument which may be a column +or multiple column names that the passed :class:`DataFrame` is to be +aligned. .. ipython:: python @@ -964,6 +685,7 @@ right = pd.DataFrame({"C": ["C0", "C1"], "D": ["D0", "D1"]}, index=["K0", "K1"]) result = left.join(right, on="key") + result .. ipython:: python :suppress: @@ -977,6 +699,7 @@ result = pd.merge( left, right, left_on="key", right_index=True, how="left", sort=False ) + result .. ipython:: python :suppress: @@ -987,7 +710,7 @@ .. _merging.multikey_join: -To join on multiple keys, the passed DataFrame must have a ``MultiIndex``: +To join on multiple keys, the passed :class:`DataFrame` must have a :class:`MultiIndex`: .. ipython:: python @@ -1006,12 +729,8 @@ right = pd.DataFrame( {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, index=index ) - -Now this can be joined by passing the two key column names: - -.. ipython:: python - result = left.join(right, on=["key1", "key2"]) + result .. ipython:: python :suppress: @@ -1022,14 +741,14 @@ .. _merging.df_inner_join: -The default for ``DataFrame.join`` is to perform a left join (essentially a -"VLOOKUP" operation, for Excel users), which uses only the keys found in the -calling DataFrame. Other join types, for example inner join, can be just as -easily performed: +The default for :class:`DataFrame.join` is to perform a left join +which uses only the keys found in the +calling :class:`DataFrame`. Other join types can be specified with ``how``. .. ipython:: python result = left.join(right, on=["key1", "key2"], how="inner") + result .. ipython:: python :suppress: @@ -1038,16 +757,13 @@ p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -As you can see, this drops any rows where there was no match. - .. _merging.join_on_mi: Joining a single Index to a MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can join a singly-indexed ``DataFrame`` with a level of a MultiIndexed ``DataFrame``. -The level will match on the name of the index of the singly-indexed frame against -a level name of the MultiIndexed frame. +You can join a :class:`DataFrame` with a :class:`Index` to a :class:`DataFrame` with a :class:`MultiIndex` on a level. +The ``name`` of the :class:`Index` with match the level name of the :class:`MultiIndex`. .. ipython:: python @@ -1066,6 +782,7 @@ ) result = left.join(right, how="inner") + result .. ipython:: python @@ -1075,29 +792,13 @@ p.plot([left, right], result, labels=["left", "right"], vertical=False); plt.close("all"); -This is equivalent but less verbose and more memory efficient / faster than this. - -.. ipython:: python - - result = pd.merge( - left.reset_index(), right.reset_index(), on=["key"], how="inner" - ).set_index(["key","Y"]) - -.. ipython:: python - :suppress: - - @savefig merging_merge_multiindex_alternative.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); - .. _merging.join_with_two_multi_indexes: -Joining with two MultiIndexes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Joining with two :class:`MultiIndex` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This is supported in a limited way, provided that the index for the right -argument is completely used in the join, and is a subset of the indices in -the left argument, as in this example: +The :class:`MultiIndex` of the input argument must be completely used +in the join and is a subset of the indices in the left argument. .. ipython:: python @@ -1115,9 +816,6 @@ left.join(right, on=["abc", "xy"], how="inner") -If that condition is not satisfied, a join with two multi-indexes can be -done using the following code. - .. ipython:: python leftindex = pd.MultiIndex.from_tuples( @@ -1137,6 +835,7 @@ result = pd.merge( left.reset_index(), right.reset_index(), on=["key"], how="inner" ).set_index(["key", "X", "Y"]) + result .. ipython:: python :suppress: @@ -1152,7 +851,7 @@ Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters may refer to either column names or index level names. This enables merging -``DataFrame`` instances on a combination of index levels and columns without +:class:`DataFrame` instances on a combination of index levels and columns without resetting indexes. .. ipython:: python @@ -1180,6 +879,7 @@ ) result = left.merge(right, on=["key1", "key2"]) + result .. ipython:: python :suppress: @@ -1190,76 +890,23 @@ .. note:: - When DataFrames are merged on a string that matches an index level in both - frames, the index level is preserved as an index level in the resulting - DataFrame. + When :class:`DataFrame` are joined on a string that matches an index level in both + arguments, the index level is preserved as an index level in the resulting + :class:`DataFrame`. .. note:: - When DataFrames are merged using only some of the levels of a ``MultiIndex``, - the extra levels will be dropped from the resulting merge. In order to - preserve those levels, use ``reset_index`` on those level names to move - those levels to columns prior to doing the merge. -.. note:: - - If a string matches both a column name and an index level name, then a - warning is issued and the column takes precedence. This will result in an - ambiguity error in a future version. - -Overlapping value columns -~~~~~~~~~~~~~~~~~~~~~~~~~ - -The merge ``suffixes`` argument takes a tuple of list of strings to append to -overlapping column names in the input ``DataFrame``\ s to disambiguate the result -columns: - -.. ipython:: python - - left = pd.DataFrame({"k": ["K0", "K1", "K2"], "v": [1, 2, 3]}) - right = pd.DataFrame({"k": ["K0", "K0", "K3"], "v": [4, 5, 6]}) - - result = pd.merge(left, right, on="k") - -.. ipython:: python - :suppress: - - @savefig merging_merge_overlapped.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); - -.. ipython:: python - - result = pd.merge(left, right, on="k", suffixes=("_l", "_r")) - -.. ipython:: python - :suppress: - - @savefig merging_merge_overlapped_suffix.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); - -:meth:`DataFrame.join` has ``lsuffix`` and ``rsuffix`` arguments which behave -similarly. - -.. ipython:: python - - left = left.set_index("k") - right = right.set_index("k") - result = left.join(right, lsuffix="_l", rsuffix="_r") - -.. ipython:: python - :suppress: - - @savefig merging_merge_overlapped_multi_suffix.png - p.plot([left, right], result, labels=["left", "right"], vertical=False); - plt.close("all"); + When :class:`DataFrame` are joined using only some of the levels of a :class:`MultiIndex`, + the extra levels will be dropped from the resulting join. To + preserve those levels, use :meth:`DataFrame.reset_index` on those level + names to move those levels to columns prior to the join. .. _merging.multiple_join: -Joining multiple DataFrames -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Joining multiple :class:`DataFrame` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A list or tuple of ``DataFrames`` can also be passed to :meth:`~DataFrame.join` +A list or tuple of ``:class:`DataFrame``` can also be passed to :meth:`~DataFrame.join` to join them together on their indexes. .. ipython:: python @@ -1281,12 +928,12 @@ .. _merging.combine_first.update: -Merging together values within Series or DataFrame columns -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +:meth:`DataFrame.combine_first` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Another fairly common situation is to have two like-indexed (or similarly -indexed) ``Series`` or ``DataFrame`` objects and wanting to "patch" values in -one object from values for matching indices in the other. Here is an example: +:meth:`DataFrame.combine_first` update missing values from one :class:`DataFrame` +with the non-missing values in another :class:`DataFrame` in the corresponding +location. .. ipython:: python @@ -1294,12 +941,8 @@ [[np.nan, 3.0, 5.0], [-4.6, np.nan, np.nan], [np.nan, 7.0, np.nan]] ) df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5.0, 1.6, 4]], index=[1, 2]) - -For this, use the :meth:`~DataFrame.combine_first` method: - -.. ipython:: python - result = df1.combine_first(df2) + result .. ipython:: python :suppress: @@ -1308,39 +951,13 @@ p.plot([df1, df2], result, labels=["df1", "df2"], vertical=False); plt.close("all"); -Note that this method only takes values from the right ``DataFrame`` if they are -missing in the left ``DataFrame``. A related method, :meth:`~DataFrame.update`, -alters non-NA values in place: - -.. ipython:: python - :suppress: - - df1_copy = df1.copy() - -.. ipython:: python - - df1.update(df2) - -.. ipython:: python - :suppress: - - @savefig merging_update.png - p.plot([df1_copy, df2], df1, labels=["df1", "df2"], vertical=False); - plt.close("all"); - -.. _merging.time_series: - -Timeseries friendly merging ---------------------------- - .. _merging.merge_ordered: -Merging ordered data -~~~~~~~~~~~~~~~~~~~~ +:func:`merge_ordered` +--------------------- -A :func:`merge_ordered` function allows combining time series and other -ordered data. In particular it has an optional ``fill_method`` keyword to -fill/interpolate missing data: +:func:`merge_ordered` combines order data such as numeric or time series data +with optional filling of missing data with ``fill_method``. .. ipython:: python @@ -1354,19 +971,16 @@ .. _merging.merge_asof: -Merging asof -~~~~~~~~~~~~ - -A :func:`merge_asof` is similar to an ordered left-join except that we match on -nearest key rather than equal keys. For each row in the ``left`` ``DataFrame``, -we select the last row in the ``right`` ``DataFrame`` whose ``on`` key is less -than the left's key. Both DataFrames must be sorted by the key. +:func:`merge_asof` +--------------------- -Optionally an asof merge can perform a group-wise merge. This matches the -``by`` key equally, in addition to the nearest match on the ``on`` key. +:func:`merge_asof` is similar to an ordered left-join except that mactches are on the +nearest key rather than equal keys. For each row in the ``left`` :class:`DataFrame`, +the last row in the ``right`` :class:`DataFrame` are selected where the ``on`` key is less +than the left's key. Both :class:`DataFrame` must be sorted by the key. -For example; we might have ``trades`` and ``quotes`` and we want to ``asof`` -merge them. +Optionally an :func:`merge_asof` can perform a group-wise merge by matching the +``by`` key in addition to the nearest match on the ``on`` key. .. ipython:: python @@ -1408,25 +1022,17 @@ }, columns=["time", "ticker", "bid", "ask"], ) - -.. ipython:: python - trades quotes - -By default we are taking the asof of the quotes. - -.. ipython:: python - pd.merge_asof(trades, quotes, on="time", by="ticker") -We only asof within ``2ms`` between the quote time and the trade time. +:func:`merge_asof` within ``2ms`` between the quote time and the trade time. .. ipython:: python pd.merge_asof(trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")) -We only asof within ``10ms`` between the quote time and the trade time and we +:func:`merge_asof` within ``10ms`` between the quote time and the trade time and exclude exact matches on time. Note that though we exclude the exact matches (of the quotes), prior quotes **do** propagate to that point in time. @@ -1443,14 +1049,11 @@ .. _merging.compare: -Comparing objects ------------------ - -The :meth:`~Series.compare` and :meth:`~DataFrame.compare` methods allow you to -compare two DataFrame or Series, respectively, and summarize their differences. +:meth:`~Series.compare` +----------------------- -For example, you might want to compare two ``DataFrame`` and stack their differences -side by side. +The :meth:`Series.compare` and :meth:`DataFrame.compare` methods allow you to +compare two :class:`DataFrame` or :class:`Series`, respectively, and summarize their differences. .. ipython:: python @@ -1463,36 +1066,29 @@ columns=["col1", "col2", "col3"], ) df - -.. ipython:: python - df2 = df.copy() df2.loc[0, "col1"] = "c" df2.loc[2, "col3"] = 4.0 df2 - -.. ipython:: python - df.compare(df2) By default, if two corresponding values are equal, they will be shown as ``NaN``. Furthermore, if all values in an entire row / column, the row / column will be omitted from the result. The remaining differences will be aligned on columns. -If you wish, you may choose to stack the differences on rows. +Stack the differences on rows. .. ipython:: python df.compare(df2, align_axis=0) -If you wish to keep all original rows and columns, set ``keep_shape`` argument -to ``True``. +Keep all original rows and columns with ``keep_shape=True`` .. ipython:: python df.compare(df2, keep_shape=True) -You may also keep all the original values even if they are equal. +Keep all the original values even if they are equal. .. ipython:: python diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/missing_data.rst pandas-2.2.2+dfsg/doc/source/user_guide/missing_data.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/missing_data.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/missing_data.rst 2024-04-10 17:42:52.000000000 +0000 @@ -6,350 +6,462 @@ Working with missing data ************************* -In this section, we will discuss missing (also referred to as NA) values in -pandas. +Values considered "missing" +~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. note:: +pandas uses different sentinel values to represent a missing (also referred to as NA) +depending on the data type. - The choice of using ``NaN`` internally to denote missing data was largely - for simplicity and performance reasons. - Starting from pandas 1.0, some optional data types start experimenting - with a native ``NA`` scalar using a mask-based approach. See - :ref:`here ` for more. +``numpy.nan`` for NumPy data types. The disadvantage of using NumPy data types +is that the original data type will be coerced to ``np.float64`` or ``object``. -See the :ref:`cookbook` for some advanced strategies. +.. ipython:: python -Values considered "missing" -~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pd.Series([1, 2], dtype=np.int64).reindex([0, 1, 2]) + pd.Series([True, False], dtype=np.bool_).reindex([0, 1, 2]) + +:class:`NaT` for NumPy ``np.datetime64``, ``np.timedelta64``, and :class:`PeriodDtype`. For typing applications, +use :class:`api.types.NaTType`. -As data comes in many shapes and forms, pandas aims to be flexible with regard -to handling missing data. While ``NaN`` is the default missing value marker for -reasons of computational speed and convenience, we need to be able to easily -detect this value with data of different types: floating point, integer, -boolean, and general object. In many cases, however, the Python ``None`` will -arise and we wish to also consider that "missing" or "not available" or "NA". +.. ipython:: python + + pd.Series([1, 2], dtype=np.dtype("timedelta64[ns]")).reindex([0, 1, 2]) + pd.Series([1, 2], dtype=np.dtype("datetime64[ns]")).reindex([0, 1, 2]) + pd.Series(["2020", "2020"], dtype=pd.PeriodDtype("D")).reindex([0, 1, 2]) -.. _missing.isna: +:class:`NA` for :class:`StringDtype`, :class:`Int64Dtype` (and other bit widths), +:class:`Float64Dtype`(and other bit widths), :class:`BooleanDtype` and :class:`ArrowDtype`. +These types will maintain the original data type of the data. +For typing applications, use :class:`api.types.NAType`. .. ipython:: python - df = pd.DataFrame( - np.random.randn(5, 3), - index=["a", "c", "e", "f", "h"], - columns=["one", "two", "three"], - ) - df["four"] = "bar" - df["five"] = df["one"] > 0 - df - df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"]) - df2 + pd.Series([1, 2], dtype="Int64").reindex([0, 1, 2]) + pd.Series([True, False], dtype="boolean[pyarrow]").reindex([0, 1, 2]) -To make detecting missing values easier (and across different array dtypes), -pandas provides the :func:`isna` and -:func:`notna` functions, which are also methods on -Series and DataFrame objects: +To detect these missing value, use the :func:`isna` or :func:`notna` methods. .. ipython:: python - df2["one"] - pd.isna(df2["one"]) - df2["four"].notna() - df2.isna() + ser = pd.Series([pd.Timestamp("2020-01-01"), pd.NaT]) + ser + pd.isna(ser) + + +.. note:: + + :func:`isna` or :func:`notna` will also consider ``None`` a missing value. + + .. ipython:: python + + ser = pd.Series([1, None], dtype=object) + ser + pd.isna(ser) .. warning:: - One has to be mindful that in Python (and NumPy), the ``nan's`` don't compare equal, but ``None's`` **do**. - Note that pandas/NumPy uses the fact that ``np.nan != np.nan``, and treats ``None`` like ``np.nan``. + Equality compaisons between ``np.nan``, :class:`NaT`, and :class:`NA` + do not act like ``None`` .. ipython:: python None == None # noqa: E711 np.nan == np.nan + pd.NaT == pd.NaT + pd.NA == pd.NA - So as compared to above, a scalar equality comparison versus a ``None/np.nan`` doesn't provide useful information. + Therefore, an equality comparison between a :class:`DataFrame` or :class:`Series` + with one of these missing values does not provide the same information as + :func:`isna` or :func:`notna`. .. ipython:: python - df2["one"] == np.nan + ser = pd.Series([True, None], dtype="boolean[pyarrow]") + ser == pd.NA + pd.isna(ser) + -Integer dtypes and missing data -------------------------------- +.. _missing_data.NA: + +:class:`NA` semantics +~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: -Because ``NaN`` is a float, a column of integers with even one missing values -is cast to floating-point dtype (see :ref:`gotchas.intna` for more). pandas -provides a nullable integer array, which can be used by explicitly requesting -the dtype: + Experimental: the behaviour of :class:`NA`` can still change without warning. + +Starting from pandas 1.0, an experimental :class:`NA` value (singleton) is +available to represent scalar missing values. The goal of :class:`NA` is provide a +"missing" indicator that can be used consistently across data types +(instead of ``np.nan``, ``None`` or ``pd.NaT`` depending on the data type). + +For example, when having missing values in a :class:`Series` with the nullable integer +dtype, it will use :class:`NA`: .. ipython:: python - pd.Series([1, 2, np.nan, 4], dtype=pd.Int64Dtype()) + s = pd.Series([1, 2, None], dtype="Int64") + s + s[2] + s[2] is pd.NA -Alternatively, the string alias ``dtype='Int64'`` (note the capital ``"I"``) can be -used. +Currently, pandas does not yet use those data types using :class:`NA` by default +a :class:`DataFrame` or :class:`Series`, so you need to specify +the dtype explicitly. An easy way to convert to those dtypes is explained in the +:ref:`conversion section `. -See :ref:`integer_na` for more. +Propagation in arithmetic and comparison operations +--------------------------------------------------- -Datetimes ---------- -.. note:: - If you are adding type checking to your application, you may need access to ``NaTType`` and ``NAType``. +In general, missing values *propagate* in operations involving :class:`NA`. When +one of the operands is unknown, the outcome of the operation is also unknown. -For datetime64[ns] types, ``NaT`` represents missing values. This is a pseudo-native -sentinel value that can be represented by NumPy in a singular dtype (datetime64[ns]). -pandas objects provide compatibility between ``NaT`` and ``NaN``. +For example, :class:`NA` propagates in arithmetic operations, similarly to +``np.nan``: .. ipython:: python - df2 = df.copy() - df2["timestamp"] = pd.Timestamp("20120101") - df2 - df2.loc[["a", "c", "h"], ["one", "timestamp"]] = np.nan - df2 - df2.dtypes.value_counts() + pd.NA + 1 + "a" * pd.NA -.. _missing.inserting: +There are a few special cases when the result is known, even when one of the +operands is ``NA``. -Inserting missing data -~~~~~~~~~~~~~~~~~~~~~~ +.. ipython:: python -You can insert missing values by simply assigning to containers. The -actual missing value used will be chosen based on the dtype. + pd.NA ** 0 + 1 ** pd.NA -For example, numeric containers will always use ``NaN`` regardless of -the missing value type chosen: +In equality and comparison operations, :class:`NA` also propagates. This deviates +from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always +return ``False``. .. ipython:: python - s = pd.Series([1., 2., 3.]) - s.loc[0] = None - s - -Likewise, datetime containers will always use ``NaT``. + pd.NA == 1 + pd.NA == pd.NA + pd.NA < 2.5 -For object containers, pandas will use the value given: +To check if a value is equal to :class:`NA`, use :func:`isna` .. ipython:: python - s = pd.Series(["a", "b", "c"]) - s.loc[0] = None - s.loc[1] = np.nan - s + pd.isna(pd.NA) -.. _missing_data.calculations: -Calculations with missing data -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. note:: -Missing values propagate naturally through arithmetic operations between pandas -objects. + An exception on this basic propagation rule are *reductions* (such as the + mean or the minimum), where pandas defaults to skipping missing values. See the + :ref:`calculation section ` for more. + +Logical operations +------------------ + +For logical operations, :class:`NA` follows the rules of the +`three-valued logic `__ (or +*Kleene logic*, similarly to R, SQL and Julia). This logic means to only +propagate missing values when it is logically required. + +For example, for the logical "or" operation (``|``), if one of the operands +is ``True``, we already know the result will be ``True``, regardless of the +other value (so regardless the missing value would be ``True`` or ``False``). +In this case, :class:`NA` does not propagate: .. ipython:: python - df = df2.loc[:, ["one", "two", "three"]] - a = df2.loc[df2.index[:5], ["one", "two"]].ffill() - b = df2.loc[df2.index[:5], ["one", "two", "three"]] - a - b - a + b + True | False + True | pd.NA + pd.NA | True -The descriptive statistics and computational methods discussed in the -:ref:`data structure overview ` (and listed :ref:`here -` and :ref:`here `) are all written to -account for missing data. For example: +On the other hand, if one of the operands is ``False``, the result depends +on the value of the other operand. Therefore, in this case :class:`NA` +propagates: -* When summing data, NA (missing) values will be treated as zero. -* If the data are all NA, the result will be 0. -* Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` ignore NA values by default, but preserve them in the resulting arrays. To override this behaviour and include NA values, use ``skipna=False``. +.. ipython:: python + + False | True + False | False + False | pd.NA + +The behaviour of the logical "and" operation (``&``) can be derived using +similar logic (where now :class:`NA` will not propagate if one of the operands +is already ``False``): .. ipython:: python - df - df["one"].sum() - df.mean(1) - df.cumsum() - df.cumsum(skipna=False) + False & True + False & False + False & pd.NA +.. ipython:: python -.. _missing_data.numeric_sum: + True & True + True & False + True & pd.NA -Sum/prod of empties/nans -~~~~~~~~~~~~~~~~~~~~~~~~ -The sum of an empty or all-NA Series or column of a DataFrame is 0. +``NA`` in a boolean context +--------------------------- + +Since the actual value of an NA is unknown, it is ambiguous to convert NA +to a boolean value. .. ipython:: python + :okexcept: - pd.Series([np.nan]).sum() + bool(pd.NA) - pd.Series([], dtype="float64").sum() +This also means that :class:`NA` cannot be used in a context where it is +evaluated to a boolean, such as ``if condition: ...`` where ``condition`` can +potentially be :class:`NA`. In such cases, :func:`isna` can be used to check +for :class:`NA` or ``condition`` being :class:`NA` can be avoided, for example by +filling missing values beforehand. -The product of an empty or all-NA Series or column of a DataFrame is 1. +A similar situation occurs when using :class:`Series` or :class:`DataFrame` objects in ``if`` +statements, see :ref:`gotchas.truth`. + +NumPy ufuncs +------------ + +:attr:`pandas.NA` implements NumPy's ``__array_ufunc__`` protocol. Most ufuncs +work with ``NA``, and generally return ``NA``: .. ipython:: python - pd.Series([np.nan]).prod() + np.log(pd.NA) + np.add(pd.NA, 1) - pd.Series([], dtype="float64").prod() +.. warning:: + Currently, ufuncs involving an ndarray and ``NA`` will return an + object-dtype filled with NA values. -NA values in GroupBy -~~~~~~~~~~~~~~~~~~~~ + .. ipython:: python + + a = np.array([1, 2, 3]) + np.greater(a, pd.NA) + + The return type here may change to return a different array type + in the future. + +See :ref:`dsintro.numpy_interop` for more on ufuncs. + +.. _missing_data.NA.conversion: + +Conversion +^^^^^^^^^^ + +If you have a :class:`DataFrame` or :class:`Series` using ``np.nan``, +:meth:`Series.convert_dtypes` and :meth:`DataFrame.convert_dtypes` +in :class:`DataFrame` that can convert data to use the data types that use :class:`NA` +such as :class:`Int64Dtype` or :class:`ArrowDtype`. This is especially helpful after reading +in data sets from IO methods where data types were inferred. -NA groups in GroupBy are automatically excluded. This behavior is consistent -with R, for example: +In this example, while the dtypes of all columns are changed, we show the results for +the first 10 columns. .. ipython:: python - df - df.groupby("one").mean() + import io + data = io.StringIO("a,b\n,True\n2,") + df = pd.read_csv(data) + df.dtypes + df_conv = df.convert_dtypes() + df_conv + df_conv.dtypes -See the groupby section :ref:`here ` for more information. +.. _missing.inserting: -Cleaning / filling missing data --------------------------------- +Inserting missing data +~~~~~~~~~~~~~~~~~~~~~~ -pandas objects are equipped with various data manipulation methods for dealing -with missing data. +You can insert missing values by simply assigning to a :class:`Series` or :class:`DataFrame`. +The missing value sentinel used will be chosen based on the dtype. -.. _missing_data.fillna: +.. ipython:: python + + ser = pd.Series([1., 2., 3.]) + ser.loc[0] = None + ser + + ser = pd.Series([pd.Timestamp("2021"), pd.Timestamp("2021")]) + ser.iloc[0] = np.nan + ser + + ser = pd.Series([True, False], dtype="boolean[pyarrow]") + ser.iloc[0] = None + ser + +For ``object`` types, pandas will use the value given: -Filling missing values: fillna +.. ipython:: python + + s = pd.Series(["a", "b", "c"], dtype=object) + s.loc[0] = None + s.loc[1] = np.nan + s + +.. _missing_data.calculations: + +Calculations with missing data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`~DataFrame.fillna` can "fill in" NA values with non-NA data in a couple -of ways, which we illustrate: +Missing values propagate through arithmetic operations between pandas objects. + +.. ipython:: python + + ser1 = pd.Series([np.nan, np.nan, 2, 3]) + ser2 = pd.Series([np.nan, 1, np.nan, 4]) + ser1 + ser2 + ser1 + ser2 + +The descriptive statistics and computational methods discussed in the +:ref:`data structure overview ` (and listed :ref:`here +` and :ref:`here `) are all +account for missing data. + +When summing data, NA values or empty data will be treated as zero. + +.. ipython:: python + + pd.Series([np.nan]).sum() + pd.Series([], dtype="float64").sum() -**Replace NA with a scalar value** +When taking the product, NA values or empty data will be treated as 1. .. ipython:: python - df2 - df2.fillna(0) - df2["one"].fillna("missing") + pd.Series([np.nan]).prod() + pd.Series([], dtype="float64").prod() -**Fill gaps forward or backward** +Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` +ignore NA values by default preserve them in the result. This behavior can be changed +with ``skipna`` + +* Cumulative methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` ignore NA values by default, but preserve them in the resulting arrays. To override this behaviour and include NA values, use ``skipna=False``. -Using the same filling arguments as :ref:`reindexing `, we -can propagate non-NA values forward or backward: .. ipython:: python - df - df.ffill() + ser = pd.Series([1, np.nan, 3, np.nan]) + ser + ser.cumsum() + ser.cumsum(skipna=False) -.. _missing_data.fillna.limit: +.. _missing_data.dropna: -**Limit the amount of filling** +Dropping missing data +~~~~~~~~~~~~~~~~~~~~~ -If we only want consecutive gaps filled up to a certain number of data points, -we can use the ``limit`` keyword: +:meth:`~DataFrame.dropna` dropa rows or columns with missing data. .. ipython:: python - df.iloc[2:4, :] = np.nan + df = pd.DataFrame([[np.nan, 1, 2], [1, 2, np.nan], [1, 2, 3]]) df - df.ffill(limit=1) + df.dropna() + df.dropna(axis=1) -To remind you, these are the available filling methods: + ser = pd.Series([1, pd.NA], dtype="int64[pyarrow]") + ser.dropna() -.. csv-table:: - :header: "Method", "Action" - :widths: 30, 50 +Filling missing data +~~~~~~~~~~~~~~~~~~~~ - pad / ffill, Fill values forward - bfill / backfill, Fill values backward +.. _missing_data.fillna: -With time series data, using pad/ffill is extremely common so that the "last -known value" is available at every time point. +Filling by value +---------------- -:meth:`~DataFrame.ffill` is equivalent to ``fillna(method='ffill')`` -and :meth:`~DataFrame.bfill` is equivalent to ``fillna(method='bfill')`` +:meth:`~DataFrame.fillna` replaces NA values with non-NA data. -.. _missing_data.PandasObject: +Replace NA with a scalar value -Filling with a PandasObject -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. ipython:: python + + data = {"np": [1.0, np.nan, np.nan, 2], "arrow": pd.array([1.0, pd.NA, pd.NA, 2], dtype="float64[pyarrow]")} + df = pd.DataFrame(data) + df + df.fillna(0) -You can also fillna using a dict or Series that is alignable. The labels of the dict or index of the Series -must match the columns of the frame you wish to fill. The -use case of this is to fill a DataFrame with the mean of that column. +Fill gaps forward or backward .. ipython:: python - dff = pd.DataFrame(np.random.randn(10, 3), columns=list("ABC")) - dff.iloc[3:5, 0] = np.nan - dff.iloc[4:6, 1] = np.nan - dff.iloc[5:8, 2] = np.nan - dff + df.ffill() + df.bfill() - dff.fillna(dff.mean()) - dff.fillna(dff.mean()["B":"C"]) +.. _missing_data.fillna.limit: -Same result as above, but is aligning the 'fill' value which is -a Series in this case. +Limit the number of NA values filled .. ipython:: python - dff.where(pd.notna(dff), dff.mean(), axis="columns") + df.ffill(limit=1) +NA values can be replaced with corresponding value from a :class:`Series` or :class:`DataFrame` +where the index and column aligns between the original object and the filled object. -.. _missing_data.dropna: +.. ipython:: python -Dropping axis labels with missing data: dropna -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + dff = pd.DataFrame(np.arange(30, dtype=np.float64).reshape(10, 3), columns=list("ABC")) + dff.iloc[3:5, 0] = np.nan + dff.iloc[4:6, 1] = np.nan + dff.iloc[5:8, 2] = np.nan + dff + dff.fillna(dff.mean()) -You may wish to simply exclude labels from a data set which refer to missing -data. To do this, use :meth:`~DataFrame.dropna`: +.. note:: -.. ipython:: python + :meth:`DataFrame.where` can also be used to fill NA values.Same result as above. - df["two"] = df["two"].fillna(0) - df["three"] = df["three"].fillna(0) - df - df.dropna(axis=0) - df.dropna(axis=1) - df["one"].dropna() + .. ipython:: python + + dff.where(pd.notna(dff), dff.mean(), axis="columns") -An equivalent :meth:`~Series.dropna` is available for Series. -DataFrame.dropna has considerably more options than Series.dropna, which can be -examined :ref:`in the API `. .. _missing_data.interpolate: Interpolation -~~~~~~~~~~~~~ +------------- -Both Series and DataFrame objects have :meth:`~DataFrame.interpolate` -that, by default, performs linear interpolation at missing data points. +:meth:`DataFrame.interpolate` and :meth:`Series.interpolate` fills NA values +using various interpolation methods. .. ipython:: python - np.random.seed(123456) - idx = pd.date_range("1/1/2000", periods=100, freq="BM") - ts = pd.Series(np.random.randn(100), index=idx) - ts[1:5] = np.nan - ts[20:30] = np.nan - ts[60:80] = np.nan - ts = ts.cumsum() + df = pd.DataFrame( + { + "A": [1, 2.1, np.nan, 4.7, 5.6, 6.8], + "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4], + } + ) + df + df.interpolate() + + idx = pd.date_range("2020-01-01", periods=10, freq="D") + data = np.random.default_rng(2).integers(0, 10, 10).astype(np.float64) + ts = pd.Series(data, index=idx) + ts.iloc[[1, 2, 5, 6, 9]] = np.nan ts - ts.count() @savefig series_before_interpolate.png ts.plot() .. ipython:: python ts.interpolate() - ts.interpolate().count() - @savefig series_interpolate.png ts.interpolate().plot() -Index aware interpolation is available via the ``method`` keyword: +Interpolation relative to a :class:`Timestamp` in the :class:`DatetimeIndex` +is available by setting ``method="time"`` .. ipython:: python - ts2 = ts.iloc[[0, 1, 30, 60, 99]] + ts2 = ts.iloc[[0, 1, 3, 7, 9]] ts2 ts2.interpolate() ts2.interpolate(method="time") @@ -360,46 +472,36 @@ idx = [0.0, 1.0, 10.0] ser = pd.Series([0.0, np.nan, 10.0], idx) - ser ser.interpolate() ser.interpolate(method="values") -You can also interpolate with a DataFrame: - -.. ipython:: python - - df = pd.DataFrame( - { - "A": [1, 2.1, np.nan, 4.7, 5.6, 6.8], - "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4], - } - ) - df - df.interpolate() - -The ``method`` argument gives access to fancier interpolation methods. If you have scipy_ installed, you can pass the name of a 1-d interpolation routine to ``method``. -You'll want to consult the full scipy interpolation documentation_ and reference guide_ for details. -The appropriate interpolation method will depend on the type of data you are working with. - -* If you are dealing with a time series that is growing at an increasing rate, - ``method='quadratic'`` may be appropriate. -* If you have values approximating a cumulative distribution function, - then ``method='pchip'`` should work well. -* To fill missing values with goal of smooth plotting, consider ``method='akima'``. +as specified in the scipy interpolation documentation_ and reference guide_. +The appropriate interpolation method will depend on the data type. -.. warning:: +.. tip:: - These methods require ``scipy``. + If you are dealing with a time series that is growing at an increasing rate, + use ``method='barycentric'``. -.. ipython:: python + If you have values approximating a cumulative distribution function, + use ``method='pchip'``. - df.interpolate(method="barycentric") + To fill missing values with goal of smooth plotting use ``method='akima'``. - df.interpolate(method="pchip") + .. ipython:: python - df.interpolate(method="akima") + df = pd.DataFrame( + { + "A": [1, 2.1, np.nan, 4.7, 5.6, 6.8], + "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4], + } + ) + df + df.interpolate(method="barycentric") + df.interpolate(method="pchip") + df.interpolate(method="akima") When interpolating via a polynomial or spline approximation, you must also specify the degree or order of the approximation: @@ -407,10 +509,9 @@ .. ipython:: python df.interpolate(method="spline", order=2) - df.interpolate(method="polynomial", order=2) -Compare several methods: +Comparing several methods. .. ipython:: python @@ -425,11 +526,7 @@ @savefig compare_interpolations.png df.plot() -Another use case is interpolation at *new* values. -Suppose you have 100 observations from some distribution. And let's suppose -that you're particularly interested in what's happening around the middle. -You can mix pandas' ``reindex`` and ``interpolate`` methods to interpolate -at the new values. +Interpolating new observations from expanding data with :meth:`Series.reindex`. .. ipython:: python @@ -447,21 +544,17 @@ .. _missing_data.interp_limits: Interpolation limits --------------------- +^^^^^^^^^^^^^^^^^^^^ -Like other pandas fill methods, :meth:`~DataFrame.interpolate` accepts a ``limit`` keyword -argument. Use this argument to limit the number of consecutive ``NaN`` values -filled since the last valid observation: +:meth:`~DataFrame.interpolate` accepts a ``limit`` keyword +argument to limit the number of consecutive ``NaN`` values +filled since the last valid observation .. ipython:: python ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan]) ser - - # fill all consecutive values in a forward direction ser.interpolate() - - # fill one consecutive value in a forward direction ser.interpolate(limit=1) By default, ``NaN`` values are filled in a ``forward`` direction. Use @@ -469,17 +562,12 @@ .. ipython:: python - # fill one consecutive value backwards ser.interpolate(limit=1, limit_direction="backward") - - # fill one consecutive value in both directions ser.interpolate(limit=1, limit_direction="both") - - # fill all consecutive values in both directions ser.interpolate(limit_direction="both") -By default, ``NaN`` values are filled whether they are inside (surrounded by) -existing valid values, or outside existing valid values. The ``limit_area`` +By default, ``NaN`` values are filled whether they are surrounded by +existing valid values or outside existing valid values. The ``limit_area`` parameter restricts filling to either inside or outside values. .. ipython:: python @@ -495,58 +583,46 @@ .. _missing_data.replace: -Replacing generic values -~~~~~~~~~~~~~~~~~~~~~~~~ -Often times we want to replace arbitrary values with other values. +Replacing values +---------------- -:meth:`~Series.replace` in Series and :meth:`~DataFrame.replace` in DataFrame provides an efficient yet -flexible way to perform such replacements. - -For a Series, you can replace a single value or a list of values by another -value: +:meth:`Series.replace` and :meth:`DataFrame.replace` can be used similar to +:meth:`Series.fillna` and :meth:`DataFrame.fillna` to replace or insert missing values. .. ipython:: python - ser = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0]) - - ser.replace(0, 5) - -You can replace a list of values by a list of other values: - -.. ipython:: python - - ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) + df = pd.DataFrame(np.eye(3)) + df + df_missing = df.replace(0, np.nan) + df_missing + df_filled = df_missing.replace(np.nan, 2) + df_filled -You can also specify a mapping dict: +Replacing more than one value is possible by passing a list. .. ipython:: python - ser.replace({0: 10, 1: 100}) + df_filled.replace([1, 44], [2, 28]) -For a DataFrame, you can specify individual values by column: +Replacing using a mapping dict. .. ipython:: python - df = pd.DataFrame({"a": [0, 1, 2, 3, 4], "b": [5, 6, 7, 8, 9]}) - - df.replace({"a": 0, "b": 5}, 100) + df_filled.replace({1: 44, 2: 28}) .. _missing_data.replace_expression: -String/regular expression replacement -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Regular expression replacement +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. note:: Python strings prefixed with the ``r`` character such as ``r'hello world'`` - are so-called "raw" strings. They have different semantics regarding - backslashes than strings without this prefix. Backslashes in raw strings - will be interpreted as an escaped backslash, e.g., ``r'\' == '\\'``. You - should `read about them - `__ - if this is unclear. + are `"raw" strings `_. + They have different semantics regarding backslashes than strings without this prefix. + Backslashes in raw strings will be interpreted as an escaped backslash, e.g., ``r'\' == '\\'``. -Replace the '.' with ``NaN`` (str -> str): +Replace the '.' with ``NaN`` .. ipython:: python @@ -554,349 +630,47 @@ df = pd.DataFrame(d) df.replace(".", np.nan) -Now do it with a regular expression that removes surrounding whitespace -(regex -> regex): +Replace the '.' with ``NaN`` with regular expression that removes surrounding whitespace .. ipython:: python df.replace(r"\s*\.\s*", np.nan, regex=True) -Replace a few different values (list -> list): - -.. ipython:: python - - df.replace(["a", "."], ["b", np.nan]) - -list of regex -> list of regex: +Replace with a list of regexes. .. ipython:: python df.replace([r"\.", r"(a)"], ["dot", r"\1stuff"], regex=True) -Only search in column ``'b'`` (dict -> dict): - -.. ipython:: python - - df.replace({"b": "."}, {"b": np.nan}) - -Same as the previous example, but use a regular expression for -searching instead (dict of regex -> dict): +Replace with a regex in a mapping dict. .. ipython:: python df.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True) -You can pass nested dictionaries of regular expressions that use ``regex=True``: +Pass nested dictionaries of regular expressions that use the ``regex`` keyword. .. ipython:: python df.replace({"b": {"b": r""}}, regex=True) - -Alternatively, you can pass the nested dictionary like so: - -.. ipython:: python - df.replace(regex={"b": {r"\s*\.\s*": np.nan}}) - -You can also use the group of a regular expression match when replacing (dict -of regex -> dict of regex), this works for lists as well. - -.. ipython:: python - df.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True) -You can pass a list of regular expressions, of which those that match -will be replaced with a scalar (list of regex -> regex). +Pass a list of regular expressions that will replace matches with a scalar. .. ipython:: python - df.replace([r"\s*\.\s*", r"a|b"], np.nan, regex=True) + df.replace([r"\s*\.\s*", r"a|b"], "placeholder", regex=True) All of the regular expression examples can also be passed with the ``to_replace`` argument as the ``regex`` argument. In this case the ``value`` argument must be passed explicitly by name or ``regex`` must be a nested -dictionary. The previous example, in this case, would then be: +dictionary. .. ipython:: python - df.replace(regex=[r"\s*\.\s*", r"a|b"], value=np.nan) - -This can be convenient if you do not want to pass ``regex=True`` every time you -want to use a regular expression. + df.replace(regex=[r"\s*\.\s*", r"a|b"], value="placeholder") .. note:: - Anywhere in the above ``replace`` examples that you see a regular expression - a compiled regular expression is valid as well. - -Numeric replacement -~~~~~~~~~~~~~~~~~~~ - -:meth:`~DataFrame.replace` is similar to :meth:`~DataFrame.fillna`. - -.. ipython:: python - - df = pd.DataFrame(np.random.randn(10, 2)) - df[np.random.rand(df.shape[0]) > 0.5] = 1.5 - df.replace(1.5, np.nan) - -Replacing more than one value is possible by passing a list. - -.. ipython:: python - - df00 = df.iloc[0, 0] - df.replace([1.5, df00], [np.nan, "a"]) - df[1].dtype - -Missing data casting rules and indexing ---------------------------------------- - -While pandas supports storing arrays of integer and boolean type, these types -are not capable of storing missing data. Until we can switch to using a native -NA type in NumPy, we've established some "casting rules". When a reindexing -operation introduces missing data, the Series will be cast according to the -rules introduced in the table below. - -.. csv-table:: - :header: "data type", "Cast to" - :widths: 40, 40 - - integer, float - boolean, object - float, no cast - object, no cast - -For example: - -.. ipython:: python - - s = pd.Series(np.random.randn(5), index=[0, 2, 4, 6, 7]) - s > 0 - (s > 0).dtype - crit = (s > 0).reindex(list(range(8))) - crit - crit.dtype - -Ordinarily NumPy will complain if you try to use an object array (even if it -contains boolean values) instead of a boolean array to get or set values from -an ndarray (e.g. selecting values based on some criteria). If a boolean vector -contains NAs, an exception will be generated: - -.. ipython:: python - :okexcept: - - reindexed = s.reindex(list(range(8))).fillna(0) - reindexed[crit] - -However, these can be filled in using :meth:`~DataFrame.fillna` and it will work fine: - -.. ipython:: python - - reindexed[crit.fillna(False)] - reindexed[crit.fillna(True)] - -pandas provides a nullable integer dtype, but you must explicitly request it -when creating the series or column. Notice that we use a capital "I" in -the ``dtype="Int64"``. - -.. ipython:: python - - s = pd.Series([0, 1, np.nan, 3, 4], dtype="Int64") - s - -See :ref:`integer_na` for more. - - -.. _missing_data.NA: - -Experimental ``NA`` scalar to denote missing values -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. warning:: - - Experimental: the behaviour of ``pd.NA`` can still change without warning. - -Starting from pandas 1.0, an experimental ``pd.NA`` value (singleton) is -available to represent scalar missing values. At this moment, it is used in -the nullable :doc:`integer `, boolean and -:ref:`dedicated string ` data types as the missing value indicator. - -The goal of ``pd.NA`` is provide a "missing" indicator that can be used -consistently across data types (instead of ``np.nan``, ``None`` or ``pd.NaT`` -depending on the data type). - -For example, when having missing values in a Series with the nullable integer -dtype, it will use ``pd.NA``: - -.. ipython:: python - - s = pd.Series([1, 2, None], dtype="Int64") - s - s[2] - s[2] is pd.NA - -Currently, pandas does not yet use those data types by default (when creating -a DataFrame or Series, or when reading in data), so you need to specify -the dtype explicitly. An easy way to convert to those dtypes is explained -:ref:`here `. - -Propagation in arithmetic and comparison operations ---------------------------------------------------- - -In general, missing values *propagate* in operations involving ``pd.NA``. When -one of the operands is unknown, the outcome of the operation is also unknown. - -For example, ``pd.NA`` propagates in arithmetic operations, similarly to -``np.nan``: - -.. ipython:: python - - pd.NA + 1 - "a" * pd.NA - -There are a few special cases when the result is known, even when one of the -operands is ``NA``. - -.. ipython:: python - - pd.NA ** 0 - 1 ** pd.NA - -In equality and comparison operations, ``pd.NA`` also propagates. This deviates -from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always -return ``False``. - -.. ipython:: python - - pd.NA == 1 - pd.NA == pd.NA - pd.NA < 2.5 - -To check if a value is equal to ``pd.NA``, the :func:`isna` function can be -used: - -.. ipython:: python - - pd.isna(pd.NA) - -An exception on this basic propagation rule are *reductions* (such as the -mean or the minimum), where pandas defaults to skipping missing values. See -:ref:`above ` for more. - -Logical operations ------------------- - -For logical operations, ``pd.NA`` follows the rules of the -`three-valued logic `__ (or -*Kleene logic*, similarly to R, SQL and Julia). This logic means to only -propagate missing values when it is logically required. - -For example, for the logical "or" operation (``|``), if one of the operands -is ``True``, we already know the result will be ``True``, regardless of the -other value (so regardless the missing value would be ``True`` or ``False``). -In this case, ``pd.NA`` does not propagate: - -.. ipython:: python - - True | False - True | pd.NA - pd.NA | True - -On the other hand, if one of the operands is ``False``, the result depends -on the value of the other operand. Therefore, in this case ``pd.NA`` -propagates: - -.. ipython:: python - - False | True - False | False - False | pd.NA - -The behaviour of the logical "and" operation (``&``) can be derived using -similar logic (where now ``pd.NA`` will not propagate if one of the operands -is already ``False``): - -.. ipython:: python - - False & True - False & False - False & pd.NA - -.. ipython:: python - - True & True - True & False - True & pd.NA - - -``NA`` in a boolean context ---------------------------- - -Since the actual value of an NA is unknown, it is ambiguous to convert NA -to a boolean value. The following raises an error: - -.. ipython:: python - :okexcept: - - bool(pd.NA) - -This also means that ``pd.NA`` cannot be used in a context where it is -evaluated to a boolean, such as ``if condition: ...`` where ``condition`` can -potentially be ``pd.NA``. In such cases, :func:`isna` can be used to check -for ``pd.NA`` or ``condition`` being ``pd.NA`` can be avoided, for example by -filling missing values beforehand. - -A similar situation occurs when using Series or DataFrame objects in ``if`` -statements, see :ref:`gotchas.truth`. - -NumPy ufuncs ------------- - -:attr:`pandas.NA` implements NumPy's ``__array_ufunc__`` protocol. Most ufuncs -work with ``NA``, and generally return ``NA``: - -.. ipython:: python - - np.log(pd.NA) - np.add(pd.NA, 1) - -.. warning:: - - Currently, ufuncs involving an ndarray and ``NA`` will return an - object-dtype filled with NA values. - - .. ipython:: python - - a = np.array([1, 2, 3]) - np.greater(a, pd.NA) - - The return type here may change to return a different array type - in the future. - -See :ref:`dsintro.numpy_interop` for more on ufuncs. - -.. _missing_data.NA.conversion: - -Conversion ----------- - -If you have a DataFrame or Series using traditional types that have missing data -represented using ``np.nan``, there are convenience methods -:meth:`~Series.convert_dtypes` in Series and :meth:`~DataFrame.convert_dtypes` -in DataFrame that can convert data to use the newer dtypes for integers, strings and -booleans listed :ref:`here `. This is especially helpful after reading -in data sets when letting the readers such as :meth:`read_csv` and :meth:`read_excel` -infer default dtypes. - -In this example, while the dtypes of all columns are changed, we show the results for -the first 10 columns. - -.. ipython:: python - - bb = pd.read_csv("data/baseball.csv", index_col="id") - bb[bb.columns[:10]].dtypes - -.. ipython:: python - - bbn = bb.convert_dtypes() - bbn[bbn.columns[:10]].dtypes + A regular expression object from ``re.compile`` is a valid input as well. diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/reshaping.rst pandas-2.2.2+dfsg/doc/source/user_guide/reshaping.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/reshaping.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/reshaping.rst 2024-04-10 17:42:52.000000000 +0000 @@ -136,7 +136,7 @@ .. ipython:: python - pd.pivot_table(df, values="D", index=pd.Grouper(freq="M", key="F"), columns="C") + pd.pivot_table(df, values="D", index=pd.Grouper(freq="ME", key="F"), columns="C") .. _reshaping.pivot.margins: @@ -480,7 +480,7 @@ .. versionadded:: 1.5.0 -:func:`~pandas.from_dummies` coverts the output of :func:`~pandas.get_dummies` back into +:func:`~pandas.from_dummies` converts the output of :func:`~pandas.get_dummies` back into a :class:`Series` of categorical values from indicator values. .. ipython:: python diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/scale.rst pandas-2.2.2+dfsg/doc/source/user_guide/scale.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/scale.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/scale.rst 2024-04-10 17:42:52.000000000 +0000 @@ -40,7 +40,7 @@ return df timeseries = [ - make_timeseries(freq="1T", seed=i).rename(columns=lambda x: f"{x}_{i}") + make_timeseries(freq="1min", seed=i).rename(columns=lambda x: f"{x}_{i}") for i in range(10) ] ts_wide = pd.concat(timeseries, axis=1) @@ -87,7 +87,7 @@ .. ipython:: python :okwarning: - ts = make_timeseries(freq="30S", seed=0) + ts = make_timeseries(freq="30s", seed=0) ts.to_parquet("timeseries.parquet") ts = pd.read_parquet("timeseries.parquet") ts @@ -156,7 +156,7 @@ Chunking works well when the operation you're performing requires zero or minimal coordination between chunks. For more complicated workflows, you're better off - :ref:`using another library `. + :ref:`using other libraries `. Suppose we have an even larger "logical dataset" on disk that's a directory of parquet files. Each file in the directory represents a different year of the entire dataset. @@ -173,7 +173,7 @@ pathlib.Path("data/timeseries").mkdir(exist_ok=True) for i, (start, end) in enumerate(zip(starts, ends)): - ts = make_timeseries(start=start, end=end, freq="1T", seed=i) + ts = make_timeseries(start=start, end=end, freq="1min", seed=i) ts.to_parquet(f"data/timeseries/ts-{i:0>2d}.parquet") @@ -219,160 +219,10 @@ .. _scale.other_libraries: -Use Dask --------- +Use Other Libraries +------------------- -pandas is just one library offering a DataFrame API. Because of its popularity, -pandas' API has become something of a standard that other libraries implement. -The pandas documentation maintains a list of libraries implementing a DataFrame API -in `the ecosystem page `_. - -For example, `Dask`_, a parallel computing library, has `dask.dataframe`_, a -pandas-like API for working with larger than memory datasets in parallel. Dask -can use multiple threads or processes on a single machine, or a cluster of -machines to process data in parallel. - - -We'll import ``dask.dataframe`` and notice that the API feels similar to pandas. -We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in. - -.. ipython:: python - :okwarning: - - import dask.dataframe as dd - - ddf = dd.read_parquet("data/timeseries/ts*.parquet", engine="pyarrow") - ddf - -Inspecting the ``ddf`` object, we see a few things - -* There are familiar attributes like ``.columns`` and ``.dtypes`` -* There are familiar methods like ``.groupby``, ``.sum``, etc. -* There are new attributes like ``.npartitions`` and ``.divisions`` - -The partitions and divisions are how Dask parallelizes computation. A **Dask** -DataFrame is made up of many pandas :class:`pandas.DataFrame`. A single method call on a -Dask DataFrame ends up making many pandas method calls, and Dask knows how to -coordinate everything to get the result. - -.. ipython:: python - - ddf.columns - ddf.dtypes - ddf.npartitions - -One major difference: the ``dask.dataframe`` API is *lazy*. If you look at the -repr above, you'll notice that the values aren't actually printed out; just the -column names and dtypes. That's because Dask hasn't actually read the data yet. -Rather than executing immediately, doing operations build up a **task graph**. - -.. ipython:: python - :okwarning: - - ddf - ddf["name"] - ddf["name"].value_counts() - -Each of these calls is instant because the result isn't being computed yet. -We're just building up a list of computation to do when someone needs the -result. Dask knows that the return type of a :class:`pandas.Series.value_counts` -is a pandas :class:`pandas.Series` with a certain dtype and a certain name. So the Dask version -returns a Dask Series with the same dtype and the same name. - -To get the actual result you can call ``.compute()``. - -.. ipython:: python - :okwarning: - - %time ddf["name"].value_counts().compute() - -At that point, you get back the same thing you'd get with pandas, in this case -a concrete pandas :class:`pandas.Series` with the count of each ``name``. - -Calling ``.compute`` causes the full task graph to be executed. This includes -reading the data, selecting the columns, and doing the ``value_counts``. The -execution is done *in parallel* where possible, and Dask tries to keep the -overall memory footprint small. You can work with datasets that are much larger -than memory, as long as each partition (a regular pandas :class:`pandas.DataFrame`) fits in memory. - -By default, ``dask.dataframe`` operations use a threadpool to do operations in -parallel. We can also connect to a cluster to distribute the work on many -machines. In this case we'll connect to a local "cluster" made up of several -processes on this single machine. - -.. code-block:: python - - >>> from dask.distributed import Client, LocalCluster - - >>> cluster = LocalCluster() - >>> client = Client(cluster) - >>> client - - -Once this ``client`` is created, all of Dask's computation will take place on -the cluster (which is just processes in this case). - -Dask implements the most used parts of the pandas API. For example, we can do -a familiar groupby aggregation. - -.. ipython:: python - :okwarning: - - %time ddf.groupby("name")[["x", "y"]].mean().compute().head() - -The grouping and aggregation is done out-of-core and in parallel. - -When Dask knows the ``divisions`` of a dataset, certain optimizations are -possible. When reading parquet datasets written by dask, the divisions will be -known automatically. In this case, since we created the parquet files manually, -we need to supply the divisions manually. - -.. ipython:: python - :okwarning: - - N = 12 - starts = [f"20{i:>02d}-01-01" for i in range(N)] - ends = [f"20{i:>02d}-12-13" for i in range(N)] - - divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),) - ddf.divisions = divisions - ddf - -Now we can do things like fast random access with ``.loc``. - -.. ipython:: python - :okwarning: - - ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute() - -Dask knows to just look in the 3rd partition for selecting values in 2002. It -doesn't need to look at any other data. - -Many workflows involve a large amount of data and processing it in a way that -reduces the size to something that fits in memory. In this case, we'll resample -to daily frequency and take the mean. Once we've taken the mean, we know the -results will fit in memory, so we can safely call ``compute`` without running -out of memory. At that point it's just a regular pandas object. - -.. ipython:: python - :okwarning: - - @savefig dask_resample.png - ddf[["x", "y"]].resample("1D").mean().cumsum().compute().plot() - -.. ipython:: python - :suppress: - - import shutil - - shutil.rmtree("data/timeseries") - -These Dask examples have all be done using multiple processes on a single -machine. Dask can be `deployed on a cluster -`_ to scale up to even larger -datasets. - -You see more dask examples at https://examples.dask.org. - -.. _Dask: https://dask.org -.. _dask.dataframe: https://docs.dask.org/en/latest/dataframe.html +There are other libraries which provide similar APIs to pandas and work nicely with pandas DataFrame, +and can give you the ability to scale your large dataset processing and analytics +by parallel runtime, distributed memory, clustering, etc. You can find more information +in `the ecosystem page `_. diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/timedeltas.rst pandas-2.2.2+dfsg/doc/source/user_guide/timedeltas.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/timedeltas.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/timedeltas.rst 2024-04-10 17:42:52.000000000 +0000 @@ -390,9 +390,9 @@ .. ipython:: python - pd.timedelta_range(start="1 days", end="2 days", freq="30T") + pd.timedelta_range(start="1 days", end="2 days", freq="30min") - pd.timedelta_range(start="1 days", periods=5, freq="2D5H") + pd.timedelta_range(start="1 days", periods=5, freq="2D5h") Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced diff -Nru pandas-2.1.4+dfsg/doc/source/user_guide/timeseries.rst pandas-2.2.2+dfsg/doc/source/user_guide/timeseries.rst --- pandas-2.1.4+dfsg/doc/source/user_guide/timeseries.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/user_guide/timeseries.rst 2024-04-10 17:42:52.000000000 +0000 @@ -28,7 +28,7 @@ .. ipython:: python - dti = pd.date_range("2018-01-01", periods=3, freq="H") + dti = pd.date_range("2018-01-01", periods=3, freq="h") dti Manipulating and converting date times with timezone information @@ -43,10 +43,10 @@ .. ipython:: python - idx = pd.date_range("2018-01-01", periods=5, freq="H") + idx = pd.date_range("2018-01-01", periods=5, freq="h") ts = pd.Series(range(len(idx)), index=idx) ts - ts.resample("2H").mean() + ts.resample("2h").mean() Performing date and time arithmetic with absolute or relative time increments @@ -107,7 +107,7 @@ pd.Series(pd.period_range("1/1/2011", freq="M", periods=3)) pd.Series([pd.DateOffset(1), pd.DateOffset(2)]) - pd.Series(pd.date_range("1/1/2011", freq="M", periods=3)) + pd.Series(pd.date_range("1/1/2011", freq="ME", periods=3)) Lastly, pandas represents null date times, time deltas, and time spans as ``NaT`` which is useful for representing missing or null date like values and behaves similar @@ -294,12 +294,6 @@ pd.to_datetime(['2009/07/31', 'asd'], errors='raise') -Pass ``errors='ignore'`` to return the original input when unparsable: - -.. ipython:: python - - pd.to_datetime(["2009/07/31", "asd"], errors="ignore") - Pass ``errors='coerce'`` to convert unparsable data to ``NaT`` (not a time): .. ipython:: python @@ -450,7 +444,7 @@ .. ipython:: python - pd.date_range(start, periods=1000, freq="M") + pd.date_range(start, periods=1000, freq="ME") pd.bdate_range(start, periods=250, freq="BQS") @@ -461,7 +455,7 @@ .. ipython:: python - pd.date_range(start, end, freq="BM") + pd.date_range(start, end, freq="BME") pd.date_range(start, end, freq="W") @@ -557,7 +551,7 @@ .. ipython:: python - rng = pd.date_range(start, end, freq="BM") + rng = pd.date_range(start, end, freq="BME") ts = pd.Series(np.random.randn(len(rng)), index=rng) ts.index ts[:5].index @@ -603,7 +597,7 @@ dft = pd.DataFrame( np.random.randn(100000, 1), columns=["A"], - index=pd.date_range("20130101", periods=100000, freq="T"), + index=pd.date_range("20130101", periods=100000, freq="min"), ) dft dft.loc["2013"] @@ -641,7 +635,7 @@ np.random.randn(20, 1), columns=["A"], index=pd.MultiIndex.from_product( - [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] + [pd.date_range("20130101", periods=10, freq="12h"), ["a", "b"]] ), ) dft2 @@ -882,34 +876,34 @@ :class:`~pandas.tseries.offsets.Week`, ``'W'``, "one week, optionally anchored on a day of the week" :class:`~pandas.tseries.offsets.WeekOfMonth`, ``'WOM'``, "the x-th day of the y-th week of each month" :class:`~pandas.tseries.offsets.LastWeekOfMonth`, ``'LWOM'``, "the x-th day of the last week of each month" - :class:`~pandas.tseries.offsets.MonthEnd`, ``'M'``, "calendar month end" + :class:`~pandas.tseries.offsets.MonthEnd`, ``'ME'``, "calendar month end" :class:`~pandas.tseries.offsets.MonthBegin`, ``'MS'``, "calendar month begin" - :class:`~pandas.tseries.offsets.BMonthEnd` or :class:`~pandas.tseries.offsets.BusinessMonthEnd`, ``'BM'``, "business month end" + :class:`~pandas.tseries.offsets.BMonthEnd` or :class:`~pandas.tseries.offsets.BusinessMonthEnd`, ``'BME'``, "business month end" :class:`~pandas.tseries.offsets.BMonthBegin` or :class:`~pandas.tseries.offsets.BusinessMonthBegin`, ``'BMS'``, "business month begin" - :class:`~pandas.tseries.offsets.CBMonthEnd` or :class:`~pandas.tseries.offsets.CustomBusinessMonthEnd`, ``'CBM'``, "custom business month end" + :class:`~pandas.tseries.offsets.CBMonthEnd` or :class:`~pandas.tseries.offsets.CustomBusinessMonthEnd`, ``'CBME'``, "custom business month end" :class:`~pandas.tseries.offsets.CBMonthBegin` or :class:`~pandas.tseries.offsets.CustomBusinessMonthBegin`, ``'CBMS'``, "custom business month begin" - :class:`~pandas.tseries.offsets.SemiMonthEnd`, ``'SM'``, "15th (or other day_of_month) and calendar month end" + :class:`~pandas.tseries.offsets.SemiMonthEnd`, ``'SME'``, "15th (or other day_of_month) and calendar month end" :class:`~pandas.tseries.offsets.SemiMonthBegin`, ``'SMS'``, "15th (or other day_of_month) and calendar month begin" - :class:`~pandas.tseries.offsets.QuarterEnd`, ``'Q'``, "calendar quarter end" + :class:`~pandas.tseries.offsets.QuarterEnd`, ``'QE'``, "calendar quarter end" :class:`~pandas.tseries.offsets.QuarterBegin`, ``'QS'``, "calendar quarter begin" - :class:`~pandas.tseries.offsets.BQuarterEnd`, ``'BQ``, "business quarter end" + :class:`~pandas.tseries.offsets.BQuarterEnd`, ``'BQE``, "business quarter end" :class:`~pandas.tseries.offsets.BQuarterBegin`, ``'BQS'``, "business quarter begin" :class:`~pandas.tseries.offsets.FY5253Quarter`, ``'REQ'``, "retail (aka 52-53 week) quarter" - :class:`~pandas.tseries.offsets.YearEnd`, ``'A'``, "calendar year end" - :class:`~pandas.tseries.offsets.YearBegin`, ``'AS'`` or ``'BYS'``,"calendar year begin" - :class:`~pandas.tseries.offsets.BYearEnd`, ``'BA'``, "business year end" - :class:`~pandas.tseries.offsets.BYearBegin`, ``'BAS'``, "business year begin" + :class:`~pandas.tseries.offsets.YearEnd`, ``'YE'``, "calendar year end" + :class:`~pandas.tseries.offsets.YearBegin`, ``'YS'`` or ``'BYS'``,"calendar year begin" + :class:`~pandas.tseries.offsets.BYearEnd`, ``'BYE'``, "business year end" + :class:`~pandas.tseries.offsets.BYearBegin`, ``'BYS'``, "business year begin" :class:`~pandas.tseries.offsets.FY5253`, ``'RE'``, "retail (aka 52-53 week) year" :class:`~pandas.tseries.offsets.Easter`, None, "Easter holiday" - :class:`~pandas.tseries.offsets.BusinessHour`, ``'BH'``, "business hour" - :class:`~pandas.tseries.offsets.CustomBusinessHour`, ``'CBH'``, "custom business hour" + :class:`~pandas.tseries.offsets.BusinessHour`, ``'bh'``, "business hour" + :class:`~pandas.tseries.offsets.CustomBusinessHour`, ``'cbh'``, "custom business hour" :class:`~pandas.tseries.offsets.Day`, ``'D'``, "one absolute day" - :class:`~pandas.tseries.offsets.Hour`, ``'H'``, "one hour" - :class:`~pandas.tseries.offsets.Minute`, ``'T'`` or ``'min'``,"one minute" - :class:`~pandas.tseries.offsets.Second`, ``'S'``, "one second" - :class:`~pandas.tseries.offsets.Milli`, ``'L'`` or ``'ms'``, "one millisecond" - :class:`~pandas.tseries.offsets.Micro`, ``'U'`` or ``'us'``, "one microsecond" - :class:`~pandas.tseries.offsets.Nano`, ``'N'``, "one nanosecond" + :class:`~pandas.tseries.offsets.Hour`, ``'h'``, "one hour" + :class:`~pandas.tseries.offsets.Minute`, ``'min'``,"one minute" + :class:`~pandas.tseries.offsets.Second`, ``'s'``, "one second" + :class:`~pandas.tseries.offsets.Milli`, ``'ms'``, "one millisecond" + :class:`~pandas.tseries.offsets.Micro`, ``'us'``, "one microsecond" + :class:`~pandas.tseries.offsets.Nano`, ``'ns'``, "one nanosecond" ``DateOffsets`` additionally have :meth:`rollforward` and :meth:`rollback` methods for moving a date forward or backward respectively to a valid offset @@ -1246,29 +1240,36 @@ "C", "custom business day frequency" "D", "calendar day frequency" "W", "weekly frequency" - "M", "month end frequency" - "SM", "semi-month end frequency (15th and end of month)" - "BM", "business month end frequency" - "CBM", "custom business month end frequency" + "ME", "month end frequency" + "SME", "semi-month end frequency (15th and end of month)" + "BME", "business month end frequency" + "CBME", "custom business month end frequency" "MS", "month start frequency" "SMS", "semi-month start frequency (1st and 15th)" "BMS", "business month start frequency" "CBMS", "custom business month start frequency" - "Q", "quarter end frequency" - "BQ", "business quarter end frequency" + "QE", "quarter end frequency" + "BQE", "business quarter end frequency" "QS", "quarter start frequency" "BQS", "business quarter start frequency" - "A, Y", "year end frequency" - "BA, BY", "business year end frequency" - "AS, YS", "year start frequency" - "BAS, BYS", "business year start frequency" - "BH", "business hour frequency" - "H", "hourly frequency" - "T, min", "minutely frequency" - "S", "secondly frequency" - "L, ms", "milliseconds" - "U, us", "microseconds" - "N", "nanoseconds" + "YE", "year end frequency" + "BYE", "business year end frequency" + "YS", "year start frequency" + "BYS", "business year start frequency" + "h", "hourly frequency" + "bh", "business hour frequency" + "cbh", "custom business hour frequency" + "min", "minutely frequency" + "s", "secondly frequency" + "ms", "milliseconds" + "us", "microseconds" + "ns", "nanoseconds" + +.. deprecated:: 2.2.0 + + Aliases ``H``, ``BH``, ``CBH``, ``T``, ``S``, ``L``, ``U``, and ``N`` + are deprecated in favour of the aliases ``h``, ``bh``, ``cbh``, + ``min``, ``s``, ``ms``, ``us``, and ``ns``. .. note:: @@ -1316,13 +1317,18 @@ "W", "weekly frequency" "M", "monthly frequency" "Q", "quarterly frequency" - "A, Y", "yearly frequency" - "H", "hourly frequency" - "T, min", "minutely frequency" - "S", "secondly frequency" - "L, ms", "milliseconds" - "U, us", "microseconds" - "N", "nanoseconds" + "Y", "yearly frequency" + "h", "hourly frequency" + "min", "minutely frequency" + "s", "secondly frequency" + "ms", "milliseconds" + "us", "microseconds" + "ns", "nanoseconds" + +.. deprecated:: 2.2.0 + + Aliases ``A``, ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` are deprecated in favour of the aliases + ``Y``, ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns``. Combining aliases @@ -1343,7 +1349,7 @@ pd.date_range(start, periods=10, freq="2h20min") - pd.date_range(start, periods=10, freq="1D10U") + pd.date_range(start, periods=10, freq="1D10us") Anchored offsets ~~~~~~~~~~~~~~~~ @@ -1361,30 +1367,30 @@ "W\-THU", "weekly frequency (Thursdays)" "W\-FRI", "weekly frequency (Fridays)" "W\-SAT", "weekly frequency (Saturdays)" - "(B)Q(S)\-DEC", "quarterly frequency, year ends in December. Same as 'Q'" - "(B)Q(S)\-JAN", "quarterly frequency, year ends in January" - "(B)Q(S)\-FEB", "quarterly frequency, year ends in February" - "(B)Q(S)\-MAR", "quarterly frequency, year ends in March" - "(B)Q(S)\-APR", "quarterly frequency, year ends in April" - "(B)Q(S)\-MAY", "quarterly frequency, year ends in May" - "(B)Q(S)\-JUN", "quarterly frequency, year ends in June" - "(B)Q(S)\-JUL", "quarterly frequency, year ends in July" - "(B)Q(S)\-AUG", "quarterly frequency, year ends in August" - "(B)Q(S)\-SEP", "quarterly frequency, year ends in September" - "(B)Q(S)\-OCT", "quarterly frequency, year ends in October" - "(B)Q(S)\-NOV", "quarterly frequency, year ends in November" - "(B)A(S)\-DEC", "annual frequency, anchored end of December. Same as 'A'" - "(B)A(S)\-JAN", "annual frequency, anchored end of January" - "(B)A(S)\-FEB", "annual frequency, anchored end of February" - "(B)A(S)\-MAR", "annual frequency, anchored end of March" - "(B)A(S)\-APR", "annual frequency, anchored end of April" - "(B)A(S)\-MAY", "annual frequency, anchored end of May" - "(B)A(S)\-JUN", "annual frequency, anchored end of June" - "(B)A(S)\-JUL", "annual frequency, anchored end of July" - "(B)A(S)\-AUG", "annual frequency, anchored end of August" - "(B)A(S)\-SEP", "annual frequency, anchored end of September" - "(B)A(S)\-OCT", "annual frequency, anchored end of October" - "(B)A(S)\-NOV", "annual frequency, anchored end of November" + "(B)Q(E)(S)\-DEC", "quarterly frequency, year ends in December. Same as 'QE'" + "(B)Q(E)(S)\-JAN", "quarterly frequency, year ends in January" + "(B)Q(E)(S)\-FEB", "quarterly frequency, year ends in February" + "(B)Q(E)(S)\-MAR", "quarterly frequency, year ends in March" + "(B)Q(E)(S)\-APR", "quarterly frequency, year ends in April" + "(B)Q(E)(S)\-MAY", "quarterly frequency, year ends in May" + "(B)Q(E)(S)\-JUN", "quarterly frequency, year ends in June" + "(B)Q(E)(S)\-JUL", "quarterly frequency, year ends in July" + "(B)Q(E)(S)\-AUG", "quarterly frequency, year ends in August" + "(B)Q(E)(S)\-SEP", "quarterly frequency, year ends in September" + "(B)Q(E)(S)\-OCT", "quarterly frequency, year ends in October" + "(B)Q(E)(S)\-NOV", "quarterly frequency, year ends in November" + "(B)Y(E)(S)\-DEC", "annual frequency, anchored end of December. Same as 'YE'" + "(B)Y(E)(S)\-JAN", "annual frequency, anchored end of January" + "(B)Y(E)(S)\-FEB", "annual frequency, anchored end of February" + "(B)Y(E)(S)\-MAR", "annual frequency, anchored end of March" + "(B)Y(E)(S)\-APR", "annual frequency, anchored end of April" + "(B)Y(E)(S)\-MAY", "annual frequency, anchored end of May" + "(B)Y(E)(S)\-JUN", "annual frequency, anchored end of June" + "(B)Y(E)(S)\-JUL", "annual frequency, anchored end of July" + "(B)Y(E)(S)\-AUG", "annual frequency, anchored end of August" + "(B)Y(E)(S)\-SEP", "annual frequency, anchored end of September" + "(B)Y(E)(S)\-OCT", "annual frequency, anchored end of October" + "(B)Y(E)(S)\-NOV", "annual frequency, anchored end of November" These can be used as arguments to ``date_range``, ``bdate_range``, constructors for ``DatetimeIndex``, as well as various other timeseries-related functions @@ -1574,7 +1580,7 @@ ts.shift(5, freq="D") ts.shift(5, freq=pd.offsets.BDay()) - ts.shift(5, freq="BM") + ts.shift(5, freq="BME") Note that with when ``freq`` is specified, the leading entry is no longer NaN because the data is not being realigned. @@ -1635,7 +1641,7 @@ .. ipython:: python - rng = pd.date_range("1/1/2012", periods=100, freq="S") + rng = pd.date_range("1/1/2012", periods=100, freq="s") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) @@ -1680,7 +1686,7 @@ .. warning:: The default values for ``label`` and ``closed`` is '**left**' for all - frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' + frequency offsets except for 'ME', 'YE', 'QE', 'BME', 'BYE', 'BQE', and 'W' which all have a default of 'right'. This might unintendedly lead to looking ahead, where the value for a later @@ -1725,11 +1731,11 @@ # from secondly to every 250 milliseconds - ts[:2].resample("250L").asfreq() + ts[:2].resample("250ms").asfreq() - ts[:2].resample("250L").ffill() + ts[:2].resample("250ms").ffill() - ts[:2].resample("250L").ffill(limit=2) + ts[:2].resample("250ms").ffill(limit=2) Sparse resampling ~~~~~~~~~~~~~~~~~ @@ -1752,7 +1758,7 @@ .. ipython:: python - ts.resample("3T").sum() + ts.resample("3min").sum() We can instead only resample those groups where we have points as follows: @@ -1764,9 +1770,10 @@ def round(t, freq): # round a Timestamp to a specified freq freq = to_offset(freq) - return pd.Timestamp((t.value // freq.delta.value) * freq.delta.value) + td = pd.Timedelta(freq) + return pd.Timestamp((t.value // td.value) * td.value) - ts.groupby(partial(round, freq="3T")).sum() + ts.groupby(partial(round, freq="3min")).sum() .. _timeseries.aggregate: @@ -1783,10 +1790,10 @@ df = pd.DataFrame( np.random.randn(1000, 3), - index=pd.date_range("1/1/2012", freq="S", periods=1000), + index=pd.date_range("1/1/2012", freq="s", periods=1000), columns=["A", "B", "C"], ) - r = df.resample("3T") + r = df.resample("3min") r.mean() We can select a specific column or columns using standard getitem. @@ -1846,7 +1853,7 @@ ), ) df - df.resample("M", on="date")[["a"]].sum() + df.resample("ME", on="date")[["a"]].sum() Similarly, if you instead want to resample by a datetimelike level of ``MultiIndex``, its name or location can be passed to the @@ -1854,7 +1861,7 @@ .. ipython:: python - df.resample("M", level="d")[["a"]].sum() + df.resample("ME", level="d")[["a"]].sum() .. _timeseries.iterating-label: @@ -1879,7 +1886,7 @@ ] ), ) - resampled = small.resample("H") + resampled = small.resample("h") for name, group in resampled: print("Group: ", name) @@ -1985,20 +1992,20 @@ .. ipython:: python - pd.Period("2012", freq="A-DEC") + pd.Period("2012", freq="Y-DEC") pd.Period("2012-1-1", freq="D") - pd.Period("2012-1-1 19:00", freq="H") + pd.Period("2012-1-1 19:00", freq="h") - pd.Period("2012-1-1 19:00", freq="5H") + pd.Period("2012-1-1 19:00", freq="5h") Adding and subtracting integers from periods shifts the period by its own frequency. Arithmetic is not allowed between ``Period`` with different ``freq`` (span). .. ipython:: python - p = pd.Period("2012", freq="A-DEC") + p = pd.Period("2012", freq="Y-DEC") p + 1 p - 3 p = pd.Period("2012-01", freq="2M") @@ -2007,11 +2014,11 @@ p == pd.Period("2012-01", freq="3M") -If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised. +If ``Period`` freq is daily or higher (``D``, ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised. .. ipython:: python - p = pd.Period("2014-07-01 09:00", freq="H") + p = pd.Period("2014-07-01 09:00", freq="h") p + pd.offsets.Hour(2) p + datetime.timedelta(minutes=120) p + np.timedelta64(7200, "s") @@ -2040,7 +2047,7 @@ .. ipython:: python - pd.Period("2012", freq="A-DEC") - pd.Period("2002", freq="A-DEC") + pd.Period("2012", freq="Y-DEC") - pd.Period("2002", freq="Y-DEC") PeriodIndex and period_range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2087,7 +2094,7 @@ .. ipython:: python - idx = pd.period_range("2014-07-01 09:00", periods=5, freq="H") + idx = pd.period_range("2014-07-01 09:00", periods=5, freq="h") idx idx + pd.offsets.Hour(2) @@ -2127,7 +2134,7 @@ pi.astype("datetime64[ns]") # convert to PeriodIndex - dti = pd.date_range("2011-01-01", freq="M", periods=3) + dti = pd.date_range("2011-01-01", freq="ME", periods=3) dti dti.astype("period[M]") @@ -2155,16 +2162,16 @@ dfp = pd.DataFrame( np.random.randn(600, 1), columns=["A"], - index=pd.period_range("2013-01-01 9:00", periods=600, freq="T"), + index=pd.period_range("2013-01-01 9:00", periods=600, freq="min"), ) dfp - dfp.loc["2013-01-01 10H"] + dfp.loc["2013-01-01 10h"] As with ``DatetimeIndex``, the endpoints will be included in the result. The example below slices data starting from 10:00 to 11:59. .. ipython:: python - dfp["2013-01-01 10H":"2013-01-01 11H"] + dfp["2013-01-01 10h":"2013-01-01 11h"] Frequency conversion and resampling with PeriodIndex @@ -2174,7 +2181,7 @@ .. ipython:: python - p = pd.Period("2011", freq="A-DEC") + p = pd.Period("2011", freq="Y-DEC") p We can convert it to a monthly frequency. Using the ``how`` parameter, we can @@ -2201,10 +2208,10 @@ p = pd.Period("2011-12", freq="M") - p.asfreq("A-NOV") + p.asfreq("Y-NOV") Note that since we converted to an annual frequency that ends the year in -November, the monthly period of December 2011 is actually in the 2012 A-NOV +November, the monthly period of December 2011 is actually in the 2012 Y-NOV period. .. _timeseries.quarterly: @@ -2246,7 +2253,7 @@ .. ipython:: python - rng = pd.date_range("1/1/2012", periods=5, freq="M") + rng = pd.date_range("1/1/2012", periods=5, freq="ME") ts = pd.Series(np.random.randn(len(rng)), index=rng) @@ -2276,7 +2283,7 @@ ts = pd.Series(np.random.randn(len(prng)), prng) - ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9 + ts.index = (prng.asfreq("M", "e") + 1).asfreq("h", "s") + 9 ts.head() @@ -2495,7 +2502,7 @@ .. ipython:: python - didx = pd.date_range(start="2014-08-01 09:00", freq="H", periods=3, tz="US/Eastern") + didx = pd.date_range(start="2014-08-01 09:00", freq="h", periods=3, tz="US/Eastern") didx didx.tz_localize(None) didx.tz_convert(None) @@ -2592,7 +2599,7 @@ .. ipython:: python - dti = pd.date_range(start="2015-03-29 02:30:00", periods=3, freq="H") + dti = pd.date_range(start="2015-03-29 02:30:00", periods=3, freq="h") # 2:30 is a nonexistent time Localization of nonexistent times will raise an error by default. @@ -2609,7 +2616,7 @@ dti dti.tz_localize("Europe/Warsaw", nonexistent="shift_forward") dti.tz_localize("Europe/Warsaw", nonexistent="shift_backward") - dti.tz_localize("Europe/Warsaw", nonexistent=pd.Timedelta(1, unit="H")) + dti.tz_localize("Europe/Warsaw", nonexistent=pd.Timedelta(1, unit="h")) dti.tz_localize("Europe/Warsaw", nonexistent="NaT") diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/index.rst pandas-2.2.2+dfsg/doc/source/whatsnew/index.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/index.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/index.rst 2024-04-10 17:42:52.000000000 +0000 @@ -10,6 +10,16 @@ see the `commit logs `_. For install and upgrade instructions, see :ref:`install`. +Version 2.2 +----------- + +.. toctree:: + :maxdepth: 2 + + v2.2.2 + v2.2.1 + v2.2.0 + Version 2.1 ----------- diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.10.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.10.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.10.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.10.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -180,19 +180,36 @@ DataFrame constructor with no columns specified. The v0.9.0 behavior (names ``X0``, ``X1``, ...) can be reproduced by specifying ``prefix='X'``: -.. ipython:: python - :okexcept: +.. code-block:: ipython - import io + In [6]: import io - data = """ - a,b,c - 1,Yes,2 - 3,No,4 - """ - print(data) - pd.read_csv(io.StringIO(data), header=None) - pd.read_csv(io.StringIO(data), header=None, prefix="X") + In [7]: data = """ + ...: a,b,c + ...: 1,Yes,2 + ...: 3,No,4 + ...: """ + ...: + + In [8]: print(data) + + a,b,c + 1,Yes,2 + 3,No,4 + + In [9]: pd.read_csv(io.StringIO(data), header=None) + Out[9]: + 0 1 2 + 0 a b c + 1 1 Yes 2 + 2 3 No 4 + + In [10]: pd.read_csv(io.StringIO(data), header=None, prefix="X") + Out[10]: + X0 X1 X2 + 0 a b c + 1 1 Yes 2 + 2 3 No 4 - Values like ``'Yes'`` and ``'No'`` are not interpreted as boolean by default, though this can be controlled by new ``true_values`` and ``false_values`` diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.11.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.11.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.11.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.11.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -347,7 +347,7 @@ .. ipython:: python df = pd.DataFrame({'A': range(5), 'B': range(5)}) - df.to_hdf('store.h5', 'table', append=True) + df.to_hdf('store.h5', key='table', append=True) pd.read_hdf('store.h5', 'table', where=['index > 2']) .. ipython:: python @@ -367,15 +367,27 @@ - You can now select with a string from a DataFrame with a datelike index, in a similar way to a Series (:issue:`3070`) - .. ipython:: python - :okexcept: + .. code-block:: ipython - idx = pd.date_range("2001-10-1", periods=5, freq='M') - ts = pd.Series(np.random.rand(len(idx)), index=idx) - ts['2001'] + In [30]: idx = pd.date_range("2001-10-1", periods=5, freq='M') - df = pd.DataFrame({'A': ts}) - df['2001'] + In [31]: ts = pd.Series(np.random.rand(len(idx)), index=idx) + + In [32]: ts['2001'] + Out[32]: + 2001-10-31 0.117967 + 2001-11-30 0.702184 + 2001-12-31 0.414034 + Freq: M, dtype: float64 + + In [33]: df = pd.DataFrame({'A': ts}) + + In [34]: df['2001'] + Out[34]: + A + 2001-10-31 0.117967 + 2001-11-30 0.702184 + 2001-12-31 0.414034 - ``Squeeze`` to possibly remove length 1 dimensions from an object. diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.12.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.12.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.12.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.12.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -250,9 +250,9 @@ .. ipython:: python - from pandas._testing import makeCustomDataframe as mkdf - - df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) + mi_idx = pd.MultiIndex.from_arrays([[1, 2, 3, 4], list("abcd")], names=list("ab")) + mi_col = pd.MultiIndex.from_arrays([[1, 2], list("ab")], names=list("cd")) + df = pd.DataFrame(np.ones((4, 2)), index=mi_idx, columns=mi_col) df.to_csv("mi.csv") print(open("mi.csv").read()) pd.read_csv("mi.csv", header=[0, 1, 2, 3], index_col=[0, 1]) diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.13.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.13.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.13.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.13.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -385,7 +385,7 @@ dfq = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'), index=pd.date_range('20130101', periods=10)) - dfq.to_hdf(path, 'dfq', format='table', data_columns=True) + dfq.to_hdf(path, key='dfq', format='table', data_columns=True) Use boolean expressions, with in-line function evaluation. @@ -415,9 +415,9 @@ path = 'test.h5' df = pd.DataFrame(np.random.randn(10, 2)) - df.to_hdf(path, 'df_table', format='table') - df.to_hdf(path, 'df_table2', append=True) - df.to_hdf(path, 'df_fixed') + df.to_hdf(path, key='df_table', format='table') + df.to_hdf(path, key='df_table2', append=True) + df.to_hdf(path, key='df_fixed') with pd.HDFStore(path) as store: print(store) @@ -537,7 +537,6 @@ is frequency conversion. See :ref:`the docs` for the docs. .. ipython:: python - :okexcept: import datetime td = pd.Series(pd.date_range('20130101', periods=4)) - pd.Series( @@ -546,13 +545,41 @@ td[3] = np.nan td + .. code-block:: ipython + # to days - td / np.timedelta64(1, 'D') - td.astype('timedelta64[D]') + In [63]: td / np.timedelta64(1, 'D') + Out[63]: + 0 31.000000 + 1 31.000000 + 2 31.003507 + 3 NaN + dtype: float64 + + In [64]: td.astype('timedelta64[D]') + Out[64]: + 0 31.0 + 1 31.0 + 2 31.0 + 3 NaN + dtype: float64 # to seconds - td / np.timedelta64(1, 's') - td.astype('timedelta64[s]') + In [65]: td / np.timedelta64(1, 's') + Out[65]: + 0 2678400.0 + 1 2678400.0 + 2 2678703.0 + 3 NaN + dtype: float64 + + In [66]: td.astype('timedelta64[s]') + Out[66]: + 0 2678400.0 + 1 2678400.0 + 2 2678703.0 + 3 NaN + dtype: float64 Dividing or multiplying a ``timedelta64[ns]`` Series by an integer or integer Series @@ -642,9 +669,16 @@ Period conversions in the range of seconds and below were reworked and extended up to nanoseconds. Periods in the nanosecond range are now available. - .. ipython:: python + .. code-block:: python - pd.date_range('2013-01-01', periods=5, freq='5N') + In [79]: pd.date_range('2013-01-01', periods=5, freq='5N') + Out[79]: + DatetimeIndex([ '2013-01-01 00:00:00', + '2013-01-01 00:00:00.000000005', + '2013-01-01 00:00:00.000000010', + '2013-01-01 00:00:00.000000015', + '2013-01-01 00:00:00.000000020'], + dtype='datetime64[ns]', freq='5N') or with frequency as offset diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.13.1.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.13.1.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.13.1.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.13.1.rst 2024-04-10 17:42:52.000000000 +0000 @@ -29,11 +29,10 @@ This would previously segfault: - .. ipython:: python + .. code-block:: python df = pd.DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) df["A"].iloc[0] = np.nan - df The recommended way to do this type of assignment is: diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.14.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.14.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.14.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.14.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -328,19 +328,36 @@ - groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation: - .. ipython:: python + .. code-block:: ipython - df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - g.head(1) # filters DataFrame + In [1]: df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + + In [2]: g = df.groupby('A') - g.apply(lambda x: x.head(1)) # used to simply fall-through + In [3]: g.head(1) # filters DataFrame + Out[3]: + A B + 0 1 2 + 2 5 6 + + In [4]: g.apply(lambda x: x.head(1)) # used to simply fall-through + Out[4]: + A B + A + 1 0 1 2 + 5 2 5 6 - groupby head and tail respect column selection: - .. ipython:: python + .. code-block:: ipython + + In [19]: g[['B']].head(1) + Out[19]: + B + 0 2 + 2 6 - g[['B']].head(1) + [2 rows x 1 columns] - groupby ``nth`` now reduces by default; filtering can be achieved by passing ``as_index=False``. With an optional ``dropna`` argument to ignore NaN. See :ref:`the docs `. @@ -843,22 +860,61 @@ datetime.datetime(2013, 9, 5, 10, 0)]}) df - df.pivot_table(values='Quantity', - index=pd.Grouper(freq='M', key='Date'), - columns=pd.Grouper(freq='M', key='PayDay'), - aggfunc="sum") + .. code-block:: ipython + + In [75]: df.pivot_table(values='Quantity', + ....: index=pd.Grouper(freq='M', key='Date'), + ....: columns=pd.Grouper(freq='M', key='PayDay'), + ....: aggfunc="sum") + Out[75]: + PayDay 2013-09-30 2013-10-31 2013-11-30 + Date + 2013-09-30 NaN 3.0 NaN + 2013-10-31 6.0 NaN 1.0 + 2013-11-30 NaN 9.0 NaN + + [3 rows x 3 columns] - Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`) - Add :meth:`~Series.nsmallest` and :meth:`Series.nlargest` methods to Series, See :ref:`the docs ` (:issue:`3960`) - ``PeriodIndex`` fully supports partial string indexing like ``DatetimeIndex`` (:issue:`7043`) - .. ipython:: python + .. code-block:: ipython + + In [76]: prng = pd.period_range('2013-01-01 09:00', periods=100, freq='H') + + In [77]: ps = pd.Series(np.random.randn(len(prng)), index=prng) - prng = pd.period_range('2013-01-01 09:00', periods=100, freq='H') - ps = pd.Series(np.random.randn(len(prng)), index=prng) - ps - ps['2013-01-02'] + In [78]: ps + Out[78]: + 2013-01-01 09:00 0.015696 + 2013-01-01 10:00 -2.242685 + 2013-01-01 11:00 1.150036 + 2013-01-01 12:00 0.991946 + 2013-01-01 13:00 0.953324 + ... + 2013-01-05 08:00 0.285296 + 2013-01-05 09:00 0.484288 + 2013-01-05 10:00 1.363482 + 2013-01-05 11:00 -0.781105 + 2013-01-05 12:00 -0.468018 + Freq: H, Length: 100, dtype: float64 + + In [79]: ps['2013-01-02'] + Out[79]: + 2013-01-02 00:00 0.553439 + 2013-01-02 01:00 1.318152 + 2013-01-02 02:00 -0.469305 + 2013-01-02 03:00 0.675554 + 2013-01-02 04:00 -1.817027 + ... + 2013-01-02 19:00 0.036142 + 2013-01-02 20:00 -2.074978 + 2013-01-02 21:00 0.247792 + 2013-01-02 22:00 -0.897157 + 2013-01-02 23:00 -0.136795 + Freq: H, Length: 24, dtype: float64 - ``read_excel`` can now read milliseconds in Excel dates and times with xlrd >= 0.9.3. (:issue:`5945`) - ``pd.stats.moments.rolling_var`` now uses Welford's method for increased numerical stability (:issue:`6817`) diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.15.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.15.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.15.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.15.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -185,7 +185,29 @@ .. ipython:: python pd.timedelta_range('1 days', periods=5, freq='D') - pd.timedelta_range(start='1 days', end='2 days', freq='30T') + +.. code-block:: python + + In [20]: pd.timedelta_range(start='1 days', end='2 days', freq='30T') + Out[20]: + TimedeltaIndex(['1 days 00:00:00', '1 days 00:30:00', '1 days 01:00:00', + '1 days 01:30:00', '1 days 02:00:00', '1 days 02:30:00', + '1 days 03:00:00', '1 days 03:30:00', '1 days 04:00:00', + '1 days 04:30:00', '1 days 05:00:00', '1 days 05:30:00', + '1 days 06:00:00', '1 days 06:30:00', '1 days 07:00:00', + '1 days 07:30:00', '1 days 08:00:00', '1 days 08:30:00', + '1 days 09:00:00', '1 days 09:30:00', '1 days 10:00:00', + '1 days 10:30:00', '1 days 11:00:00', '1 days 11:30:00', + '1 days 12:00:00', '1 days 12:30:00', '1 days 13:00:00', + '1 days 13:30:00', '1 days 14:00:00', '1 days 14:30:00', + '1 days 15:00:00', '1 days 15:30:00', '1 days 16:00:00', + '1 days 16:30:00', '1 days 17:00:00', '1 days 17:30:00', + '1 days 18:00:00', '1 days 18:30:00', '1 days 19:00:00', + '1 days 19:30:00', '1 days 20:00:00', '1 days 20:30:00', + '1 days 21:00:00', '1 days 21:30:00', '1 days 22:00:00', + '1 days 22:30:00', '1 days 23:00:00', '1 days 23:30:00', + '2 days 00:00:00'], + dtype='timedelta64[ns]', freq='30T') You can now use a ``TimedeltaIndex`` as the index of a pandas object @@ -310,16 +332,37 @@ - ``tz_localize(None)`` for tz-aware ``Timestamp`` and ``DatetimeIndex`` now removes timezone holding local time, previously this resulted in ``Exception`` or ``TypeError`` (:issue:`7812`) - .. ipython:: python + .. code-block:: ipython + + In [58]: ts = pd.Timestamp('2014-08-01 09:00', tz='US/Eastern') + + In[59]: ts + Out[59]: Timestamp('2014-08-01 09:00:00-0400', tz='US/Eastern') + + In [60]: ts.tz_localize(None) + Out[60]: Timestamp('2014-08-01 09:00:00') - ts = pd.Timestamp('2014-08-01 09:00', tz='US/Eastern') - ts - ts.tz_localize(None) - - didx = pd.date_range(start='2014-08-01 09:00', freq='H', - periods=10, tz='US/Eastern') - didx - didx.tz_localize(None) + In [61]: didx = pd.date_range(start='2014-08-01 09:00', freq='H', + ....: periods=10, tz='US/Eastern') + ....: + + In [62]: didx + Out[62]: + DatetimeIndex(['2014-08-01 09:00:00-04:00', '2014-08-01 10:00:00-04:00', + '2014-08-01 11:00:00-04:00', '2014-08-01 12:00:00-04:00', + '2014-08-01 13:00:00-04:00', '2014-08-01 14:00:00-04:00', + '2014-08-01 15:00:00-04:00', '2014-08-01 16:00:00-04:00', + '2014-08-01 17:00:00-04:00', '2014-08-01 18:00:00-04:00'], + dtype='datetime64[ns, US/Eastern]', freq='H') + + In [63]: didx.tz_localize(None) + Out[63]: + DatetimeIndex(['2014-08-01 09:00:00', '2014-08-01 10:00:00', + '2014-08-01 11:00:00', '2014-08-01 12:00:00', + '2014-08-01 13:00:00', '2014-08-01 14:00:00', + '2014-08-01 15:00:00', '2014-08-01 16:00:00', + '2014-08-01 17:00:00', '2014-08-01 18:00:00'], + dtype='datetime64[ns]', freq=None) - ``tz_localize`` now accepts the ``ambiguous`` keyword which allows for passing an array of bools indicating whether the date belongs in DST or not, 'NaT' for setting transition times to NaT, @@ -1028,16 +1071,35 @@ If ``Period`` freq is ``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``, ``Timedelta``-like can be added if the result can have same freq. Otherwise, only the same ``offsets`` can be added. - .. ipython:: python + .. code-block:: ipython + + In [104]: idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') + + In [105]: idx + Out[105]: + PeriodIndex(['2014-07-01 09:00', '2014-07-01 10:00', '2014-07-01 11:00', + '2014-07-01 12:00', '2014-07-01 13:00'], + dtype='period[H]') + + In [106]: idx + pd.offsets.Hour(2) + Out[106]: + PeriodIndex(['2014-07-01 11:00', '2014-07-01 12:00', '2014-07-01 13:00', + '2014-07-01 14:00', '2014-07-01 15:00'], + dtype='period[H]') + + In [107]: idx + pd.Timedelta('120m') + Out[107]: + PeriodIndex(['2014-07-01 11:00', '2014-07-01 12:00', '2014-07-01 13:00', + '2014-07-01 14:00', '2014-07-01 15:00'], + dtype='period[H]') + + In [108]: idx = pd.period_range('2014-07', periods=5, freq='M') + + In [109]: idx + Out[109]: PeriodIndex(['2014-07', '2014-08', '2014-09', '2014-10', '2014-11'], dtype='period[M]') - idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') - idx - idx + pd.offsets.Hour(2) - idx + pd.Timedelta('120m') - - idx = pd.period_range('2014-07', periods=5, freq='M') - idx - idx + pd.offsets.MonthEnd(3) + In [110]: idx + pd.offsets.MonthEnd(3) + Out[110]: PeriodIndex(['2014-10', '2014-11', '2014-12', '2015-01', '2015-02'], dtype='period[M]') - Added experimental compatibility with ``openpyxl`` for versions >= 2.0. The ``DataFrame.to_excel`` method ``engine`` keyword now recognizes ``openpyxl1`` and ``openpyxl2`` diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.15.2.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.15.2.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.15.2.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.15.2.rst 2024-04-10 17:42:52.000000000 +0000 @@ -24,25 +24,61 @@ - Indexing in ``MultiIndex`` beyond lex-sort depth is now supported, though a lexically sorted index will have a better performance. (:issue:`2646`) - .. ipython:: python - :okexcept: - :okwarning: - - df = pd.DataFrame({'jim':[0, 0, 1, 1], - 'joe':['x', 'x', 'z', 'y'], - 'jolie':np.random.rand(4)}).set_index(['jim', 'joe']) - df - df.index.lexsort_depth + .. code-block:: ipython + + In [1]: df = pd.DataFrame({'jim':[0, 0, 1, 1], + ...: 'joe':['x', 'x', 'z', 'y'], + ...: 'jolie':np.random.rand(4)}).set_index(['jim', 'joe']) + ...: + + In [2]: df + Out[2]: + jolie + jim joe + 0 x 0.126970 + x 0.966718 + 1 z 0.260476 + y 0.897237 + + [4 rows x 1 columns] + + In [3]: df.index.lexsort_depth + Out[3]: 1 # in prior versions this would raise a KeyError # will now show a PerformanceWarning - df.loc[(1, 'z')] + In [4]: df.loc[(1, 'z')] + Out[4]: + jolie + jim joe + 1 z 0.260476 + + [1 rows x 1 columns] # lexically sorting - df2 = df.sort_index() - df2 - df2.index.lexsort_depth - df2.loc[(1,'z')] + In [5]: df2 = df.sort_index() + + In [6]: df2 + Out[6]: + jolie + jim joe + 0 x 0.126970 + x 0.966718 + 1 y 0.897237 + z 0.260476 + + [4 rows x 1 columns] + + In [7]: df2.index.lexsort_depth + Out[7]: 2 + + In [8]: df2.loc[(1,'z')] + Out[8]: + jolie + jim joe + 1 z 0.260476 + + [1 rows x 1 columns] - Bug in unique of Series with ``category`` dtype, which returned all categories regardless whether they were "used" or not (see :issue:`8559` for the discussion). diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.17.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.17.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.17.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.17.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -632,9 +632,10 @@ To keep the previous behavior, you can use ``errors='ignore'``: -.. ipython:: python +.. code-block:: ipython - pd.to_datetime(["2009-07-31", "asd"], errors="ignore") + In [4]: pd.to_datetime(["2009-07-31", "asd"], errors="ignore") + Out[4]: Index(['2009-07-31', 'asd'], dtype='object') Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword has been deprecated in favor of ``errors='coerce'``. @@ -793,7 +794,7 @@ In [27]: df_with_missing.to_hdf('file.h5', - 'df_with_missing', + key='df_with_missing', format='table', mode='w') @@ -809,7 +810,7 @@ .. ipython:: python - df_with_missing.to_hdf("file.h5", "df_with_missing", format="table", mode="w") + df_with_missing.to_hdf("file.h5", key="df_with_missing", format="table", mode="w") pd.read_hdf("file.h5", "df_with_missing") diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.17.1.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.17.1.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.17.1.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.17.1.rst 2024-04-10 17:42:52.000000000 +0000 @@ -43,7 +43,7 @@ the visual styling of a DataFrame based on the data. The styling is accomplished with HTML and CSS. Accesses the styler class with the :attr:`pandas.DataFrame.style`, attribute, -an instance of :class:`~pandas.core.style.Styler` with your data attached. +an instance of :class:`.Styler` with your data attached. Here's a quick example: @@ -58,7 +58,7 @@ .. raw:: html :file: whatsnew_0171_html_table.html -:class:`~pandas.core.style.Styler` interacts nicely with the Jupyter Notebook. +:class:`.Styler` interacts nicely with the Jupyter Notebook. See the :ref:`documentation ` for more. .. _whatsnew_0171.enhancements: diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.18.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.18.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.18.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.18.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -808,11 +808,19 @@ performed with the ``Resampler`` objects with :meth:`~Resampler.backfill`, :meth:`~Resampler.ffill`, :meth:`~Resampler.fillna` and :meth:`~Resampler.asfreq` methods. -.. ipython:: python +.. code-block:: ipython - s = pd.Series(np.arange(5, dtype='int64'), + In [89]: s = pd.Series(np.arange(5, dtype='int64'), index=pd.date_range('2010-01-01', periods=5, freq='Q')) - s + + In [90]: s + Out[90]: + 2010-03-31 0 + 2010-06-30 1 + 2010-09-30 2 + 2010-12-31 3 + 2011-03-31 4 + Freq: Q-DEC, Length: 5, dtype: int64 Previously @@ -837,9 +845,24 @@ New API -.. ipython:: python +.. code-block:: ipython - s.resample('M').ffill() + In [91]: s.resample('M').ffill() + Out[91]: + 2010-03-31 0 + 2010-04-30 0 + 2010-05-31 0 + 2010-06-30 1 + 2010-07-31 1 + 2010-08-31 1 + 2010-09-30 2 + 2010-10-31 2 + 2010-11-30 2 + 2010-12-31 3 + 2011-01-31 3 + 2011-02-28 3 + 2011-03-31 4 + Freq: M, Length: 13, dtype: int64 .. note:: @@ -985,10 +1008,16 @@ ^^^^^^^^^^^^^^^^^ - ``DataFrame.between_time`` and ``Series.between_time`` now only parse a fixed set of time strings. Parsing of date strings is no longer supported and raises a ``ValueError``. (:issue:`11818`) - .. ipython:: python + .. code-block:: ipython + + In [107]: s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10)) - s = pd.Series(range(10), pd.date_range('2015-01-01', freq='H', periods=10)) - s.between_time("7:00am", "9:00am") + In [108]: s.between_time("7:00am", "9:00am") + Out[108]: + 2015-01-01 07:00:00 7 + 2015-01-01 08:00:00 8 + 2015-01-01 09:00:00 9 + Freq: H, Length: 3, dtype: int64 This will now raise. diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.18.1.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.18.1.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.18.1.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.18.1.rst 2024-04-10 17:42:52.000000000 +0000 @@ -77,9 +77,52 @@ df = pd.DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) df -.. ipython:: python +.. code-block:: ipython - df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) + In [1]: df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) + Out[1]: + A + 1 0 NaN + 1 NaN + 2 NaN + 3 1.5 + 4 2.5 + 5 3.5 + 6 4.5 + 7 5.5 + 8 6.5 + 9 7.5 + 10 8.5 + 11 9.5 + 12 10.5 + 13 11.5 + 14 12.5 + 15 13.5 + 16 14.5 + 17 15.5 + 18 16.5 + 19 17.5 + 2 20 NaN + 21 NaN + 22 NaN + 23 21.5 + 24 22.5 + 25 23.5 + 26 24.5 + 27 25.5 + 28 26.5 + 29 27.5 + 30 28.5 + 31 29.5 + 3 32 NaN + 33 NaN + 34 NaN + 35 33.5 + 36 34.5 + 37 35.5 + 38 36.5 + 39 37.5 + Name: B, dtype: float64 Now you can do: @@ -101,15 +144,53 @@ df -.. ipython:: python +.. code-block:: ipython - df.groupby("group").apply(lambda x: x.resample("1D").ffill()) + In[1]: df.groupby("group").apply(lambda x: x.resample("1D").ffill()) + Out[1]: + group val + group date + 1 2016-01-03 1 5 + 2016-01-04 1 5 + 2016-01-05 1 5 + 2016-01-06 1 5 + 2016-01-07 1 5 + 2016-01-08 1 5 + 2016-01-09 1 5 + 2016-01-10 1 6 + 2 2016-01-17 2 7 + 2016-01-18 2 7 + 2016-01-19 2 7 + 2016-01-20 2 7 + 2016-01-21 2 7 + 2016-01-22 2 7 + 2016-01-23 2 7 + 2016-01-24 2 8 Now you can do: -.. ipython:: python +.. code-block:: ipython - df.groupby("group").resample("1D").ffill() + In[1]: df.groupby("group").resample("1D").ffill() + Out[1]: + group val + group date + 1 2016-01-03 1 5 + 2016-01-04 1 5 + 2016-01-05 1 5 + 2016-01-06 1 5 + 2016-01-07 1 5 + 2016-01-08 1 5 + 2016-01-09 1 5 + 2016-01-10 1 6 + 2 2016-01-17 2 7 + 2016-01-18 2 7 + 2016-01-19 2 7 + 2016-01-20 2 7 + 2016-01-21 2 7 + 2016-01-22 2 7 + 2016-01-23 2 7 + 2016-01-24 2 8 .. _whatsnew_0181.enhancements.method_chain: @@ -175,26 +256,78 @@ Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiIndex`` (:issue:`10331`) -.. ipython:: python +.. code-block:: ipython + + In [20]: dft2 = pd.DataFrame( + ....: np.random.randn(20, 1), + ....: columns=["A"], + ....: index=pd.MultiIndex.from_product( + ....: [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] + ....: ), + ....: ) + ....: + + In [21]: dft2 + Out[21]: + A + 2013-01-01 00:00:00 a 0.469112 + b -0.282863 + 2013-01-01 12:00:00 a -1.509059 + b -1.135632 + 2013-01-02 00:00:00 a 1.212112 + ... ... + 2013-01-04 12:00:00 b 0.271860 + 2013-01-05 00:00:00 a -0.424972 + b 0.567020 + 2013-01-05 12:00:00 a 0.276232 + b -1.087401 + + [20 rows x 1 columns] + + In [22]: dft2.loc["2013-01-05"] + Out[22]: + A + 2013-01-05 00:00:00 a -0.424972 + b 0.567020 + 2013-01-05 12:00:00 a 0.276232 + b -1.087401 - dft2 = pd.DataFrame( - np.random.randn(20, 1), - columns=["A"], - index=pd.MultiIndex.from_product( - [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] - ), - ) - dft2 - dft2.loc["2013-01-05"] + [4 rows x 1 columns] On other levels -.. ipython:: python +.. code-block:: ipython + + In [26]: idx = pd.IndexSlice + + In [27]: dft2 = dft2.swaplevel(0, 1).sort_index() + + In [28]: dft2 + Out[28]: + A + a 2013-01-01 00:00:00 0.469112 + 2013-01-01 12:00:00 -1.509059 + 2013-01-02 00:00:00 1.212112 + 2013-01-02 12:00:00 0.119209 + 2013-01-03 00:00:00 -0.861849 + ... ... + b 2013-01-03 12:00:00 1.071804 + 2013-01-04 00:00:00 -0.706771 + 2013-01-04 12:00:00 0.271860 + 2013-01-05 00:00:00 0.567020 + 2013-01-05 12:00:00 -1.087401 + + [20 rows x 1 columns] + + In [29]: dft2.loc[idx[:, "2013-01-05"], :] + Out[29]: + A + a 2013-01-05 00:00:00 -0.424972 + 2013-01-05 12:00:00 0.276232 + b 2013-01-05 00:00:00 0.567020 + 2013-01-05 12:00:00 -1.087401 - idx = pd.IndexSlice - dft2 = dft2.swaplevel(0, 1).sort_index() - dft2 - dft2.loc[idx[:, "2013-01-05"], :] + [4 rows x 1 columns] .. _whatsnew_0181.enhancements.assembling: diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.19.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.19.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.19.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.19.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -329,11 +329,13 @@ **SemiMonthEnd**: -.. ipython:: python +.. code-block:: python - pd.Timestamp("2016-01-01") + SemiMonthEnd() + In [46]: pd.Timestamp("2016-01-01") + SemiMonthEnd() + Out[46]: Timestamp('2016-01-15 00:00:00') - pd.date_range("2015-01-01", freq="SM", periods=4) + In [47]: pd.date_range("2015-01-01", freq="SM", periods=4) + Out[47]: DatetimeIndex(['2015-01-15', '2015-01-31', '2015-02-15', '2015-02-28'], dtype='datetime64[ns]', freq='SM-15') **SemiMonthBegin**: @@ -345,11 +347,13 @@ Using the anchoring suffix, you can also specify the day of month to use instead of the 15th. -.. ipython:: python +.. code-block:: python - pd.date_range("2015-01-01", freq="SMS-16", periods=4) + In [50]: pd.date_range("2015-01-01", freq="SMS-16", periods=4) + Out[50]: DatetimeIndex(['2015-01-01', '2015-01-16', '2015-02-01', '2015-02-16'], dtype='datetime64[ns]', freq='SMS-16') - pd.date_range("2015-01-01", freq="SM-14", periods=4) + In [51]: pd.date_range("2015-01-01", freq="SM-14", periods=4) + Out[51]: DatetimeIndex(['2015-01-14', '2015-01-31', '2015-02-14', '2015-02-28'], dtype='datetime64[ns]', freq='SM-14') .. _whatsnew_0190.enhancements.index: @@ -498,8 +502,26 @@ ), ) df - df.resample("M", on="date")[["a"]].sum() - df.resample("M", level="d")[["a"]].sum() + + .. code-block:: ipython + + In [74]: df.resample("M", on="date")[["a"]].sum() + Out[74]: + a + date + 2015-01-31 6 + 2015-02-28 4 + + [2 rows x 1 columns] + + In [75]: df.resample("M", level="d")[["a"]].sum() + Out[75]: + a + d + 2015-01-31 6 + 2015-02-28 4 + + [2 rows x 1 columns] - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the docs for more details (:issue:`13577`). - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.20.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.20.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.20.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.20.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -614,11 +614,18 @@ ``map`` on a ``Series`` with ``datetime64`` values may return ``int64`` dtypes rather than ``int32`` -.. ipython:: python +.. code-block:: ipython - s = pd.Series(pd.date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H') - .tz_localize('Asia/Tokyo')) - s + In [64]: s = pd.Series(pd.date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H') + ....: .tz_localize('Asia/Tokyo')) + ....: + + In [65]: s + Out[65]: + 0 2011-01-02 00:00:00+09:00 + 1 2011-01-02 01:00:00+09:00 + 2 2011-01-02 02:00:00+09:00 + Length: 3, dtype: datetime64[ns, Asia/Tokyo] Previous behavior: @@ -633,9 +640,14 @@ New behavior: -.. ipython:: python +.. code-block:: ipython - s.map(lambda x: x.hour) + In [66]: s.map(lambda x: x.hour) + Out[66]: + 0 0 + 1 1 + 2 2 + Length: 3, dtype: int64 .. _whatsnew_0200.api_breaking.index_dt_field: @@ -659,10 +671,12 @@ New behavior: -.. ipython:: python +.. code-block:: ipython - idx = pd.date_range("2015-01-01", periods=5, freq='10H') - idx.hour + In [67]: idx = pd.date_range("2015-01-01", periods=5, freq='10H') + + In [68]: idx.hour + Out[68]: Index([0, 10, 20, 6, 16], dtype='int32') This has the advantage that specific ``Index`` methods are still available on the result. On the other hand, this might have backward incompatibilities: e.g. @@ -872,11 +886,23 @@ This is *unchanged* from prior versions, but shown for illustration purposes: -.. ipython:: python +.. code-block:: python - df = pd.DataFrame(np.arange(6), columns=['value'], - index=pd.MultiIndex.from_product([list('BA'), range(3)])) - df + In [81]: df = pd.DataFrame(np.arange(6), columns=['value'], + ....: index=pd.MultiIndex.from_product([list('BA'), range(3)])) + ....: + In [82]: df + + Out[82]: + value + B 0 0 + 1 1 + 2 2 + A 0 3 + 1 4 + 2 5 + + [6 rows x 1 columns] .. code-block:: python @@ -1059,7 +1085,7 @@ .. ipython:: python df = pd.DataFrame({'unparsed_date': ['2014-01-01', '2014-01-01']}) - df.to_hdf('store.h5', 'key', format='table', data_columns=True) + df.to_hdf('store.h5', key='key', format='table', data_columns=True) df.dtypes Previous behavior: diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.20.2.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.20.2.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.20.2.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.20.2.rst 2024-04-10 17:42:52.000000000 +0000 @@ -28,8 +28,8 @@ - Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) - ``Series`` provides a ``to_latex`` method (:issue:`16180`) -- A new groupby method :meth:`~pandas.core.groupby.GroupBy.ngroup`, - parallel to the existing :meth:`~pandas.core.groupby.GroupBy.cumcount`, +- A new groupby method :meth:`.GroupBy.ngroup`, + parallel to the existing :meth:`.GroupBy.cumcount`, has been added to return the group order (:issue:`11642`); see :ref:`here `. diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.21.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.21.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.21.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.21.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -306,7 +306,7 @@ New functions or methods """""""""""""""""""""""" -- :meth:`~pandas.core.resample.Resampler.nearest` is added to support nearest-neighbor upsampling (:issue:`17496`). +- :meth:`.Resampler.nearest` is added to support nearest-neighbor upsampling (:issue:`17496`). - :class:`~pandas.Index` has added support for a ``to_frame`` method (:issue:`15230`). New keywords @@ -392,7 +392,7 @@ The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames no longer depends on whether `bottleneck `__ is installed, and return value of ``sum`` and ``prod`` on an empty Series has changed (:issue:`9422`, :issue:`15507`). -Calling ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, will result in ``NaN``. See the :ref:`docs `. +Calling ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, will result in ``NaN``. See the :ref:`docs `. .. ipython:: python @@ -635,17 +635,22 @@ New behavior: -.. ipython:: python +.. code-block:: ipython - pi = pd.period_range('2017-01', periods=12, freq='M') + In [1]: pi = pd.period_range('2017-01', periods=12, freq='M') - s = pd.Series(np.arange(12), index=pi) + In [2]: s = pd.Series(np.arange(12), index=pi) - resampled = s.resample('2Q').mean() + In [3]: resampled = s.resample('2Q').mean() - resampled + In [4]: resampled + Out[4]: + 2017Q1 2.5 + 2017Q3 8.5 + Freq: 2Q-DEC, dtype: float64 - resampled.index + In [5]: resampled.index + Out[5]: PeriodIndex(['2017Q1', '2017Q3'], dtype='period[2Q-DEC]') Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior. @@ -671,15 +676,35 @@ New behavior: -.. ipython:: python +.. code-block:: ipython + + In [56]: pi = pd.period_range(start='2000-01-01', freq='D', periods=10) - pi = pd.period_range(start='2000-01-01', freq='D', periods=10) + In [57]: s = pd.Series(np.arange(10), index=pi) - s = pd.Series(np.arange(10), index=pi) + In [58]: s.resample('H').ohlc() + Out[58]: + open high low close + 2000-01-01 00:00 0.0 0.0 0.0 0.0 + 2000-01-01 01:00 NaN NaN NaN NaN + 2000-01-01 02:00 NaN NaN NaN NaN + 2000-01-01 03:00 NaN NaN NaN NaN + 2000-01-01 04:00 NaN NaN NaN NaN + ... ... ... ... ... + 2000-01-10 19:00 NaN NaN NaN NaN + 2000-01-10 20:00 NaN NaN NaN NaN + 2000-01-10 21:00 NaN NaN NaN NaN + 2000-01-10 22:00 NaN NaN NaN NaN + 2000-01-10 23:00 NaN NaN NaN NaN - s.resample('H').ohlc() + [240 rows x 4 columns] + + In [59]: s.resample('M').ohlc() + Out[59]: + open high low close + 2000-01 0 9 0 9 - s.resample('M').ohlc() + [1 rows x 4 columns] .. _whatsnew_0210.api_breaking.pandas_eval: diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.22.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.22.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.22.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.22.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -187,16 +187,27 @@ *pandas 0.22.0* -.. ipython:: python +.. code-block:: ipython - idx = pd.DatetimeIndex(["2017-01-01", "2017-01-02"]) - pd.Series([1, 2], index=idx).resample("12H").sum() + In [14]: idx = pd.DatetimeIndex(["2017-01-01", "2017-01-02"]) + In [15]: pd.Series([1, 2], index=idx).resample("12H").sum() + Out[15]: + 2017-01-01 00:00:00 1 + 2017-01-01 12:00:00 0 + 2017-01-02 00:00:00 2 + Freq: 12H, Length: 3, dtype: int64 Once again, the ``min_count`` keyword is available to restore the 0.21 behavior. -.. ipython:: python +.. code-block:: ipython + + In [16]: pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1) + Out[16]: + 2017-01-01 00:00:00 1.0 + 2017-01-01 12:00:00 NaN + 2017-01-02 00:00:00 2.0 + Freq: 12H, Length: 3, dtype: float64 - pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1) Rolling and expanding ^^^^^^^^^^^^^^^^^^^^^ diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.23.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.23.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.23.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.23.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -286,12 +286,33 @@ df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) df -.. ipython:: python - pd.pivot_table(df, values='values', index=['A', 'B'], - dropna=True) - pd.pivot_table(df, values='values', index=['A', 'B'], - dropna=False) +.. code-block:: ipython + + In [1]: pd.pivot_table(df, values='values', index=['A', 'B'], dropna=True) + + Out[1]: + values + A B + a c 1.0 + d 2.0 + b c 3.0 + d 4.0 + + In [2]: pd.pivot_table(df, values='values', index=['A', 'B'], dropna=False) + + Out[2]: + values + A B + a c 1.0 + d 2.0 + y NaN + b c 3.0 + d 4.0 + y NaN + z c NaN + d NaN + y NaN .. _whatsnew_0230.enhancements.window_raw: @@ -299,8 +320,8 @@ Rolling/Expanding.apply() accepts ``raw=False`` to pass a ``Series`` to the function ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, -:func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` have gained a ``raw=None`` parameter. +:func:`Series.rolling().apply() <.Rolling.apply>`, :func:`DataFrame.rolling().apply() <.Rolling.apply>`, +:func:`Series.expanding().apply() <.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <.Expanding.apply>` have gained a ``raw=None`` parameter. This is similar to :func:`DataFame.apply`. This parameter, if ``True`` allows one to send a ``np.ndarray`` to the applied function. If ``False`` a ``Series`` will be passed. The default is ``None``, which preserves backward compatibility, so this will default to ``True``, sending an ``np.ndarray``. In a future version the default will be changed to ``False``, sending a ``Series``. (:issue:`5071`, :issue:`20584`) @@ -524,7 +545,7 @@ - ``Categorical.rename_categories``, ``CategoricalIndex.rename_categories`` and :attr:`Series.cat.rename_categories` can now take a callable as their argument (:issue:`18862`) - :class:`Interval` and :class:`IntervalIndex` have gained a ``length`` attribute (:issue:`18789`) -- ``Resampler`` objects now have a functioning :attr:`~pandas.core.resample.Resampler.pipe` method. +- ``Resampler`` objects now have a functioning :attr:`.Resampler.pipe` method. Previously, calls to ``pipe`` were diverted to the ``mean`` method (:issue:`17905`). - :func:`~pandas.api.types.is_scalar` now returns ``True`` for ``DateOffset`` objects (:issue:`18943`). - :func:`DataFrame.pivot` now accepts a list for the ``values=`` kwarg (:issue:`17160`). @@ -536,7 +557,7 @@ - ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`) - :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`) -- Added :func:`pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing` and :func:`pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing` (:issue:`17015`) +- Added :func:`.SeriesGroupBy.is_monotonic_increasing` and :func:`.SeriesGroupBy.is_monotonic_decreasing` (:issue:`17015`) - For subclassed ``DataFrames``, :func:`DataFrame.apply` will now preserve the ``Series`` subclass (if defined) when passing the data to the applied function (:issue:`19822`) - :func:`DataFrame.from_dict` now accepts a ``columns`` argument that can be used to specify the column names when ``orient='index'`` is used (:issue:`18529`) - Added option ``display.html.use_mathjax`` so `MathJax `_ can be disabled when rendering tables in ``Jupyter`` notebooks (:issue:`19856`, :issue:`19824`) @@ -547,7 +568,7 @@ ``SQLAlchemy`` dialects supporting multi-value inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`) - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) - :func:`read_html` now reads all ```` elements in a ````, not just the first. (:issue:`20690`) -- :meth:`~pandas.core.window.Rolling.quantile` and :meth:`~pandas.core.window.Expanding.quantile` now accept the ``interpolation`` keyword, ``linear`` by default (:issue:`20497`) +- :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile` now accept the ``interpolation`` keyword, ``linear`` by default (:issue:`20497`) - zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`) - :class:`~pandas.tseries.offsets.WeekOfMonth` constructor now supports ``n=0`` (:issue:`20517`). - :class:`DataFrame` and :class:`Series` now support matrix multiplication (``@``) operator (:issue:`10259`) for Python>=3.5 @@ -1052,7 +1073,7 @@ - :func:`DatetimeIndex.strftime` and :func:`PeriodIndex.strftime` now return an ``Index`` instead of a numpy array to be consistent with similar accessors (:issue:`20127`) - Constructing a Series from a list of length 1 no longer broadcasts this list when a longer index is specified (:issue:`19714`, :issue:`20391`). - :func:`DataFrame.to_dict` with ``orient='index'`` no longer casts int columns to float for a DataFrame with only int and float columns (:issue:`18580`) -- A user-defined-function that is passed to :func:`Series.rolling().aggregate() `, :func:`DataFrame.rolling().aggregate() `, or its expanding cousins, will now *always* be passed a ``Series``, rather than a ``np.array``; ``.apply()`` only has the ``raw`` keyword, see :ref:`here `. This is consistent with the signatures of ``.aggregate()`` across pandas (:issue:`20584`) +- A user-defined-function that is passed to :func:`Series.rolling().aggregate() <.Rolling.aggregate>`, :func:`DataFrame.rolling().aggregate() <.Rolling.aggregate>`, or its expanding cousins, will now *always* be passed a ``Series``, rather than a ``np.array``; ``.apply()`` only has the ``raw`` keyword, see :ref:`here `. This is consistent with the signatures of ``.aggregate()`` across pandas (:issue:`20584`) - Rolling and Expanding types raise ``NotImplementedError`` upon iteration (:issue:`11704`). .. _whatsnew_0230.deprecations: @@ -1084,8 +1105,7 @@ - ``Index.summary()`` is deprecated and will be removed in a future version (:issue:`18217`) - ``NDFrame.get_ftype_counts()`` is deprecated and will be removed in a future version (:issue:`18243`) - The ``convert_datetime64`` parameter in :func:`DataFrame.to_records` has been deprecated and will be removed in a future version. The NumPy bug motivating this parameter has been resolved. The default value for this parameter has also changed from ``True`` to ``None`` (:issue:`18160`). -- :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, - :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` have deprecated passing an ``np.array`` by default. One will need to pass the new ``raw`` parameter to be explicit about what is passed (:issue:`20584`) +- :func:`Series.rolling().apply() <.Rolling.apply>`, :func:`DataFrame.rolling().apply() <.Rolling.apply>`, :func:`Series.expanding().apply() <.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <.Expanding.apply>` have deprecated passing an ``np.array`` by default. One will need to pass the new ``raw`` parameter to be explicit about what is passed (:issue:`20584`) - The ``data``, ``base``, ``strides``, ``flags`` and ``itemsize`` properties of the ``Series`` and ``Index`` classes have been deprecated and will be removed in a future version (:issue:`20419`). @@ -1159,15 +1179,15 @@ - Improved performance of :func:`MultiIndex.remove_unused_levels` when there are no unused levels, at the cost of a reduction in performance when there are (:issue:`19289`) - Improved performance of :func:`Index.get_loc` for non-unique indexes (:issue:`19478`) - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) -- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` (:issue:`15779`) +- Improved performance of :func:`.GroupBy.rank` (:issue:`15779`) - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) -- Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`) -- Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`) -- Improved performance of :func:`pandas.core.groupby.GroupBy.pct_change` (:issue:`19165`) +- Improved performance of :func:`.GroupBy.ffill` and :func:`.GroupBy.bfill` (:issue:`11296`) +- Improved performance of :func:`.GroupBy.any` and :func:`.GroupBy.all` (:issue:`15435`) +- Improved performance of :func:`.GroupBy.pct_change` (:issue:`19165`) - Improved performance of :func:`Series.isin` in the case of categorical dtypes (:issue:`20003`) - Improved performance of ``getattr(Series, attr)`` when the Series has certain index types. This manifested in slow printing of large Series with a ``DatetimeIndex`` (:issue:`19764`) - Fixed a performance regression for :func:`GroupBy.nth` and :func:`GroupBy.last` with some object columns (:issue:`19283`) -- Improved performance of :func:`pandas.core.arrays.Categorical.from_codes` (:issue:`18501`) +- Improved performance of :func:`.Categorical.from_codes` (:issue:`18501`) .. _whatsnew_0230.docs: @@ -1412,13 +1432,13 @@ - Bug in :func:`DataFrame.groupby` where aggregation by ``first``/``last``/``min``/``max`` was causing timestamps to lose precision (:issue:`19526`) - Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) - Bug in :func:`DataFrame.groupby` passing the ``on=`` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) -- Bug in :func:`DataFrame.resample().aggregate ` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) +- Bug in :func:`DataFrame.resample().aggregate <.Resampler.aggregate>` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) - Bug in :func:`DataFrameGroupBy.cumsum` and :func:`DataFrameGroupBy.cumprod` when ``skipna`` was passed (:issue:`19806`) - Bug in :func:`DataFrame.resample` that dropped timezone information (:issue:`13238`) - Bug in :func:`DataFrame.groupby` where transformations using ``np.all`` and ``np.any`` were raising a ``ValueError`` (:issue:`20653`) - Bug in :func:`DataFrame.resample` where ``ffill``, ``bfill``, ``pad``, ``backfill``, ``fillna``, ``interpolate``, and ``asfreq`` were ignoring ``loffset``. (:issue:`20744`) - Bug in :func:`DataFrame.groupby` when applying a function that has mixed data types and the user supplied function can fail on the grouping column (:issue:`20949`) -- Bug in :func:`DataFrameGroupBy.rolling().apply() ` where operations performed against the associated :class:`DataFrameGroupBy` object could impact the inclusion of the grouped item(s) in the result (:issue:`14013`) +- Bug in :func:`DataFrameGroupBy.rolling().apply() <.Rolling.apply>` where operations performed against the associated :class:`DataFrameGroupBy` object could impact the inclusion of the grouped item(s) in the result (:issue:`14013`) Sparse ^^^^^^ diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.23.1.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.23.1.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.23.1.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.23.1.rst 2024-04-10 17:42:52.000000000 +0000 @@ -100,8 +100,8 @@ **Groupby/resample/rolling** - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) -- Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) -- Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` +- Bug in :func:`.GroupBy.ffill` and :func:`.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) +- Bug in :func:`.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` - Bug in :func:`pandas.DataFrame.rolling` and :func:`pandas.Series.rolling` which incorrectly accepted a 0 window size rather than raising (:issue:`21286`) **Data-type specific** diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.24.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.24.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.24.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.24.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -286,6 +286,7 @@ .. ipython:: python + from io import StringIO result = pd.read_html(StringIO("""
@@ -334,7 +335,7 @@ df.style.pipe(format_and_align).set_caption('Summary of results.') Similar methods already exist for other classes in pandas, including :meth:`DataFrame.pipe`, -:meth:`GroupBy.pipe() `, and :meth:`Resampler.pipe() `. +:meth:`GroupBy.pipe() <.GroupBy.pipe>`, and :meth:`Resampler.pipe() <.Resampler.pipe>`. .. _whatsnew_0240.enhancements.rename_axis: @@ -404,7 +405,7 @@ now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) and a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`) - The result of :meth:`~DataFrame.resample` is now iterable similar to ``groupby()`` (:issue:`15314`). -- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`pandas.core.resample.Resampler.quantile` (:issue:`15023`). +- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`.Resampler.quantile` (:issue:`15023`). - :meth:`DataFrame.resample` and :meth:`Series.resample` with a :class:`PeriodIndex` will now respect the ``base`` argument in the same fashion as with a :class:`DatetimeIndex`. (:issue:`23882`) - :meth:`pandas.api.types.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) @@ -1377,18 +1378,22 @@ *New behavior*: -.. ipython:: python - :okexcept: - :okwarning: +.. code-block:: ipython + + In [108]: ts = pd.Timestamp('1994-05-06 12:15:16', freq=pd.offsets.Hour()) + + In[109]: ts + 2 * ts.freq + Out[109]: Timestamp('1994-05-06 14:15:16', freq='H') + + In [110]: tdi = pd.timedelta_range('1D', periods=2) - ts = pd.Timestamp('1994-05-06 12:15:16', freq=pd.offsets.Hour()) - ts + 2 * ts.freq + In [111]: tdi - np.array([2 * tdi.freq, 1 * tdi.freq]) + Out[111]: TimedeltaIndex(['-1 days', '1 days'], dtype='timedelta64[ns]', freq=None) - tdi = pd.timedelta_range('1D', periods=2) - tdi - np.array([2 * tdi.freq, 1 * tdi.freq]) + In [112]: dti = pd.date_range('2001-01-01', periods=2, freq='7D') - dti = pd.date_range('2001-01-01', periods=2, freq='7D') - dti + pd.Index([1 * dti.freq, 2 * dti.freq]) + In [113]: dti + pd.Index([1 * dti.freq, 2 * dti.freq]) + Out[113]: DatetimeIndex(['2001-01-08', '2001-01-22'], dtype='datetime64[ns]', freq=None) .. _whatsnew_0240.deprecations.integer_tz: @@ -1544,7 +1549,7 @@ shows similar speed improvements as above (:issue:`21659`) - Improved performance of :meth:`CategoricalIndex.equals` when comparing to another :class:`CategoricalIndex` (:issue:`24023`) - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) -- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) +- Improved performance of :func:`.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) - Improved performance of :meth:`Series.at` and :meth:`Index.get_value` for Extension Arrays values (e.g. :class:`Categorical`) (:issue:`24204`) - Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` @@ -1852,28 +1857,28 @@ GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :func:`pandas.core.window.Rolling.min` and :func:`pandas.core.window.Rolling.max` with ``closed='left'``, a datetime-like index and only one entry in the series leading to segfault (:issue:`24718`) -- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) +- Bug in :func:`.Rolling.min` and :func:`.Rolling.max` with ``closed='left'``, a datetime-like index and only one entry in the series leading to segfault (:issue:`24718`) +- Bug in :func:`.GroupBy.first` and :func:`.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) - Bug in :meth:`DateFrame.resample` when downsampling across a DST boundary (:issue:`8531`) - Bug in date anchoring for :meth:`DateFrame.resample` with offset :class:`Day` when n > 1 (:issue:`24127`) -- Bug where ``ValueError`` is wrongly raised when calling :func:`~pandas.core.groupby.SeriesGroupBy.count` method of a +- Bug where ``ValueError`` is wrongly raised when calling :func:`.SeriesGroupBy.count` method of a ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). -- Multiple bugs in :func:`pandas.core.window.Rolling.min` with ``closed='left'`` and a +- Multiple bugs in :func:`.Rolling.min` with ``closed='left'`` and a datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) -- Bug in :meth:`pandas.core.resample.Resampler.apply` when passing positional arguments to applied func (:issue:`14615`). +- Bug in :meth:`.Resampler.apply` when passing positional arguments to applied func (:issue:`14615`). - Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). -- Bug in :meth:`pandas.core.resample.Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). -- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) -- :func:`pandas.core.groupby.RollingGroupby.agg` and :func:`pandas.core.groupby.ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`) +- Bug in :meth:`.Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). +- Bug in :meth:`.SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) +- :func:`.RollingGroupby.agg` and :func:`.ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`) - Bug in :meth:`DataFrame.expanding` in which the ``axis`` argument was not being respected during aggregations (:issue:`23372`) -- Bug in :meth:`pandas.core.groupby.GroupBy.transform` which caused missing values when the input function can accept a :class:`DataFrame` but renames it (:issue:`23455`). -- Bug in :func:`pandas.core.groupby.GroupBy.nth` where column order was not always preserved (:issue:`20760`) -- Bug in :meth:`pandas.core.groupby.GroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ``ZeroDivisionError`` (:issue:`23666`). -- Calling :meth:`pandas.core.groupby.GroupBy.rank` with empty groups and ``pct=True`` was raising a ``ZeroDivisionError`` (:issue:`22519`) +- Bug in :meth:`.GroupBy.transform` which caused missing values when the input function can accept a :class:`DataFrame` but renames it (:issue:`23455`). +- Bug in :func:`.GroupBy.nth` where column order was not always preserved (:issue:`20760`) +- Bug in :meth:`.GroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ``ZeroDivisionError`` (:issue:`23666`). +- Calling :meth:`.GroupBy.rank` with empty groups and ``pct=True`` was raising a ``ZeroDivisionError`` (:issue:`22519`) - Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). - Bug in :meth:`DataFrame.groupby` did not respect the ``observed`` argument when selecting a column and instead always used ``observed=False`` (:issue:`23970`) -- Bug in :func:`pandas.core.groupby.SeriesGroupBy.pct_change` or :func:`pandas.core.groupby.DataFrameGroupBy.pct_change` would previously work across groups when calculating the percent change, where it now correctly works per group (:issue:`21200`, :issue:`21235`). +- Bug in :func:`.SeriesGroupBy.pct_change` or :func:`.DataFrameGroupBy.pct_change` would previously work across groups when calculating the percent change, where it now correctly works per group (:issue:`21200`, :issue:`21235`). - Bug preventing hash table creation with very large number (2^32) of rows (:issue:`22805`) - Bug in groupby when grouping on categorical causes ``ValueError`` and incorrect grouping if ``observed=True`` and ``nan`` is present in categorical column (:issue:`24740`, :issue:`21151`). @@ -1887,7 +1892,7 @@ - Bug in :meth:`DataFrame.where` with an empty DataFrame and empty ``cond`` having non-bool dtype (:issue:`21947`) - Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) - Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) -- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) +- :func:`.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) - Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) - Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) @@ -1905,7 +1910,7 @@ - Bug in :func:`pandas.concat` when joining ``Series`` datetimetz with ``Series`` category would lose timezone (:issue:`23816`) - Bug in :meth:`DataFrame.join` when joining on partial MultiIndex would drop names (:issue:`20452`). - :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) -- Constructing a DataFrame with an index argument that wasn't already an instance of :class:`~pandas.core.Index` was broken (:issue:`22227`). +- Constructing a DataFrame with an index argument that wasn't already an instance of :class:`.Index` was broken (:issue:`22227`). - Bug in :class:`DataFrame` prevented list subclasses to be used to construction (:issue:`21226`) - Bug in :func:`DataFrame.unstack` and :func:`DataFrame.pivot_table` returning a misleading error message when the resulting DataFrame has more elements than int32 can handle. Now, the error message is improved, pointing towards the actual problem (:issue:`20601`) - Bug in :func:`DataFrame.unstack` where a ``ValueError`` was raised when unstacking timezone aware values (:issue:`18338`) diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.24.2.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.24.2.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.24.2.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.24.2.rst 2024-04-10 17:42:52.000000000 +0000 @@ -54,7 +54,7 @@ **Reshaping** -- Bug in :meth:`~pandas.core.groupby.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`) +- Bug in :meth:`.GroupBy.transform` where applying a function to a timezone aware column would return a timezone naive result (:issue:`24198`) - Bug in :func:`DataFrame.join` when joining on a timezone aware :class:`DatetimeIndex` (:issue:`23931`) **Visualization** diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.25.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.25.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.25.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.25.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -89,7 +89,7 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can now provide multiple lambda functions to a list-like aggregation in -:class:`pandas.core.groupby.GroupBy.agg` (:issue:`26430`). +:class:`.GroupBy.agg` (:issue:`26430`). .. ipython:: python @@ -225,7 +225,7 @@ - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) - :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`) -- :meth:`pandas.core.window.Rolling` supports exponential (or Poisson) window type (:issue:`21303`) +- :meth:`.Rolling` supports exponential (or Poisson) window type (:issue:`21303`) - Error message for missing required imports now includes the original import error's text (:issue:`23868`) - :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`) - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`) @@ -311,7 +311,7 @@ ``GroupBy.apply`` on ``DataFrame`` evaluates first group only once ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The implementation of :meth:`DataFrameGroupBy.apply() ` +The implementation of :meth:`.DataFrameGroupBy.apply` previously evaluated the supplied function consistently twice on the first group to infer if it is safe to use a fast code path. Particularly for functions with side effects, this was an undesired behavior and may have led to surprises. (:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`, :issue:`20084`, :issue:`21417`) @@ -493,7 +493,7 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The methods ``ffill``, ``bfill``, ``pad`` and ``backfill`` of -:class:`DataFrameGroupBy ` +:class:`.DataFrameGroupBy` previously included the group labels in the return value, which was inconsistent with other groupby transforms. Now only the filled values are returned. (:issue:`21521`) @@ -885,8 +885,8 @@ - :meth:`Timestamp.strptime` will now rise a ``NotImplementedError`` (:issue:`25016`) - Comparing :class:`Timestamp` with unsupported objects now returns :py:obj:`NotImplemented` instead of raising ``TypeError``. This implies that unsupported rich comparisons are delegated to the other object, and are now consistent with Python 3 behavior for ``datetime`` objects (:issue:`24011`) - Bug in :meth:`DatetimeIndex.snap` which didn't preserving the ``name`` of the input :class:`Index` (:issue:`25575`) -- The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) -- The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) +- The ``arg`` argument in :meth:`.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) +- The ``arg`` argument in :meth:`.Window.aggregate` has been renamed to ``func`` (:issue:`26372`) - Most pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) - The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) @@ -991,7 +991,7 @@ - :meth:`DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) - Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) -- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) +- Improved performance of :meth:`.GroupBy.quantile` (:issue:`20405`) - Improved performance of slicing and other selected operation on a :class:`RangeIndex` (:issue:`26565`, :issue:`26617`, :issue:`26722`) - :class:`RangeIndex` now performs standard lookup without instantiating an actual hashtable, hence saving memory (:issue:`16685`) - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) @@ -1117,7 +1117,7 @@ - Bug in :meth:`DataFrame.loc` and :meth:`Series.loc` where ``KeyError`` was not raised for a ``MultiIndex`` when the key was less than or equal to the number of levels in the :class:`MultiIndex` (:issue:`14885`). - Bug in which :meth:`DataFrame.append` produced an erroneous warning indicating that a ``KeyError`` will be thrown in the future when the data to be appended contains new columns (:issue:`22252`). - Bug in which :meth:`DataFrame.to_csv` caused a segfault for a reindexed data frame, when the indices were single-level :class:`MultiIndex` (:issue:`26303`). -- Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`pandas.core.frame.DataFrame` would raise error (:issue:`26390`) +- Fixed bug where assigning a :class:`arrays.PandasArray` to a :class:`.DataFrame` would raise error (:issue:`26390`) - Allow keyword arguments for callable local reference used in the :meth:`DataFrame.query` string (:issue:`26426`) - Fixed a ``KeyError`` when indexing a :class:`MultiIndex` level with a list containing exactly one label, which is missing (:issue:`27148`) - Bug which produced ``AttributeError`` on partial matching :class:`Timestamp` in a :class:`MultiIndex` (:issue:`26944`) @@ -1190,28 +1190,28 @@ GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`) -- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`) -- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying an aggregation function to timezone aware data (:issue:`23683`) -- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) -- Bug in :func:`pandas.core.groupby.GroupBy.size` when grouping only NA values (:issue:`23050`) +- Bug in :meth:`.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`) +- Bug in :meth:`.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`) +- Bug in :func:`.GroupBy.agg` when applying an aggregation function to timezone aware data (:issue:`23683`) +- Bug in :func:`.GroupBy.first` and :func:`.GroupBy.last` where timezone information would be dropped (:issue:`21603`) +- Bug in :func:`.GroupBy.size` when grouping only NA values (:issue:`23050`) - Bug in :func:`Series.groupby` where ``observed`` kwarg was previously ignored (:issue:`24880`) - Bug in :func:`Series.groupby` where using ``groupby`` with a :class:`MultiIndex` Series with a list of labels equal to the length of the series caused incorrect grouping (:issue:`25704`) - Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`) - Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`) -- Bug in :meth:`pandas.core.window.Rolling.min` and :meth:`pandas.core.window.Rolling.max` that caused a memory leak (:issue:`25893`) -- Bug in :meth:`pandas.core.window.Rolling.count` and ``pandas.core.window.Expanding.count`` was previously ignoring the ``axis`` keyword (:issue:`13503`) -- Bug in :meth:`pandas.core.groupby.GroupBy.idxmax` and :meth:`pandas.core.groupby.GroupBy.idxmin` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`) -- Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) -- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) -- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise a ``ValueError`` (:issue:`26208`) -- Bug in :meth:`pandas.core.frame.DataFrame.groupby` where passing a :class:`pandas.core.groupby.grouper.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`) -- Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) -- Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`) -- Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`) -- Improved :class:`pandas.core.window.Rolling`, :class:`pandas.core.window.Window` and :class:`pandas.core.window.ExponentialMovingWindow` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`) -- Bug in :meth:`pandas.core.window.Rolling.max` and :meth:`pandas.core.window.Rolling.min` where incorrect results are returned with an empty variable window (:issue:`26005`) -- Raise a helpful exception when an unsupported weighted window function is used as an argument of :meth:`pandas.core.window.Window.aggregate` (:issue:`26597`) +- Bug in :meth:`.Rolling.min` and :meth:`.Rolling.max` that caused a memory leak (:issue:`25893`) +- Bug in :meth:`.Rolling.count` and ``.Expanding.count`` was previously ignoring the ``axis`` keyword (:issue:`13503`) +- Bug in :meth:`.GroupBy.idxmax` and :meth:`.GroupBy.idxmin` with datetime column would return incorrect dtype (:issue:`25444`, :issue:`15306`) +- Bug in :meth:`.GroupBy.cumsum`, :meth:`.GroupBy.cumprod`, :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) +- Bug in :meth:`.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) +- Bug in :meth:`.SeriesGroupBy.transform` where transforming an empty group would raise a ``ValueError`` (:issue:`26208`) +- Bug in :meth:`.DataFrame.groupby` where passing a :class:`.Grouper` would return incorrect groups when using the ``.groups`` accessor (:issue:`26326`) +- Bug in :meth:`.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) +- Bug in :meth:`.Rolling.median` and :meth:`.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`) +- Bug in :meth:`.Rolling.median` and :meth:`.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`) +- Improved :class:`.Rolling`, :class:`.Window` and :class:`.ExponentialMovingWindow` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`) +- Bug in :meth:`.Rolling.max` and :meth:`.Rolling.min` where incorrect results are returned with an empty variable window (:issue:`26005`) +- Raise a helpful exception when an unsupported weighted window function is used as an argument of :meth:`.Window.aggregate` (:issue:`26597`) Reshaping ^^^^^^^^^ diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.25.1.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.25.1.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.25.1.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.25.1.rst 2024-04-10 17:42:52.000000000 +0000 @@ -86,10 +86,10 @@ ^^^^^^^^^^^^^^^^^^^^^^^^ - Fixed regression in :meth:`pands.core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`) -- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`) -- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) +- Bug in :meth:`.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`) +- Bug in :meth:`.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) - Bug in windowing over read-only arrays (:issue:`27766`) -- Fixed segfault in ``pandas.core.groupby.DataFrameGroupBy.quantile`` when an invalid quantile was passed (:issue:`27470`) +- Fixed segfault in ``.DataFrameGroupBy.quantile`` when an invalid quantile was passed (:issue:`27470`) Reshaping ^^^^^^^^^ diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.25.2.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.25.2.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.25.2.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.25.2.rst 2024-04-10 17:42:52.000000000 +0000 @@ -31,8 +31,8 @@ GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`). -- Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`) +- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`.DataFrameGroupBy.quantile` (:issue:`28113`). +- Bug in :meth:`.GroupBy.shift`, :meth:`.GroupBy.bfill` and :meth:`.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`) Other ^^^^^ diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v0.4.x.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v0.4.x.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v0.4.x.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v0.4.x.rst 2024-04-10 17:42:52.000000000 +0000 @@ -11,8 +11,7 @@ - Added Python 3 support using 2to3 (:issue:`200`) - :ref:`Added ` ``name`` attribute to ``Series``, now prints as part of ``Series.__repr__`` -- :ref:`Added ` instance methods ``isnull`` and ``notnull`` to - Series (:issue:`209`, :issue:`203`) +- :meth:`Series.isnull`` and :meth:`Series.notnull` (:issue:`209`, :issue:`203`) - :ref:`Added ` ``Series.align`` method for aligning two series with choice of join method (ENH56_) - :ref:`Added ` method ``get_level_values`` to diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v1.0.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v1.0.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v1.0.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v1.0.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -242,7 +242,7 @@ - :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue:`30270`) - :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`) - :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`) -- Implemented :meth:`pandas.core.window.Window.var` and :meth:`pandas.core.window.Window.std` functions (:issue:`26597`) +- Implemented :meth:`.Window.var` and :meth:`.Window.std` functions (:issue:`26597`) - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`) - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`) @@ -987,7 +987,7 @@ - The 'outer' method on Numpy ufuncs, e.g. ``np.subtract.outer`` operating on :class:`Series` objects is no longer supported, and will raise ``NotImplementedError`` (:issue:`27198`) - Removed ``Series.get_dtype_counts`` and ``DataFrame.get_dtype_counts`` (:issue:`27145`) - Changed the default "fill_value" argument in :meth:`Categorical.take` from ``True`` to ``False`` (:issue:`20841`) -- Changed the default value for the ``raw`` argument in :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` from ``None`` to ``False`` (:issue:`20584`) +- Changed the default value for the ``raw`` argument in :func:`Series.rolling().apply() <.Rolling.apply>`, :func:`DataFrame.rolling().apply() <.Rolling.apply>`, :func:`Series.expanding().apply() <.Expanding.apply>`, and :func:`DataFrame.expanding().apply() <.Expanding.apply>` from ``None`` to ``False`` (:issue:`20584`) - Removed deprecated behavior of :meth:`Series.argmin` and :meth:`Series.argmax`, use :meth:`Series.idxmin` and :meth:`Series.idxmax` for the old behavior (:issue:`16955`) - Passing a tz-aware ``datetime.datetime`` or :class:`Timestamp` into the :class:`Timestamp` constructor with the ``tz`` argument now raises a ``ValueError`` (:issue:`23621`) - Removed ``Series.base``, ``Index.base``, ``Categorical.base``, ``Series.flags``, ``Index.flags``, ``PeriodArray.flags``, ``Series.strides``, ``Index.strides``, ``Series.itemsize``, ``Index.itemsize``, ``Series.data``, ``Index.data`` (:issue:`20721`) @@ -1079,7 +1079,7 @@ - Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`) - Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`) - Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`) -- Bug in :func:`pandas.to_datetime` failing for ``deques`` when using ``cache=True`` (the default) (:issue:`29403`) +- Bug in :func:`pandas.to_datetime` failing for ``deque`` objects when using ``cache=True`` (the default) (:issue:`29403`) - Bug in :meth:`Series.item` with ``datetime64`` or ``timedelta64`` dtype, :meth:`DatetimeIndex.item`, and :meth:`TimedeltaIndex.item` returning an integer instead of a :class:`Timestamp` or :class:`Timedelta` (:issue:`30175`) - Bug in :class:`DatetimeIndex` addition when adding a non-optimized :class:`DateOffset` incorrectly dropping timezone information (:issue:`30336`) - Bug in :meth:`DataFrame.drop` where attempting to drop non-existent values from a DatetimeIndex would yield a confusing error message (:issue:`30399`) @@ -1217,7 +1217,7 @@ - Bug in :meth:`core.groupby.DataFrameGroupBy.apply` only showing output from a single group when function returns an :class:`Index` (:issue:`28652`) - Bug in :meth:`DataFrame.groupby` with multiple groups where an ``IndexError`` would be raised if any group contained all NA values (:issue:`20519`) -- Bug in :meth:`pandas.core.resample.Resampler.size` and :meth:`pandas.core.resample.Resampler.count` returning wrong dtype when used with an empty :class:`Series` or :class:`DataFrame` (:issue:`28427`) +- Bug in :meth:`.Resampler.size` and :meth:`.Resampler.count` returning wrong dtype when used with an empty :class:`Series` or :class:`DataFrame` (:issue:`28427`) - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue:`28192`) - Bug in :meth:`DataFrame.rolling` not allowing rolling over multi-index levels (:issue:`15584`). - Bug in :meth:`DataFrame.rolling` not allowing rolling on monotonic decreasing time indexes (:issue:`19248`). diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v1.0.2.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v1.0.2.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v1.0.2.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v1.0.2.rst 2024-04-10 17:42:52.000000000 +0000 @@ -19,8 +19,8 @@ - Fixed regression in :meth:`.DataFrameGroupBy.agg` and :meth:`.SeriesGroupBy.agg` which were failing on frames with :class:`MultiIndex` columns and a custom function (:issue:`31777`) - Fixed regression in ``groupby(..).rolling(..).apply()`` (``RollingGroupby``) where the ``raw`` parameter was ignored (:issue:`31754`) -- Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) -- Fixed regression in :meth:`groupby(..).nunique() ` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) +- Fixed regression in :meth:`rolling(..).corr() <.Rolling.corr>` when using a time offset (:issue:`31789`) +- Fixed regression in :meth:`groupby(..).nunique() <.DataFrameGroupBy.nunique>` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) - Fixed regression in ``DataFrame.groupby`` raising a ``ValueError`` from an internal operation (:issue:`31802`) - Fixed regression in :meth:`.DataFrameGroupBy.agg` and :meth:`.SeriesGroupBy.agg` calling a user-provided function an extra time on an empty input (:issue:`31760`) diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v1.1.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v1.1.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v1.1.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v1.1.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -313,9 +313,9 @@ - :meth:`melt` has gained an ``ignore_index`` (default ``True``) argument that, if set to ``False``, prevents the method from dropping the index (:issue:`17440`). - :meth:`Series.update` now accepts objects that can be coerced to a :class:`Series`, such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`) -- :meth:`~pandas.core.groupby.DataFrameGroupBy.transform` and :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` have gained ``engine`` and ``engine_kwargs`` arguments that support executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`) -- :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`) -- :class:`~pandas.core.groupby.DataFrameGroupBy` and :class:`~pandas.core.groupby.SeriesGroupBy` now implement the ``sample`` method for doing random sampling within groups (:issue:`31775`) +- :meth:`.DataFrameGroupBy.transform` and :meth:`.DataFrameGroupBy.aggregate` have gained ``engine`` and ``engine_kwargs`` arguments that support executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`) +- :meth:`.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`) +- :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` now implement the ``sample`` method for doing random sampling within groups (:issue:`31775`) - :meth:`DataFrame.to_numpy` now supports the ``na_value`` keyword to control the NA sentinel in the output array (:issue:`33820`) - Added :class:`api.extension.ExtensionArray.equals` to the extension array interface, similar to :meth:`Series.equals` (:issue:`27081`) - The minimum supported dta version has increased to 105 in :func:`read_stata` and :class:`~pandas.io.stata.StataReader` (:issue:`26667`). @@ -327,10 +327,10 @@ and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`). - :meth:`HDFStore.put` now accepts a ``track_times`` parameter. This parameter is passed to the ``create_table`` method of ``PyTables`` (:issue:`32682`). - :meth:`Series.plot` and :meth:`DataFrame.plot` now accepts ``xlabel`` and ``ylabel`` parameters to present labels on x and y axis (:issue:`9093`). -- Made :class:`pandas.core.window.rolling.Rolling` and :class:`pandas.core.window.expanding.Expanding` iterable(:issue:`11704`) +- Made :class:`.Rolling` and :class:`.Expanding` iterable(:issue:`11704`) - Made ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) -- :meth:`~pandas.core.groupby.DataFrameGroupBy.groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). +- :meth:`.DataFrameGroupBy.groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). - :func:`read_json` now accepts an ``nrows`` parameter. (:issue:`33916`). - :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) - :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example @@ -344,7 +344,7 @@ - :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similar to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`). - :meth:`DataFrame.to_markdown` and :meth:`Series.to_markdown` now accept ``index`` argument as an alias for tabulate's ``showindex`` (:issue:`32667`) - :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable Boolean dtype (:issue:`34859`) -- :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) +- :class:`.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) - ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`) - :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for ``yerr`` and/or ``xerr``, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) @@ -629,7 +629,7 @@ df.groupby("a", as_index=False).nunique() -The method :meth:`~pandas.core.groupby.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a :class:`DataFrame` instead of a :class:`Series`. (:issue:`32599`) +The method :meth:`.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a :class:`DataFrame` instead of a :class:`Series`. (:issue:`32599`) *Previous behavior*: @@ -650,10 +650,10 @@ .. _whatsnew_110.api_breaking.groupby_results_lost_as_index_false: -:meth:`~pandas.core.groupby.DataFrameGroupby.agg` lost results with ``as_index=False`` when relabeling columns -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:meth:`.DataFrameGroupby.agg` lost results with ``as_index=False`` when relabeling columns +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previously :meth:`~pandas.core.groupby.DataFrameGroupby.agg` lost the result columns, when the ``as_index`` option was +Previously :meth:`.DataFrameGroupby.agg` lost the result columns, when the ``as_index`` option was set to ``False`` and the result columns were relabeled. In this case the result values were replaced with the previous index (:issue:`32240`). @@ -878,14 +878,14 @@ sparse values from ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). -- Performance improvement for groupby methods :meth:`~pandas.core.groupby.groupby.Groupby.first` - and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`) +- Performance improvement for groupby methods :meth:`.Groupby.first` + and :meth:`.Groupby.last` (:issue:`34178`) - Performance improvement in :func:`factorize` for nullable (integer and Boolean) dtypes (:issue:`33064`). - Performance improvement when constructing :class:`Categorical` objects (:issue:`33921`) - Fixed performance regression in :func:`pandas.qcut` and :func:`pandas.cut` (:issue:`33921`) - Performance improvement in reductions (``sum``, ``prod``, ``min``, ``max``) for nullable (integer and Boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) -- Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`) +- Performance improvement in :class:`.RollingGroupby` (:issue:`34052`) - Performance improvement in arithmetic operations (``sub``, ``add``, ``mul``, ``div``) for :class:`MultiIndex` (:issue:`34297`) - Performance improvement in ``DataFrame[bool_indexer]`` when ``bool_indexer`` is a ``list`` (:issue:`33924`) - Significant performance improvement of :meth:`io.formats.style.Styler.render` with styles added with various ways such as :meth:`io.formats.style.Styler.apply`, :meth:`io.formats.style.Styler.applymap` or :meth:`io.formats.style.Styler.bar` (:issue:`19917`) diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v1.1.1.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v1.1.1.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v1.1.1.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v1.1.1.rst 2024-04-10 17:42:52.000000000 +0000 @@ -30,7 +30,7 @@ - Fixed regression where :func:`pandas.merge_asof` would raise a ``UnboundLocalError`` when ``left_index``, ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) - Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`) -- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` where a list of functions would produce the wrong results if at least one of the functions did not aggregate (:issue:`35490`) +- Fixed regression in :meth:`.DataFrameGroupBy.aggregate` where a list of functions would produce the wrong results if at least one of the functions did not aggregate (:issue:`35490`) - Fixed memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) .. --------------------------------------------------------------------------- diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v1.2.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v1.2.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v1.2.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v1.2.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -793,13 +793,13 @@ - Bug in :meth:`DataFrame.resample` that would throw a ``ValueError`` when resampling from ``"D"`` to ``"24H"`` over a transition into daylight savings time (DST) (:issue:`35219`) - Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising a ``TypeError`` (:issue:`35325`) - Bug in :meth:`.DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply`` (:issue:`34656`) -- Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values (:issue:`9959`) +- Bug when subsetting columns on a :class:`.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values (:issue:`9959`) - Bug in :meth:`.DataFrameGroupBy.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`.DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) - Bug in :meth:`.Rolling.sum` returned wrong values when dtypes where mixed between float and integer and ``axis=1`` (:issue:`20649`, :issue:`35596`) - Bug in :meth:`.Rolling.count` returned ``np.nan`` with :class:`~pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in the window (:issue:`35579`) -- Bug where :class:`pandas.core.window.Rolling` produces incorrect window sizes when using a ``PeriodIndex`` (:issue:`34225`) +- Bug where :class:`.Rolling` produces incorrect window sizes when using a ``PeriodIndex`` (:issue:`34225`) - Bug in :meth:`.DataFrameGroupBy.ffill` and :meth:`.DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) - Bug in :meth:`.RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`) - Bug in :meth:`.DataFrameGroupBy.rolling` returning wrong values with partial centered window (:issue:`36040`) diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v1.4.2.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v1.4.2.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v1.4.2.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v1.4.2.rst 2024-04-10 17:42:52.000000000 +0000 @@ -33,7 +33,7 @@ - Fix some cases for subclasses that define their ``_constructor`` properties as general callables (:issue:`46018`) - Fixed "longtable" formatting in :meth:`.Styler.to_latex` when ``column_format`` is given in extended format (:issue:`46037`) - Fixed incorrect rendering in :meth:`.Styler.format` with ``hyperlinks="html"`` when the url contains a colon or other special characters (:issue:`46389`) -- Improved error message in :class:`~pandas.core.window.Rolling` when ``window`` is a frequency and ``NaT`` is in the rolling axis (:issue:`46087`) +- Improved error message in :class:`.Rolling` when ``window`` is a frequency and ``NaT`` is in the rolling axis (:issue:`46087`) .. --------------------------------------------------------------------------- diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v1.5.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v1.5.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v1.5.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v1.5.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -112,14 +112,33 @@ of pandas, not specifying ``group_keys`` will default to the same behavior as ``group_keys=False``. -.. ipython:: python +.. code-block:: ipython - df = pd.DataFrame( - {'a': range(6)}, - index=pd.date_range("2021-01-01", periods=6, freq="8H") - ) - df.resample("D", group_keys=True).apply(lambda x: x) - df.resample("D", group_keys=False).apply(lambda x: x) + In [11]: df = pd.DataFrame( + ....: {'a': range(6)}, + ....: index=pd.date_range("2021-01-01", periods=6, freq="8H") + ....: ) + ....: + + In [12]: df.resample("D", group_keys=True).apply(lambda x: x) + Out[12]: + a + 2021-01-01 2021-01-01 00:00:00 0 + 2021-01-01 08:00:00 1 + 2021-01-01 16:00:00 2 + 2021-01-02 2021-01-02 00:00:00 3 + 2021-01-02 08:00:00 4 + 2021-01-02 16:00:00 5 + + In [13]: df.resample("D", group_keys=False).apply(lambda x: x) + Out[13]: + a + 2021-01-01 00:00:00 0 + 2021-01-01 08:00:00 1 + 2021-01-01 16:00:00 2 + 2021-01-02 00:00:00 3 + 2021-01-02 08:00:00 4 + 2021-01-02 16:00:00 5 Previously, the resulting index would depend upon the values returned by ``apply``, as seen in the following example. @@ -461,20 +480,21 @@ *Old Behavior* -.. ipython:: python - - index = pd.date_range( - start='2020-12-28 00:00:00', - end='2020-12-28 02:00:00', - freq='1H', - ) - a = pd.Series( - data=range(3), - index=index, - ) - .. code-block:: ipython + In [32]: index = pd.date_range( + ....: start='2020-12-28 00:00:00', + ....: end='2020-12-28 02:00:00', + ....: freq='1H', + ....: ) + ....: + + In [33]: a = pd.Series( + ....: data=range(3), + ....: index=index, + ....: ) + ....: + In [4]: from io import StringIO In [5]: a.to_json(date_format='iso') @@ -485,12 +505,16 @@ *New Behavior* -.. ipython:: python +.. code-block:: ipython + + In [34]: from io import StringIO + + In [35]: a.to_json(date_format='iso') + Out[35]: '{"2020-12-28T00:00:00.000Z":0,"2020-12-28T01:00:00.000Z":1,"2020-12-28T02:00:00.000Z":2}' - from io import StringIO - a.to_json(date_format='iso') # Roundtripping now works - pd.read_json(StringIO(a.to_json(date_format='iso')), typ="series").index == a.index + In [36]: pd.read_json(StringIO(a.to_json(date_format='iso')), typ="series").index == a.index + Out[36]: array([ True, True, True]) .. _whatsnew_150.notable_bug_fixes.groupby_value_counts_categorical: diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v2.0.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v2.0.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v2.0.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v2.0.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -76,7 +76,7 @@ .. ipython:: python - idx = pd.date_range(start='1/1/2018', periods=3, freq='M') + idx = pd.date_range(start='1/1/2018', periods=3, freq='ME') idx.array.year idx.year diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v2.1.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v2.1.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v2.1.0.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v2.1.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -432,7 +432,7 @@ In [3]: ser[0] = 'not an int64' FutureWarning: - Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. + Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'not an int64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first. In [4]: ser @@ -789,6 +789,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmax` returns wrong dtype when used on an empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`) +- Bug in :meth:`DataFrame.groupby.rank` on nullable datatypes when passing ``na_option="bottom"`` or ``na_option="top"`` (:issue:`54206`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` in incorrectly allowing non-fixed ``freq`` when resampling on a :class:`TimedeltaIndex` (:issue:`51896`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` losing time zone when resampling empty data (:issue:`53664`) - Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where ``origin`` has no effect in resample when values are outside of axis (:issue:`53662`) diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v2.1.4.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v2.1.4.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v2.1.4.rst 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v2.1.4.rst 2024-04-10 17:42:52.000000000 +0000 @@ -22,8 +22,9 @@ ~~~~~~~~~ - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) - Bug in :class:`Series` when trying to cast date-like string inputs to :class:`ArrowDtype` of ``pyarrow.timestamp`` (:issue:`56266`) -- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55753`) +- Bug in :class:`Timestamp` construction with ``ts_input="now"`` or ``ts_input="today"`` giving a different unit from :meth:`Timestamp.now` or :meth:`Timestamp.today` (:issue:`55879`) - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) +- Fixed bug in :func:`read_csv` not respecting object dtype when ``infer_string`` option is set (:issue:`56047`) - Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`) - Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) @@ -34,7 +35,6 @@ - Fixed bug in :meth:`Series.reset_index` not preserving object dtype when ``infer_string`` is set (:issue:`56160`) - Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) -- .. --------------------------------------------------------------------------- .. _whatsnew_214.contributors: @@ -42,4 +42,4 @@ Contributors ~~~~~~~~~~~~ -.. contributors:: v2.1.3..v2.1.4|HEAD +.. contributors:: v2.1.3..v2.1.4 diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v2.2.0.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v2.2.0.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v2.2.0.rst 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v2.2.0.rst 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,949 @@ +.. _whatsnew_220: + +What's new in 2.2.0 (January 19, 2024) +-------------------------------------- + +These are the changes in pandas 2.2.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_220.upcoming_changes: + +Upcoming changes in pandas 3.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +pandas 3.0 will bring two bigger changes to the default behavior of pandas. + +Copy-on-Write +^^^^^^^^^^^^^ + +The currently optional mode Copy-on-Write will be enabled by default in pandas 3.0. There +won't be an option to keep the current behavior enabled. The new behavioral semantics are +explained in the :ref:`user guide about Copy-on-Write `. + +The new behavior can be enabled since pandas 2.0 with the following option: + +.. code-block:: ipython + + pd.options.mode.copy_on_write = True + +This change brings different changes in behavior in how pandas operates with respect to +copies and views. Some of these changes allow a clear deprecation, like the changes in +chained assignment. Other changes are more subtle and thus, the warnings are hidden behind +an option that can be enabled in pandas 2.2. + +.. code-block:: ipython + + pd.options.mode.copy_on_write = "warn" + +This mode will warn in many different scenarios that aren't actually relevant to +most queries. We recommend exploring this mode, but it is not necessary to get rid +of all of these warnings. The :ref:`migration guide ` +explains the upgrade process in more detail. + +Dedicated string data type (backed by Arrow) by default +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Historically, pandas represented string columns with NumPy object data type. This +representation has numerous problems, including slow performance and a large memory +footprint. This will change in pandas 3.0. pandas will start inferring string columns +as a new ``string`` data type, backed by Arrow, which represents strings contiguous in memory. This brings +a huge performance and memory improvement. + +Old behavior: + +.. code-block:: ipython + + In [1]: ser = pd.Series(["a", "b"]) + Out[1]: + 0 a + 1 b + dtype: object + +New behavior: + + +.. code-block:: ipython + + In [1]: ser = pd.Series(["a", "b"]) + Out[1]: + 0 a + 1 b + dtype: string + +The string data type that is used in these scenarios will mostly behave as NumPy +object would, including missing value semantics and general operations on these +columns. + +This change includes a few additional changes across the API: + +- Currently, specifying ``dtype="string"`` creates a dtype that is backed by Python strings + which are stored in a NumPy array. This will change in pandas 3.0, this dtype + will create an Arrow backed string column. +- The column names and the Index will also be backed by Arrow strings. +- PyArrow will become a required dependency with pandas 3.0 to accommodate this change. + +This future dtype inference logic can be enabled with: + +.. code-block:: ipython + + pd.options.future.infer_string = True + +.. _whatsnew_220.enhancements: + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_220.enhancements.adbc_support: + +ADBC Driver support in to_sql and read_sql +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_sql` and :meth:`~DataFrame.to_sql` now work with `Apache Arrow ADBC +`_ drivers. Compared to +traditional drivers used via SQLAlchemy, ADBC drivers should provide +significant performance improvements, better type support and cleaner +nullability handling. + +.. code-block:: ipython + + import adbc_driver_postgresql.dbapi as pg_dbapi + + df = pd.DataFrame( + [ + [1, 2, 3], + [4, 5, 6], + ], + columns=['a', 'b', 'c'] + ) + uri = "postgresql://postgres:postgres@localhost/postgres" + with pg_dbapi.connect(uri) as conn: + df.to_sql("pandas_table", conn, index=False) + + # for round-tripping + with pg_dbapi.connect(uri) as conn: + df2 = pd.read_sql("pandas_table", conn) + +The Arrow type system offers a wider array of types that can more closely match +what databases like PostgreSQL can offer. To illustrate, note this (non-exhaustive) +listing of types available in different databases and pandas backends: + ++-----------------+-----------------------+----------------+---------+ +|numpy/pandas |arrow |postgres |sqlite | ++=================+=======================+================+=========+ +|int16/Int16 |int16 |SMALLINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int32/Int32 |int32 |INTEGER |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int64/Int64 |int64 |BIGINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|float32 |float32 |REAL |REAL | ++-----------------+-----------------------+----------------+---------+ +|float64 |float64 |DOUBLE PRECISION|REAL | ++-----------------+-----------------------+----------------+---------+ +|object |string |TEXT |TEXT | ++-----------------+-----------------------+----------------+---------+ +|bool |``bool_`` |BOOLEAN | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns] |timestamp(us) |TIMESTAMP | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns,tz]|timestamp(us,tz) |TIMESTAMPTZ | | ++-----------------+-----------------------+----------------+---------+ +| |date32 |DATE | | ++-----------------+-----------------------+----------------+---------+ +| |month_day_nano_interval|INTERVAL | | ++-----------------+-----------------------+----------------+---------+ +| |binary |BINARY |BLOB | ++-----------------+-----------------------+----------------+---------+ +| |decimal128 |DECIMAL [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |list |ARRAY [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |struct |COMPOSITE TYPE | | +| | | [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ + +.. rubric:: Footnotes + +.. [#f1] Not implemented as of writing, but theoretically possible + +If you are interested in preserving database types as best as possible +throughout the lifecycle of your DataFrame, users are encouraged to +leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` + +.. code-block:: ipython + + # for round-tripping + with pg_dbapi.connect(uri) as conn: + df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") + +This will prevent your data from being converted to the traditional pandas/NumPy +type system, which often converts SQL types in ways that make them impossible to +round-trip. + +For a full list of ADBC drivers and their development status, see the `ADBC Driver +Implementation Status `_ +documentation. + +.. _whatsnew_220.enhancements.case_when: + +Create a pandas Series based on one or more conditions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`Series.case_when` function has been added to create a Series object based on one or more conditions. (:issue:`39154`) + +.. ipython:: python + + import pandas as pd + + df = pd.DataFrame(dict(a=[1, 2, 3], b=[4, 5, 6])) + default=pd.Series('default', index=df.index) + default.case_when( + caselist=[ + (df.a == 1, 'first'), # condition, replacement + (df.a.gt(1) & df.b.eq(5), 'second'), # condition, replacement + ], + ) + +.. _whatsnew_220.enhancements.to_numpy_ea: + +``to_numpy`` for NumPy nullable and Arrow types converts to suitable NumPy dtype +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``to_numpy`` for NumPy nullable and Arrow types will now convert to a +suitable NumPy dtype instead of ``object`` dtype for nullable and PyArrow backed extension dtypes. + +*Old behavior:* + +.. code-block:: ipython + + In [1]: ser = pd.Series([1, 2, 3], dtype="Int64") + In [2]: ser.to_numpy() + Out[2]: array([1, 2, 3], dtype=object) + +*New behavior:* + +.. ipython:: python + + ser = pd.Series([1, 2, 3], dtype="Int64") + ser.to_numpy() + + ser = pd.Series([1, 2, 3], dtype="timestamp[ns][pyarrow]") + ser.to_numpy() + +The default NumPy dtype (without any arguments) is determined as follows: + +- float dtypes are cast to NumPy floats +- integer dtypes without missing values are cast to NumPy integer dtypes +- integer dtypes with missing values are cast to NumPy float dtypes and ``NaN`` is used as missing value indicator +- boolean dtypes without missing values are cast to NumPy bool dtype +- boolean dtypes with missing values keep object dtype +- datetime and timedelta types are cast to Numpy datetime64 and timedelta64 types respectively and ``NaT`` is used as missing value indicator + +.. _whatsnew_220.enhancements.struct_accessor: + +Series.struct accessor for PyArrow structured data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``Series.struct`` accessor provides attributes and methods for processing +data with ``struct[pyarrow]`` dtype Series. For example, +:meth:`Series.struct.explode` converts PyArrow structured data to a pandas +DataFrame. (:issue:`54938`) + +.. ipython:: python + + import pyarrow as pa + series = pd.Series( + [ + {"project": "pandas", "version": "2.2.0"}, + {"project": "numpy", "version": "1.25.2"}, + {"project": "pyarrow", "version": "13.0.0"}, + ], + dtype=pd.ArrowDtype( + pa.struct([ + ("project", pa.string()), + ("version", pa.string()), + ]) + ), + ) + series.struct.explode() + +Use :meth:`Series.struct.field` to index into a (possible nested) +struct field. + + +.. ipython:: python + + series.struct.field("project") + +.. _whatsnew_220.enhancements.list_accessor: + +Series.list accessor for PyArrow list data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``Series.list`` accessor provides attributes and methods for processing +data with ``list[pyarrow]`` dtype Series. For example, +:meth:`Series.list.__getitem__` allows indexing pyarrow lists in +a Series. (:issue:`55323`) + +.. ipython:: python + + import pyarrow as pa + series = pd.Series( + [ + [1, 2, 3], + [4, 5], + [6], + ], + dtype=pd.ArrowDtype( + pa.list_(pa.int64()) + ), + ) + series.list[0] + +.. _whatsnew_220.enhancements.calamine: + +Calamine engine for :func:`read_excel` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``calamine`` engine was added to :func:`read_excel`. +It uses ``python-calamine``, which provides Python bindings for the Rust library `calamine `__. +This engine supports Excel files (``.xlsx``, ``.xlsm``, ``.xls``, ``.xlsb``) and OpenDocument spreadsheets (``.ods``) (:issue:`50395`). + +There are two advantages of this engine: + +1. Calamine is often faster than other engines, some benchmarks show results up to 5x faster than 'openpyxl', 20x - 'odf', 4x - 'pyxlsb', and 1.5x - 'xlrd'. + But, 'openpyxl' and 'pyxlsb' are faster in reading a few rows from large files because of lazy iteration over rows. +2. Calamine supports the recognition of datetime in ``.xlsb`` files, unlike 'pyxlsb' which is the only other engine in pandas that can read ``.xlsb`` files. + +.. code-block:: python + + pd.read_excel("path_to_file.xlsb", engine="calamine") + + +For more, see :ref:`io.calamine` in the user guide on IO tools. + +.. _whatsnew_220.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend +- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). +- :func:`get_dummies` now returning extension dtypes ``boolean`` or ``bool[pyarrow]`` that are compatible with the input dtype (:issue:`56273`) +- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"`` (:issue:`54480`) +- :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`) +- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs` (:issue:`54264`) +- :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`) +- :meth:`DataFrame.apply` now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) +- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) +- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) +- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area``; 3rd party :class:`.ExtensionArray` authors need to add this argument to the method ``_pad_or_backfill`` (:issue:`56492`) +- Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`) +- Implement :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for :class:`ArrowDtype` and masked dtypes (:issue:`56267`) +- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) +- Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`) +- Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`) +- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`) +- Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`) +- The dtypes ``string[pyarrow]`` and ``string[pyarrow_numpy]`` now both utilize the ``large_string`` type from PyArrow to avoid overflow for long columns (:issue:`56259`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_220.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +These are bug fixes that might have notable behavior changes. + +.. _whatsnew_220.notable_bug_fixes.merge_sort_behavior: + +:func:`merge` and :meth:`DataFrame.join` now consistently follow documented sort behavior +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` did not +always return a result that followed the documented sort behavior. pandas now +follows the documented sort behavior in merge and join operations (:issue:`54611`, :issue:`56426`, :issue:`56443`). + +As documented, ``sort=True`` sorts the join keys lexicographically in the resulting +:class:`DataFrame`. With ``sort=False``, the order of the join keys depends on the +join type (``how`` keyword): + +- ``how="left"``: preserve the order of the left keys +- ``how="right"``: preserve the order of the right keys +- ``how="inner"``: preserve the order of the left keys +- ``how="outer"``: sort keys lexicographically + +One example with changing behavior is inner joins with non-unique left join keys +and ``sort=False``: + +.. ipython:: python + + left = pd.DataFrame({"a": [1, 2, 1]}) + right = pd.DataFrame({"a": [1, 2]}) + result = pd.merge(left, right, how="inner", on="a", sort=False) + +*Old Behavior* + +.. code-block:: ipython + + In [5]: result + Out[5]: + a + 0 1 + 1 1 + 2 2 + +*New Behavior* + +.. ipython:: python + + result + +.. _whatsnew_220.notable_bug_fixes.multiindex_join_different_levels: + +:func:`merge` and :meth:`DataFrame.join` no longer reorder levels when levels differ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, :func:`merge` and :meth:`DataFrame.join` would reorder +index levels when joining on two indexes with different levels (:issue:`34133`). + +.. ipython:: python + + left = pd.DataFrame({"left": 1}, index=pd.MultiIndex.from_tuples([("x", 1), ("x", 2)], names=["A", "B"])) + right = pd.DataFrame({"right": 2}, index=pd.MultiIndex.from_tuples([(1, 1), (2, 2)], names=["B", "C"])) + left + right + result = left.join(right) + +*Old Behavior* + +.. code-block:: ipython + + In [5]: result + Out[5]: + left right + B A C + 1 x 1 1 2 + 2 x 2 1 2 + +*New Behavior* + +.. ipython:: python + + result + +.. _whatsnew_220.api_breaking.deps: + +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +For `optional dependencies `_ the general recommendation is to use the latest version. +Optional dependencies below the lowest tested version may still work but are not considered supported. +The following table lists the optional dependencies that have had their minimum tested version increased. + ++-----------------+---------------------+ +| Package | New Minimum Version | ++=================+=====================+ +| beautifulsoup4 | 4.11.2 | ++-----------------+---------------------+ +| blosc | 1.21.3 | ++-----------------+---------------------+ +| bottleneck | 1.3.6 | ++-----------------+---------------------+ +| fastparquet | 2022.12.0 | ++-----------------+---------------------+ +| fsspec | 2022.11.0 | ++-----------------+---------------------+ +| gcsfs | 2022.11.0 | ++-----------------+---------------------+ +| lxml | 4.9.2 | ++-----------------+---------------------+ +| matplotlib | 3.6.3 | ++-----------------+---------------------+ +| numba | 0.56.4 | ++-----------------+---------------------+ +| numexpr | 2.8.4 | ++-----------------+---------------------+ +| qtpy | 2.3.0 | ++-----------------+---------------------+ +| openpyxl | 3.1.0 | ++-----------------+---------------------+ +| psycopg2 | 2.9.6 | ++-----------------+---------------------+ +| pyreadstat | 1.2.0 | ++-----------------+---------------------+ +| pytables | 3.8.0 | ++-----------------+---------------------+ +| pyxlsb | 1.0.10 | ++-----------------+---------------------+ +| s3fs | 2022.11.0 | ++-----------------+---------------------+ +| scipy | 1.10.0 | ++-----------------+---------------------+ +| sqlalchemy | 2.0.0 | ++-----------------+---------------------+ +| tabulate | 0.9.0 | ++-----------------+---------------------+ +| xarray | 2022.12.0 | ++-----------------+---------------------+ +| xlsxwriter | 3.0.5 | ++-----------------+---------------------+ +| zstandard | 0.19.0 | ++-----------------+---------------------+ +| pyqt5 | 5.15.8 | ++-----------------+---------------------+ +| tzdata | 2022.7 | ++-----------------+---------------------+ + +See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. + +.. _whatsnew_220.api_breaking.other: + +Other API changes +^^^^^^^^^^^^^^^^^ +- The hash values of nullable extension dtypes changed to improve the performance of the hashing operation (:issue:`56507`) +- ``check_exact`` now only takes effect for floating-point dtypes in :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal`. In particular, integer dtypes are always checked exactly (:issue:`55882`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_220.deprecations: + +Deprecations +~~~~~~~~~~~~ + +Chained assignment +^^^^^^^^^^^^^^^^^^ + +In preparation of larger upcoming changes to the copy / view behaviour in pandas 3.0 +(:ref:`copy_on_write`, PDEP-7), we started deprecating *chained assignment*. + +Chained assignment occurs when you try to update a pandas DataFrame or Series through +two subsequent indexing operations. Depending on the type and order of those operations +this currently does or does not work. + +A typical example is as follows: + +.. code-block:: python + + df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) + + # first selecting rows with a mask, then assigning values to a column + # -> this has never worked and raises a SettingWithCopyWarning + df[df["bar"] > 5]["foo"] = 100 + + # first selecting the column, and then assigning to a subset of that column + # -> this currently works + df["foo"][df["bar"] > 5] = 100 + +This second example of chained assignment currently works to update the original ``df``. +This will no longer work in pandas 3.0, and therefore we started deprecating this: + +.. code-block:: python + + >>> df["foo"][df["bar"] > 5] = 100 + FutureWarning: ChainedAssignmentError: behaviour will change in pandas 3.0! + You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy. + A typical example is when you are setting values in a column of a DataFrame, like: + + df["col"][row_indexer] = value + + Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`. + + See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy + +You can fix this warning and ensure your code is ready for pandas 3.0 by removing +the usage of chained assignment. Typically, this can be done by doing the assignment +in a single step using for example ``.loc``. For the example above, we can do: + +.. code-block:: python + + df.loc[df["bar"] > 5, "foo"] = 100 + +The same deprecation applies to inplace methods that are done in a chained manner, such as: + +.. code-block:: python + + >>> df["foo"].fillna(0, inplace=True) + FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. + The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. + + For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. + +When the goal is to update the column in the DataFrame ``df``, the alternative here is +to call the method on ``df`` itself, such as ``df.fillna({"foo": 0}, inplace=True)``. + +See more details in the :ref:`migration guide `. + + +Deprecate aliases ``M``, ``Q``, ``Y``, etc. in favour of ``ME``, ``QE``, ``YE``, etc. for offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Deprecated the following frequency aliases (:issue:`9586`): + ++-------------------------------+------------------+------------------+ +|offsets |deprecated aliases|new aliases | ++===============================+==================+==================+ +|:class:`MonthEnd` | ``M`` | ``ME`` | ++-------------------------------+------------------+------------------+ +|:class:`BusinessMonthEnd` | ``BM`` | ``BME`` | ++-------------------------------+------------------+------------------+ +|:class:`SemiMonthEnd` | ``SM`` | ``SME`` | ++-------------------------------+------------------+------------------+ +|:class:`CustomBusinessMonthEnd`| ``CBM`` | ``CBME`` | ++-------------------------------+------------------+------------------+ +|:class:`QuarterEnd` | ``Q`` | ``QE`` | ++-------------------------------+------------------+------------------+ +|:class:`BQuarterEnd` | ``BQ`` | ``BQE`` | ++-------------------------------+------------------+------------------+ +|:class:`YearEnd` | ``Y`` | ``YE`` | ++-------------------------------+------------------+------------------+ +|:class:`BYearEnd` | ``BY`` | ``BYE`` | ++-------------------------------+------------------+------------------+ + +For example: + +*Previous behavior*: + +.. code-block:: ipython + + In [8]: pd.date_range('2020-01-01', periods=3, freq='Q-NOV') + Out[8]: + DatetimeIndex(['2020-02-29', '2020-05-31', '2020-08-31'], + dtype='datetime64[ns]', freq='Q-NOV') + +*Future behavior*: + +.. ipython:: python + + pd.date_range('2020-01-01', periods=3, freq='QE-NOV') + +Deprecated automatic downcasting +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Deprecated the automatic downcasting of object dtype results in a number of +methods. These would silently change the dtype in a hard to predict manner since the +behavior was value dependent. Additionally, pandas is moving away from silent dtype +changes (:issue:`54710`, :issue:`54261`). + +These methods are: + +- :meth:`Series.replace` and :meth:`DataFrame.replace` +- :meth:`DataFrame.fillna`, :meth:`Series.fillna` +- :meth:`DataFrame.ffill`, :meth:`Series.ffill` +- :meth:`DataFrame.bfill`, :meth:`Series.bfill` +- :meth:`DataFrame.mask`, :meth:`Series.mask` +- :meth:`DataFrame.where`, :meth:`Series.where` +- :meth:`DataFrame.clip`, :meth:`Series.clip` + +Explicitly call :meth:`DataFrame.infer_objects` to replicate the current behavior in the future. + +.. code-block:: ipython + + result = result.infer_objects(copy=False) + +Or explicitly cast all-round floats to ints using ``astype``. + +Set the following option to opt into the future behavior: + +.. code-block:: ipython + + In [9]: pd.set_option("future.no_silent_downcasting", True) + +Other Deprecations +^^^^^^^^^^^^^^^^^^ +- Changed :meth:`Timedelta.resolution_string` to return ``h``, ``min``, ``s``, ``ms``, ``us``, and ``ns`` instead of ``H``, ``T``, ``S``, ``L``, ``U``, and ``N``, for compatibility with respective deprecations in frequency aliases (:issue:`52536`) +- Deprecated :attr:`offsets.Day.delta`, :attr:`offsets.Hour.delta`, :attr:`offsets.Minute.delta`, :attr:`offsets.Second.delta`, :attr:`offsets.Milli.delta`, :attr:`offsets.Micro.delta`, :attr:`offsets.Nano.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`) +- Deprecated :func:`pandas.api.types.is_interval` and :func:`pandas.api.types.is_period`, use ``isinstance(obj, pd.Interval)`` and ``isinstance(obj, pd.Period)`` instead (:issue:`55264`) +- Deprecated :func:`read_gbq` and :meth:`DataFrame.to_gbq`. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`) +- Deprecated :meth:`.DataFrameGroupBy.fillna` and :meth:`.SeriesGroupBy.fillna`; use :meth:`.DataFrameGroupBy.ffill`, :meth:`.DataFrameGroupBy.bfill` for forward and backward filling or :meth:`.DataFrame.fillna` to fill with a single value (or the Series equivalents) (:issue:`55718`) +- Deprecated :meth:`DateOffset.is_anchored`, use ``obj.n == 1`` for non-Tick subclasses (for Tick this was always False) (:issue:`55388`) +- Deprecated :meth:`DatetimeArray.__init__` and :meth:`TimedeltaArray.__init__`, use :func:`array` instead (:issue:`55623`) +- Deprecated :meth:`Index.format`, use ``index.astype(str)`` or ``index.map(formatter)`` instead (:issue:`55413`) +- Deprecated :meth:`Series.ravel`, the underlying array is already 1D, so ravel is not necessary (:issue:`52511`) +- Deprecated :meth:`Series.resample` and :meth:`DataFrame.resample` with a :class:`PeriodIndex` (and the 'convention' keyword), convert to :class:`DatetimeIndex` (with ``.to_timestamp()``) before resampling instead (:issue:`53481`) +- Deprecated :meth:`Series.view`, use :meth:`Series.astype` instead to change the dtype (:issue:`20251`) +- Deprecated :meth:`offsets.Tick.is_anchored`, use ``False`` instead (:issue:`55388`) +- Deprecated ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock``, use public APIs instead (:issue:`55139`) +- Deprecated ``year``, ``month``, ``quarter``, ``day``, ``hour``, ``minute``, and ``second`` keywords in the :class:`PeriodIndex` constructor, use :meth:`PeriodIndex.from_fields` instead (:issue:`55960`) +- Deprecated accepting a type as an argument in :meth:`Index.view`, call without any arguments instead (:issue:`55709`) +- Deprecated allowing non-integer ``periods`` argument in :func:`date_range`, :func:`timedelta_range`, :func:`period_range`, and :func:`interval_range` (:issue:`56036`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_clipboard` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_csv` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_dict` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_excel` except ``excel_writer`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_gbq` except ``destination_table`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_hdf` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_html` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_json` except ``path_or_buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_latex` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_markdown` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_parquet` except ``path`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_pickle` except ``path`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_string` except ``buf`` (:issue:`54229`) +- Deprecated allowing non-keyword arguments in :meth:`DataFrame.to_xml` except ``path_or_buffer`` (:issue:`54229`) +- Deprecated allowing passing :class:`BlockManager` objects to :class:`DataFrame` or :class:`SingleBlockManager` objects to :class:`Series` (:issue:`52419`) +- Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) +- Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) +- Deprecated dtype inference in :class:`Index`, :class:`Series` and :class:`DataFrame` constructors when giving a pandas input, call ``.infer_objects`` on the input to keep the current behavior (:issue:`56012`) +- Deprecated dtype inference when setting a :class:`Index` into a :class:`DataFrame`, cast explicitly instead (:issue:`56102`) +- Deprecated including the groups in computations when using :meth:`.DataFrameGroupBy.apply` and :meth:`.DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) +- Deprecated indexing an :class:`Index` with a boolean indexer of length zero (:issue:`55820`) +- Deprecated not passing a tuple to :class:`.DataFrameGroupBy.get_group` or :class:`.SeriesGroupBy.get_group` when grouping by a length-1 list-like (:issue:`25971`) +- Deprecated string ``AS`` denoting frequency in :class:`YearBegin` and strings ``AS-DEC``, ``AS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`54275`) +- Deprecated string ``A`` denoting frequency in :class:`YearEnd` and strings ``A-DEC``, ``A-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`54275`) +- Deprecated string ``BAS`` denoting frequency in :class:`BYearBegin` and strings ``BAS-DEC``, ``BAS-JAN``, etc. denoting annual frequencies with various fiscal year starts (:issue:`54275`) +- Deprecated string ``BA`` denoting frequency in :class:`BYearEnd` and strings ``BA-DEC``, ``BA-JAN``, etc. denoting annual frequencies with various fiscal year ends (:issue:`54275`) +- Deprecated strings ``H``, ``BH``, and ``CBH`` denoting frequencies in :class:`Hour`, :class:`BusinessHour`, :class:`CustomBusinessHour` (:issue:`52536`) +- Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`) +- Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`) +- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`) +- Deprecated support for combining parsed datetime columns in :func:`read_csv` along with the ``keep_date_col`` keyword (:issue:`55569`) +- Deprecated the :attr:`.DataFrameGroupBy.grouper` and :attr:`SeriesGroupBy.grouper`; these attributes will be removed in a future version of pandas (:issue:`56521`) +- Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`) +- Deprecated the ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep="\\s+"`` instead (:issue:`55569`) +- Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`) +- Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`) +- Deprecated the ``kind`` keyword in :meth:`Series.resample` and :meth:`DataFrame.resample`, explicitly cast the object's ``index`` instead (:issue:`55895`) +- Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`) +- Deprecated the ``unit`` keyword in :class:`TimedeltaIndex` construction, use :func:`to_timedelta` instead (:issue:`55499`) +- Deprecated the ``verbose`` keyword in :func:`read_csv` and :func:`read_table` (:issue:`55569`) +- Deprecated the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype`; in a future version replace will change the values while preserving the categories. To change the categories, use ``ser.cat.rename_categories`` instead (:issue:`55147`) +- Deprecated the behavior of :meth:`Series.value_counts` and :meth:`Index.value_counts` with object dtype; in a future version these will not perform dtype inference on the resulting :class:`Index`, do ``result.index = result.index.infer_objects()`` to retain the old behavior (:issue:`56161`) +- Deprecated the default of ``observed=False`` in :meth:`DataFrame.pivot_table`; will be ``True`` in a future version (:issue:`56236`) +- Deprecated the extension test classes ``BaseNoReduceTests``, ``BaseBooleanReduceTests``, and ``BaseNumericReduceTests``, use ``BaseReduceTests`` instead (:issue:`54663`) +- Deprecated the option ``mode.data_manager`` and the ``ArrayManager``; only the ``BlockManager`` will be available in future versions (:issue:`55043`) +- Deprecated the previous implementation of :class:`DataFrame.stack`; specify ``future_stack=True`` to adopt the future version (:issue:`53515`) +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_220.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :func:`.testing.assert_frame_equal` and :func:`.testing.assert_series_equal` (:issue:`55949`, :issue:`55971`) +- Performance improvement in :func:`concat` with ``axis=1`` and objects with unaligned indexes (:issue:`55084`) +- Performance improvement in :func:`get_dummies` (:issue:`56089`) +- Performance improvement in :func:`merge` and :func:`merge_ordered` when joining on sorted ascending keys (:issue:`56115`) +- Performance improvement in :func:`merge_asof` when ``by`` is not ``None`` (:issue:`55580`, :issue:`55678`) +- Performance improvement in :func:`read_stata` for files with many variables (:issue:`55515`) +- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`) +- Performance improvement in :meth:`DataFrame.join` when joining on unordered categorical indexes (:issue:`56345`) +- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` when indexing with a :class:`MultiIndex` (:issue:`56062`) +- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`) +- Performance improvement in :meth:`DataFrame.to_dict` on converting DataFrame to dictionary (:issue:`50990`) +- Performance improvement in :meth:`Index.difference` (:issue:`55108`) +- Performance improvement in :meth:`Index.sort_values` when index is already sorted (:issue:`56128`) +- Performance improvement in :meth:`MultiIndex.get_indexer` when ``method`` is not ``None`` (:issue:`55839`) +- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`) +- Performance improvement in :meth:`Series.str.get_dummies` when dtype is ``"string[pyarrow]"`` or ``"string[pyarrow_numpy]"`` (:issue:`56110`) +- Performance improvement in :meth:`Series.str` methods (:issue:`55736`) +- Performance improvement in :meth:`Series.value_counts` and :meth:`Series.mode` for masked dtypes (:issue:`54984`, :issue:`55340`) +- Performance improvement in :meth:`.DataFrameGroupBy.nunique` and :meth:`.SeriesGroupBy.nunique` (:issue:`55972`) +- Performance improvement in :meth:`.SeriesGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.DataFrameGroupBy.idxmin` (:issue:`54234`) +- Performance improvement when hashing a nullable extension array (:issue:`56507`) +- Performance improvement when indexing into a non-unique index (:issue:`55816`) +- Performance improvement when indexing with more than 4 keys (:issue:`54550`) +- Performance improvement when localizing time to UTC (:issue:`55241`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_220.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ +- :meth:`Categorical.isin` raising ``InvalidIndexError`` for categorical containing overlapping :class:`Interval` values (:issue:`34974`) +- Bug in :meth:`CategoricalDtype.__eq__` returning ``False`` for unordered categorical data with mixed types (:issue:`55468`) +- Bug when casting ``pa.dictionary`` to :class:`CategoricalDtype` using a ``pa.DictionaryArray`` as categories (:issue:`56672`) + +Datetimelike +^^^^^^^^^^^^ +- Bug in :class:`DatetimeIndex` construction when passing both a ``tz`` and either ``dayfirst`` or ``yearfirst`` ignoring dayfirst/yearfirst (:issue:`55813`) +- Bug in :class:`DatetimeIndex` when passing an object-dtype ndarray of float objects and a ``tz`` incorrectly localizing the result (:issue:`55780`) +- Bug in :func:`Series.isin` with :class:`DatetimeTZDtype` dtype and comparison values that are all ``NaT`` incorrectly returning all-``False`` even if the series contains ``NaT`` entries (:issue:`56427`) +- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame (:issue:`52093`) +- Bug in :func:`testing.assert_extension_array_equal` that could use the wrong unit when comparing resolutions (:issue:`55730`) +- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing a list of mixed-string-and-numeric types incorrectly raising (:issue:`55780`) +- Bug in :func:`to_datetime` and :class:`DatetimeIndex` when passing mixed-type objects with a mix of timezones or mix of timezone-awareness failing to raise ``ValueError`` (:issue:`55693`) +- Bug in :meth:`.Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in :meth:`DatetimeIndex.shift` with non-nanosecond resolution incorrectly returning with nanosecond resolution (:issue:`56117`) +- Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`) +- Bug in :meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` always caching :meth:`Index.is_unique` as ``True`` when first value in index is ``NaT`` (:issue:`55755`) +- Bug in :meth:`Index.view` to a datetime64 dtype with non-supported resolution incorrectly raising (:issue:`55710`) +- Bug in :meth:`Series.dt.round` with non-nanosecond resolution and ``NaT`` entries incorrectly raising ``OverflowError`` (:issue:`56158`) +- Bug in :meth:`Series.fillna` with non-nanosecond resolution dtypes and higher-resolution vector values returning incorrect (internally-corrupted) results (:issue:`56410`) +- Bug in :meth:`Timestamp.unit` being inferred incorrectly from an ISO8601 format string with minute or hour resolution and a timezone offset (:issue:`56208`) +- Bug in ``.astype`` converting from a higher-resolution ``datetime64`` dtype to a lower-resolution ``datetime64`` dtype (e.g. ``datetime64[us]->datetime64[ms]``) silently overflowing with values near the lower implementation bound (:issue:`55979`) +- Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`) +- Bug in addition or subtraction of :class:`BusinessDay` offset with ``offset`` attribute to non-nanosecond :class:`Index`, :class:`Series`, or :class:`DataFrame` column giving incorrect results (:issue:`55608`) +- Bug in addition or subtraction of :class:`DateOffset` objects with microsecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` columns with non-nanosecond resolution (:issue:`55595`) +- Bug in addition or subtraction of very large :class:`.Tick` objects with :class:`Timestamp` or :class:`Timedelta` objects raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond :class:`DatetimeTZDtype` and inputs that would be out of bounds with nanosecond resolution incorrectly raising ``OutOfBoundsDatetime`` (:issue:`54620`) +- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` (or :class:`DatetimeTZDtype`) from mixed-numeric inputs treating those as nanoseconds instead of as multiples of the dtype's unit (which would happen with non-mixed numeric inputs) (:issue:`56004`) +- Bug in creating a :class:`Index`, :class:`Series`, or :class:`DataFrame` with a non-nanosecond ``datetime64`` dtype and inputs that would be out of bounds for a ``datetime64[ns]`` incorrectly raising ``OutOfBoundsDatetime`` (:issue:`55756`) +- Bug in parsing datetime strings with nanosecond resolution with non-ISO8601 formats incorrectly truncating sub-microsecond components (:issue:`56051`) +- Bug in parsing datetime strings with sub-second resolution and trailing zeros incorrectly inferring second or millisecond resolution (:issue:`55737`) +- Bug in the results of :func:`to_datetime` with an floating-dtype argument with ``unit`` not matching the pointwise results of :class:`Timestamp` (:issue:`56037`) +- Fixed regression where :func:`concat` would raise an error when concatenating ``datetime64`` columns with differing resolutions (:issue:`53641`) + +Timedelta +^^^^^^^^^ +- Bug in :class:`Timedelta` construction raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`) +- Bug in rendering (``__repr__``) of :class:`TimedeltaIndex` and :class:`Series` with timedelta64 values with non-nanosecond resolution entries that are all multiples of 24 hours failing to use the compact representation used in the nanosecond cases (:issue:`55405`) + +Timezones +^^^^^^^^^ +- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) +- Bug in :class:`Timestamp` construction with an ambiguous value and a ``pytz`` timezone failing to raise ``pytz.AmbiguousTimeError`` (:issue:`55657`) +- Bug in :meth:`Timestamp.tz_localize` with ``nonexistent="shift_forward`` around UTC+0 during DST (:issue:`51501`) + +Numeric +^^^^^^^ +- Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) +- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`) +- Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) +- Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` matching float ``0.0`` with ``False`` and vice versa (:issue:`55398`) +- Bug in :meth:`Series.round` raising for nullable boolean dtype (:issue:`55936`) + +Conversion +^^^^^^^^^^ +- Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`) +- Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`) +- Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`) +- Bug in :meth:``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`) + +Strings +^^^^^^^ +- Bug in :func:`pandas.api.types.is_string_dtype` while checking object array with no elements is of the string dtype (:issue:`54661`) +- Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`) +- Bug in :meth:`DataFrame.reindex` not matching :class:`Index` with ``string[pyarrow_numpy]`` dtype (:issue:`56106`) +- Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`) +- Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`) +- Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`) +- Bug in :meth:`Series.str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`) +- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`) +- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`) +- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`) +- Bug in comparison operations for ``dtype="string[pyarrow_numpy]"`` raising if dtypes can't be compared (:issue:`56008`) + +Interval +^^^^^^^^ +- Bug in :class:`Interval` ``__repr__`` not displaying UTC offsets for :class:`Timestamp` bounds. Additionally the hour, minute and second components will now be shown (:issue:`55015`) +- Bug in :meth:`IntervalIndex.factorize` and :meth:`Series.factorize` with :class:`IntervalDtype` with datetime64 or timedelta64 intervals not preserving non-nanosecond units (:issue:`56099`) +- Bug in :meth:`IntervalIndex.from_arrays` when passed ``datetime64`` or ``timedelta64`` arrays with mismatched resolutions constructing an invalid ``IntervalArray`` object (:issue:`55714`) +- Bug in :meth:`IntervalIndex.from_tuples` raising if subtype is a nullable extension dtype (:issue:`56765`) +- Bug in :meth:`IntervalIndex.get_indexer` with datetime or timedelta intervals incorrectly matching on integer targets (:issue:`47772`) +- Bug in :meth:`IntervalIndex.get_indexer` with timezone-aware datetime intervals incorrectly matching on a sequence of timezone-naive targets (:issue:`47772`) +- Bug in setting values on a :class:`Series` with an :class:`IntervalIndex` using a slice incorrectly raising (:issue:`54722`) + +Indexing +^^^^^^^^ +- Bug in :meth:`DataFrame.loc` mutating a boolean indexer when :class:`DataFrame` has a :class:`MultiIndex` (:issue:`56635`) +- Bug in :meth:`DataFrame.loc` when setting :class:`Series` with extension dtype into NumPy dtype (:issue:`55604`) +- Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`) +- Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`) +- Fixed bug when creating new column with missing values when setting a single string value (:issue:`56204`) + +Missing +^^^^^^^ +- Bug in :meth:`DataFrame.update` wasn't updating in-place for tz-aware datetime64 dtypes (:issue:`56227`) + +MultiIndex +^^^^^^^^^^ +- Bug in :meth:`MultiIndex.get_indexer` not raising ``ValueError`` when ``method`` provided and index is non-monotonic (:issue:`53452`) + +I/O +^^^ +- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified (:issue:`56323`) +- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified (:issue:`55677`) +- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raising a Python warning; this now yields a :class:`.errors.ParserWarning` (:issue:`54296`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``quotechar`` was ignored (:issue:`52266`) +- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a CSV with no headers (:issue:`54459`) +- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when the file contains ``NaN`` or ``Inf`` (:issue:`54564`) +- Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`) +- Bug in :meth:`DataFrame.to_excel`, with ``OdsWriter`` (``ods`` files) writing Boolean/string value (:issue:`54994`) +- Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`) +- Bug in :meth:`DataFrame.to_stata` raising for extension dtypes (:issue:`54671`) +- Bug in :meth:`~pandas.read_excel` with ``engine="odf"`` (``ods`` files) when a string cell contains an annotation (:issue:`55200`) +- Bug in :meth:`~pandas.read_excel` with an ODS file without cached formatted cell for float values (:issue:`55219`) +- Bug where :meth:`DataFrame.to_json` would raise an ``OverflowError`` instead of a ``TypeError`` with unsupported NumPy types (:issue:`55403`) + +Period +^^^^^^ +- Bug in :class:`PeriodIndex` construction when more than one of ``data``, ``ordinal`` and ``**fields`` are passed failing to raise ``ValueError`` (:issue:`55961`) +- Bug in :class:`Period` addition silently wrapping around instead of raising ``OverflowError`` (:issue:`55503`) +- Bug in casting from :class:`PeriodDtype` with ``astype`` to ``datetime64`` or :class:`DatetimeTZDtype` with non-nanosecond unit incorrectly returning with nanosecond unit (:issue:`55958`) + +Plotting +^^^^^^^^ +- Bug in :meth:`DataFrame.plot.box` with ``vert=False`` and a Matplotlib ``Axes`` created with ``sharey=True`` (:issue:`54941`) +- Bug in :meth:`DataFrame.plot.scatter` discarding string columns (:issue:`56142`) +- Bug in :meth:`Series.plot` when reusing an ``ax`` object failing to raise when a ``how`` keyword is passed (:issue:`55953`) + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, and :meth:`.SeriesGroupBy.idxmax` would not retain :class:`.Categorical` dtype when the index was a :class:`.CategoricalIndex` that contained NA values (:issue:`54234`) +- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` when ``observed=False`` and ``f="idxmin"`` or ``f="idxmax"`` would incorrectly raise on unobserved categories (:issue:`54234`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` could result in incorrect sorting if the columns of the DataFrame or name of the Series are integers (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would not respect ``sort=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`55951`) +- Bug in :meth:`.DataFrameGroupBy.value_counts` and :meth:`.SeriesGroupBy.value_counts` would sort by proportions rather than frequencies when ``sort=True`` and ``normalize=True`` (:issue:`55951`) +- Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`) +- Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`) +- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` where grouping by a combination of ``Decimal`` and NA values would fail when ``sort=True`` (:issue:`54847`) +- Bug in :meth:`DataFrame.groupby` for DataFrame subclasses when selecting a subset of columns to apply the function to (:issue:`56761`) +- Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) +- Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) +- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) +- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) +- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) +- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where either the ``index`` or ``on`` column was :class:`ArrowDtype` with ``pyarrow.timestamp`` type (:issue:`55849`) + +Reshaping +^^^^^^^^^ +- Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) +- Bug in :func:`concat` renaming :class:`Series` when ``ignore_index=False`` (:issue:`15047`) +- Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) +- Bug in :func:`merge_asof` raising incorrect error for string dtype (:issue:`56444`) +- Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`) +- Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`) +- Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`) +- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`) +- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) +- Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) +- Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) +- Bug in :meth:`DataFrame.pivot_table` where the row margin is incorrect when the columns have numeric names (:issue:`26568`) +- Bug in :meth:`DataFrame.pivot` with numeric columns and extension dtype for data (:issue:`56528`) +- Bug in :meth:`DataFrame.stack` with ``future_stack=True`` would not preserve NA values in the index (:issue:`56573`) + +Sparse +^^^^^^ +- Bug in :meth:`arrays.SparseArray.take` when using a different fill value than the array's fill value (:issue:`55181`) + +Other +^^^^^ +- :meth:`DataFrame.__dataframe__` did not support pyarrow large strings (:issue:`56702`) +- Bug in :func:`DataFrame.describe` when formatting percentiles in the resulting percentile 99.999% is rounded to 100% (:issue:`55765`) +- Bug in :func:`api.interchange.from_dataframe` where it raised ``NotImplementedError`` when handling empty string columns (:issue:`56703`) +- Bug in :func:`cut` and :func:`qcut` with ``datetime64`` dtype values with non-nanosecond units incorrectly returning nanosecond-unit bins (:issue:`56101`) +- Bug in :func:`cut` incorrectly allowing cutting of timezone-aware datetimes with timezone-naive bins (:issue:`54964`) +- Bug in :func:`infer_freq` and :meth:`DatetimeIndex.inferred_freq` with weekly frequencies and non-nanosecond resolutions (:issue:`55609`) +- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) +- Bug in :meth:`DataFrame.from_dict` which would always sort the rows of the created :class:`DataFrame`. (:issue:`55683`) +- Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` raising a ``ValueError`` (:issue:`56478`) +- Bug in rendering ``inf`` values inside a :class:`DataFrame` with the ``use_inf_as_na`` option enabled (:issue:`55483`) +- Bug in rendering a :class:`Series` with a :class:`MultiIndex` when one of the index level's names is 0 not having that name displayed (:issue:`55415`) +- Bug in the error message when assigning an empty :class:`DataFrame` to a column (:issue:`55956`) +- Bug when time-like strings were being cast to :class:`ArrowDtype` with ``pyarrow.time64`` type (:issue:`56463`) +- Fixed a spurious deprecation warning from ``numba`` >= 0.58.0 when passing a numpy ufunc in :class:`core.window.Rolling.apply` with ``engine="numba"`` (:issue:`55247`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_220.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.1.4..v2.2.0 diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v2.2.1.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v2.2.1.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v2.2.1.rst 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v2.2.1.rst 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,90 @@ +.. _whatsnew_221: + +What's new in 2.2.1 (February 22, 2024) +--------------------------------------- + +These are the changes in pandas 2.2.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.enhancements: + +Enhancements +~~~~~~~~~~~~ +- Added ``pyarrow`` pip extra so users can install pandas and pyarrow with pip with ``pip install pandas[pyarrow]`` (:issue:`54466`) + +.. _whatsnew_221.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed memory leak in :func:`read_csv` (:issue:`57039`) +- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) +- Fixed regression causing overflow for near-minimum timestamps (:issue:`57150`) +- Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) +- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) +- Fixed regression in :func:`read_json` where an :class:`Index` would be returned instead of a :class:`RangeIndex` (:issue:`57429`) +- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) +- Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) +- Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) +- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.loc` which was unnecessarily throwing "incompatible dtype warning" when expanding with partial row indexer and multiple columns (see `PDEP6 `_) (:issue:`56503`) +- Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`) +- Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) +- Fixed regression in :meth:`DataFrame.query` with all ``NaT`` column with object dtype (:issue:`57068`) +- Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) +- Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) +- Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) +- Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) +- Fixed regression in :meth:`DataFrame.to_sql` when ``method="multi"`` is passed and the dialect type is not Oracle (:issue:`57310`) +- Fixed regression in :meth:`DataFrame.transpose` with nullable extension dtypes not having F-contiguous data potentially causing exceptions when used (:issue:`57315`) +- Fixed regression in :meth:`DataFrame.update` emitting incorrect warnings about downcasting (:issue:`57124`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) +- Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) +- Fixed regression in :meth:`Series.astype` introducing decimals when converting from integer with missing values to string dtype (:issue:`57418`) +- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) +- Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) +- Fixed regression in addition or subtraction of :class:`DateOffset` objects with millisecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` (:issue:`57529`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for Nullable integers (:issue:`55069`) +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`) +- Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) +- Fixed bug in :meth:`PeriodIndex.asfreq` which was silently converting frequencies which are not supported as period frequencies instead of raising an error (:issue:`56945`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.other: + +Other +~~~~~ + +.. note:: + + The ``DeprecationWarning`` that was raised when pandas was imported without PyArrow being + installed has been removed. This decision was made because the warning was too noisy for too + many users and a lot of feedback was collected about the decision to make PyArrow a required + dependency. Pandas is currently considering the decision whether or not PyArrow should be added + as a hard dependency in 3.0. Interested users can follow the discussion + `here `_. + +- Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`) +- Added the argument ``skipna`` to :meth:`Resampler.first`, :meth:`Resampler.last` (:issue:`57019`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.2.0..v2.2.1 diff -Nru pandas-2.1.4+dfsg/doc/source/whatsnew/v2.2.2.rst pandas-2.2.2+dfsg/doc/source/whatsnew/v2.2.2.rst --- pandas-2.1.4+dfsg/doc/source/whatsnew/v2.2.2.rst 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/doc/source/whatsnew/v2.2.2.rst 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,59 @@ +.. _whatsnew_222: + +What's new in 2.2.2 (April 10, 2024) +--------------------------------------- + +These are the changes in pandas 2.2.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_220.np2_compat: + +Pandas 2.2.2 is now compatible with numpy 2.0 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas 2.2.2 is the first version of pandas that is generally compatible with the upcoming +numpy 2.0 release, and wheels for pandas 2.2.2 will work with both numpy 1.x and 2.x. + +One major caveat is that arrays created with numpy 2.0's new ``StringDtype`` will convert +to ``object`` dtyped arrays upon :class:`Series`/:class:`DataFrame` creation. +Full support for numpy 2.0's StringDtype is expected to land in pandas 3.0. + +As usual please report any bugs discovered to our `issue tracker `_ + +.. _whatsnew_222.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pyarrow nullable on with missing values (:issue:`57664`) +- Avoid issuing a spurious ``DeprecationWarning`` when a custom :class:`DataFrame` or :class:`Series` subclass method is called (:issue:`57553`) +- Fixed regression in precision of :func:`to_datetime` with string and ``unit`` input (:issue:`57051`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.bug_fixes: + +Bug fixes +~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the column's type was nullable boolean (:issue:`55332`) +- :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`) +- :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`) +- :meth:`DataFrame.to_sql` was failing to find the right table when using the schema argument (:issue:`57539`) + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_222.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.2.1..v2.2.2|HEAD diff -Nru pandas-2.1.4+dfsg/environment.yml pandas-2.2.2+dfsg/environment.yml --- pandas-2.1.4+dfsg/environment.yml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/environment.yml 2024-04-10 17:42:52.000000000 +0000 @@ -8,7 +8,7 @@ # build dependencies - versioneer[toml] - - cython=0.29.33 + - cython=3.0.5 - meson[ninja]=1.2.1 - meson-python=0.13.1 @@ -16,6 +16,8 @@ - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 + - pytest-qt>=4.2.0 + - pyqt>=5.15.9 - coverage # required dependencies @@ -24,37 +26,40 @@ - pytz # optional dependencies - - beautifulsoup4>=4.11.1 + - beautifulsoup4>=4.11.2 + # https://github.com/conda-forge/pytables-feedstock/issues/97 + - c-blosc2=2.13.2 - blosc - - bottleneck>=1.3.4 - - fastparquet>=0.8.1 - - fsspec>=2022.05.0 + - bottleneck>=1.3.6 + - fastparquet>=2022.12.0 + - fsspec>=2022.11.0 - html5lib>=1.1 - hypothesis>=6.46.1 - - gcsfs>=2022.05.0 + - gcsfs>=2022.11.0 - ipython - jinja2>=3.1.2 - - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.8 - - numba>=0.55.2 - - numexpr>=2.8.0 - - openpyxl>=3.0.10 + - lxml>=4.9.2 + - matplotlib>=3.6.3 + - numba>=0.56.4 + - numexpr>=2.8.4 + - openpyxl>=3.1.0 - odfpy>=1.4.1 - py - - psycopg2>=2.9.3 - - pyarrow>=7.0.0 + - psycopg2>=2.9.6 + - pyarrow>=10.0.1 - pymysql>=1.0.2 - - pyreadstat>=1.1.5 - - pytables>=3.7.0 - - pyxlsb>=1.0.9 - - s3fs>=2022.05.0 - - scipy>=1.8.1 - - sqlalchemy>=1.4.36 - - tabulate>=0.8.10 - - xarray>=2022.03.0 + - pyreadstat>=1.2.0 + - pytables>=3.8.0 + - python-calamine>=0.1.7 + - pyxlsb>=1.0.10 + - s3fs>=2022.11.0 + - scipy>=1.10.0 + - sqlalchemy>=2.0.0 + - tabulate>=0.9.0 + - xarray>=2022.12.0 - xlrd>=2.0.1 - - xlsxwriter>=3.0.3 - - zstandard>=0.17.0 + - xlsxwriter>=3.0.5 + - zstandard>=0.19.0 # downstream packages - dask-core @@ -65,17 +70,17 @@ - flask # benchmarks - - asv>=0.5.1 + - asv>=0.6.1 ## The compiler packages are meta-packages and install the correct compiler (activation) packages on the respective platforms. - c-compiler - cxx-compiler # code checks - - flake8=6.0.0 # run in subprocess over docstring examples - - mypy=1.4.1 # pre-commit uses locally installed mypy + - flake8=6.1.0 # run in subprocess over docstring examples + - mypy=1.8.0 # pre-commit uses locally installed mypy - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py - - pre-commit>=2.15.0 + - pre-commit>=3.6.0 # documentation - gitpython # obtain contributors from git for whatsnew @@ -83,7 +88,7 @@ - google-auth - natsort # DataFrame.sort_values doctest - numpydoc - - pydata-sphinx-theme=0.13 + - pydata-sphinx-theme=0.14 - pytest-cython # doctest - sphinx - sphinx-design @@ -95,16 +100,16 @@ - types-setuptools # documentation (jupyter notebooks) - - nbconvert>=6.4.5 + - nbconvert>=7.11.0 - nbsphinx - pandoc - ipywidgets - nbformat - - notebook>=6.0.3 + - notebook>=7.0.6 - ipykernel # web - - jinja2 # in optional dependencies, but documented here as needed + # - jinja2 # already listed in optional dependencies, but documented here for reference - markdown - feedparser - pyyaml @@ -112,7 +117,8 @@ - pygments # Code highlighting - pip: + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - dataframe-api-compat>=0.1.7 - - sphinx-toggleprompt # conda-forge version has stricter pins on jinja2 - typing_extensions; python_version<"3.11" - - tzdata>=2022.1 + - tzdata>=2022.7 diff -Nru pandas-2.1.4+dfsg/generate_pxi.py pandas-2.2.2+dfsg/generate_pxi.py --- pandas-2.1.4+dfsg/generate_pxi.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/generate_pxi.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,7 +4,7 @@ from Cython import Tempita -def process_tempita(pxifile, outfile): +def process_tempita(pxifile, outfile) -> None: with open(pxifile, encoding="utf-8") as f: tmpl = f.read() pyxcontent = Tempita.sub(tmpl) @@ -13,7 +13,7 @@ f.write(pyxcontent) -def main(): +def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("infile", type=str, help="Path to the input file") parser.add_argument("-o", "--outdir", type=str, help="Path to the output directory") diff -Nru pandas-2.1.4+dfsg/generate_version.py pandas-2.2.2+dfsg/generate_version.py --- pandas-2.1.4+dfsg/generate_version.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/generate_version.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,7 +10,7 @@ sys.path.insert(0, "") -def write_version_info(path): +def write_version_info(path) -> None: version = None git_version = None @@ -29,7 +29,7 @@ file.write(f'__git_version__="{git_version}"\n') -def main(): +def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( "-o", diff -Nru pandas-2.1.4+dfsg/meson.build pandas-2.2.2+dfsg/meson.build --- pandas-2.1.4+dfsg/meson.build 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/meson.build 2024-04-10 17:42:52.000000000 +0000 @@ -7,7 +7,8 @@ meson_version: '>=1.2.1', default_options: [ 'buildtype=release', - 'c_std=c99' + 'c_std=c11', + 'warning_level=2', ] ) diff -Nru pandas-2.1.4+dfsg/pandas/__init__.py pandas-2.2.2+dfsg/pandas/__init__.py --- pandas-2.1.4+dfsg/pandas/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,5 +1,8 @@ from __future__ import annotations +import os +import warnings + __docformat__ = "restructuredtext" # Let users know if they're missing any of our hard dependencies @@ -21,7 +24,7 @@ try: # numpy compat from pandas.compat import ( - is_numpy_dev as _is_numpy_dev, # pyright: ignore[reportUnusedImport] # noqa: F401,E501 + is_numpy_dev as _is_numpy_dev, # pyright: ignore[reportUnusedImport] # noqa: F401 ) except ImportError as _err: # pragma: no cover _module = _err.name @@ -190,6 +193,17 @@ __git_version__ = v.get("full-revisionid") del get_versions, v +# GH#55043 - deprecation of the data_manager option +if "PANDAS_DATA_MANAGER" in os.environ: + warnings.warn( + "The env variable PANDAS_DATA_MANAGER is set. The data_manager option is " + "deprecated and will be removed in a future version. Only the BlockManager " + "will be available. Unset this environment variable to silence this warning.", + FutureWarning, + stacklevel=2, + ) + +del warnings, os # module level doc-string __doc__ = """ diff -Nru pandas-2.1.4+dfsg/pandas/_config/__init__.py pandas-2.2.2+dfsg/pandas/_config/__init__.py --- pandas-2.1.4+dfsg/pandas/_config/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_config/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,6 +15,7 @@ "option_context", "options", "using_copy_on_write", + "warn_copy_on_write", ] from pandas._config import config from pandas._config import dates # pyright: ignore[reportUnusedImport] # noqa: F401 @@ -32,7 +33,18 @@ def using_copy_on_write() -> bool: _mode_options = _global_config["mode"] - return _mode_options["copy_on_write"] and _mode_options["data_manager"] == "block" + return ( + _mode_options["copy_on_write"] is True + and _mode_options["data_manager"] == "block" + ) + + +def warn_copy_on_write() -> bool: + _mode_options = _global_config["mode"] + return ( + _mode_options["copy_on_write"] == "warn" + and _mode_options["data_manager"] == "block" + ) def using_nullable_dtypes() -> bool: diff -Nru pandas-2.1.4+dfsg/pandas/_config/config.py pandas-2.2.2+dfsg/pandas/_config/config.py --- pandas-2.1.4+dfsg/pandas/_config/config.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_config/config.py 2024-04-10 17:42:52.000000000 +0000 @@ -220,6 +220,8 @@ class DictWrapper: """provide attribute-style access to a nested dict""" + d: dict[str, Any] + def __init__(self, d: dict[str, Any], prefix: str = "") -> None: object.__setattr__(self, "d", d) object.__setattr__(self, "prefix", prefix) @@ -250,7 +252,7 @@ else: return _get_option(prefix) - def __dir__(self) -> Iterable[str]: + def __dir__(self) -> list[str]: return list(self.d.keys()) diff -Nru pandas-2.1.4+dfsg/pandas/_libs/__init__.py pandas-2.2.2+dfsg/pandas/_libs/__init__.py --- pandas-2.1.4+dfsg/pandas/_libs/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,8 +13,8 @@ # Below imports needs to happen first to ensure pandas top level # module gets monkeypatched with the pandas_datetime_CAPI # see pandas_datetime_exec in pd_datetime.c -import pandas._libs.pandas_parser # noqa: E501 # isort: skip # type: ignore[reportUnusedImport] -import pandas._libs.pandas_datetime # noqa: F401,E501 # isort: skip # type: ignore[reportUnusedImport] +import pandas._libs.pandas_parser # isort: skip # type: ignore[reportUnusedImport] +import pandas._libs.pandas_datetime # noqa: F401 # isort: skip # type: ignore[reportUnusedImport] from pandas._libs.interval import Interval from pandas._libs.tslibs import ( NaT, diff -Nru pandas-2.1.4+dfsg/pandas/_libs/algos.pyx pandas-2.2.2+dfsg/pandas/_libs/algos.pyx --- pandas-2.1.4+dfsg/pandas/_libs/algos.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/algos.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -814,7 +814,7 @@ return True, True, True if timelike and arr[0] == NPY_NAT: - return False, False, True + return False, False, False if numeric_object_t is not object: with nogil: @@ -998,8 +998,7 @@ N = len(values) if labels is not None: - # TODO(cython3): cast won't be necessary (#2992) - assert len(labels) == N + assert len(labels) == N out = np.empty(N) grp_sizes = np.ones(N, dtype=np.int64) @@ -1145,107 +1144,7 @@ # that sorted value for retrieval back from the original # values / masked_vals arrays # TODO(cython3): de-duplicate once cython supports conditional nogil - if numeric_object_t is object: - with gil: - for i in range(N): - at_end = i == N - 1 - - # dups and sum_ranks will be incremented each loop where - # the value / group remains the same, and should be reset - # when either of those change. Used to calculate tiebreakers - dups += 1 - sum_ranks += i - grp_start + 1 - - next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]], - masked_vals[sort_indexer[i+1]]) - - # We'll need this check later anyway to determine group size, so just - # compute it here since shortcircuiting won't help - group_changed = at_end or (check_labels and - (labels[sort_indexer[i]] - != labels[sort_indexer[i+1]])) - - # Update out only when there is a transition of values or labels. - # When a new value or group is encountered, go back #dups steps( - # the number of occurrence of current value) and assign the ranks - # based on the starting index of the current group (grp_start) - # and the current index - if (next_val_diff or group_changed or (check_mask and - (mask[sort_indexer[i]] - ^ mask[sort_indexer[i+1]]))): - - # If keep_na, check for missing values and assign back - # to the result where appropriate - if keep_na and check_mask and mask[sort_indexer[i]]: - grp_na_count = dups - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = NaN - elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = i - grp_start + 1 - - # With n as the previous rank in the group and m as the number - # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, - # then rankings should be n + 1, n + 2 ... n + m - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = j + 1 - grp_start - - # If TIEBREAK_FIRST and descending, the ranking should be - # n + m, n + (m - 1) ... n + 1. This is equivalent to - # (i - dups + 1) + (i - j + 1) - grp_start - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[sort_indexer[j]] = grp_vals_seen - - # Look forward to the next value (using the sorting in - # lexsort_indexer). If the value does not equal the current - # value then we need to reset the dups and sum_ranks, knowing - # that a new value is coming up. The conditional also needs - # to handle nan equality and the end of iteration. If group - # changes we do not record seeing a new value in the group - if not group_changed and (next_val_diff or (check_mask and - (mask[sort_indexer[i]] - ^ mask[sort_indexer[i+1]]))): - dups = sum_ranks = 0 - grp_vals_seen += 1 - - # Similar to the previous conditional, check now if we are - # moving to a new group. If so, keep track of the index where - # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. Fill in the size of each - # group encountered (used by pct calculations later). Also be - # sure to reset any of the items helping to calculate dups - if group_changed: - - # If not dense tiebreak, group size used to compute - # percentile will be # of non-null elements in group - if tiebreak != TIEBREAK_DENSE: - grp_size = i - grp_start + 1 - grp_na_count - - # Otherwise, it will be the number of distinct values - # in the group, subtracting 1 if NaNs are present - # since that is a distinct value we shouldn't count - else: - grp_size = grp_vals_seen - (grp_na_count > 0) - - for j in range(grp_start, i + 1): - grp_sizes[sort_indexer[j]] = grp_size - - dups = sum_ranks = 0 - grp_na_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 - else: + with gil(numeric_object_t is object): for i in range(N): at_end = i == N - 1 @@ -1255,8 +1154,12 @@ dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or (masked_vals[sort_indexer[i]] - != masked_vals[sort_indexer[i+1]]) + if numeric_object_t is object: + next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]], + masked_vals[sort_indexer[i+1]]) + else: + next_val_diff = at_end or (masked_vals[sort_indexer[i]] + != masked_vals[sort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help @@ -1269,10 +1172,9 @@ # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - if (next_val_diff or group_changed - or (check_mask and - (mask[sort_indexer[i]] ^ mask[sort_indexer[i+1]]))): - + if (next_val_diff or group_changed or (check_mask and + (mask[sort_indexer[i]] + ^ mask[sort_indexer[i+1]]))): # If keep_na, check for missing values and assign back # to the result where appropriate if keep_na and check_mask and mask[sort_indexer[i]]: @@ -1483,7 +1385,7 @@ cdef: Py_ssize_t i, j, sx, sy, start, stop bint f_contig = arr.flags.f_contiguous - # bint f_contig = arr.is_f_contig() # TODO(cython3) + # bint f_contig = arr.is_f_contig() # TODO(cython3) once arr is memoryview diff_t left, right # Disable for unsupported dtype combinations, diff -Nru pandas-2.1.4+dfsg/pandas/_libs/algos_take_helper.pxi.in pandas-2.2.2+dfsg/pandas/_libs/algos_take_helper.pxi.in --- pandas-2.1.4+dfsg/pandas/_libs/algos_take_helper.pxi.in 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/algos_take_helper.pxi.in 2024-04-10 17:42:52.000000000 +0000 @@ -184,6 +184,17 @@ fv = fill_value + {{if c_type_in == c_type_out != "object"}} + with nogil: + for i in range(n): + for j in range(k): + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + {{else}} for i in range(n): for j in range(k): idx = indexer[j] @@ -195,6 +206,7 @@ {{else}} out[i, j] = values[i, idx] {{endif}} + {{endif}} @cython.wraparound(False) diff -Nru pandas-2.1.4+dfsg/pandas/_libs/arrays.pyi pandas-2.2.2+dfsg/pandas/_libs/arrays.pyi --- pandas-2.1.4+dfsg/pandas/_libs/arrays.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/arrays.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -26,7 +26,7 @@ def size(self) -> int: ... @property def nbytes(self) -> int: ... - def copy(self): ... + def copy(self, order=...): ... def delete(self, loc, axis=...): ... def swapaxes(self, axis1, axis2): ... def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ... diff -Nru pandas-2.1.4+dfsg/pandas/_libs/arrays.pyx pandas-2.2.2+dfsg/pandas/_libs/arrays.pyx --- pandas-2.1.4+dfsg/pandas/_libs/arrays.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/arrays.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -126,8 +126,7 @@ @property def size(self) -> int: - # TODO(cython3): use self._ndarray.size - return cnp.PyArray_SIZE(self._ndarray) + return self._ndarray.size @property def nbytes(self) -> int: diff -Nru pandas-2.1.4+dfsg/pandas/_libs/dtypes.pxd pandas-2.2.2+dfsg/pandas/_libs/dtypes.pxd --- pandas-2.1.4+dfsg/pandas/_libs/dtypes.pxd 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/dtypes.pxd 2024-04-10 17:42:52.000000000 +0000 @@ -34,3 +34,8 @@ ctypedef fused numeric_object_t: numeric_t object + +ctypedef fused uint8_int64_object_t: + uint8_t + int64_t + object diff -Nru pandas-2.1.4+dfsg/pandas/_libs/groupby.pyi pandas-2.2.2+dfsg/pandas/_libs/groupby.pyi --- pandas-2.1.4+dfsg/pandas/_libs/groupby.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/groupby.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -44,7 +44,6 @@ labels: np.ndarray, # ndarray[int64_t] sorted_labels: npt.NDArray[np.intp], mask: npt.NDArray[np.uint8], - direction: Literal["ffill", "bfill"], limit: int, # int64_t dropna: bool, ) -> None: ... @@ -55,7 +54,7 @@ mask: np.ndarray, # const uint8_t[::1] val_test: Literal["any", "all"], skipna: bool, - nullable: bool, + result_mask: np.ndarray | None, ) -> None: ... def group_sum( out: np.ndarray, # complexfloatingintuint_t[:, ::1] @@ -137,6 +136,7 @@ result_mask: npt.NDArray[np.bool_] | None = ..., min_count: int = ..., # Py_ssize_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_nth( out: np.ndarray, # rank_t[:, ::1] @@ -148,6 +148,7 @@ min_count: int = ..., # int64_t rank: int = ..., # int64_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_rank( out: np.ndarray, # float64_t[:, ::1] @@ -181,6 +182,18 @@ mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... +def group_idxmin_idxmax( + out: npt.NDArray[np.intp], + counts: npt.NDArray[np.int64], + values: np.ndarray, # ndarray[groupby_t, ndim=2] + labels: npt.NDArray[np.intp], + min_count: int = ..., + is_datetimelike: bool = ..., + mask: np.ndarray | None = ..., + name: str = ..., + skipna: bool = ..., + result_mask: np.ndarray | None = ..., +) -> None: ... def group_cummin( out: np.ndarray, # groupby_t[:, ::1] values: np.ndarray, # ndarray[groupby_t, ndim=2] diff -Nru pandas-2.1.4+dfsg/pandas/_libs/groupby.pyx pandas-2.2.2+dfsg/pandas/_libs/groupby.pyx --- pandas-2.1.4+dfsg/pandas/_libs/groupby.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/groupby.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -695,54 +695,37 @@ N, K = (values).shape - if sum_t is object: - # NB: this does not use 'compensation' like the non-object track does. + with nogil(sum_t is not object): for i in range(N): lab = labels[i] if lab < 0: continue counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if not checknull(val): - nobs[lab, j] += 1 - - if nobs[lab, j] == 1: - # i.e. we haven't added anything yet; avoid TypeError - # if e.g. val is a str and sumx[lab, j] is 0 - t = val - else: - t = sumx[lab, j] + val - sumx[lab, j] = t - for i in range(ncounts): for j in range(K): - if nobs[i, j] < min_count: - out[i, j] = None + val = values[i, j] + if uses_mask: + isna_entry = mask[i, j] else: - out[i, j] = sumx[i, j] - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue + isna_entry = _treat_as_na(val, is_datetimelike) - counts[lab] += 1 - for j in range(K): - val = values[i, j] + if not isna_entry: + nobs[lab, j] += 1 - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if sum_t is object: + # NB: this does not use 'compensation' like the non-object + # track does. + if nobs[lab, j] == 1: + # i.e. we haven't added anything yet; avoid TypeError + # if e.g. val is a str and sumx[lab, j] is 0 + t = val + else: + t = sumx[lab, j] + val + sumx[lab, j] = t - if not isna_entry: - nobs[lab, j] += 1 + else: y = val - compensation[lab, j] t = sumx[lab, j] + y compensation[lab, j] = t - sumx[lab, j] - y @@ -755,9 +738,9 @@ compensation[lab, j] = 0 sumx[lab, j] = t - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx - ) + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx + ) @cython.wraparound(False) @@ -809,9 +792,9 @@ nobs[lab, j] += 1 prodx[lab, j] *= val - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx - ) + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx + ) @cython.wraparound(False) @@ -1320,9 +1303,8 @@ cdef bint _treat_as_na(numeric_object_complex_t val, bint is_datetimelike) noexcept nogil: if numeric_object_complex_t is object: - # Should never be used, but we need to avoid the `val != val` below - # or else cython will raise about gil acquisition. - raise NotImplementedError + with gil: + return checknull(val) elif numeric_object_complex_t is int64_t: return is_datetimelike and val == NPY_NAT @@ -1369,7 +1351,7 @@ ctypedef fused mincount_t: - numeric_t + numeric_object_t complex64_t complex128_t @@ -1385,7 +1367,7 @@ int64_t[:, ::1] nobs, int64_t min_count, mincount_t[:, ::1] resx, -) noexcept nogil: +) noexcept: """ Check if the number of observations for a group is below min_count, and if so set the result for that group to the appropriate NA-like value. @@ -1393,38 +1375,40 @@ cdef: Py_ssize_t i, j - for i in range(ncounts): - for j in range(K): - - if nobs[i, j] < min_count: - # if we are integer dtype, not is_datetimelike, and - # not uses_mask, then getting here implies that - # counts[i] < min_count, which means we will - # be cast to float64 and masked at the end - # of WrappedCythonOp._call_cython_op. So we can safely - # set a placeholder value in out[i, j]. - if uses_mask: - result_mask[i, j] = True - # set out[i, j] to 0 to be deterministic, as - # it was initialized with np.empty. Also ensures - # we can downcast out if appropriate. - out[i, j] = 0 - elif ( - mincount_t is float32_t - or mincount_t is float64_t - or mincount_t is complex64_t - or mincount_t is complex128_t - ): - out[i, j] = NAN - elif mincount_t is int64_t: - # Per above, this is a placeholder in - # non-is_datetimelike cases. - out[i, j] = NPY_NAT + with nogil(mincount_t is not object): + for i in range(ncounts): + for j in range(K): + if nobs[i, j] >= min_count: + out[i, j] = resx[i, j] else: - # placeholder, see above - out[i, j] = 0 - else: - out[i, j] = resx[i, j] + # if we are integer dtype, not is_datetimelike, and + # not uses_mask, then getting here implies that + # counts[i] < min_count, which means we will + # be cast to float64 and masked at the end + # of WrappedCythonOp._call_cython_op. So we can safely + # set a placeholder value in out[i, j]. + if uses_mask: + result_mask[i, j] = True + # set out[i, j] to 0 to be deterministic, as + # it was initialized with np.empty. Also ensures + # we can downcast out if appropriate. + out[i, j] = 0 + elif ( + mincount_t is float32_t + or mincount_t is float64_t + or mincount_t is complex64_t + or mincount_t is complex128_t + ): + out[i, j] = NAN + elif mincount_t is int64_t: + # Per above, this is a placeholder in + # non-is_datetimelike cases. + out[i, j] = NPY_NAT + elif mincount_t is object: + out[i, j] = None + else: + # placeholder, see above + out[i, j] = 0 # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can @@ -1440,6 +1424,7 @@ uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1452,9 +1437,7 @@ bint uses_mask = mask is not None bint isna_entry - # TODO(cython3): - # Instead of `labels.shape[0]` use `len(labels)` - if not len(values) == labels.shape[0]: + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) @@ -1466,8 +1449,7 @@ N, K = (values).shape - if numeric_object_t is object: - # TODO(cython3): De-duplicate once conditional-nogil is available + with nogil(numeric_object_t is not object): for i in range(N): lab = labels[i] if lab < 0: @@ -1477,46 +1459,23 @@ for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = checknull(val) - - if not isna_entry: - # TODO(cython3): use _treat_as_na here once - # conditional-nogil is available. - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] < min_count: - out[i, j] = None - else: - out[i, j] = resx[i, j] - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - + if skipna: if uses_mask: isna_entry = mask[i, j] else: isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue + + nobs[lab, j] += 1 + resx[lab, j] = val - if not isna_entry: - nobs[lab, j] += 1 - resx[lab, j] = val - - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx - ) + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] + + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx + ) # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can @@ -1533,6 +1492,7 @@ int64_t min_count=-1, int64_t rank=1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1545,9 +1505,7 @@ bint uses_mask = mask is not None bint isna_entry - # TODO(cython3): - # Instead of `labels.shape[0]` use `len(labels)` - if not len(values) == labels.shape[0]: + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) @@ -1559,8 +1517,7 @@ N, K = (values).shape - if numeric_object_t is object: - # TODO(cython3): De-duplicate once conditional-nogil is available + with nogil(numeric_object_t is not object): for i in range(N): lab = labels[i] if lab < 0: @@ -1570,49 +1527,23 @@ for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = checknull(val) - - if not isna_entry: - # TODO(cython3): use _treat_as_na here once - # conditional-nogil is available. - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] < min_count: - out[i, j] = None - else: - out[i, j] = resx[i, j] - - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - + if skipna: if uses_mask: isna_entry = mask[i, j] else: isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue + + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] - if not isna_entry: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx - ) + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx + ) @cython.boundscheck(False) @@ -1752,9 +1683,7 @@ bint uses_mask = mask is not None bint isna_entry - # TODO(cython3): - # Instead of `labels.shape[0]` use `len(labels)` - if not len(values) == labels.shape[0]: + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) @@ -1789,9 +1718,123 @@ if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val - _check_below_mincount( - out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max - ) + _check_below_mincount( + out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max + ) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_idxmin_idxmax( + intp_t[:, ::1] out, + int64_t[::1] counts, + ndarray[numeric_object_t, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=-1, + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + str name="idxmin", + bint skipna=True, + uint8_t[:, ::1] result_mask=None, +): + """ + Compute index of minimum/maximum of columns of `values`, in row groups `labels`. + + This function only computes the row number where the minimum/maximum occurs, we'll + take the corresponding index value after this function. + + Parameters + ---------- + out : np.ndarray[intp, ndim=2] + Array to store result in. + counts : np.ndarray[int64] + Input as a zeroed array, populated by group sizes during algorithm + values : np.ndarray[numeric_object_t, ndim=2] + Values to find column-wise min/max of. + labels : np.ndarray[np.intp] + Labels to group by. + min_count : Py_ssize_t, default -1 + The minimum number of non-NA group elements, NA result if threshold + is not met. + is_datetimelike : bool + True if `values` contains datetime-like entries. + name : {"idxmin", "idxmax"}, default "idxmin" + Whether to compute idxmin or idxmax. + mask : ndarray[bool, ndim=2], optional + If not None, indices represent missing values, + otherwise the mask will not be used + skipna : bool, default True + Flag to ignore nan values during truth testing + result_mask : ndarray[bool, ndim=2], optional + If not None, these specify locations in the output that are NA. + Modified in-place. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. + `counts` is modified to hold group sizes + """ + cdef: + Py_ssize_t i, j, N, K, lab + numeric_object_t val + numeric_object_t[:, ::1] group_min_or_max + uint8_t[:, ::1] seen + bint uses_mask = mask is not None + bint isna_entry + bint compute_max = name == "idxmax" + + assert name == "idxmin" or name == "idxmax" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + N, K = (values).shape + + if numeric_object_t is object: + group_min_or_max = np.empty((out).shape, dtype=object) + seen = np.zeros((out).shape, dtype=np.uint8) + else: + group_min_or_max = np.empty_like(out, dtype=values.dtype) + seen = np.zeros_like(out, dtype=np.uint8) + + # When using transform, we need a valid value for take in the case + # a category is not observed; these values will be dropped + out[:] = 0 + + with nogil(numeric_object_t is not object): + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + for j in range(K): + if not skipna and out[lab, j] == -1: + # Once we've hit NA there is no going back + continue + + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + + if isna_entry: + if not skipna or not seen[lab, j]: + out[lab, j] = -1 + else: + if not seen[lab, j]: + seen[lab, j] = True + group_min_or_max[lab, j] = val + out[lab, j] = i + elif compute_max: + if val > group_min_or_max[lab, j]: + group_min_or_max[lab, j] = val + out[lab, j] = i + else: + if val < group_min_or_max[lab, j]: + group_min_or_max[lab, j] = val + out[lab, j] = i @cython.wraparound(False) diff -Nru pandas-2.1.4+dfsg/pandas/_libs/hashtable.pyi pandas-2.2.2+dfsg/pandas/_libs/hashtable.pyi --- pandas-2.1.4+dfsg/pandas/_libs/hashtable.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/hashtable.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -20,7 +20,6 @@ def factorize( self, values: np.ndarray, - sort: bool = ..., na_sentinel=..., na_value=..., mask=..., @@ -157,9 +156,9 @@ def __contains__(self, key: Hashable) -> bool: ... def sizeof(self, deep: bool = ...) -> int: ... def get_state(self) -> dict[str, int]: ... - # TODO: `item` type is subclass-specific - def get_item(self, item): ... # TODO: return type? - def set_item(self, item, val) -> None: ... + # TODO: `val/key` type is subclass-specific + def get_item(self, val): ... # TODO: return type? + def set_item(self, key, val) -> None: ... def get_na(self): ... # TODO: return type? def set_na(self, val) -> None: ... def map_locations( @@ -185,6 +184,7 @@ self, values: np.ndarray, # np.ndarray[subclass-specific] return_inverse: bool = ..., + mask=..., ) -> ( tuple[ np.ndarray, # np.ndarray[subclass-specific] @@ -198,6 +198,7 @@ na_sentinel: int = ..., na_value: object = ..., mask=..., + ignore_na: bool = True, ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ... # np.ndarray[subclass-specific] class Complex128HashTable(HashTable): ... @@ -240,7 +241,7 @@ values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = ..., -) -> tuple[np.ndarray, npt.NDArray[np.int64]]: ... # np.ndarray[same-as-values] +) -> tuple[np.ndarray, npt.NDArray[np.int64], int]: ... # np.ndarray[same-as-values] # arr and values should have same dtype def ismember( diff -Nru pandas-2.1.4+dfsg/pandas/_libs/hashtable_class_helper.pxi.in pandas-2.2.2+dfsg/pandas/_libs/hashtable_class_helper.pxi.in --- pandas-2.1.4+dfsg/pandas/_libs/hashtable_class_helper.pxi.in 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/hashtable_class_helper.pxi.in 2024-04-10 17:42:52.000000000 +0000 @@ -1239,9 +1239,10 @@ na_value=na_value, ignore_na=ignore_na, return_inverse=True) + # Add unused mask parameter for compat with other signatures def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, object mask=None): # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, @@ -1496,9 +1497,10 @@ na_value=na_value, ignore_na=ignore_na, return_inverse=True) + # Add unused mask parameter for compat with other signatures def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, object mask=None): # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, diff -Nru pandas-2.1.4+dfsg/pandas/_libs/hashtable_func_helper.pxi.in pandas-2.2.2+dfsg/pandas/_libs/hashtable_func_helper.pxi.in --- pandas-2.1.4+dfsg/pandas/_libs/hashtable_func_helper.pxi.in 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/hashtable_func_helper.pxi.in 2024-04-10 17:42:52.000000000 +0000 @@ -36,7 +36,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None): {{endif}} cdef: - Py_ssize_t i = 0 + Py_ssize_t i = 0, na_counter = 0, na_add = 0 Py_ssize_t n = len(values) kh_{{ttype}}_t *table @@ -49,9 +49,6 @@ bint uses_mask = mask is not None bint isna_entry = False - if uses_mask and not dropna: - raise NotImplementedError("uses_mask not implemented with dropna=False") - # we track the order in which keys are first seen (GH39009), # khash-map isn't insertion-ordered, thus: # table maps keys to counts @@ -82,25 +79,31 @@ for i in range(n): val = {{to_c_type}}(values[i]) + if uses_mask: + isna_entry = mask[i] + if dropna: - if uses_mask: - isna_entry = mask[i] - else: + if not uses_mask: isna_entry = is_nan_{{c_type}}(val) if not dropna or not isna_entry: - k = kh_get_{{ttype}}(table, val) - if k != table.n_buckets: - table.vals[k] += 1 + if uses_mask and isna_entry: + na_counter += 1 else: - k = kh_put_{{ttype}}(table, val, &ret) - table.vals[k] = 1 - result_keys.append(val) + k = kh_get_{{ttype}}(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{ttype}}(table, val, &ret) + table.vals[k] = 1 + result_keys.append(val) {{endif}} # collect counts in the order corresponding to result_keys: + if na_counter > 0: + na_add = 1 cdef: - int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64) + int64_t[::1] result_counts = np.empty(table.size + na_add, dtype=np.int64) for i in range(table.size): {{if dtype == 'object'}} @@ -110,9 +113,13 @@ {{endif}} result_counts[i] = table.vals[k] + if na_counter > 0: + result_counts[table.size] = na_counter + result_keys.append(val) + kh_destroy_{{ttype}}(table) - return result_keys.to_array(), result_counts.base + return result_keys.to_array(), result_counts.base, na_counter @cython.wraparound(False) @@ -397,12 +404,13 @@ cdef: ndarray[htfunc_t] keys ndarray[htfunc_t] modes + ndarray[uint8_t] res_mask = None int64_t[::1] counts - int64_t count, max_count = -1 - Py_ssize_t nkeys, k, j = 0 + int64_t count, _, max_count = -1 + Py_ssize_t nkeys, k, na_counter, j = 0 - keys, counts = value_count(values, dropna, mask=mask) + keys, counts, na_counter = value_count(values, dropna, mask=mask) nkeys = len(keys) modes = np.empty(nkeys, dtype=values.dtype) @@ -433,7 +441,10 @@ modes[j] = keys[k] - return modes[:j + 1] + if na_counter > 0: + res_mask = np.zeros(j+1, dtype=np.bool_) + res_mask[j] = True + return modes[:j + 1], res_mask {{py: diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/datetime/date_conversions.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/datetime/date_conversions.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/datetime/date_conversions.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/datetime/date_conversions.h 2024-04-10 17:42:52.000000000 +0000 @@ -12,19 +12,13 @@ #include // Scales value inplace from nanosecond resolution to unit resolution -int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit); +int scaleNanosecToUnit(int64_t *value, NPY_DATETIMEUNIT unit); // Converts an int64 object representing a date to ISO format // up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z // while base="ns" yields "2020-01-01T00:00:00.000000000Z" // len is mutated to save the length of the returned string -char *int64ToIso(int64_t value, - NPY_DATETIMEUNIT valueUnit, - NPY_DATETIMEUNIT base, - size_t *len); - -// TODO(username): this function doesn't do a lot; should augment or -// replace with scaleNanosecToUnit -npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base); +char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit, + NPY_DATETIMEUNIT base, size_t *len); char *int64ToIsoDuration(int64_t value, size_t *len); diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/datetime/pd_datetime.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/datetime/pd_datetime.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/datetime/pd_datetime.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/datetime/pd_datetime.h 2024-04-10 17:42:52.000000000 +0000 @@ -19,12 +19,12 @@ #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#endif // NPY_NO_DEPRECATED_API +#endif // NPY_NO_DEPRECATED_API -#include +#include "pandas/datetime/date_conversions.h" #include "pandas/vendored/numpy/datetime/np_datetime.h" #include "pandas/vendored/numpy/datetime/np_datetime_strings.h" -#include "pandas/datetime/date_conversions.h" +#include #ifdef __cplusplus extern "C" { @@ -33,9 +33,8 @@ typedef struct { npy_datetime (*npy_datetimestruct_to_datetime)(NPY_DATETIMEUNIT, const npy_datetimestruct *); - int (*scaleNanosecToUnit)(npy_int64 *, NPY_DATETIMEUNIT); + int (*scaleNanosecToUnit)(int64_t *, NPY_DATETIMEUNIT); char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, size_t *); - npy_datetime (*NpyDateTimeToEpoch)(npy_datetime, NPY_DATETIMEUNIT); char *(*PyDateTimeToIso)(PyObject *, NPY_DATETIMEUNIT, size_t *); npy_datetime (*PyDateTimeToEpoch)(PyObject *, NPY_DATETIMEUNIT); char *(*int64ToIsoDuration)(int64_t, size_t *); @@ -51,7 +50,7 @@ NPY_DATETIMEUNIT *, int *, int *, const char *, int, FormatRequirement); int (*get_datetime_iso_8601_strlen)(int, NPY_DATETIMEUNIT); - int (*make_iso_8601_datetime)(npy_datetimestruct *, char *, int, int, + int (*make_iso_8601_datetime)(npy_datetimestruct *, char *, size_t, int, NPY_DATETIMEUNIT); int (*make_iso_8601_timedelta)(pandas_timedeltastruct *, char *, size_t *); } PandasDateTime_CAPI; diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/inline_helper.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/inline_helper.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/inline_helper.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/inline_helper.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,24 +0,0 @@ -/* -Copyright (c) 2016, PyData Development Team -All rights reserved. - -Distributed under the terms of the BSD Simplified License. - -The full license is in the LICENSE file, distributed with this software. -*/ - -#pragma once - -#ifndef PANDAS_INLINE - #if defined(__clang__) - #define PANDAS_INLINE static __inline__ __attribute__ ((__unused__)) - #elif defined(__GNUC__) - #define PANDAS_INLINE static __inline__ - #elif defined(_MSC_VER) - #define PANDAS_INLINE static __inline - #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L - #define PANDAS_INLINE static inline - #else - #define PANDAS_INLINE - #endif // __GNUC__ -#endif // PANDAS_INLINE diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/parser/io.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/parser/io.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/parser/io.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/parser/io.h 2024-04-10 17:42:52.000000000 +0000 @@ -10,22 +10,22 @@ #pragma once #define PY_SSIZE_T_CLEAN -#include #include "tokenizer.h" +#include #define FS(source) ((file_source *)source) typedef struct _rd_source { - PyObject *obj; - PyObject *buffer; - size_t position; + PyObject *obj; + PyObject *buffer; + size_t position; } rd_source; #define RDS(source) ((rd_source *)source) void *new_rd_source(PyObject *obj); -int del_rd_source(void *src); +void del_rd_source(void *src); -void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, +char *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors); diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/parser/pd_parser.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/parser/pd_parser.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/parser/pd_parser.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/parser/pd_parser.h 2024-04-10 17:42:52.000000000 +0000 @@ -13,15 +13,15 @@ #endif #define PY_SSIZE_T_CLEAN -#include #include "pandas/parser/tokenizer.h" +#include typedef struct { int (*to_double)(char *, double *, char, char, int *); int (*floatify)(PyObject *, double *, int *); void *(*new_rd_source)(PyObject *); - int (*del_rd_source)(void *); - void *(*buffer_rd_bytes)(void *, size_t, size_t *, int *, const char *); + void (*del_rd_source)(void *); + char *(*buffer_rd_bytes)(void *, size_t, size_t *, int *, const char *); void (*uint_state_init)(uint_state *); int (*uint64_conflict)(uint_state *); void (*coliter_setup)(coliter_t *, parser_t *, int64_t, int64_t); @@ -30,7 +30,7 @@ void (*parser_free)(parser_t *); void (*parser_del)(parser_t *); int (*parser_add_skiprow)(parser_t *, int64_t); - int (*parser_set_skipfirstnrows)(parser_t *, int64_t); + void (*parser_set_skipfirstnrows)(parser_t *, int64_t); void (*parser_set_default_options)(parser_t *); int (*parser_consume_rows)(parser_t *, size_t); int (*parser_trim_buffers)(parser_t *); @@ -81,11 +81,10 @@ PandasParserAPI->parser_set_default_options((self)) #define parser_consume_rows(self, nrows) \ PandasParserAPI->parser_consume_rows((self), (nrows)) -#define parser_trim_buffers(self) \ - PandasParserAPI->parser_trim_buffers((self)) -#define tokenize_all_rows(self, encoding_errors) \ +#define parser_trim_buffers(self) PandasParserAPI->parser_trim_buffers((self)) +#define tokenize_all_rows(self, encoding_errors) \ PandasParserAPI->tokenize_all_rows((self), (encoding_errors)) -#define tokenize_nrows(self, nrows, encoding_errors) \ +#define tokenize_nrows(self, nrows, encoding_errors) \ PandasParserAPI->tokenize_nrows((self), (nrows), (encoding_errors)) #define str_to_int64(p_item, int_min, int_max, error, t_sep) \ PandasParserAPI->str_to_int64((p_item), (int_min), (int_max), (error), \ @@ -104,7 +103,7 @@ PandasParserAPI->round_trip((p), (q), (decimal), (sci), (tsep), \ (skip_trailing), (error), (maybe_int)) #define to_boolean(item, val) PandasParserAPI->to_boolean((item), (val)) -#endif /* !defined(_PANDAS_PARSER_IMPL) */ +#endif /* !defined(_PANDAS_PARSER_IMPL) */ #ifdef __cplusplus } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/parser/tokenizer.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/parser/tokenizer.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/parser/tokenizer.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/parser/tokenizer.h 2024-04-10 17:42:52.000000000 +0000 @@ -19,17 +19,12 @@ #define ERROR_INVALID_CHARS 3 #include -#include "pandas/inline_helper.h" -#include "pandas/portable.h" - -#include "pandas/vendored/klib/khash.h" #define STREAM_INIT_SIZE 32 #define REACHED_EOF 1 #define CALLING_READ_FAILED 2 - /* C flat file parsing low level code for pandas / NumPy @@ -46,7 +41,7 @@ #define TRACE(X) printf X; #else #define TRACE(X) -#endif // VERBOSE +#endif // VERBOSE #define PARSER_OUT_OF_MEMORY -1 @@ -56,131 +51,127 @@ */ typedef enum { - START_RECORD, - START_FIELD, - ESCAPED_CHAR, - IN_FIELD, - IN_QUOTED_FIELD, - ESCAPE_IN_QUOTED_FIELD, - QUOTE_IN_QUOTED_FIELD, - EAT_CRNL, - EAT_CRNL_NOP, - EAT_WHITESPACE, - EAT_COMMENT, - EAT_LINE_COMMENT, - WHITESPACE_LINE, - START_FIELD_IN_SKIP_LINE, - IN_FIELD_IN_SKIP_LINE, - IN_QUOTED_FIELD_IN_SKIP_LINE, - QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE, - FINISHED + START_RECORD, + START_FIELD, + ESCAPED_CHAR, + IN_FIELD, + IN_QUOTED_FIELD, + ESCAPE_IN_QUOTED_FIELD, + QUOTE_IN_QUOTED_FIELD, + EAT_CRNL, + EAT_CRNL_NOP, + EAT_WHITESPACE, + EAT_COMMENT, + EAT_LINE_COMMENT, + WHITESPACE_LINE, + START_FIELD_IN_SKIP_LINE, + IN_FIELD_IN_SKIP_LINE, + IN_QUOTED_FIELD_IN_SKIP_LINE, + QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE, + FINISHED } ParserState; typedef enum { - QUOTE_MINIMAL, - QUOTE_ALL, - QUOTE_NONNUMERIC, - QUOTE_NONE + QUOTE_MINIMAL, + QUOTE_ALL, + QUOTE_NONNUMERIC, + QUOTE_NONE } QuoteStyle; -typedef enum { - ERROR, - WARN, - SKIP -} BadLineHandleMethod; +typedef enum { ERROR, WARN, SKIP } BadLineHandleMethod; -typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, +typedef char *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors); -typedef int (*io_cleanup)(void *src); +typedef void (*io_cleanup)(void *src); typedef struct parser_t { - void *source; - io_callback cb_io; - io_cleanup cb_cleanup; - - int64_t chunksize; // Number of bytes to prepare for each chunk - char *data; // pointer to data to be processed - int64_t datalen; // amount of data available - int64_t datapos; - - // where to write out tokenized data - char *stream; - uint64_t stream_len; - uint64_t stream_cap; - - // Store words in (potentially ragged) matrix for now, hmm - char **words; - int64_t *word_starts; // where we are in the stream - uint64_t words_len; - uint64_t words_cap; - uint64_t max_words_cap; // maximum word cap encountered - - char *pword_start; // pointer to stream start of current field - int64_t word_start; // position start of current field - - int64_t *line_start; // position in words for start of line - int64_t *line_fields; // Number of fields in each line - uint64_t lines; // Number of (good) lines observed - uint64_t file_lines; // Number of lines (including bad or skipped) - uint64_t lines_cap; // Vector capacity - - // Tokenizing stuff - ParserState state; - int doublequote; /* is " represented by ""? */ - char delimiter; /* field separator */ - int delim_whitespace; /* delimit by consuming space/tabs instead */ - char quotechar; /* quote character */ - char escapechar; /* escape character */ - char lineterminator; - int skipinitialspace; /* ignore spaces following delimiter? */ - int quoting; /* style of quoting to write */ - - char commentchar; - int allow_embedded_newline; - - int usecols; // Boolean: 1: usecols provided, 0: none provided - - Py_ssize_t expected_fields; - BadLineHandleMethod on_bad_lines; - - // floating point options - char decimal; - char sci; - - // thousands separator (comma, period) - char thousands; - - int header; // Boolean: 1: has header, 0: no header - int64_t header_start; // header row start - uint64_t header_end; // header row end - - void *skipset; - PyObject *skipfunc; - int64_t skip_first_N_rows; - int64_t skip_footer; - double (*double_converter)(const char *, char **, - char, char, char, int, int *, int *); - - // error handling - char *warn_msg; - char *error_msg; + void *source; + io_callback cb_io; + io_cleanup cb_cleanup; + + int64_t chunksize; // Number of bytes to prepare for each chunk + char *data; // pointer to data to be processed + int64_t datalen; // amount of data available + int64_t datapos; + + // where to write out tokenized data + char *stream; + uint64_t stream_len; + uint64_t stream_cap; + + // Store words in (potentially ragged) matrix for now, hmm + char **words; + int64_t *word_starts; // where we are in the stream + uint64_t words_len; + uint64_t words_cap; + uint64_t max_words_cap; // maximum word cap encountered + + char *pword_start; // pointer to stream start of current field + int64_t word_start; // position start of current field + + int64_t *line_start; // position in words for start of line + int64_t *line_fields; // Number of fields in each line + uint64_t lines; // Number of (good) lines observed + uint64_t file_lines; // Number of lines (including bad or skipped) + uint64_t lines_cap; // Vector capacity + + // Tokenizing stuff + ParserState state; + int doublequote; /* is " represented by ""? */ + char delimiter; /* field separator */ + int delim_whitespace; /* delimit by consuming space/tabs instead */ + char quotechar; /* quote character */ + char escapechar; /* escape character */ + char lineterminator; + int skipinitialspace; /* ignore spaces following delimiter? */ + int quoting; /* style of quoting to write */ + + char commentchar; + int allow_embedded_newline; + + int usecols; // Boolean: 1: usecols provided, 0: none provided + + Py_ssize_t expected_fields; + BadLineHandleMethod on_bad_lines; + + // floating point options + char decimal; + char sci; + + // thousands separator (comma, period) + char thousands; + + int header; // Boolean: 1: has header, 0: no header + int64_t header_start; // header row start + uint64_t header_end; // header row end + + void *skipset; + PyObject *skipfunc; + int64_t skip_first_N_rows; + int64_t skip_footer; + double (*double_converter)(const char *, char **, char, char, char, int, + int *, int *); + + // error handling + char *warn_msg; + char *error_msg; - int skip_empty_lines; + int skip_empty_lines; } parser_t; typedef struct coliter_t { - char **words; - int64_t *line_start; - int64_t col; + char **words; + int64_t *line_start; + int64_t col; } coliter_t; void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start); -#define COLITER_NEXT(iter, word) \ - do { \ - const int64_t i = *iter.line_start++ + iter.col; \ - word = i >= *iter.line_start ? "" : iter.words[i]; \ - } while (0) +#define COLITER_NEXT(iter, word) \ + do { \ + const int64_t i = *iter.line_start++ + iter.col; \ + word = i >= *iter.line_start ? "" : iter.words[i]; \ + } while (0) parser_t *parser_new(void); @@ -192,7 +183,7 @@ int parser_add_skiprow(parser_t *self, int64_t row); -int parser_set_skipfirstnrows(parser_t *self, int64_t nrows); +void parser_set_skipfirstnrows(parser_t *self, int64_t nrows); void parser_free(parser_t *self); @@ -208,9 +199,9 @@ // and want to free memory from the token stream typedef struct uint_state { - int seen_sint; - int seen_uint; - int seen_null; + int seen_sint; + int seen_uint; + int seen_null; } uint_state; void uint_state_init(uint_state *self); @@ -223,9 +214,9 @@ int *error, char tsep); double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); -double precise_xstrtod(const char *p, char **q, char decimal, - char sci, char tsep, int skip_trailing, - int *error, int *maybe_int); +double precise_xstrtod(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing, int *error, + int *maybe_int); // GH-15140 - round_trip requires and acquires the GIL on its own double round_trip(const char *p, char **q, char decimal, char sci, char tsep, diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/portable.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/portable.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/portable.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/portable.h 2024-04-10 17:42:52.000000000 +0000 @@ -16,9 +16,22 @@ #endif // GH-23516 - works around locale perf issues -// from MUSL libc, MIT Licensed - see LICENSES +// from MUSL libc, licence at LICENSES/MUSL_LICENSE #define isdigit_ascii(c) (((unsigned)(c) - '0') < 10u) -#define getdigit_ascii(c, default) (isdigit_ascii(c) ? ((int)((c) - '0')) : default) +#define getdigit_ascii(c, default) \ + (isdigit_ascii(c) ? ((int)((c) - '0')) : default) #define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5)) #define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c) & 0x5f) : (c)) #define tolower_ascii(c) ((((unsigned)(c) - 'A') < 26) ? ((c) | 0x20) : (c)) + +#if defined(_WIN32) +#define PD_FALLTHROUGH \ + do { \ + } while (0) /* fallthrough */ +#elif __has_attribute(__fallthrough__) +#define PD_FALLTHROUGH __attribute__((__fallthrough__)) +#else +#define PD_FALLTHROUGH \ + do { \ + } while (0) /* fallthrough */ +#endif diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/skiplist.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/skiplist.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/skiplist.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/skiplist.h 2024-04-10 17:42:52.000000000 +0000 @@ -19,279 +19,278 @@ #include #include #include -#include "pandas/inline_helper.h" -PANDAS_INLINE float __skiplist_nanf(void) { - const union { - int __i; - float __f; - } __bint = {0x7fc00000UL}; - return __bint.__f; +static inline float __skiplist_nanf(void) { + const union { + int __i; + float __f; + } __bint = {0x7fc00000UL}; + return __bint.__f; } #define PANDAS_NAN ((double)__skiplist_nanf()) -PANDAS_INLINE double Log2(double val) { return log(val) / log(2.); } +static inline double Log2(double val) { return log(val) / log(2.); } typedef struct node_t node_t; struct node_t { - node_t **next; - int *width; - double value; - int is_nil; - int levels; - int ref_count; + node_t **next; + int *width; + double value; + int is_nil; + int levels; + int ref_count; }; typedef struct { - node_t *head; - node_t **tmp_chain; - int *tmp_steps; - int size; - int maxlevels; + node_t *head; + node_t **tmp_chain; + int *tmp_steps; + int size; + int maxlevels; } skiplist_t; -PANDAS_INLINE double urand(void) { - return ((double)rand() + 1) / ((double)RAND_MAX + 2); +static inline double urand(void) { + return ((double)rand() + 1) / ((double)RAND_MAX + 2); } -PANDAS_INLINE int int_min(int a, int b) { return a < b ? a : b; } +static inline int int_min(int a, int b) { return a < b ? a : b; } -PANDAS_INLINE node_t *node_init(double value, int levels) { - node_t *result; - result = (node_t *)malloc(sizeof(node_t)); - if (result) { - result->value = value; - result->levels = levels; - result->is_nil = 0; - result->ref_count = 0; - result->next = (node_t **)malloc(levels * sizeof(node_t *)); - result->width = (int *)malloc(levels * sizeof(int)); - if (!(result->next && result->width) && (levels != 0)) { - free(result->next); - free(result->width); - free(result); - return NULL; - } +static inline node_t *node_init(double value, int levels) { + node_t *result; + result = (node_t *)malloc(sizeof(node_t)); + if (result) { + result->value = value; + result->levels = levels; + result->is_nil = 0; + result->ref_count = 0; + result->next = (node_t **)malloc(levels * sizeof(node_t *)); + result->width = (int *)malloc(levels * sizeof(int)); + if (!(result->next && result->width) && (levels != 0)) { + free(result->next); + free(result->width); + free(result); + return NULL; } - return result; + } + return result; } // do this ourselves -PANDAS_INLINE void node_incref(node_t *node) { ++(node->ref_count); } +static inline void node_incref(node_t *node) { ++(node->ref_count); } -PANDAS_INLINE void node_decref(node_t *node) { --(node->ref_count); } +static inline void node_decref(node_t *node) { --(node->ref_count); } static void node_destroy(node_t *node) { - int i; - if (node) { - if (node->ref_count <= 1) { - for (i = 0; i < node->levels; ++i) { - node_destroy(node->next[i]); - } - free(node->next); - free(node->width); - // printf("Reference count was 1, freeing\n"); - free(node); - } else { - node_decref(node); - } - // pretty sure that freeing the struct above will be enough - } -} - -PANDAS_INLINE void skiplist_destroy(skiplist_t *skp) { - if (skp) { - node_destroy(skp->head); - free(skp->tmp_steps); - free(skp->tmp_chain); - free(skp); + int i; + if (node) { + if (node->ref_count <= 1) { + for (i = 0; i < node->levels; ++i) { + node_destroy(node->next[i]); + } + free(node->next); + free(node->width); + // printf("Reference count was 1, freeing\n"); + free(node); + } else { + node_decref(node); } + // pretty sure that freeing the struct above will be enough + } } -PANDAS_INLINE skiplist_t *skiplist_init(int expected_size) { - skiplist_t *result; - node_t *NIL, *head; - int maxlevels, i; - - maxlevels = 1 + Log2((double)expected_size); - result = (skiplist_t *)malloc(sizeof(skiplist_t)); - if (!result) { - return NULL; - } - result->tmp_chain = (node_t **)malloc(maxlevels * sizeof(node_t *)); - result->tmp_steps = (int *)malloc(maxlevels * sizeof(int)); - result->maxlevels = maxlevels; - result->size = 0; - - head = result->head = node_init(PANDAS_NAN, maxlevels); - NIL = node_init(0.0, 0); - - if (!(result->tmp_chain && result->tmp_steps && result->head && NIL)) { - skiplist_destroy(result); - node_destroy(NIL); - return NULL; - } - - node_incref(head); - - NIL->is_nil = 1; +static inline void skiplist_destroy(skiplist_t *skp) { + if (skp) { + node_destroy(skp->head); + free(skp->tmp_steps); + free(skp->tmp_chain); + free(skp); + } +} + +static inline skiplist_t *skiplist_init(int expected_size) { + skiplist_t *result; + node_t *NIL, *head; + int maxlevels, i; + + maxlevels = 1 + Log2((double)expected_size); + result = (skiplist_t *)malloc(sizeof(skiplist_t)); + if (!result) { + return NULL; + } + result->tmp_chain = (node_t **)malloc(maxlevels * sizeof(node_t *)); + result->tmp_steps = (int *)malloc(maxlevels * sizeof(int)); + result->maxlevels = maxlevels; + result->size = 0; + + head = result->head = node_init(PANDAS_NAN, maxlevels); + NIL = node_init(0.0, 0); + + if (!(result->tmp_chain && result->tmp_steps && result->head && NIL)) { + skiplist_destroy(result); + node_destroy(NIL); + return NULL; + } + + node_incref(head); + + NIL->is_nil = 1; + + for (i = 0; i < maxlevels; ++i) { + head->next[i] = NIL; + head->width[i] = 1; + node_incref(NIL); + } - for (i = 0; i < maxlevels; ++i) { - head->next[i] = NIL; - head->width[i] = 1; - node_incref(NIL); - } - - return result; + return result; } // 1 if left < right, 0 if left == right, -1 if left > right -PANDAS_INLINE int _node_cmp(node_t *node, double value) { - if (node->is_nil || node->value > value) { - return -1; - } else if (node->value < value) { - return 1; - } else { - return 0; - } -} - -PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) { - node_t *node; - int level; - - if (i < 0 || i >= skp->size) { - *ret = 0; - return 0; - } - - node = skp->head; - ++i; - for (level = skp->maxlevels - 1; level >= 0; --level) { - while (node->width[level] <= i) { - i -= node->width[level]; - node = node->next[level]; - } +static inline int _node_cmp(node_t *node, double value) { + if (node->is_nil || node->value > value) { + return -1; + } else if (node->value < value) { + return 1; + } else { + return 0; + } +} + +static inline double skiplist_get(skiplist_t *skp, int i, int *ret) { + node_t *node; + int level; + + if (i < 0 || i >= skp->size) { + *ret = 0; + return 0; + } + + node = skp->head; + ++i; + for (level = skp->maxlevels - 1; level >= 0; --level) { + while (node->width[level] <= i) { + i -= node->width[level]; + node = node->next[level]; } + } - *ret = 1; - return node->value; + *ret = 1; + return node->value; } // Returns the lowest rank of all elements with value `value`, as opposed to the // highest rank returned by `skiplist_insert`. -PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) { - node_t *node; - int level, rank = 0; - - node = skp->head; - for (level = skp->maxlevels - 1; level >= 0; --level) { - while (_node_cmp(node->next[level], value) > 0) { - rank += node->width[level]; - node = node->next[level]; - } +static inline int skiplist_min_rank(skiplist_t *skp, double value) { + node_t *node; + int level, rank = 0; + + node = skp->head; + for (level = skp->maxlevels - 1; level >= 0; --level) { + while (_node_cmp(node->next[level], value) > 0) { + rank += node->width[level]; + node = node->next[level]; } + } - return rank + 1; + return rank + 1; } // Returns the rank of the inserted element. When there are duplicates, // `rank` is the highest of the group, i.e. the 'max' method of // https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html -PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { - node_t *node, *prevnode, *newnode, *next_at_level; - int *steps_at_level; - int size, steps, level, rank = 0; - node_t **chain; - - chain = skp->tmp_chain; - - steps_at_level = skp->tmp_steps; - memset(steps_at_level, 0, skp->maxlevels * sizeof(int)); - - node = skp->head; - - for (level = skp->maxlevels - 1; level >= 0; --level) { - next_at_level = node->next[level]; - while (_node_cmp(next_at_level, value) >= 0) { - steps_at_level[level] += node->width[level]; - rank += node->width[level]; - node = next_at_level; - next_at_level = node->next[level]; - } - chain[level] = node; - } +static inline int skiplist_insert(skiplist_t *skp, double value) { + node_t *node, *prevnode, *newnode, *next_at_level; + int *steps_at_level; + int size, steps, level, rank = 0; + node_t **chain; + + chain = skp->tmp_chain; - size = int_min(skp->maxlevels, 1 - ((int)Log2(urand()))); + steps_at_level = skp->tmp_steps; + memset(steps_at_level, 0, skp->maxlevels * sizeof(int)); - newnode = node_init(value, size); - if (!newnode) { - return -1; + node = skp->head; + + for (level = skp->maxlevels - 1; level >= 0; --level) { + next_at_level = node->next[level]; + while (_node_cmp(next_at_level, value) >= 0) { + steps_at_level[level] += node->width[level]; + rank += node->width[level]; + node = next_at_level; + next_at_level = node->next[level]; } - steps = 0; + chain[level] = node; + } - for (level = 0; level < size; ++level) { - prevnode = chain[level]; - newnode->next[level] = prevnode->next[level]; + size = int_min(skp->maxlevels, 1 - ((int)Log2(urand()))); - prevnode->next[level] = newnode; - node_incref(newnode); // increment the reference count + newnode = node_init(value, size); + if (!newnode) { + return -1; + } + steps = 0; - newnode->width[level] = prevnode->width[level] - steps; - prevnode->width[level] = steps + 1; + for (level = 0; level < size; ++level) { + prevnode = chain[level]; + newnode->next[level] = prevnode->next[level]; - steps += steps_at_level[level]; - } + prevnode->next[level] = newnode; + node_incref(newnode); // increment the reference count - for (level = size; level < skp->maxlevels; ++level) { - chain[level]->width[level] += 1; - } + newnode->width[level] = prevnode->width[level] - steps; + prevnode->width[level] = steps + 1; + + steps += steps_at_level[level]; + } - ++(skp->size); + for (level = size; level < skp->maxlevels; ++level) { + chain[level]->width[level] += 1; + } - return rank + 1; + ++(skp->size); + + return rank + 1; } -PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) { - int level, size; - node_t *node, *prevnode, *tmpnode, *next_at_level; - node_t **chain; - - chain = skp->tmp_chain; - node = skp->head; - - for (level = skp->maxlevels - 1; level >= 0; --level) { - next_at_level = node->next[level]; - while (_node_cmp(next_at_level, value) > 0) { - node = next_at_level; - next_at_level = node->next[level]; - } - chain[level] = node; - } +static inline int skiplist_remove(skiplist_t *skp, double value) { + int level, size; + node_t *node, *prevnode, *tmpnode, *next_at_level; + node_t **chain; - if (value != chain[0]->next[0]->value) { - return 0; + chain = skp->tmp_chain; + node = skp->head; + + for (level = skp->maxlevels - 1; level >= 0; --level) { + next_at_level = node->next[level]; + while (_node_cmp(next_at_level, value) > 0) { + node = next_at_level; + next_at_level = node->next[level]; } + chain[level] = node; + } - size = chain[0]->next[0]->levels; + if (value != chain[0]->next[0]->value) { + return 0; + } - for (level = 0; level < size; ++level) { - prevnode = chain[level]; + size = chain[0]->next[0]->levels; - tmpnode = prevnode->next[level]; + for (level = 0; level < size; ++level) { + prevnode = chain[level]; - prevnode->width[level] += tmpnode->width[level] - 1; - prevnode->next[level] = tmpnode->next[level]; + tmpnode = prevnode->next[level]; - tmpnode->next[level] = NULL; - node_destroy(tmpnode); // decrement refcount or free - } + prevnode->width[level] += tmpnode->width[level] - 1; + prevnode->next[level] = tmpnode->next[level]; - for (level = size; level < skp->maxlevels; ++level) { - --(chain[level]->width[level]); - } + tmpnode->next[level] = NULL; + node_destroy(tmpnode); // decrement refcount or free + } - --(skp->size); - return 1; + for (level = size; level < skp->maxlevels; ++level) { + --(chain[level]->width[level]); + } + + --(skp->size); + return 1; } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/vendored/klib/khash.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/vendored/klib/khash.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/vendored/klib/khash.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/vendored/klib/khash.h 2024-04-10 17:42:52.000000000 +0000 @@ -1,27 +1,4 @@ -/* The MIT License - - Copyright (c) 2008, 2009, 2011 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ +// Licence at LICENSES/KLIB_LICENSE /* An example: @@ -29,38 +6,38 @@ #include "khash.h" KHASH_MAP_INIT_INT(32, char) int main() { - int ret, is_missing; - khiter_t k; - khash_t(32) *h = kh_init(32); - k = kh_put(32, h, 5, &ret); - if (!ret) kh_del(32, h, k); - kh_value(h, k) = 10; - k = kh_get(32, h, 10); - is_missing = (k == kh_end(h)); - k = kh_get(32, h, 5); - kh_del(32, h, k); - for (k = kh_begin(h); k != kh_end(h); ++k) - if (kh_exist(h, k)) kh_value(h, k) = 1; - kh_destroy(32, h); - return 0; + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; } */ /* 2011-09-16 (0.2.6): - * The capacity is a power of 2. This seems to dramatically improve the - speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: - - https://github.com/stefanocasazza/ULib - - https://nothings.org/computer/judy/ + - https://github.com/stefanocasazza/ULib + - https://nothings.org/computer/judy/ - * Allow to optionally use linear probing which usually has better - performance for random input. Double hashing is still the default as it - is more robust to certain non-random input. + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as + it is more robust to certain non-random input. - * Added Wang's integer hash function (not used by default). This hash - function is more robust to certain non-random input. + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. 2011-02-14 (0.2.5): @@ -72,32 +49,31 @@ 2008-09-19 (0.2.3): - * Corrected the example - * Improved interfaces + * Corrected the example + * Improved interfaces 2008-09-11 (0.2.2): - * Improved speed a little in kh_put() + * Improved speed a little in kh_put() 2008-09-10 (0.2.1): - * Added kh_clear() - * Fixed a compiling error + * Added kh_clear() + * Fixed a compiling error 2008-09-02 (0.2.0): - * Changed to token concatenation which increases flexibility. + * Changed to token concatenation which increases flexibility. 2008-08-31 (0.1.2): - * Fixed a bug in kh_get(), which has not been tested previously. + * Fixed a bug in kh_get(), which has not been tested previously. 2008-08-31 (0.1.1): - * Added destructor + * Added destructor */ - #ifndef __AC_KHASH_H #define __AC_KHASH_H @@ -109,11 +85,9 @@ #define AC_VERSION_KHASH_H "0.2.6" +#include #include #include -#include -#include "pandas/inline_helper.h" - // hooks for memory allocator, C-runtime allocator used per default #ifndef KHASH_MALLOC @@ -132,7 +106,6 @@ #define KHASH_FREE free #endif - #if UINT_MAX == 0xffffffffu typedef unsigned int khuint32_t; typedef signed int khint32_t; @@ -168,262 +141,311 @@ typedef khuint32_t khuint_t; typedef khuint_t khiter_t; -#define __ac_isempty(flag, i) ((flag[i>>5]>>(i&0x1fU))&1) +#define __ac_isempty(flag, i) ((flag[i >> 5] >> (i & 0x1fU)) & 1) #define __ac_isdel(flag, i) (0) #define __ac_iseither(flag, i) __ac_isempty(flag, i) #define __ac_set_isdel_false(flag, i) (0) -#define __ac_set_isempty_false(flag, i) (flag[i>>5]&=~(1ul<<(i&0x1fU))) -#define __ac_set_isempty_true(flag, i) (flag[i>>5]|=(1ul<<(i&0x1fU))) +#define __ac_set_isempty_false(flag, i) (flag[i >> 5] &= ~(1ul << (i & 0x1fU))) +#define __ac_set_isempty_true(flag, i) (flag[i >> 5] |= (1ul << (i & 0x1fU))) #define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i) #define __ac_set_isdel_true(flag, i) ((void)0) - -// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp -khuint32_t PANDAS_INLINE murmur2_32to32(khuint32_t k){ - const khuint32_t SEED = 0xc70f6907UL; - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - const khuint32_t M_32 = 0x5bd1e995; - const int R_32 = 24; - - // Initialize the hash to a 'random' value - khuint32_t h = SEED ^ 4; - - //handle 4 bytes: - k *= M_32; - k ^= k >> R_32; - k *= M_32; - - h *= M_32; - h ^= k; - - // Do a few final mixes of the hash to ensure the "last few - // bytes" are well-incorporated. (Really needed here?) - h ^= h >> 13; - h *= M_32; - h ^= h >> 15; - return h; +// specializations of +// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp +static inline khuint32_t murmur2_32to32(khuint32_t k) { + const khuint32_t SEED = 0xc70f6907UL; + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const khuint32_t M_32 = 0x5bd1e995; + const int R_32 = 24; + + // Initialize the hash to a 'random' value + khuint32_t h = SEED ^ 4; + + // handle 4 bytes: + k *= M_32; + k ^= k >> R_32; + k *= M_32; + + h *= M_32; + h ^= k; + + // Do a few final mixes of the hash to ensure the "last few + // bytes" are well-incorporated. (Really needed here?) + h ^= h >> 13; + h *= M_32; + h ^= h >> 15; + return h; } -// it is possible to have a special x64-version, which would need less operations, but -// using 32bit version always has also some benefits: +// it is possible to have a special x64-version, which would need less +// operations, but using 32bit version always has also some benefits: // - one code for 32bit and 64bit builds // - the same case for 32bit and 64bit builds -// - no performance difference could be measured compared to a possible x64-version +// - no performance difference could be measured compared to a possible +// x64-version -khuint32_t PANDAS_INLINE murmur2_32_32to32(khuint32_t k1, khuint32_t k2){ - const khuint32_t SEED = 0xc70f6907UL; - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - const khuint32_t M_32 = 0x5bd1e995; - const int R_32 = 24; - - // Initialize the hash to a 'random' value - khuint32_t h = SEED ^ 4; - - //handle first 4 bytes: - k1 *= M_32; - k1 ^= k1 >> R_32; - k1 *= M_32; - - h *= M_32; - h ^= k1; - - //handle second 4 bytes: - k2 *= M_32; - k2 ^= k2 >> R_32; - k2 *= M_32; - - h *= M_32; - h ^= k2; - - // Do a few final mixes of the hash to ensure the "last few - // bytes" are well-incorporated. - h ^= h >> 13; - h *= M_32; - h ^= h >> 15; - return h; +static inline khuint32_t murmur2_32_32to32(khuint32_t k1, khuint32_t k2) { + const khuint32_t SEED = 0xc70f6907UL; + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const khuint32_t M_32 = 0x5bd1e995; + const int R_32 = 24; + + // Initialize the hash to a 'random' value + khuint32_t h = SEED ^ 4; + + // handle first 4 bytes: + k1 *= M_32; + k1 ^= k1 >> R_32; + k1 *= M_32; + + h *= M_32; + h ^= k1; + + // handle second 4 bytes: + k2 *= M_32; + k2 ^= k2 >> R_32; + k2 *= M_32; + + h *= M_32; + h ^= k2; + + // Do a few final mixes of the hash to ensure the "last few + // bytes" are well-incorporated. + h ^= h >> 13; + h *= M_32; + h ^= h >> 15; + return h; } -khuint32_t PANDAS_INLINE murmur2_64to32(khuint64_t k){ - khuint32_t k1 = (khuint32_t)k; - khuint32_t k2 = (khuint32_t)(k >> 32); +static inline khuint32_t murmur2_64to32(khuint64_t k) { + khuint32_t k1 = (khuint32_t)k; + khuint32_t k2 = (khuint32_t)(k >> 32); - return murmur2_32_32to32(k1, k2); + return murmur2_32_32to32(k1, k2); } - #ifdef KHASH_LINEAR #define __ac_inc(k, m) 1 #else #define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m) #endif -#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5) +#define __ac_fsize(m) ((m) < 32 ? 1 : (m) >> 5) #ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#define kroundup32(x) \ + (--(x), (x) |= (x) >> 1, (x) |= (x) >> 2, (x) |= (x) >> 4, (x) |= (x) >> 8, \ + (x) |= (x) >> 16, ++(x)) #endif static const double __ac_HASH_UPPER = 0.77; -#define KHASH_DECLARE(name, khkey_t, khval_t) \ - typedef struct { \ - khuint_t n_buckets, size, n_occupied, upper_bound; \ - khuint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - extern kh_##name##_t *kh_init_##name(); \ - extern void kh_destroy_##name(kh_##name##_t *h); \ - extern void kh_clear_##name(kh_##name##_t *h); \ - extern khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ - extern void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets); \ - extern khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ - extern void kh_del_##name(kh_##name##_t *h, khuint_t x); - -#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - typedef struct { \ - khuint_t n_buckets, size, n_occupied, upper_bound; \ - khuint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - SCOPE kh_##name##_t *kh_init_##name(void) { \ - return (kh_##name##_t*)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \ - } \ - SCOPE void kh_destroy_##name(kh_##name##_t *h) \ - { \ - if (h) { \ - KHASH_FREE(h->keys); KHASH_FREE(h->flags); \ - KHASH_FREE(h->vals); \ - KHASH_FREE(h); \ - } \ - } \ - SCOPE void kh_clear_##name(kh_##name##_t *h) \ - { \ - if (h && h->flags) { \ - memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khuint32_t)); \ - h->size = h->n_occupied = 0; \ - } \ - } \ - SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ - { \ - if (h->n_buckets) { \ - khuint_t inc, k, i, last, mask; \ - mask = h->n_buckets - 1; \ - k = __hash_func(key); i = k & mask; \ - inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - i = (i + inc) & mask; \ - if (i == last) return h->n_buckets; \ - } \ - return __ac_iseither(h->flags, i)? h->n_buckets : i; \ - } else return 0; \ - } \ - SCOPE void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets) \ - { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ - khuint32_t *new_flags = 0; \ - khuint_t j = 1; \ - { \ - kroundup32(new_n_buckets); \ - if (new_n_buckets < 4) new_n_buckets = 4; \ - if (h->size >= (khuint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ - else { /* hash table size to be changed (shrink or expand); rehash */ \ - new_flags = (khuint32_t*)KHASH_MALLOC(__ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \ - memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \ - if (h->n_buckets < new_n_buckets) { /* expand */ \ - h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \ - } /* otherwise shrink */ \ - } \ - } \ - if (j) { /* rehashing is needed */ \ - for (j = 0; j != h->n_buckets; ++j) { \ - if (__ac_iseither(h->flags, j) == 0) { \ - khkey_t key = h->keys[j]; \ - khval_t val; \ - khuint_t new_mask; \ - new_mask = new_n_buckets - 1; \ - if (kh_is_map) val = h->vals[j]; \ - __ac_set_isempty_true(h->flags, j); \ - while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ - khuint_t inc, k, i; \ - k = __hash_func(key); \ - i = k & new_mask; \ - inc = __ac_inc(k, new_mask); \ - while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ - __ac_set_isempty_false(new_flags, i); \ - if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ - { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ - if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isempty_true(h->flags, i); /* mark it as deleted in the old hash table */ \ - } else { /* write the element and jump out of the loop */ \ - h->keys[i] = key; \ - if (kh_is_map) h->vals[i] = val; \ - break; \ - } \ - } \ - } \ - } \ - if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ - h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ - KHASH_FREE(h->flags); /* free the working space */ \ - h->flags = new_flags; \ - h->n_buckets = new_n_buckets; \ - h->n_occupied = h->size; \ - h->upper_bound = (khuint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ - } \ - } \ - SCOPE khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ - { \ - khuint_t x; \ - if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ - if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ - else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ - } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ - { \ - khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ - x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ - if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ - else { \ - inc = __ac_inc(k, mask); last = i; \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (__ac_isdel(h->flags, i)) site = i; \ - i = (i + inc) & mask; \ - if (i == last) { x = site; break; } \ - } \ - if (x == h->n_buckets) { \ - if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ - else x = i; \ - } \ - } \ - } \ - if (__ac_isempty(h->flags, x)) { /* not present at all */ \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; ++h->n_occupied; \ - *ret = 1; \ - } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; \ - *ret = 2; \ - } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ - return x; \ - } \ - SCOPE void kh_del_##name(kh_##name##_t *h, khuint_t x) \ - { \ - if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ - __ac_set_isdel_true(h->flags, x); \ - --h->size; \ - } \ - } - -#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - KHASH_INIT2(name, PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + typedef struct { \ + khuint_t n_buckets, size, n_occupied, upper_bound; \ + khuint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + extern kh_##name##_t *kh_init_##name(); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets); \ + extern khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khuint_t x); + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, \ + __hash_equal) \ + typedef struct { \ + khuint_t n_buckets, size, n_occupied, upper_bound; \ + khuint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t *)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) { \ + if (h) { \ + KHASH_FREE(h->keys); \ + KHASH_FREE(h->flags); \ + KHASH_FREE(h->vals); \ + KHASH_FREE(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khuint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) { \ + if (h->n_buckets) { \ + khuint_t inc, k, i, last, mask; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); \ + i = k & mask; \ + inc = __ac_inc(k, mask); \ + last = i; /* inc==1 for linear probing */ \ + while (!__ac_isempty(h->flags, i) && \ + (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + inc) & mask; \ + if (i == last) \ + return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i) ? h->n_buckets : i; \ + } else \ + return 0; \ + } \ + SCOPE void kh_resize_##name( \ + kh_##name##_t *h, \ + khuint_t new_n_buckets) { /* This function uses 0.25*n_bucktes bytes of \ + working space instead of \ + [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khuint32_t *new_flags = 0; \ + khuint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) \ + new_n_buckets = 4; \ + if (h->size >= (khuint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) \ + j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khuint32_t *)KHASH_MALLOC(__ac_fsize(new_n_buckets) * \ + sizeof(khuint32_t)); \ + memset(new_flags, 0xff, \ + __ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + h->keys = (khkey_t *)KHASH_REALLOC(h->keys, \ + new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t *)KHASH_REALLOC(h->vals, new_n_buckets * \ + sizeof(khval_t)); \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khuint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) \ + val = h->vals[j]; \ + __ac_set_isempty_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khuint_t inc, k, i; \ + k = __hash_func(key); \ + i = k & new_mask; \ + inc = __ac_inc(k, new_mask); \ + while (!__ac_isempty(new_flags, i)) \ + i = (i + inc) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && \ + __ac_iseither(h->flags, i) == \ + 0) { /* kick out the existing element */ \ + { \ + khkey_t tmp = h->keys[i]; \ + h->keys[i] = key; \ + key = tmp; \ + } \ + if (kh_is_map) { \ + khval_t tmp = h->vals[i]; \ + h->vals[i] = val; \ + val = tmp; \ + } \ + __ac_set_isempty_true( \ + h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) \ + h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t *)KHASH_REALLOC(h->keys, \ + new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t *)KHASH_REALLOC(h->vals, \ + new_n_buckets * sizeof(khval_t)); \ + } \ + KHASH_FREE(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khuint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + SCOPE khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) { \ + khuint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size << 1)) \ + kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ + else \ + kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ + } /* TODO: to implement automatically shrinking; resize() already support \ + shrinking */ \ + { \ + khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ + x = site = h->n_buckets; \ + k = __hash_func(key); \ + i = k & mask; \ + if (__ac_isempty(h->flags, i)) \ + x = i; /* for speed up */ \ + else { \ + inc = __ac_inc(k, mask); \ + last = i; \ + while (!__ac_isempty(h->flags, i) && \ + (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) \ + site = i; \ + i = (i + inc) & mask; \ + if (i == last) { \ + x = site; \ + break; \ + } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) \ + x = site; \ + else \ + x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else \ + *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khuint_t x) { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, \ + __hash_equal) \ + KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, \ + __hash_equal) /* --- BEGIN OF HASH FUNCTIONS --- */ @@ -442,9 +464,8 @@ @param key The integer [khuint64_t] @return The hash value [khuint_t] */ -PANDAS_INLINE khuint_t kh_int64_hash_func(khuint64_t key) -{ - return (khuint_t)((key)>>33^(key)^(key)<<11); +static inline khuint_t kh_int64_hash_func(khuint64_t key) { + return (khuint_t)((key) >> 33 ^ (key) ^ (key) << 11); } /*! @function @abstract 64-bit integer comparison function @@ -456,11 +477,12 @@ @param s Pointer to a null terminated string @return The hash value */ -PANDAS_INLINE khuint_t __ac_X31_hash_string(const char *s) -{ - khuint_t h = *s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; - return h; +static inline khuint_t __ac_X31_hash_string(const char *s) { + khuint_t h = *s; + if (h) + for (++s; *s; ++s) + h = (h << 5) - h + *s; + return h; } /*! @function @abstract Another interface to const char* hash function @@ -473,15 +495,14 @@ */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) -PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key) -{ - key += ~(key << 15); - key ^= (key >> 10); - key += (key << 3); - key ^= (key >> 6); - key += ~(key << 11); - key ^= (key >> 16); - return key; +static inline khuint_t __ac_Wang_hash(khuint_t key) { + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; } #define kh_int_hash_func2(k) __ac_Wang_hash((khuint_t)key) @@ -531,7 +552,7 @@ @param k Key [type of keys] @param r Extra return code: 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in - the bucket has been deleted [int*] + the bucket has been deleted [int*] @return Iterator to the inserted element [khuint_t] */ #define kh_put(name, h, k, r) kh_put_##name(h, k, r) @@ -541,7 +562,8 @@ @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] - @return Iterator to the found element, or kh_end(h) is the element is absent [khuint_t] + @return Iterator to the found element, or kh_end(h) is the element is + absent [khuint_t] */ #define kh_get(name, h, k) kh_get_##name(h, k) @@ -617,81 +639,80 @@ @abstract Instantiate a hash set containing integer keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_INT(name) \ - KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_INT(name, khval_t) \ - KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) -#define KHASH_MAP_INIT_UINT(name, khval_t) \ - KHASH_INIT(name, khuint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_UINT(name, khval_t) \ + KHASH_INIT(name, khuint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_UINT64(name) \ - KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) +#define KHASH_SET_INIT_UINT64(name) \ + KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) -#define KHASH_SET_INIT_INT64(name) \ - KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_UINT64(name, khval_t) \ - KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) - -#define KHASH_MAP_INIT_INT64(name, khval_t) \ - KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) +#define KHASH_MAP_INIT_UINT64(name, khval_t) \ + KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, \ + kh_int64_hash_equal) + +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, \ + kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 16bit-integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_INT16(name, khval_t) \ - KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_INT16(name, khval_t) \ + KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) -#define KHASH_MAP_INIT_UINT16(name, khval_t) \ - KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_UINT16(name, khval_t) \ + KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 8bit-integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_INT8(name, khval_t) \ - KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) - -#define KHASH_MAP_INIT_UINT8(name, khval_t) \ - KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) - +#define KHASH_MAP_INIT_INT8(name, khval_t) \ + KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_UINT8(name, khval_t) \ + KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_STR(name) \ - KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_STR(name, khval_t) \ - KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) - +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) #define kh_exist_str(h, k) (kh_exist(h, k)) #define kh_exist_float64(h, k) (kh_exist(h, k)) @@ -715,5 +736,4 @@ KHASH_MAP_INIT_INT8(int8, size_t) KHASH_MAP_INIT_UINT8(uint8, size_t) - #endif /* __AC_KHASH_H */ diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/vendored/klib/khash_python.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/vendored/klib/khash_python.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/vendored/klib/khash_python.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/vendored/klib/khash_python.h 2024-04-10 17:42:52.000000000 +0000 @@ -1,18 +1,17 @@ -#include -#include +// Licence at LICENSES/KLIB_LICENSE +#include +#include typedef struct { - float real; - float imag; + float real; + float imag; } khcomplex64_t; typedef struct { - double real; - double imag; + double real; + double imag; } khcomplex128_t; - - // khash should report usage to tracemalloc #if PY_VERSION_HEX >= 0x03060000 #include @@ -25,43 +24,41 @@ #define PyTraceMalloc_Untrack(...) #endif - static const int KHASH_TRACE_DOMAIN = 424242; -void *traced_malloc(size_t size){ - void * ptr = malloc(size); - if(ptr!=NULL){ - PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); - } - return ptr; -} - -void *traced_calloc(size_t num, size_t size){ - void * ptr = calloc(num, size); - if(ptr!=NULL){ - PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num*size); - } - return ptr; -} - -void *traced_realloc(void* old_ptr, size_t size){ - void * ptr = realloc(old_ptr, size); - if(ptr!=NULL){ - if(old_ptr != ptr){ - PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); - } - PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); - } - return ptr; +void *traced_malloc(size_t size) { + void *ptr = malloc(size); + if (ptr != NULL) { + PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); + } + return ptr; +} + +void *traced_calloc(size_t num, size_t size) { + void *ptr = calloc(num, size); + if (ptr != NULL) { + PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num * size); + } + return ptr; +} + +void *traced_realloc(void *old_ptr, size_t size) { + void *ptr = realloc(old_ptr, size); + if (ptr != NULL) { + if (old_ptr != ptr) { + PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); + } + PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); + } + return ptr; +} + +void traced_free(void *ptr) { + if (ptr != NULL) { + PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr); + } + free(ptr); } -void traced_free(void* ptr){ - if(ptr!=NULL){ - PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr); - } - free(ptr); -} - - #define KHASH_MALLOC traced_malloc #define KHASH_REALLOC traced_realloc #define KHASH_CALLOC traced_calloc @@ -72,327 +69,295 @@ // python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021 // python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85 -// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x)) -// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3). -// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t -// is 64 bits the truncation causes collision issues. Given all that, we use our own -// simple hash, viewing the double bytes as an int64 and using khash's default -// hash for 64 bit integers. -// GH 13436 showed that _Py_HashDouble doesn't work well with khash -// GH 28303 showed, that the simple xoring-version isn't good enough -// See GH 36729 for evaluation of the currently used murmur2-hash version -// An interesting alternative to expensive murmur2-hash would be to change -// the probing strategy and use e.g. the probing strategy from CPython's +// The python 3 hash function has the invariant hash(x) == hash(int(x)) == +// hash(decimal(x)) and the size of hash may be different by platform / version +// (long in py2, Py_ssize_t in py3). We don't need those invariants because +// types will be cast before hashing, and if Py_ssize_t is 64 bits the +// truncation causes collision issues. Given all that, we use our own simple +// hash, viewing the double bytes as an int64 and using khash's default hash for +// 64 bit integers. GH 13436 showed that _Py_HashDouble doesn't work well with +// khash GH 28303 showed, that the simple xoring-version isn't good enough See +// GH 36729 for evaluation of the currently used murmur2-hash version An +// interesting alternative to expensive murmur2-hash would be to change the +// probing strategy and use e.g. the probing strategy from CPython's // implementation of dicts, which shines for smaller sizes but is more // predisposed to superlinear running times (see GH 36729 for comparison) - -khuint64_t PANDAS_INLINE asuint64(double key) { - khuint64_t val; - memcpy(&val, &key, sizeof(double)); - return val; +static inline khuint64_t asuint64(double key) { + khuint64_t val; + memcpy(&val, &key, sizeof(double)); + return val; } -khuint32_t PANDAS_INLINE asuint32(float key) { - khuint32_t val; - memcpy(&val, &key, sizeof(float)); - return val; +static inline khuint32_t asuint32(float key) { + khuint32_t val; + memcpy(&val, &key, sizeof(float)); + return val; } #define ZERO_HASH 0 -#define NAN_HASH 0 +#define NAN_HASH 0 -khuint32_t PANDAS_INLINE kh_float64_hash_func(double val){ - // 0.0 and -0.0 should have the same hash: - if (val == 0.0){ - return ZERO_HASH; - } - // all nans should have the same hash: - if ( val!=val ){ - return NAN_HASH; - } - khuint64_t as_int = asuint64(val); - return murmur2_64to32(as_int); -} - -khuint32_t PANDAS_INLINE kh_float32_hash_func(float val){ - // 0.0 and -0.0 should have the same hash: - if (val == 0.0f){ - return ZERO_HASH; - } - // all nans should have the same hash: - if ( val!=val ){ - return NAN_HASH; - } - khuint32_t as_int = asuint32(val); - return murmur2_32to32(as_int); +static inline khuint32_t kh_float64_hash_func(double val) { + // 0.0 and -0.0 should have the same hash: + if (val == 0.0) { + return ZERO_HASH; + } + // all nans should have the same hash: + if (val != val) { + return NAN_HASH; + } + khuint64_t as_int = asuint64(val); + return murmur2_64to32(as_int); +} + +static inline khuint32_t kh_float32_hash_func(float val) { + // 0.0 and -0.0 should have the same hash: + if (val == 0.0f) { + return ZERO_HASH; + } + // all nans should have the same hash: + if (val != val) { + return NAN_HASH; + } + khuint32_t as_int = asuint32(val); + return murmur2_32to32(as_int); } #define kh_floats_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) -#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ - KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_floats_hash_equal) +#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ + KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, \ + kh_floats_hash_equal) KHASH_MAP_INIT_FLOAT64(float64, size_t) -#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \ - KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, kh_floats_hash_equal) +#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \ + KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, \ + kh_floats_hash_equal) KHASH_MAP_INIT_FLOAT32(float32, size_t) -khint32_t PANDAS_INLINE kh_complex128_hash_func(khcomplex128_t val){ - return kh_float64_hash_func(val.real)^kh_float64_hash_func(val.imag); +static inline khint32_t kh_complex128_hash_func(khcomplex128_t val) { + return kh_float64_hash_func(val.real) ^ kh_float64_hash_func(val.imag); } -khint32_t PANDAS_INLINE kh_complex64_hash_func(khcomplex64_t val){ - return kh_float32_hash_func(val.real)^kh_float32_hash_func(val.imag); +static inline khint32_t kh_complex64_hash_func(khcomplex64_t val) { + return kh_float32_hash_func(val.real) ^ kh_float32_hash_func(val.imag); } -#define kh_complex_hash_equal(a, b) \ +#define kh_complex_hash_equal(a, b) \ (kh_floats_hash_equal(a.real, b.real) && kh_floats_hash_equal(a.imag, b.imag)) - -#define KHASH_MAP_INIT_COMPLEX64(name, khval_t) \ - KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, kh_complex_hash_equal) +#define KHASH_MAP_INIT_COMPLEX64(name, khval_t) \ + KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, \ + kh_complex_hash_equal) KHASH_MAP_INIT_COMPLEX64(complex64, size_t) - -#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \ - KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, kh_complex_hash_equal) +#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \ + KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, \ + kh_complex_hash_equal) KHASH_MAP_INIT_COMPLEX128(complex128, size_t) - #define kh_exist_complex64(h, k) (kh_exist(h, k)) #define kh_exist_complex128(h, k) (kh_exist(h, k)) - // NaN-floats should be in the same equivalency class, see GH 22119 -int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){ - return ( - Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && - Py_IS_NAN(PyFloat_AS_DOUBLE(b)) - ) - || - ( PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b) ); +static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) { + return (Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && Py_IS_NAN(PyFloat_AS_DOUBLE(b))) || + (PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b)); } - // NaNs should be in the same equivalency class, see GH 41836 // PyObject_RichCompareBool for complexobjects has a different behavior // needs to be replaced -int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){ - return ( - Py_IS_NAN(a->cval.real) && - Py_IS_NAN(b->cval.real) && - Py_IS_NAN(a->cval.imag) && - Py_IS_NAN(b->cval.imag) - ) - || - ( - Py_IS_NAN(a->cval.real) && - Py_IS_NAN(b->cval.real) && - a->cval.imag == b->cval.imag - ) - || - ( - a->cval.real == b->cval.real && - Py_IS_NAN(a->cval.imag) && - Py_IS_NAN(b->cval.imag) - ) - || - ( - a->cval.real == b->cval.real && - a->cval.imag == b->cval.imag - ); +static inline int complexobject_cmp(PyComplexObject *a, PyComplexObject *b) { + return (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && + Py_IS_NAN(a->cval.imag) && Py_IS_NAN(b->cval.imag)) || + (Py_IS_NAN(a->cval.real) && Py_IS_NAN(b->cval.real) && + a->cval.imag == b->cval.imag) || + (a->cval.real == b->cval.real && Py_IS_NAN(a->cval.imag) && + Py_IS_NAN(b->cval.imag)) || + (a->cval.real == b->cval.real && a->cval.imag == b->cval.imag); } -int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b); - +static inline int pyobject_cmp(PyObject *a, PyObject *b); // replacing PyObject_RichCompareBool (NaN!=NaN) with pyobject_cmp (NaN==NaN), // which treats NaNs as equivalent // see GH 41836 -int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){ - Py_ssize_t i; +static inline int tupleobject_cmp(PyTupleObject *a, PyTupleObject *b) { + Py_ssize_t i; - if (Py_SIZE(a) != Py_SIZE(b)) { - return 0; - } + if (Py_SIZE(a) != Py_SIZE(b)) { + return 0; + } - for (i = 0; i < Py_SIZE(a); ++i) { - if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) { - return 0; - } + for (i = 0; i < Py_SIZE(a); ++i) { + if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) { + return 0; } - return 1; + } + return 1; } - -int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { - if (a == b) { - return 1; - } - if (Py_TYPE(a) == Py_TYPE(b)) { - // special handling for some built-in types which could have NaNs - // as we would like to have them equivalent, but the usual - // PyObject_RichCompareBool would return False - if (PyFloat_CheckExact(a)) { - return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b); - } - if (PyComplex_CheckExact(a)) { - return complexobject_cmp((PyComplexObject*)a, (PyComplexObject*)b); - } - if (PyTuple_CheckExact(a)) { - return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b); - } - // frozenset isn't yet supported - } - - int result = PyObject_RichCompareBool(a, b, Py_EQ); - if (result < 0) { - PyErr_Clear(); - return 0; - } - return result; +static inline int pyobject_cmp(PyObject *a, PyObject *b) { + if (a == b) { + return 1; + } + if (Py_TYPE(a) == Py_TYPE(b)) { + // special handling for some built-in types which could have NaNs + // as we would like to have them equivalent, but the usual + // PyObject_RichCompareBool would return False + if (PyFloat_CheckExact(a)) { + return floatobject_cmp((PyFloatObject *)a, (PyFloatObject *)b); + } + if (PyComplex_CheckExact(a)) { + return complexobject_cmp((PyComplexObject *)a, (PyComplexObject *)b); + } + if (PyTuple_CheckExact(a)) { + return tupleobject_cmp((PyTupleObject *)a, (PyTupleObject *)b); + } + // frozenset isn't yet supported + } + + int result = PyObject_RichCompareBool(a, b, Py_EQ); + if (result < 0) { + PyErr_Clear(); + return 0; + } + return result; } - -Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) { - //Since Python3.10, nan is no longer has hash 0 - if (Py_IS_NAN(val)) { - return 0; - } +static inline Py_hash_t _Pandas_HashDouble(double val) { + // Since Python3.10, nan is no longer has hash 0 + if (Py_IS_NAN(val)) { + return 0; + } #if PY_VERSION_HEX < 0x030A0000 - return _Py_HashDouble(val); + return _Py_HashDouble(val); #else - return _Py_HashDouble(NULL, val); + return _Py_HashDouble(NULL, val); #endif } - -Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) { - return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key)); +static inline Py_hash_t floatobject_hash(PyFloatObject *key) { + return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key)); } - #define _PandasHASH_IMAG 1000003UL // replaces _Py_HashDouble with _Pandas_HashDouble -Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) { - Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real); - Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag); - if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) { - return -1; - } - Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash; - if (combined == (Py_uhash_t)-1) { - return -2; - } - return (Py_hash_t)combined; +static inline Py_hash_t complexobject_hash(PyComplexObject *key) { + Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real); + Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag); + if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) { + return -1; + } + Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash; + if (combined == (Py_uhash_t)-1) { + return -2; + } + return (Py_hash_t)combined; } +static inline khuint32_t kh_python_hash_func(PyObject *key); -khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key); - -//we could use any hashing algorithm, this is the original CPython's for tuples +// we could use any hashing algorithm, this is the original CPython's for tuples #if SIZEOF_PY_UHASH_T > 4 #define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL) #define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL) #define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL) -#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */ +#define _PandasHASH_XXROTATE(x) \ + ((x << 31) | (x >> 33)) /* Rotate left 31 bits */ #else #define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL) #define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL) #define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL) -#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */ +#define _PandasHASH_XXROTATE(x) \ + ((x << 13) | (x >> 19)) /* Rotate left 13 bits */ #endif -Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) { - Py_ssize_t i, len = Py_SIZE(key); - PyObject **item = key->ob_item; - - Py_uhash_t acc = _PandasHASH_XXPRIME_5; - for (i = 0; i < len; i++) { - Py_uhash_t lane = kh_python_hash_func(item[i]); - if (lane == (Py_uhash_t)-1) { - return -1; - } - acc += lane * _PandasHASH_XXPRIME_2; - acc = _PandasHASH_XXROTATE(acc); - acc *= _PandasHASH_XXPRIME_1; - } - - /* Add input length, mangled to keep the historical value of hash(()). */ - acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL); - - if (acc == (Py_uhash_t)-1) { - return 1546275796; - } - return acc; -} +static inline Py_hash_t tupleobject_hash(PyTupleObject *key) { + Py_ssize_t i, len = Py_SIZE(key); + PyObject **item = key->ob_item; + + Py_uhash_t acc = _PandasHASH_XXPRIME_5; + for (i = 0; i < len; i++) { + Py_uhash_t lane = kh_python_hash_func(item[i]); + if (lane == (Py_uhash_t)-1) { + return -1; + } + acc += lane * _PandasHASH_XXPRIME_2; + acc = _PandasHASH_XXROTATE(acc); + acc *= _PandasHASH_XXPRIME_1; + } + + /* Add input length, mangled to keep the historical value of hash(()). */ + acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL); + + if (acc == (Py_uhash_t)-1) { + return 1546275796; + } + return acc; +} + +static inline khuint32_t kh_python_hash_func(PyObject *key) { + Py_hash_t hash; + // For PyObject_Hash holds: + // hash(0.0) == 0 == hash(-0.0) + // yet for different nan-objects different hash-values + // are possible + if (PyFloat_CheckExact(key)) { + // we cannot use kh_float64_hash_func + // because float(k) == k holds for any int-object k + // and kh_float64_hash_func doesn't respect it + hash = floatobject_hash((PyFloatObject *)key); + } else if (PyComplex_CheckExact(key)) { + // we cannot use kh_complex128_hash_func + // because complex(k,0) == k holds for any int-object k + // and kh_complex128_hash_func doesn't respect it + hash = complexobject_hash((PyComplexObject *)key); + } else if (PyTuple_CheckExact(key)) { + hash = tupleobject_hash((PyTupleObject *)key); + } else { + hash = PyObject_Hash(key); + } - -khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { - Py_hash_t hash; - // For PyObject_Hash holds: - // hash(0.0) == 0 == hash(-0.0) - // yet for different nan-objects different hash-values - // are possible - if (PyFloat_CheckExact(key)) { - // we cannot use kh_float64_hash_func - // because float(k) == k holds for any int-object k - // and kh_float64_hash_func doesn't respect it - hash = floatobject_hash((PyFloatObject*)key); - } - else if (PyComplex_CheckExact(key)) { - // we cannot use kh_complex128_hash_func - // because complex(k,0) == k holds for any int-object k - // and kh_complex128_hash_func doesn't respect it - hash = complexobject_hash((PyComplexObject*)key); - } - else if (PyTuple_CheckExact(key)) { - hash = tupleobject_hash((PyTupleObject*)key); - } - else { - hash = PyObject_Hash(key); - } - - if (hash == -1) { - PyErr_Clear(); - return 0; - } - #if SIZEOF_PY_HASH_T == 4 - // it is already 32bit value - return hash; - #else - // for 64bit builds, - // we need information of the upper 32bits as well - // see GH 37615 - khuint64_t as_uint = (khuint64_t) hash; - // uints avoid undefined behavior of signed ints - return (as_uint>>32)^as_uint; - #endif + if (hash == -1) { + PyErr_Clear(); + return 0; + } +#if SIZEOF_PY_HASH_T == 4 + // it is already 32bit value + return hash; +#else + // for 64bit builds, + // we need information of the upper 32bits as well + // see GH 37615 + khuint64_t as_uint = (khuint64_t)hash; + // uints avoid undefined behavior of signed ints + return (as_uint >> 32) ^ as_uint; +#endif } - #define kh_python_hash_equal(a, b) (pyobject_cmp(a, b)) - // Python object -typedef PyObject* kh_pyobject_t; +typedef PyObject *kh_pyobject_t; -#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \ - KHASH_INIT(name, kh_pyobject_t, khval_t, 1, \ - kh_python_hash_func, kh_python_hash_equal) +#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \ + KHASH_INIT(name, kh_pyobject_t, khval_t, 1, kh_python_hash_func, \ + kh_python_hash_equal) KHASH_MAP_INIT_PYOBJECT(pymap, Py_ssize_t) -#define KHASH_SET_INIT_PYOBJECT(name) \ - KHASH_INIT(name, kh_pyobject_t, char, 0, \ - kh_python_hash_func, kh_python_hash_equal) +#define KHASH_SET_INIT_PYOBJECT(name) \ + KHASH_INIT(name, kh_pyobject_t, char, 0, kh_python_hash_func, \ + kh_python_hash_equal) KHASH_SET_INIT_PYOBJECT(pyset) @@ -402,49 +367,52 @@ KHASH_MAP_INIT_STR(strbox, kh_pyobject_t) typedef struct { - kh_str_t *table; - int starts[256]; + kh_str_t *table; + int starts[256]; } kh_str_starts_t; -typedef kh_str_starts_t* p_kh_str_starts_t; +typedef kh_str_starts_t *p_kh_str_starts_t; -p_kh_str_starts_t PANDAS_INLINE kh_init_str_starts(void) { - kh_str_starts_t *result = (kh_str_starts_t*)KHASH_CALLOC(1, sizeof(kh_str_starts_t)); - result->table = kh_init_str(); - return result; -} - -khuint_t PANDAS_INLINE kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) { - khuint_t result = kh_put_str(table->table, key, ret); - if (*ret != 0) { - table->starts[(unsigned char)key[0]] = 1; - } - return result; -} - -khuint_t PANDAS_INLINE kh_get_str_starts_item(const kh_str_starts_t* table, const char* key) { - unsigned char ch = *key; - if (table->starts[ch]) { - if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) return 1; - } - return 0; +static inline p_kh_str_starts_t kh_init_str_starts(void) { + kh_str_starts_t *result = + (kh_str_starts_t *)KHASH_CALLOC(1, sizeof(kh_str_starts_t)); + result->table = kh_init_str(); + return result; +} + +static inline khuint_t kh_put_str_starts_item(kh_str_starts_t *table, char *key, + int *ret) { + khuint_t result = kh_put_str(table->table, key, ret); + if (*ret != 0) { + table->starts[(unsigned char)key[0]] = 1; + } + return result; +} + +static inline khuint_t kh_get_str_starts_item(const kh_str_starts_t *table, + const char *key) { + unsigned char ch = *key; + if (table->starts[ch]) { + if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) + return 1; + } + return 0; } -void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) { - kh_destroy_str(table->table); - KHASH_FREE(table); +static inline void kh_destroy_str_starts(kh_str_starts_t *table) { + kh_destroy_str(table->table); + KHASH_FREE(table); } -void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) { - kh_resize_str(table->table, val); +static inline void kh_resize_str_starts(kh_str_starts_t *table, khuint_t val) { + kh_resize_str(table->table, val); } // utility function: given the number of elements // returns number of necessary buckets -khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){ - khuint_t candidate = n_elements; - kroundup32(candidate); - khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5); - return (upper_bound < n_elements) ? 2*candidate : candidate; - +static inline khuint_t kh_needed_n_buckets(khuint_t n_elements) { + khuint_t candidate = n_elements; + kroundup32(candidate); + khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5); + return (upper_bound < n_elements) ? 2 * candidate : candidate; } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h 2024-04-10 17:42:52.000000000 +0000 @@ -18,44 +18,44 @@ #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#endif // NPY_NO_DEPRECATED_API +#endif // NPY_NO_DEPRECATED_API #include typedef struct { - npy_int64 days; - npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds; + npy_int64 days; + npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds; } pandas_timedeltastruct; -static const npy_datetimestruct _AS_MIN_DTS = { - 1969, 12, 31, 23, 59, 50, 776627, 963145, 224193}; -static const npy_datetimestruct _FS_MIN_DTS = { - 1969, 12, 31, 21, 26, 16, 627963, 145224, 193000}; -static const npy_datetimestruct _PS_MIN_DTS = { - 1969, 9, 16, 5, 57, 7, 963145, 224193, 0}; -static const npy_datetimestruct _NS_MIN_DTS = { - 1677, 9, 21, 0, 12, 43, 145224, 193000, 0}; -static const npy_datetimestruct _US_MIN_DTS = { - -290308, 12, 21, 19, 59, 05, 224193, 0, 0}; -static const npy_datetimestruct _MS_MIN_DTS = { - -292275055, 5, 16, 16, 47, 4, 193000, 0, 0}; +static const npy_datetimestruct _AS_MIN_DTS = {1969, 12, 31, 23, 59, + 50, 776627, 963145, 224193}; +static const npy_datetimestruct _FS_MIN_DTS = {1969, 12, 31, 21, 26, + 16, 627963, 145224, 193000}; +static const npy_datetimestruct _PS_MIN_DTS = {1969, 9, 16, 5, 57, + 7, 963145, 224193, 0}; +static const npy_datetimestruct _NS_MIN_DTS = {1677, 9, 21, 0, 12, + 43, 145224, 193000, 0}; +static const npy_datetimestruct _US_MIN_DTS = {-290308, 12, 21, 19, 59, + 05, 224193, 0, 0}; +static const npy_datetimestruct _MS_MIN_DTS = {-292275055, 5, 16, 16, 47, + 4, 193000, 0, 0}; static const npy_datetimestruct _S_MIN_DTS = { -292277022657, 1, 27, 8, 29, 53, 0, 0, 0}; static const npy_datetimestruct _M_MIN_DTS = { -17536621475646, 5, 4, 5, 53, 0, 0, 0, 0}; -static const npy_datetimestruct _AS_MAX_DTS = { - 1970, 1, 1, 0, 0, 9, 223372, 36854, 775807}; -static const npy_datetimestruct _FS_MAX_DTS = { - 1970, 1, 1, 2, 33, 43, 372036, 854775, 807000}; -static const npy_datetimestruct _PS_MAX_DTS = { - 1970, 4, 17, 18, 2, 52, 36854, 775807, 0}; -static const npy_datetimestruct _NS_MAX_DTS = { - 2262, 4, 11, 23, 47, 16, 854775, 807000, 0}; -static const npy_datetimestruct _US_MAX_DTS = { - 294247, 1, 10, 4, 0, 54, 775807, 0, 0}; -static const npy_datetimestruct _MS_MAX_DTS = { - 292278994, 8, 17, 7, 12, 55, 807000, 0, 0}; +static const npy_datetimestruct _AS_MAX_DTS = {1970, 1, 1, 0, 0, + 9, 223372, 36854, 775807}; +static const npy_datetimestruct _FS_MAX_DTS = {1970, 1, 1, 2, 33, + 43, 372036, 854775, 807000}; +static const npy_datetimestruct _PS_MAX_DTS = {1970, 4, 17, 18, 2, + 52, 36854, 775807, 0}; +static const npy_datetimestruct _NS_MAX_DTS = {2262, 4, 11, 23, 47, + 16, 854775, 807000, 0}; +static const npy_datetimestruct _US_MAX_DTS = {294247, 1, 10, 4, 0, + 54, 775807, 0, 0}; +static const npy_datetimestruct _MS_MAX_DTS = {292278994, 8, 17, 7, 12, + 55, 807000, 0, 0}; static const npy_datetimestruct _S_MAX_DTS = { 292277026596, 12, 4, 15, 30, 7, 0, 0, 0}; static const npy_datetimestruct _M_MAX_DTS = { @@ -72,8 +72,7 @@ void pandas_datetime_to_datetimestruct(npy_datetime val, NPY_DATETIMEUNIT fr, npy_datetimestruct *result); -void pandas_timedelta_to_timedeltastruct(npy_timedelta val, - NPY_DATETIMEUNIT fr, +void pandas_timedelta_to_timedeltastruct(npy_timedelta val, NPY_DATETIMEUNIT fr, pandas_timedeltastruct *result); extern const int days_per_month_table[2][12]; @@ -86,9 +85,7 @@ /* * Calculates the days offset from the 1970 epoch. */ -npy_int64 -get_datetimestruct_days(const npy_datetimestruct *dts); - +npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts); /* * Compares two npy_datetimestruct objects chronologically @@ -96,17 +93,14 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a, const npy_datetimestruct *b); - /* * Adjusts a datetimestruct based on a minutes offset. Assumes * the current values are valid. */ -void -add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes); +void add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes); /* * This function returns the DateTimeMetaData * contained within the provided datetime dtype. */ -PyArray_DatetimeMetaData get_datetime_metadata_from_dtype( - PyArray_Descr *dtype); +PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(PyArray_Descr *dtype); diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime_strings.h 2024-04-10 17:42:52.000000000 +0000 @@ -23,7 +23,7 @@ #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#endif // NPY_NO_DEPRECATED_API +#endif // NPY_NO_DEPRECATED_API /* 'format_requirement' can be one of three values: * * PARTIAL_MATCH : Only require a partial match with 'format'. @@ -34,11 +34,7 @@ * be able to parse it without error is '%Y-%m-%d'; * * INFER_FORMAT: parse without comparing 'format' (i.e. infer it). */ -typedef enum { - PARTIAL_MATCH, - EXACT_MATCH, - INFER_FORMAT -} FormatRequirement; +typedef enum { PARTIAL_MATCH, EXACT_MATCH, INFER_FORMAT } FormatRequirement; /* * Parses (almost) standard ISO 8601 date strings. The differences are: @@ -58,31 +54,26 @@ * 'str' must be a NULL-terminated string, and 'len' must be its length. * * 'out' gets filled with the parsed date-time. - * 'out_local' gets whether returned value contains timezone. 0 for UTC, 1 for local time. - * 'out_tzoffset' gets set to timezone offset by minutes - * if the parsed time was in local time, - * to 0 otherwise. The values 'now' and 'today' don't get counted - * as local, and neither do UTC +/-#### timezone offsets, because - * they aren't using the computer's local timezone offset. + * 'out_local' gets whether returned value contains timezone. 0 for UTC, 1 for + * local time. 'out_tzoffset' gets set to timezone offset by minutes if the + * parsed time was in local time, to 0 otherwise. The values 'now' and 'today' + * don't get counted as local, and neither do UTC +/-#### timezone offsets, + * because they aren't using the computer's local timezone offset. * * Returns 0 on success, -1 on failure. */ -int -parse_iso_8601_datetime(const char *str, int len, int want_exc, - npy_datetimestruct *out, - NPY_DATETIMEUNIT *out_bestunit, - int *out_local, - int *out_tzoffset, - const char* format, - int format_len, - FormatRequirement format_requirement); +int parse_iso_8601_datetime(const char *str, int len, int want_exc, + npy_datetimestruct *out, + NPY_DATETIMEUNIT *out_bestunit, int *out_local, + int *out_tzoffset, const char *format, + int format_len, + FormatRequirement format_requirement); /* * Provides a string length to use for converting datetime * objects with the given local and unit settings. */ -int -get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); +int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); /* * Converts an npy_datetimestruct to an (almost) ISO 8601 @@ -94,9 +85,8 @@ * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ -int -make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, - int utc, NPY_DATETIMEUNIT base); +int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, size_t outlen, + int utc, NPY_DATETIMEUNIT base); /* * Converts an pandas_timedeltastruct to an ISO 8601 string. diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/vendored/ujson/lib/ultrajson.h 2024-04-10 17:42:52.000000000 +0000 @@ -16,18 +16,19 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms @@ -51,9 +52,9 @@ #pragma once +#include "pandas/portable.h" #include #include -#include "pandas/portable.h" // Don't output any extra whitespaces when encoding #define JSON_NO_EXTRA_WHITESPACE @@ -74,7 +75,8 @@ #endif /* -Dictates and limits how much stack space for buffers UltraJSON will use before resorting to provided heap functions */ +Dictates and limits how much stack space for buffers UltraJSON will use before +resorting to provided heap functions */ #ifndef JSON_MAX_STACK_BUFFER_SIZE #define JSON_MAX_STACK_BUFFER_SIZE 131072 #endif @@ -138,23 +140,23 @@ #endif enum JSTYPES { - JT_NULL, // NULL - JT_TRUE, // boolean true - JT_FALSE, // boolean false - JT_INT, // (JSINT32 (signed 32-bit)) - JT_LONG, // (JSINT64 (signed 64-bit)) - JT_DOUBLE, // (double) - JT_BIGNUM, // integer larger than sys.maxsize - JT_UTF8, // (char 8-bit) - JT_ARRAY, // Array structure - JT_OBJECT, // Key/Value structure - JT_INVALID, // Internal, do not return nor expect - JT_POS_INF, // Positive infinity - JT_NEG_INF, // Negative infinity + JT_NULL, // NULL + JT_TRUE, // boolean true + JT_FALSE, // boolean false + JT_INT, // (JSINT32 (signed 32-bit)) + JT_LONG, // (JSINT64 (signed 64-bit)) + JT_DOUBLE, // (double) + JT_BIGNUM, // integer larger than sys.maxsize + JT_UTF8, // (char 8-bit) + JT_ARRAY, // Array structure + JT_OBJECT, // Key/Value structure + JT_INVALID, // Internal, do not return nor expect + JT_POS_INF, // Positive infinity + JT_NEG_INF, // Negative infinity }; -typedef void * JSOBJ; -typedef void * JSITER; +typedef void *JSOBJ; +typedef void *JSITER; typedef struct __JSONTypeContext { int type; @@ -183,17 +185,18 @@ JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc, - size_t *_outLen); + size_t *_outLen); /* Begin iteration of an iterable object (JS_ARRAY or JS_OBJECT) - Implementor should setup iteration state in ti->prv + Implementer should setup iteration state in ti->prv */ JSPFN_ITERBEGIN iterBegin; /* - Retrieve next object in an iteration. Should return 0 to indicate iteration has reached end or 1 if there are more items. - Implementor is responsible for keeping state of the iteration. Use ti->prv fields for this + Retrieve next object in an iteration. Should return 0 to indicate iteration + has reached end or 1 if there are more items. Implementer is responsible for + keeping state of the iteration. Use ti->prv fields for this */ JSPFN_ITERNEXT iterNext; @@ -205,19 +208,22 @@ /* Returns a reference to the value object of an iterator - The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + The is responsible for the life-cycle of the returned string. Use + iterNext/iterEnd and ti->prv to keep track of current object */ JSPFN_ITERGETVALUE iterGetValue; /* Return name of iterator. - The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object + The is responsible for the life-cycle of the returned string. Use + iterNext/iterEnd and ti->prv to keep track of current object */ JSPFN_ITERGETNAME iterGetName; /* - Release a value as indicated by setting ti->release = 1 in the previous getValue call. - The ti->prv array should contain the necessary context to release the value + Release a value as indicated by setting ti->release = 1 in the previous + getValue call. The ti->prv array should contain the necessary context to + release the value */ void (*releaseObject)(JSOBJ obj); @@ -228,19 +234,23 @@ JSPFN_FREE free; /* - Configuration for max recursion, set to 0 to use default (see JSON_MAX_RECURSION_DEPTH)*/ + Configuration for max recursion, set to 0 to use default (see + JSON_MAX_RECURSION_DEPTH)*/ int recursionMax; /* - Configuration for max decimals of double floating point numbers to encode (0-9) */ + Configuration for max decimals of double floating point numbers to encode + (0-9) */ int doublePrecision; /* - If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */ + If true output will be ASCII with all characters above 127 encoded as \uXXXX. + If false output will be UTF-8 or what ever charset strings are brought as */ int forceASCII; /* - If true, '<', '>', and '&' characters will be encoded as \u003c, \u003e, and \u0026, respectively. If false, no special encoding will be used. */ + If true, '<', '>', and '&' characters will be encoded as \u003c, \u003e, and + \u0026, respectively. If false, no special encoding will be used. */ int encodeHTMLChars; /* @@ -266,18 +276,20 @@ Arguments: obj - An anonymous type representing the object enc - Function definitions for querying JSOBJ type -buffer - Preallocated buffer to store result in. If NULL function allocates own buffer -cbBuffer - Length of buffer (ignored if buffer is NULL) +buffer - Preallocated buffer to store result in. If NULL function allocates own +buffer cbBuffer - Length of buffer (ignored if buffer is NULL) Returns: Encoded JSON object as a null terminated char string. NOTE: -If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer. -Life cycle of the provided buffer must still be handled by caller. - -If the return value doesn't equal the specified buffer caller must release the memory using -JSONObjectEncoder.free or free() as specified when calling this function. +If the supplied buffer wasn't enough to hold the result the function will +allocate a new buffer. Life cycle of the provided buffer must still be handled +by caller. + +If the return value doesn't equal the specified buffer caller must release the +memory using JSONObjectEncoder.free or free() as specified when calling this +function. */ EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer); diff -Nru pandas-2.1.4+dfsg/pandas/_libs/include/pandas/vendored/ujson/python/version.h pandas-2.2.2+dfsg/pandas/_libs/include/pandas/vendored/ujson/python/version.h --- pandas-2.1.4+dfsg/pandas/_libs/include/pandas/vendored/ujson/python/version.h 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/include/pandas/vendored/ujson/python/version.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,40 +0,0 @@ -/* -Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the ESN Social Software AB nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. - -Numeric decoder derived from TCL library -https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms - * Copyright (c) 1988-1993 The Regents of the University of California. - * Copyright (c) 1994 Sun Microsystems, Inc. -*/ - -#pragma once - -#define UJSON_VERSION "1.33" diff -Nru pandas-2.1.4+dfsg/pandas/_libs/index.pyi pandas-2.2.2+dfsg/pandas/_libs/index.pyi --- pandas-2.1.4+dfsg/pandas/_libs/index.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/index.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -80,13 +80,6 @@ ) -> None: ... def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ... def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ... - def get_indexer_with_fill( - self, - target: np.ndarray, # np.ndarray[object] of tuples - values: np.ndarray, # np.ndarray[object] of tuples - method: str, - limit: int | None, - ) -> npt.NDArray[np.intp]: ... class ExtensionEngine: def __init__(self, values: ExtensionArray) -> None: ... diff -Nru pandas-2.1.4+dfsg/pandas/_libs/index.pyx pandas-2.2.2+dfsg/pandas/_libs/index.pyx --- pandas-2.1.4+dfsg/pandas/_libs/index.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/index.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,5 @@ cimport cython +from cpython.sequence cimport PySequence_GetItem import numpy as np @@ -77,7 +78,7 @@ indexer = np.empty(len(values), dtype=np.uint8) for i in range(len(values)): - item = values[i] + item = PySequence_GetItem(values, i) indexer[i] = is_matching_na(item, val) else: @@ -95,6 +96,20 @@ return indexer.view(bool) +cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length): + """ + Resize array if loc is out of bounds. + """ + cdef: + Py_ssize_t n = len(values) + + if loc >= n: + while loc >= n: + n *= 2 + values = np.resize(values, min(n, max_length)) + return values + + # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1_000_000 @@ -280,7 +295,7 @@ values = self.values self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \ self._call_monotonic(values) - except TypeError: + except (TypeError, ValueError): self.monotonic_inc = 0 self.monotonic_dec = 0 is_strict_monotonic = 0 @@ -354,7 +369,7 @@ dict d = {} object val Py_ssize_t count = 0, count_missing = 0 - Py_ssize_t i, j, n, n_t, n_alloc, start, end + Py_ssize_t i, j, n, n_t, n_alloc, max_alloc, start, end bint check_na_values = False values = self.values @@ -364,6 +379,7 @@ n = len(values) n_t = len(targets) + max_alloc = n * n_t if n > 10_000: n_alloc = 10_000 else: @@ -375,7 +391,7 @@ # map each starget to its position in the index if ( stargets and - len(stargets) < 5 and + len(stargets) < (n / (2 * n.bit_length())) and not na_in_stargets and self.is_monotonic_increasing ): @@ -404,7 +420,7 @@ found_nas = set() for i in range(n): - val = values[i] + val = PySequence_GetItem(values, i) # GH#43870 # handle lookup for nas @@ -436,7 +452,7 @@ d[val].append(i) for i in range(n_t): - val = targets[i] + val = PySequence_GetItem(targets, i) # ensure there are nas in values before looking for a matching na if check_na_values and checknull(val): @@ -448,23 +464,18 @@ # found if val in d: key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc += 10_000 - result = np.resize(result, n_alloc) - result[count] = j count += 1 # value not found else: - - if count >= n_alloc: - n_alloc += 10_000 - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i @@ -483,22 +494,22 @@ Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1 object pval - if hi == 0 or (hi > 0 and val > values[hi]): + if hi == 0 or (hi > 0 and val > PySequence_GetItem(values, hi)): return len(values) while lo < hi: mid = (lo + hi) // 2 - pval = values[mid] + pval = PySequence_GetItem(values, mid) if val < pval: hi = mid elif val > pval: lo = mid + 1 else: - while mid > 0 and val == values[mid - 1]: + while mid > 0 and val == PySequence_GetItem(values, mid - 1): mid -= 1 return mid - if val <= values[mid]: + if val <= PySequence_GetItem(values, mid): return mid else: return mid + 1 @@ -586,7 +597,7 @@ loc = values.searchsorted(conv, side="left") - if loc == len(values) or values[loc] != conv: + if loc == len(values) or PySequence_GetItem(values, loc) != conv: raise KeyError(val) return loc @@ -748,91 +759,6 @@ """ return self._base.get_indexer(self, target) - def get_indexer_with_fill(self, ndarray target, ndarray values, - str method, object limit) -> np.ndarray: - """ - Returns an array giving the positions of each value of `target` in - `values`, where -1 represents a value in `target` which does not - appear in `values` - - If `method` is "backfill" then the position for a value in `target` - which does not appear in `values` is that of the next greater value - in `values` (if one exists), and -1 if there is no such value. - - Similarly, if the method is "pad" then the position for a value in - `target` which does not appear in `values` is that of the next smaller - value in `values` (if one exists), and -1 if there is no such value. - - Parameters - ---------- - target: ndarray[object] of tuples - need not be sorted, but all must have the same length, which must be - the same as the length of all tuples in `values` - values : ndarray[object] of tuples - must be sorted and all have the same length. Should be the set of - the MultiIndex's values. - method: string - "backfill" or "pad" - limit: int or None - if provided, limit the number of fills to this value - - Returns - ------- - np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`, - filled with the `method` (and optionally `limit`) specified - """ - assert method in ("backfill", "pad") - cdef: - int64_t i, j, next_code - int64_t num_values, num_target_values - ndarray[int64_t, ndim=1] target_order - ndarray[object, ndim=1] target_values - ndarray[int64_t, ndim=1] new_codes, new_target_codes - ndarray[intp_t, ndim=1] sorted_indexer - - target_order = np.argsort(target).astype("int64") - target_values = target[target_order] - num_values, num_target_values = len(values), len(target_values) - new_codes, new_target_codes = ( - np.empty((num_values,)).astype("int64"), - np.empty((num_target_values,)).astype("int64"), - ) - - # `values` and `target_values` are both sorted, so we walk through them - # and memoize the (ordered) set of indices in the (implicit) merged-and - # sorted list of the two which belong to each of them - # the effect of this is to create a factorization for the (sorted) - # merger of the index values, where `new_codes` and `new_target_codes` - # are the subset of the factors which appear in `values` and `target`, - # respectively - i, j, next_code = 0, 0, 0 - while i < num_values and j < num_target_values: - val, target_val = values[i], target_values[j] - if val <= target_val: - new_codes[i] = next_code - i += 1 - if target_val <= val: - new_target_codes[j] = next_code - j += 1 - next_code += 1 - - # at this point, at least one should have reached the end - # the remaining values of the other should be added to the end - assert i == num_values or j == num_target_values - while i < num_values: - new_codes[i] = next_code - i += 1 - next_code += 1 - while j < num_target_values: - new_target_codes[j] = next_code - j += 1 - next_code += 1 - - # get the indexer, and undo the sorting of `target.values` - algo = algos.backfill if method == "backfill" else algos.pad - sorted_indexer = algo(new_codes, new_target_codes, limit=limit) - return sorted_indexer[np.argsort(target_order)] - def get_loc(self, object key): if is_definitely_invalid_key(key): raise TypeError(f"'{key}' is an invalid key") @@ -1042,7 +968,7 @@ res = np.empty(N, dtype=np.intp) for i in range(N): - val = values[i] + val = PySequence_GetItem(values, i) try: loc = self.get_loc(val) # Because we are unique, loc should always be an integer @@ -1076,7 +1002,7 @@ # See also IntervalIndex.get_indexer_pointwise for i in range(N): - val = targets[i] + val = PySequence_GetItem(targets, i) try: locs = self.get_loc(val) @@ -1211,7 +1137,7 @@ dict d = {} object val Py_ssize_t count = 0, count_missing = 0 - Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx + Py_ssize_t i, j, n, n_t, n_alloc, max_alloc, start, end, na_idx target_vals = self._get_data(targets) target_mask = self._get_mask(targets) @@ -1224,6 +1150,7 @@ n = len(values) n_t = len(target_vals) + max_alloc = n * n_t if n > 10_000: n_alloc = 10_000 else: @@ -1255,9 +1182,9 @@ na_pos = [] for i in range(n): - val = values[i] + val = PySequence_GetItem(values, i) - if mask[i]: + if PySequence_GetItem(mask, i): na_pos.append(i) else: @@ -1267,16 +1194,16 @@ d[val].append(i) for i in range(n_t): - val = target_vals[i] + val = PySequence_GetItem(target_vals, i) - if target_mask[i]: + if PySequence_GetItem(target_mask, i): if na_pos: + result = _maybe_resize_array( + result, + count + len(na_pos) - 1, + max_alloc, + ) for na_idx in na_pos: - # realloc if needed - if count >= n_alloc: - n_alloc += 10_000 - result = np.resize(result, n_alloc) - result[count] = na_idx count += 1 continue @@ -1284,22 +1211,18 @@ elif val in d: # found key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc, + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc += 10_000 - result = np.resize(result, n_alloc) - result[count] = j count += 1 continue # value not found - if count >= n_alloc: - n_alloc += 10_000 - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i diff -Nru pandas-2.1.4+dfsg/pandas/_libs/internals.pyi pandas-2.2.2+dfsg/pandas/_libs/internals.pyi --- pandas-2.1.4+dfsg/pandas/_libs/internals.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/internals.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -15,7 +15,6 @@ ) from pandas import Index -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.internals.blocks import Block as B def slice_len(slc: slice, objlen: int = ...) -> int: ... @@ -60,7 +59,7 @@ def append(self, others: list[BlockPlacement]) -> BlockPlacement: ... def tile_for_unstack(self, factor: int) -> npt.NDArray[np.intp]: ... -class SharedBlock: +class Block: _mgr_locs: BlockPlacement ndim: int values: ArrayLike @@ -72,19 +71,8 @@ ndim: int, refs: BlockValuesRefs | None = ..., ) -> None: ... - -class NumpyBlock(SharedBlock): - values: np.ndarray - @final - def slice_block_rows(self, slicer: slice) -> Self: ... - -class NDArrayBackedBlock(SharedBlock): - values: NDArrayBackedExtensionArray - @final def slice_block_rows(self, slicer: slice) -> Self: ... -class Block(SharedBlock): ... - class BlockManager: blocks: tuple[B, ...] axes: list[Index] @@ -100,7 +88,7 @@ class BlockValuesRefs: referenced_blocks: list[weakref.ref] - def __init__(self, blk: SharedBlock | None = ...) -> None: ... - def add_reference(self, blk: SharedBlock) -> None: ... + def __init__(self, blk: Block | None = ...) -> None: ... + def add_reference(self, blk: Block) -> None: ... def add_index_reference(self, index: Index) -> None: ... def has_reference(self) -> bool: ... diff -Nru pandas-2.1.4+dfsg/pandas/_libs/internals.pyx pandas-2.2.2+dfsg/pandas/_libs/internals.pyx --- pandas-2.1.4+dfsg/pandas/_libs/internals.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/internals.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -2,14 +2,10 @@ import weakref cimport cython +from cpython.pyport cimport PY_SSIZE_T_MAX from cpython.slice cimport PySlice_GetIndicesEx from cython cimport Py_ssize_t - -cdef extern from "Python.h": - # TODO(cython3): from cpython.pyport cimport PY_SSIZE_T_MAX - Py_ssize_t PY_SSIZE_T_MAX - import numpy as np cimport numpy as cnp @@ -24,7 +20,6 @@ from pandas._libs.algos import ensure_int64 -from pandas._libs.arrays cimport NDArrayBacked from pandas._libs.util cimport ( is_array, is_integer_object, @@ -639,7 +634,7 @@ @cython.freelist(64) -cdef class SharedBlock: +cdef class Block: """ Defining __init__ in a cython class significantly improves performance. """ @@ -647,6 +642,11 @@ public BlockPlacement _mgr_locs public BlockValuesRefs refs readonly int ndim + # 2023-08-15 no apparent performance improvement from declaring values + # as ndarray in a type-special subclass (similar for NDArrayBacked). + # This might change if slice_block_rows can be optimized with something + # like https://github.com/numpy/numpy/issues/23934 + public object values def __cinit__( self, @@ -666,6 +666,8 @@ refs: BlockValuesRefs, optional Ref tracking object or None if block does not have any refs. """ + self.values = values + self._mgr_locs = placement self.ndim = ndim if refs is None: @@ -699,51 +701,7 @@ ndim = maybe_infer_ndim(self.values, self.mgr_locs) self.ndim = ndim - -cdef class NumpyBlock(SharedBlock): - cdef: - public ndarray values - - def __cinit__( - self, - ndarray values, - BlockPlacement placement, - int ndim, - refs: BlockValuesRefs | None = None, - ): - # set values here; the (implicit) call to SharedBlock.__cinit__ will - # set placement, ndim and refs - self.values = values - - cpdef NumpyBlock slice_block_rows(self, slice slicer): - """ - Perform __getitem__-like specialized to slicing along index. - - Assumes self.ndim == 2 - """ - new_values = self.values[..., slicer] - return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs) - - -cdef class NDArrayBackedBlock(SharedBlock): - """ - Block backed by NDArrayBackedExtensionArray - """ - cdef public: - NDArrayBacked values - - def __cinit__( - self, - NDArrayBacked values, - BlockPlacement placement, - int ndim, - refs: BlockValuesRefs | None = None, - ): - # set values here; the (implicit) call to SharedBlock.__cinit__ will - # set placement, ndim and refs - self.values = values - - cpdef NDArrayBackedBlock slice_block_rows(self, slice slicer): + cpdef Block slice_block_rows(self, slice slicer): """ Perform __getitem__-like specialized to slicing along index. @@ -753,22 +711,6 @@ return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs) -cdef class Block(SharedBlock): - cdef: - public object values - - def __cinit__( - self, - object values, - BlockPlacement placement, - int ndim, - refs: BlockValuesRefs | None = None, - ): - # set values here; the (implicit) call to SharedBlock.__cinit__ will - # set placement, ndim and refs - self.values = values - - @cython.freelist(64) cdef class BlockManager: cdef: @@ -811,7 +753,7 @@ cdef: intp_t blkno, i, j cnp.npy_intp length = self.shape[0] - SharedBlock blk + Block blk BlockPlacement bp ndarray[intp_t, ndim=1] new_blknos, new_blklocs @@ -901,7 +843,7 @@ cdef BlockManager _slice_mgr_rows(self, slice slobj): cdef: - SharedBlock blk, nb + Block blk, nb BlockManager mgr ndarray blknos, blklocs @@ -946,7 +888,7 @@ public list referenced_blocks public int clear_counter - def __cinit__(self, blk: SharedBlock | None = None) -> None: + def __cinit__(self, blk: Block | None = None) -> None: if blk is not None: self.referenced_blocks = [weakref.ref(blk)] else: @@ -968,12 +910,12 @@ elif nr_of_refs > self.clear_counter: self.clear_counter = max(self.clear_counter * 2, nr_of_refs) - def add_reference(self, blk: SharedBlock) -> None: + def add_reference(self, blk: Block) -> None: """Adds a new reference to our reference collection. Parameters ---------- - blk: SharedBlock + blk : Block The block that the new references should point to. """ self._clear_dead_references() diff -Nru pandas-2.1.4+dfsg/pandas/_libs/interval.pyx pandas-2.2.2+dfsg/pandas/_libs/interval.pyx --- pandas-2.1.4+dfsg/pandas/_libs/interval.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/interval.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -39,7 +39,6 @@ from pandas._libs.tslibs.util cimport ( is_float_object, is_integer_object, - is_timedelta64_object, ) VALID_CLOSED = frozenset(["left", "right", "both", "neither"]) @@ -478,57 +477,31 @@ args = (self.left, self.right, self.closed) return (type(self), args) - def _repr_base(self): - left = self.left - right = self.right - - # TODO: need more general formatting methodology here - if isinstance(left, _Timestamp) and isinstance(right, _Timestamp): - left = left._short_repr - right = right._short_repr - - return left, right - def __repr__(self) -> str: - - left, right = self._repr_base() - disp = str if isinstance(left, np.generic) else repr + disp = str if isinstance(self.left, (np.generic, _Timestamp)) else repr name = type(self).__name__ - repr_str = f"{name}({disp(left)}, {disp(right)}, closed={repr(self.closed)})" + repr_str = f"{name}({disp(self.left)}, {disp(self.right)}, closed={repr(self.closed)})" # noqa: E501 return repr_str def __str__(self) -> str: - - left, right = self._repr_base() start_symbol = "[" if self.closed_left else "(" end_symbol = "]" if self.closed_right else ")" - return f"{start_symbol}{left}, {right}{end_symbol}" + return f"{start_symbol}{self.left}, {self.right}{end_symbol}" def __add__(self, y): if ( isinstance(y, numbers.Number) or PyDelta_Check(y) - or is_timedelta64_object(y) + or cnp.is_timedelta64_object(y) ): return Interval(self.left + y, self.right + y, closed=self.closed) - elif ( - # __radd__ pattern - # TODO(cython3): remove this - isinstance(y, Interval) - and ( - isinstance(self, numbers.Number) - or PyDelta_Check(self) - or is_timedelta64_object(self) - ) - ): - return Interval(y.left + self, y.right + self, closed=y.closed) return NotImplemented def __radd__(self, other): if ( isinstance(other, numbers.Number) or PyDelta_Check(other) - or is_timedelta64_object(other) + or cnp.is_timedelta64_object(other) ): return Interval(self.left + other, self.right + other, closed=self.closed) return NotImplemented @@ -537,7 +510,7 @@ if ( isinstance(y, numbers.Number) or PyDelta_Check(y) - or is_timedelta64_object(y) + or cnp.is_timedelta64_object(y) ): return Interval(self.left - y, self.right - y, closed=self.closed) return NotImplemented @@ -545,10 +518,6 @@ def __mul__(self, y): if isinstance(y, numbers.Number): return Interval(self.left * y, self.right * y, closed=self.closed) - elif isinstance(y, Interval) and isinstance(self, numbers.Number): - # __radd__ semantics - # TODO(cython3): remove this - return Interval(y.left * self, y.right * self, closed=y.closed) return NotImplemented def __rmul__(self, other): diff -Nru pandas-2.1.4+dfsg/pandas/_libs/intervaltree.pxi.in pandas-2.2.2+dfsg/pandas/_libs/intervaltree.pxi.in --- pandas-2.1.4+dfsg/pandas/_libs/intervaltree.pxi.in 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/intervaltree.pxi.in 2024-04-10 17:42:52.000000000 +0000 @@ -391,6 +391,11 @@ """Recursively query this node and its sub-nodes for intervals that overlap with the query point. """ + + # GH 51826: ensures nan is handled properly during reindexing + if np.isnan(point): + return + cdef: int64_t[:] indices {{dtype}}_t[:] values diff -Nru pandas-2.1.4+dfsg/pandas/_libs/join.pyi pandas-2.2.2+dfsg/pandas/_libs/join.pyi --- pandas-2.1.4+dfsg/pandas/_libs/join.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/join.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -6,6 +6,7 @@ left: np.ndarray, # const intp_t[:] right: np.ndarray, # const intp_t[:] max_groups: int, + sort: bool = ..., ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... def left_outer_join( left: np.ndarray, # const intp_t[:] @@ -52,8 +53,8 @@ def asof_join_backward_on_X_by_Y( left_values: np.ndarray, # ndarray[numeric_t] right_values: np.ndarray, # ndarray[numeric_t] - left_by_values: np.ndarray, # ndarray[by_t] - right_by_values: np.ndarray, # ndarray[by_t] + left_by_values: np.ndarray, # const int64_t[:] + right_by_values: np.ndarray, # const int64_t[:] allow_exact_matches: bool = ..., tolerance: np.number | float | None = ..., use_hashtable: bool = ..., @@ -61,8 +62,8 @@ def asof_join_forward_on_X_by_Y( left_values: np.ndarray, # ndarray[numeric_t] right_values: np.ndarray, # ndarray[numeric_t] - left_by_values: np.ndarray, # ndarray[by_t] - right_by_values: np.ndarray, # ndarray[by_t] + left_by_values: np.ndarray, # const int64_t[:] + right_by_values: np.ndarray, # const int64_t[:] allow_exact_matches: bool = ..., tolerance: np.number | float | None = ..., use_hashtable: bool = ..., @@ -70,8 +71,8 @@ def asof_join_nearest_on_X_by_Y( left_values: np.ndarray, # ndarray[numeric_t] right_values: np.ndarray, # ndarray[numeric_t] - left_by_values: np.ndarray, # ndarray[by_t] - right_by_values: np.ndarray, # ndarray[by_t] + left_by_values: np.ndarray, # const int64_t[:] + right_by_values: np.ndarray, # const int64_t[:] allow_exact_matches: bool = ..., tolerance: np.number | float | None = ..., use_hashtable: bool = ..., diff -Nru pandas-2.1.4+dfsg/pandas/_libs/join.pyx pandas-2.2.2+dfsg/pandas/_libs/join.pyx --- pandas-2.1.4+dfsg/pandas/_libs/join.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/join.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -7,7 +7,6 @@ int64_t, intp_t, ndarray, - uint64_t, ) cnp.import_array() @@ -23,7 +22,7 @@ @cython.wraparound(False) @cython.boundscheck(False) def inner_join(const intp_t[:] left, const intp_t[:] right, - Py_ssize_t max_groups): + Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 intp_t[::1] left_sorter, right_sorter @@ -70,7 +69,20 @@ _get_result_indexer(left_sorter, left_indexer) _get_result_indexer(right_sorter, right_indexer) - return np.asarray(left_indexer), np.asarray(right_indexer) + if not sort: + # if not asked to sort, revert to original order + if len(left) == len(left_indexer): + # no multiple matches for any row on the left + # this is a short-cut to avoid groupsort_indexer + # otherwise, the `else` path also works in this case + rev = np.empty(len(left), dtype=np.intp) + rev.put(np.asarray(left_sorter), np.arange(len(left))) + else: + rev, _ = groupsort_indexer(left_indexer, len(left)) + + return np.asarray(left_indexer).take(rev), np.asarray(right_indexer).take(rev) + else: + return np.asarray(left_indexer), np.asarray(right_indexer) @cython.wraparound(False) @@ -666,23 +678,13 @@ # asof_join_by # ---------------------------------------------------------------------- -from pandas._libs.hashtable cimport ( - HashTable, - Int64HashTable, - PyObjectHashTable, - UInt64HashTable, -) - -ctypedef fused by_t: - object - int64_t - uint64_t +from pandas._libs.hashtable cimport Int64HashTable def asof_join_backward_on_X_by_Y(ndarray[numeric_t] left_values, ndarray[numeric_t] right_values, - ndarray[by_t] left_by_values, - ndarray[by_t] right_by_values, + const int64_t[:] left_by_values, + const int64_t[:] right_by_values, bint allow_exact_matches=True, tolerance=None, bint use_hashtable=True): @@ -693,8 +695,7 @@ bint has_tolerance = False numeric_t tolerance_ = 0 numeric_t diff = 0 - HashTable hash_table - by_t by_value + Int64HashTable hash_table # if we are using tolerance, set our objects if tolerance is not None: @@ -708,12 +709,7 @@ right_indexer = np.empty(left_size, dtype=np.intp) if use_hashtable: - if by_t is object: - hash_table = PyObjectHashTable(right_size) - elif by_t is int64_t: - hash_table = Int64HashTable(right_size) - elif by_t is uint64_t: - hash_table = UInt64HashTable(right_size) + hash_table = Int64HashTable(right_size) right_pos = 0 for left_pos in range(left_size): @@ -758,8 +754,8 @@ def asof_join_forward_on_X_by_Y(ndarray[numeric_t] left_values, ndarray[numeric_t] right_values, - ndarray[by_t] left_by_values, - ndarray[by_t] right_by_values, + const int64_t[:] left_by_values, + const int64_t[:] right_by_values, bint allow_exact_matches=1, tolerance=None, bint use_hashtable=True): @@ -770,8 +766,7 @@ bint has_tolerance = False numeric_t tolerance_ = 0 numeric_t diff = 0 - HashTable hash_table - by_t by_value + Int64HashTable hash_table # if we are using tolerance, set our objects if tolerance is not None: @@ -785,12 +780,7 @@ right_indexer = np.empty(left_size, dtype=np.intp) if use_hashtable: - if by_t is object: - hash_table = PyObjectHashTable(right_size) - elif by_t is int64_t: - hash_table = Int64HashTable(right_size) - elif by_t is uint64_t: - hash_table = UInt64HashTable(right_size) + hash_table = Int64HashTable(right_size) right_pos = right_size - 1 for left_pos in range(left_size - 1, -1, -1): @@ -836,8 +826,8 @@ def asof_join_nearest_on_X_by_Y(ndarray[numeric_t] left_values, ndarray[numeric_t] right_values, - ndarray[by_t] left_by_values, - ndarray[by_t] right_by_values, + const int64_t[:] left_by_values, + const int64_t[:] right_by_values, bint allow_exact_matches=True, tolerance=None, bint use_hashtable=True): diff -Nru pandas-2.1.4+dfsg/pandas/_libs/lib.pyi pandas-2.2.2+dfsg/pandas/_libs/lib.pyi --- pandas-2.1.4+dfsg/pandas/_libs/lib.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/lib.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -45,25 +45,31 @@ def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... def is_pyarrow_array(obj: object) -> bool: ... def is_period(val: object) -> TypeGuard[Period]: ... -def is_interval(val: object) -> TypeGuard[Interval]: ... -def is_decimal(val: object) -> TypeGuard[Decimal]: ... -def is_complex(val: object) -> TypeGuard[complex]: ... -def is_bool(val: object) -> TypeGuard[bool | np.bool_]: ... -def is_integer(val: object) -> TypeGuard[int | np.integer]: ... +def is_interval(obj: object) -> TypeGuard[Interval]: ... +def is_decimal(obj: object) -> TypeGuard[Decimal]: ... +def is_complex(obj: object) -> TypeGuard[complex]: ... +def is_bool(obj: object) -> TypeGuard[bool | np.bool_]: ... +def is_integer(obj: object) -> TypeGuard[int | np.integer]: ... def is_int_or_none(obj) -> bool: ... -def is_float(val: object) -> TypeGuard[float]: ... +def is_float(obj: object) -> TypeGuard[float]: ... def is_interval_array(values: np.ndarray) -> bool: ... -def is_datetime64_array(values: np.ndarray) -> bool: ... -def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ... +def is_datetime64_array(values: np.ndarray, skipna: bool = True) -> bool: ... +def is_timedelta_or_timedelta64_array( + values: np.ndarray, skipna: bool = True +) -> bool: ... def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ... def is_time_array(values: np.ndarray, skipna: bool = ...): ... def is_date_array(values: np.ndarray, skipna: bool = ...): ... def is_datetime_array(values: np.ndarray, skipna: bool = ...): ... def is_string_array(values: np.ndarray, skipna: bool = ...): ... -def is_float_array(values: np.ndarray, skipna: bool = ...): ... +def is_float_array(values: np.ndarray): ... def is_integer_array(values: np.ndarray, skipna: bool = ...): ... def is_bool_array(values: np.ndarray, skipna: bool = ...): ... -def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ... +def fast_multiget( + mapping: dict, + keys: np.ndarray, # object[:] + default=..., +) -> np.ndarray: ... def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ... def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ... def map_infer( @@ -181,7 +187,7 @@ max_bin: int, ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2] def get_level_sorter( - label: np.ndarray, # const int64_t[:] + codes: np.ndarray, # const int64_t[:] starts: np.ndarray, # const intp_t[:] ) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1] def generate_bins_dt64( diff -Nru pandas-2.1.4+dfsg/pandas/_libs/lib.pyx pandas-2.2.2+dfsg/pandas/_libs/lib.pyx --- pandas-2.1.4+dfsg/pandas/_libs/lib.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/lib.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -25,7 +25,6 @@ Py_EQ, PyObject, PyObject_RichCompareBool, - PyTypeObject, ) from cpython.ref cimport Py_INCREF from cpython.sequence cimport PySequence_Check @@ -66,34 +65,8 @@ ) cnp.import_array() +from pandas._libs.interval import Interval -cdef extern from "Python.h": - # Note: importing extern-style allows us to declare these as nogil - # functions, whereas `from cpython cimport` does not. - bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil - -cdef extern from "numpy/arrayobject.h": - # cython's numpy.dtype specification is incorrect, which leads to - # errors in issubclass(self.dtype.type, np.bool_), so we directly - # include the correct version - # https://github.com/cython/cython/issues/2022 - - ctypedef class numpy.dtype [object PyArray_Descr]: - # Use PyDataType_* macros when possible, however there are no macros - # for accessing some of the fields, so some are defined. Please - # ask on cython-dev if you need more. - cdef: - int type_num - int itemsize "elsize" - char byteorder - object fields - tuple names - - PyTypeObject PySignedIntegerArrType_Type - PyTypeObject PyUnsignedIntegerArrType_Type - -cdef extern from "numpy/ndarrayobject.h": - bint PyArray_CheckScalar(obj) nogil cdef extern from "pandas/parser/pd_parser.h": int floatify(object, float64_t *result, int *maybe_int) except -1 @@ -102,6 +75,7 @@ PandasParser_IMPORT from pandas._libs cimport util +from pandas._libs.dtypes cimport uint8_int64_object_t from pandas._libs.util cimport ( INT64_MAX, INT64_MIN, @@ -255,7 +229,7 @@ # Note: PyNumber_Check check includes Decimal, Fraction, numbers.Number return (PyNumber_Check(val) or is_period_object(val) - or is_interval(val) + or isinstance(val, Interval) or is_offset_object(val)) @@ -271,7 +245,7 @@ ------- is_ndarray : bool """ - if PyArray_CheckScalar(val): + if cnp.PyArray_CheckScalar(val): return cnp.PyArray_DescrFromScalar(val).itemsize else: return -1 @@ -512,8 +486,7 @@ @cython.wraparound(False) @cython.boundscheck(False) -# TODO(cython3): Can add const once cython#1772 is resolved -def has_infs(floating[:] arr) -> bool: +def has_infs(const floating[:] arr) -> bool: cdef: Py_ssize_t i, n = len(arr) floating inf, neginf, val @@ -776,6 +749,7 @@ cdef: Py_ssize_t i = 0, n = len(arr) bint already_copied = True + ndarray[object] newarr if hasattr(arr, "to_numpy"): @@ -785,13 +759,14 @@ out = arr.astype(str).astype(object) out[arr.isna()] = na_value return out - arr = arr.to_numpy() + arr = arr.to_numpy(dtype=object) elif not util.is_array(arr): arr = np.array(arr, dtype="object") result = np.asarray(arr, dtype="object") - if copy and result is arr: + if copy and (result is arr or np.shares_memory(arr, result)): + # GH#54654 result = result.copy() elif not copy and result is arr: already_copied = False @@ -800,8 +775,30 @@ # short-circuit, all elements are str return result + if arr.dtype.kind == "f": # non-optimized path + for i in range(n): + val = arr[i] + + if not already_copied: + result = result.copy() + already_copied = True + + if not checknull(val): + # f"{val}" is not always equivalent to str(val) for floats + result[i] = str(val) + else: + if convert_na_value: + val = na_value + if skipna: + result[i] = val + else: + result[i] = f"{val}" + + return result + + newarr = np.asarray(arr, dtype=object) for i in range(n): - val = arr[i] + val = newarr[i] if isinstance(val, str): continue @@ -1165,6 +1162,17 @@ cpdef bint is_interval(object obj): + import warnings + + from pandas.util._exceptions import find_stack_level + + warnings.warn( + # GH#55264 + "is_interval is deprecated and will be removed in a future version. " + "Use isinstance(obj, pd.Interval) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return getattr(obj, "_typ", "_typ") == "interval" @@ -1176,6 +1184,17 @@ ------- bool """ + import warnings + + from pandas.util._exceptions import find_stack_level + + warnings.warn( + # GH#55264 + "is_period is deprecated and will be removed in a future version. " + "Use isinstance(obj, pd.Period) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return is_period_object(val) @@ -1415,14 +1434,12 @@ self.sint_ = ( self.sint_ or (oINT64_MIN <= val < 0) - # Cython equivalent of `isinstance(val, np.signedinteger)` - or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type) + or isinstance(val, cnp.signedinteger) ) self.uint_ = ( self.uint_ or (oINT64_MAX < val <= oUINT64_MAX) - # Cython equivalent of `isinstance(val, np.unsignedinteger)` - or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type) + or isinstance(val, cnp.unsignedinteger) ) @property @@ -1632,7 +1649,7 @@ if seen_val is False and skipna: return "empty" - if util.is_datetime64_object(val): + if cnp.is_datetime64_object(val): if is_datetime64_array(values, skipna=skipna): return "datetime64" @@ -1700,7 +1717,7 @@ if is_period_array(values, skipna=skipna): return "period" - elif is_interval(val): + elif isinstance(val, Interval): if is_interval_array(values): return "interval" @@ -1716,7 +1733,7 @@ cdef bint is_timedelta(object o): - return PyDelta_Check(o) or util.is_timedelta64_object(o) + return PyDelta_Check(o) or cnp.is_timedelta64_object(o) @cython.internal @@ -1724,10 +1741,10 @@ cdef: Py_ssize_t n - dtype dtype + cnp.dtype dtype bint skipna - def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), + def __cinit__(self, Py_ssize_t n, cnp.dtype dtype=np.dtype(np.object_), bint skipna=False): self.n = n self.dtype = dtype @@ -1808,7 +1825,7 @@ return util.is_bool_object(value) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.bool_) + return cnp.PyDataType_ISBOOL(self.dtype) cpdef bint is_bool_array(ndarray values, bint skipna=False): @@ -1825,7 +1842,7 @@ return util.is_integer_object(value) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.integer) + return cnp.PyDataType_ISINTEGER(self.dtype) # Note: only python-exposed for tests @@ -1857,7 +1874,7 @@ return util.is_integer_object(value) or util.is_float_object(value) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.integer) + return cnp.PyDataType_ISINTEGER(self.dtype) cdef bint is_integer_float_array(ndarray values, bint skipna=True): @@ -1874,7 +1891,7 @@ return util.is_float_object(value) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.floating) + return cnp.PyDataType_ISFLOAT(self.dtype) # Note: only python-exposed for tests @@ -1893,7 +1910,7 @@ ) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.complexfloating) + return cnp.PyDataType_ISCOMPLEX(self.dtype) cdef bint is_complex_array(ndarray values): @@ -1922,7 +1939,7 @@ return isinstance(value, str) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.str_) + return self.dtype.type_num == cnp.NPY_UNICODE cpdef bint is_string_array(ndarray values, bint skipna=False): @@ -1939,7 +1956,7 @@ return isinstance(value, bytes) cdef bint is_array_typed(self) except -1: - return issubclass(self.dtype.type, np.bytes_) + return self.dtype.type_num == cnp.NPY_STRING cdef bint is_bytes_array(ndarray values, bint skipna=False): @@ -1954,7 +1971,7 @@ cdef: bint all_generic_na - def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), + def __cinit__(self, Py_ssize_t n, cnp.dtype dtype=np.dtype(np.object_), bint skipna=False): self.n = n self.dtype = dtype @@ -2003,7 +2020,7 @@ @cython.internal cdef class Datetime64Validator(DatetimeValidator): cdef bint is_value_typed(self, object value) except -1: - return util.is_datetime64_object(value) + return cnp.is_datetime64_object(value) # Note: only python-exposed for tests @@ -2017,7 +2034,7 @@ @cython.internal cdef class AnyDatetimeValidator(DatetimeValidator): cdef bint is_value_typed(self, object value) except -1: - return util.is_datetime64_object(value) or ( + return cnp.is_datetime64_object(value) or ( PyDateTime_Check(value) and value.tzinfo is None ) @@ -2175,7 +2192,7 @@ for i in range(n): val = values[i] - if is_interval(val): + if isinstance(val, Interval): if closed is None: closed = val.closed numeric = ( @@ -2501,6 +2518,7 @@ ndarray[int64_t] ints ndarray[uint64_t] uints ndarray[uint8_t] bools + ndarray[uint8_t] mask Seen seen = Seen() object val _TSObject tsobj @@ -2600,7 +2618,7 @@ seen.complex_ = True if not convert_numeric: break - elif PyDateTime_Check(val) or util.is_datetime64_object(val): + elif PyDateTime_Check(val) or cnp.is_datetime64_object(val): # if we have an tz's attached then return the objects if convert_non_numeric: @@ -2613,6 +2631,7 @@ tsobj = convert_to_tsobject(val, None, None, 0, 0) tsobj.ensure_reso(NPY_FR_ns) except OutOfBoundsDatetime: + # e.g. test_out_of_s_bounds_datetime64 seen.object_ = True break else: @@ -2634,7 +2653,7 @@ except (ValueError, TypeError): seen.object_ = True break - elif is_interval(val): + elif isinstance(val, Interval): if convert_non_numeric: seen.interval_ = True break @@ -2737,8 +2756,11 @@ res[:] = NPY_NAT return res elif dtype is not None: - # EA, we don't expect to get here, but _could_ implement - raise NotImplementedError(dtype) + # i.e. PeriodDtype, DatetimeTZDtype + cls = dtype.construct_array_type() + obj = cls._from_sequence([], dtype=dtype) + taker = -np.ones((objects).shape, dtype=np.intp) + return obj.take(taker, allow_fill=True) else: # we don't guess seen.object_ = True @@ -2838,11 +2860,14 @@ NoDefault = Literal[_NoDefault.no_default] -@cython.boundscheck(False) -@cython.wraparound(False) -def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True, - object na_value=no_default, cnp.dtype dtype=np.dtype(object) - ) -> np.ndarray: +def map_infer_mask( + ndarray[object] arr, + object f, + const uint8_t[:] mask, + bint convert=True, + object na_value=no_default, + cnp.dtype dtype=np.dtype(object) +) -> np.ndarray: """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2853,10 +2878,10 @@ mask : ndarray uint8 dtype ndarray indicating values not to apply `f` to. convert : bool, default True - Whether to call `maybe_convert_objects` on the resulting ndarray + Whether to call `maybe_convert_objects` on the resulting ndarray. na_value : Any, optional The result value to use for masked values. By default, the - input value is used + input value is used. dtype : numpy.dtype The numpy dtype to use for the result ndarray. @@ -2864,13 +2889,39 @@ ------- np.ndarray """ + cdef Py_ssize_t n = len(arr) + result = np.empty(n, dtype=dtype) + + _map_infer_mask( + result, + arr, + f, + mask, + na_value, + ) + if convert: + return maybe_convert_objects(result) + else: + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def _map_infer_mask( + ndarray[uint8_int64_object_t] out, + ndarray[object] arr, + object f, + const uint8_t[:] mask, + object na_value=no_default, +) -> None: + """ + Helper for map_infer_mask, split off to use fused types based on the result. + """ cdef: Py_ssize_t i, n - ndarray result object val n = len(arr) - result = np.empty(n, dtype=dtype) for i in range(n): if mask[i]: if na_value is no_default: @@ -2884,12 +2935,7 @@ # unbox 0-dim arrays, GH#690 val = val.item() - result[i] = val - - if convert: - return maybe_convert_objects(result) - else: - return result + out[i] = val @cython.boundscheck(False) @@ -3045,7 +3091,7 @@ @cython.wraparound(False) @cython.boundscheck(False) -def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray: +def fast_multiget(dict mapping, object[:] keys, default=np.nan) -> np.ndarray: cdef: Py_ssize_t i, n = len(keys) object val diff -Nru pandas-2.1.4+dfsg/pandas/_libs/meson.build pandas-2.2.2+dfsg/pandas/_libs/meson.build --- pandas-2.1.4+dfsg/pandas/_libs/meson.build 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/meson.build 2024-04-10 17:42:52.000000000 +0000 @@ -61,12 +61,12 @@ libs_sources = { # Dict of extension name -> dict of {sources, include_dirs, and deps} # numpy include dir is implicitly included - 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper, _khash_primitive_helper]}, + 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep}, 'arrays': {'sources': ['arrays.pyx']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, - 'hashtable': {'sources': ['hashtable.pyx', _khash_primitive_helper, _hashtable_class_helper, _hashtable_func_helper]}, - 'index': {'sources': ['index.pyx', _index_class_helper]}, + 'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep}, + 'index': {'sources': ['index.pyx', _index_class_helper], 'deps': _khash_primitive_helper_dep}, 'indexing': {'sources': ['indexing.pyx']}, 'internals': {'sources': ['internals.pyx']}, 'interval': {'sources': ['interval.pyx', _intervaltree_helper], @@ -101,12 +101,20 @@ 'writers': {'sources': ['writers.pyx']} } +cython_args = [ + '--include-dir', + meson.current_build_dir(), + '-X always_allow_keywords=true' +] +if get_option('buildtype') == 'debug' + cython_args += ['--gdb'] +endif foreach ext_name, ext_dict : libs_sources py.extension_module( ext_name, ext_dict.get('sources'), - cython_args: ['--include-dir', meson.current_build_dir(), '-X always_allow_keywords=true'], + cython_args: cython_args, include_directories: [inc_np, inc_pd], dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs', diff -Nru pandas-2.1.4+dfsg/pandas/_libs/missing.pyi pandas-2.2.2+dfsg/pandas/_libs/missing.pyi --- pandas-2.1.4+dfsg/pandas/_libs/missing.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/missing.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -14,4 +14,3 @@ def checknull(val: object, inf_as_na: bool = ...) -> bool: ... def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ... def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... -def is_float_nan(values: np.ndarray) -> npt.NDArray[np.bool_]: ... diff -Nru pandas-2.1.4+dfsg/pandas/_libs/missing.pyx pandas-2.2.2+dfsg/pandas/_libs/missing.pyx --- pandas-2.1.4+dfsg/pandas/_libs/missing.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/missing.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -32,8 +32,6 @@ ) from pandas._libs.tslibs.np_datetime cimport ( get_datetime64_unit, - get_datetime64_value, - get_timedelta64_value, import_pandas_datetime, ) @@ -120,18 +118,18 @@ and util.is_complex_object(right) and util.is_nan(right) ) - elif util.is_datetime64_object(left): + elif cnp.is_datetime64_object(left): return ( - get_datetime64_value(left) == NPY_NAT - and util.is_datetime64_object(right) - and get_datetime64_value(right) == NPY_NAT + cnp.get_datetime64_value(left) == NPY_NAT + and cnp.is_datetime64_object(right) + and cnp.get_datetime64_value(right) == NPY_NAT and get_datetime64_unit(left) == get_datetime64_unit(right) ) - elif util.is_timedelta64_object(left): + elif cnp.is_timedelta64_object(left): return ( - get_timedelta64_value(left) == NPY_NAT - and util.is_timedelta64_object(right) - and get_timedelta64_value(right) == NPY_NAT + cnp.get_timedelta64_value(left) == NPY_NAT + and cnp.is_timedelta64_object(right) + and cnp.get_timedelta64_value(right) == NPY_NAT and get_datetime64_unit(left) == get_datetime64_unit(right) ) elif is_decimal_na(left): @@ -169,10 +167,10 @@ elif inf_as_na: return val == INF or val == NEGINF return False - elif util.is_timedelta64_object(val): - return get_timedelta64_value(val) == NPY_NAT - elif util.is_datetime64_object(val): - return get_datetime64_value(val) == NPY_NAT + elif cnp.is_timedelta64_object(val): + return cnp.get_timedelta64_value(val) == NPY_NAT + elif cnp.is_datetime64_object(val): + return cnp.get_datetime64_value(val) == NPY_NAT else: return is_decimal_na(val) @@ -256,31 +254,6 @@ @cython.wraparound(False) -@cython.boundscheck(False) -def is_float_nan(values: ndarray) -> ndarray: - """ - True for elements which correspond to a float nan - - Returns - ------- - ndarray[bool] - """ - cdef: - ndarray[uint8_t] result - Py_ssize_t i, N - object val - - N = len(values) - result = np.zeros(N, dtype=np.uint8) - - for i in range(N): - val = values[i] - if util.is_nan(val): - result[i] = True - return result.view(bool) - - -@cython.wraparound(False) @cython.boundscheck(False) def is_numeric_na(values: ndarray) -> ndarray: """ diff -Nru pandas-2.1.4+dfsg/pandas/_libs/ops.pyi pandas-2.2.2+dfsg/pandas/_libs/ops.pyi --- pandas-2.1.4+dfsg/pandas/_libs/ops.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/ops.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -37,8 +37,8 @@ @overload def maybe_convert_bool( arr: npt.NDArray[np.object_], - true_values: Iterable = ..., - false_values: Iterable = ..., + true_values: Iterable | None = None, + false_values: Iterable | None = None, convert_to_masked_nullable: Literal[False] = ..., ) -> tuple[np.ndarray, None]: ... @overload diff -Nru pandas-2.1.4+dfsg/pandas/_libs/ops.pyx pandas-2.2.2+dfsg/pandas/_libs/ops.pyx --- pandas-2.1.4+dfsg/pandas/_libs/ops.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/ops.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -29,7 +29,7 @@ @cython.wraparound(False) @cython.boundscheck(False) -def scalar_compare(object[:] values, object val, object op) -> ndarray: +def scalar_compare(ndarray[object] values, object val, object op) -> ndarray: """ Compare each element of `values` array with the scalar `val`, with the comparison operation described by `op`. diff -Nru pandas-2.1.4+dfsg/pandas/_libs/parsers.pyx pandas-2.2.2+dfsg/pandas/_libs/parsers.pyx --- pandas-2.1.4+dfsg/pandas/_libs/parsers.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/parsers.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -6,7 +6,6 @@ QUOTE_NONE, QUOTE_NONNUMERIC, ) -import sys import time import warnings @@ -35,6 +34,7 @@ PyUnicode_AsUTF8String, PyUnicode_Decode, PyUnicode_DecodeUTF8, + PyUnicode_FromString, ) from cython cimport Py_ssize_t from libc.stdlib cimport free @@ -45,11 +45,6 @@ ) -cdef extern from "Python.h": - # TODO(cython3): get this from cpython.unicode - object PyUnicode_FromString(char *v) - - import numpy as np cimport numpy as cnp @@ -157,9 +152,9 @@ WARN, SKIP - ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, + ctypedef char* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) - ctypedef int (*io_cleanup)(void *src) + ctypedef void (*io_cleanup)(void *src) ctypedef struct parser_t: void *source @@ -229,7 +224,7 @@ # pick one, depending on whether the converter requires GIL double (*double_converter)(const char *, char **, char, char, char, - int, int *, int *) nogil + int, int *, int *) noexcept nogil # error handling char *warn_msg @@ -252,9 +247,9 @@ cdef extern from "pandas/parser/pd_parser.h": void *new_rd_source(object obj) except NULL - int del_rd_source(void *src) + void del_rd_source(void *src) - void* buffer_rd_bytes(void *source, size_t nbytes, + char* buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) void uint_state_init(uint_state *self) @@ -271,7 +266,7 @@ void parser_del(parser_t *self) nogil int parser_add_skiprow(parser_t *self, int64_t row) - int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) + void parser_set_skipfirstnrows(parser_t *self, int64_t nrows) void parser_set_default_options(parser_t *self) @@ -323,13 +318,13 @@ return round_trip(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int) -cdef void* buffer_rd_bytes_wrapper(void *source, size_t nbytes, +cdef char* buffer_rd_bytes_wrapper(void *source, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) noexcept: return buffer_rd_bytes(source, nbytes, bytes_read, status, encoding_errors) -cdef int del_rd_source_wrapper(void *src) noexcept: - return del_rd_source(src) +cdef void del_rd_source_wrapper(void *src) noexcept: + del_rd_source(src) cdef class TextReader: @@ -880,9 +875,15 @@ cdef _check_tokenize_status(self, int status): if self.parser.warn_msg != NULL: - print(PyUnicode_DecodeUTF8( - self.parser.warn_msg, strlen(self.parser.warn_msg), - self.encoding_errors), file=sys.stderr) + warnings.warn( + PyUnicode_DecodeUTF8( + self.parser.warn_msg, + strlen(self.parser.warn_msg), + self.encoding_errors + ), + ParserWarning, + stacklevel=find_stack_level() + ) free(self.parser.warn_msg) self.parser.warn_msg = NULL @@ -988,7 +989,7 @@ missing_usecols = [col for col in self.usecols if col >= num_cols] if missing_usecols: raise ParserError( - "Defining usecols without of bounds indices is not allowed. " + "Defining usecols with out-of-bounds indices is not allowed. " f"{missing_usecols} are out of bounds.", ) @@ -1470,13 +1471,15 @@ elif arr.dtype == np.object_: if use_dtype_backend: - arr = StringDtype().construct_array_type()._from_sequence(arr) + dtype = StringDtype() + cls = dtype.construct_array_type() + arr = cls._from_sequence(arr, dtype=dtype) if use_dtype_backend and dtype_backend == "pyarrow": import pyarrow as pa if isinstance(arr, IntegerArray) and arr.isna().all(): # use null instead of int64 in pyarrow - arr = arr.to_numpy() + arr = arr.to_numpy(na_value=None) arr = ArrowExtensionArray(pa.array(arr, from_pandas=True)) return arr @@ -1600,7 +1603,7 @@ # -> ndarray[f'|S{width}'] cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, - int64_t line_end, int64_t width): + int64_t line_end, int64_t width) noexcept: cdef: char *data ndarray result @@ -1616,7 +1619,7 @@ cdef void _to_fw_string_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - size_t width, char *data) nogil: + size_t width, char *data) noexcept nogil: cdef: int64_t i coliter_t it @@ -1672,7 +1675,7 @@ cdef int _try_double_nogil(parser_t *parser, float64_t (*double_converter)( const char *, char **, char, - char, char, int, int *, int *) nogil, + char, char, int, int *, int *) noexcept nogil, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset, bint use_na_flist, diff -Nru pandas-2.1.4+dfsg/pandas/_libs/sas.pyx pandas-2.2.2+dfsg/pandas/_libs/sas.pyx --- pandas-2.1.4+dfsg/pandas/_libs/sas.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/sas.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -62,6 +62,7 @@ # algorithm. It is partially documented here: # # https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf +# Licence at LICENSES/SAS7BDAT_LICENSE cdef int rle_decompress(Buffer inbuff, Buffer outbuff) except? 0: cdef: diff -Nru pandas-2.1.4+dfsg/pandas/_libs/sparse.pyi pandas-2.2.2+dfsg/pandas/_libs/sparse.pyi --- pandas-2.1.4+dfsg/pandas/_libs/sparse.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/sparse.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -39,6 +39,10 @@ self, length: int, blocs: np.ndarray, blengths: np.ndarray ) -> None: ... + # Override to have correct parameters + def intersect(self, other: SparseIndex) -> Self: ... + def make_union(self, y: SparseIndex) -> Self: ... + def make_mask_object_ndarray( arr: npt.NDArray[np.object_], fill_value ) -> npt.NDArray[np.bool_]: ... diff -Nru pandas-2.1.4+dfsg/pandas/_libs/src/datetime/date_conversions.c pandas-2.2.2+dfsg/pandas/_libs/src/datetime/date_conversions.c --- pandas-2.1.4+dfsg/pandas/_libs/src/datetime/date_conversions.c 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/src/datetime/date_conversions.c 2024-04-10 17:42:52.000000000 +0000 @@ -20,84 +20,77 @@ * * Mutates the provided value directly. Returns 0 on success, non-zero on error. */ -int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) { - switch (unit) { - case NPY_FR_ns: - break; - case NPY_FR_us: - *value /= 1000LL; - break; - case NPY_FR_ms: - *value /= 1000000LL; - break; - case NPY_FR_s: - *value /= 1000000000LL; - break; - default: - return -1; - } +int scaleNanosecToUnit(int64_t *value, NPY_DATETIMEUNIT unit) { + switch (unit) { + case NPY_FR_ns: + break; + case NPY_FR_us: + *value /= 1000LL; + break; + case NPY_FR_ms: + *value /= 1000000LL; + break; + case NPY_FR_s: + *value /= 1000000000LL; + break; + default: + return -1; + } - return 0; + return 0; } /* Converts the int64_t representation of a datetime to ISO; mutates len */ -char *int64ToIso(int64_t value, - NPY_DATETIMEUNIT valueUnit, - NPY_DATETIMEUNIT base, - size_t *len) { - npy_datetimestruct dts; - int ret_code; - - pandas_datetime_to_datetimestruct(value, valueUnit, &dts); - - *len = (size_t)get_datetime_iso_8601_strlen(0, base); - char *result = PyObject_Malloc(*len); - - if (result == NULL) { - PyErr_NoMemory(); - return NULL; - } - // datetime64 is always naive - ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base); - if (ret_code != 0) { - PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); - PyObject_Free(result); - } - - // Note that get_datetime_iso_8601_strlen just gives a generic size - // for ISO string conversion, not the actual size used - *len = strlen(result); - return result; -} - -npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { - scaleNanosecToUnit(&dt, base); - return dt; +char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit, + NPY_DATETIMEUNIT base, size_t *len) { + npy_datetimestruct dts; + int ret_code; + + pandas_datetime_to_datetimestruct(value, valueUnit, &dts); + + *len = (size_t)get_datetime_iso_8601_strlen(0, base); + char *result = PyObject_Malloc(*len); + + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } + // datetime64 is always naive + ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base); + if (ret_code != 0) { + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + PyObject_Free(result); + } + + // Note that get_datetime_iso_8601_strlen just gives a generic size + // for ISO string conversion, not the actual size used + *len = strlen(result); + return result; } /* Converts the int64_t representation of a duration to ISO; mutates len */ char *int64ToIsoDuration(int64_t value, size_t *len) { - pandas_timedeltastruct tds; - int ret_code; + pandas_timedeltastruct tds; + int ret_code; - pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); + pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); - // Max theoretical length of ISO Duration with 64 bit day - // as the largest unit is 70 characters + 1 for a null terminator - char *result = PyObject_Malloc(71); - if (result == NULL) { - PyErr_NoMemory(); - return NULL; - } - - ret_code = make_iso_8601_timedelta(&tds, result, len); - if (ret_code == -1) { - PyErr_SetString(PyExc_ValueError, - "Could not convert timedelta value to string"); - PyObject_Free(result); - return NULL; - } + // Max theoretical length of ISO Duration with 64 bit day + // as the largest unit is 70 characters + 1 for a null terminator + char *result = PyObject_Malloc(71); + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } + + ret_code = make_iso_8601_timedelta(&tds, result, len); + if (ret_code == -1) { + PyErr_SetString(PyExc_ValueError, + "Could not convert timedelta value to string"); + PyObject_Free(result); + return NULL; + } - return result; + return result; } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/src/datetime/pd_datetime.c pandas-2.2.2+dfsg/pandas/_libs/src/datetime/pd_datetime.c --- pandas-2.1.4+dfsg/pandas/_libs/src/datetime/pd_datetime.c 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/src/datetime/pd_datetime.c 2024-04-10 17:42:52.000000000 +0000 @@ -20,8 +20,11 @@ #include #include "datetime.h" +/* Need to import_array for np_datetime.c (for NumPy 1.x support only) */ +#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY +#include "numpy/ndarrayobject.h" #include "pandas/datetime/pd_datetime.h" - +#include "pandas/portable.h" static void pandas_datetime_destructor(PyObject *op) { void *ptr = PyCapsule_GetPointer(op, PandasDateTime_CAPSULE_NAME); @@ -42,77 +45,77 @@ * if obj doesn't have the needed date or datetime attributes. */ static int convert_pydatetime_to_datetimestruct(PyObject *dtobj, - npy_datetimestruct *out) { - // Assumes that obj is a valid datetime object - PyObject *tmp; - PyObject *obj = (PyObject*)dtobj; - - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); - out->month = 1; - out->day = 1; - - out->year = PyLong_AsLong(PyObject_GetAttrString(obj, "year")); - out->month = PyLong_AsLong(PyObject_GetAttrString(obj, "month")); - out->day = PyLong_AsLong(PyObject_GetAttrString(obj, "day")); - - // TODO(anyone): If we can get PyDateTime_IMPORT to work, we could use - // PyDateTime_Check here, and less verbose attribute lookups. - - /* Check for time attributes (if not there, return success as a date) */ - if (!PyObject_HasAttrString(obj, "hour") || - !PyObject_HasAttrString(obj, "minute") || - !PyObject_HasAttrString(obj, "second") || - !PyObject_HasAttrString(obj, "microsecond")) { - return 0; - } + npy_datetimestruct *out) { + // Assumes that obj is a valid datetime object + PyObject *tmp; + PyObject *obj = (PyObject *)dtobj; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(npy_datetimestruct)); + out->month = 1; + out->day = 1; + + out->year = PyLong_AsLong(PyObject_GetAttrString(obj, "year")); + out->month = PyLong_AsLong(PyObject_GetAttrString(obj, "month")); + out->day = PyLong_AsLong(PyObject_GetAttrString(obj, "day")); + + // TODO(anyone): If we can get PyDateTime_IMPORT to work, we could use + // PyDateTime_Check here, and less verbose attribute lookups. + + /* Check for time attributes (if not there, return success as a date) */ + if (!PyObject_HasAttrString(obj, "hour") || + !PyObject_HasAttrString(obj, "minute") || + !PyObject_HasAttrString(obj, "second") || + !PyObject_HasAttrString(obj, "microsecond")) { + return 0; + } - out->hour = PyLong_AsLong(PyObject_GetAttrString(obj, "hour")); - out->min = PyLong_AsLong(PyObject_GetAttrString(obj, "minute")); - out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second")); - out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond")); - - if (PyObject_HasAttrString(obj, "tzinfo")) { - PyObject *offset = extract_utc_offset(obj); - /* Apply the time zone offset if datetime obj is tz-aware */ - if (offset != NULL) { - if (offset == Py_None) { - Py_DECREF(offset); - return 0; - } - PyObject *tmp_int; - int seconds_offset, minutes_offset; - /* - * The timedelta should have a function "total_seconds" - * which contains the value we want. - */ - tmp = PyObject_CallMethod(offset, "total_seconds", ""); - Py_DECREF(offset); - if (tmp == NULL) { - return -1; - } - tmp_int = PyNumber_Long(tmp); - if (tmp_int == NULL) { - Py_DECREF(tmp); - return -1; - } - seconds_offset = PyLong_AsLong(tmp_int); - if (seconds_offset == -1 && PyErr_Occurred()) { - Py_DECREF(tmp_int); - Py_DECREF(tmp); - return -1; - } - Py_DECREF(tmp_int); - Py_DECREF(tmp); + out->hour = PyLong_AsLong(PyObject_GetAttrString(obj, "hour")); + out->min = PyLong_AsLong(PyObject_GetAttrString(obj, "minute")); + out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second")); + out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond")); + + if (PyObject_HasAttrString(obj, "tzinfo")) { + PyObject *offset = extract_utc_offset(obj); + /* Apply the time zone offset if datetime obj is tz-aware */ + if (offset != NULL) { + if (offset == Py_None) { + Py_DECREF(offset); + return 0; + } + PyObject *tmp_int; + int seconds_offset, minutes_offset; + /* + * The timedelta should have a function "total_seconds" + * which contains the value we want. + */ + tmp = PyObject_CallMethod(offset, "total_seconds", ""); + Py_DECREF(offset); + if (tmp == NULL) { + return -1; + } + tmp_int = PyNumber_Long(tmp); + if (tmp_int == NULL) { + Py_DECREF(tmp); + return -1; + } + seconds_offset = PyLong_AsLong(tmp_int); + if (seconds_offset == -1 && PyErr_Occurred()) { + Py_DECREF(tmp_int); + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp_int); + Py_DECREF(tmp); - /* Convert to a minutes offset and apply it */ - minutes_offset = seconds_offset / 60; + /* Convert to a minutes offset and apply it */ + minutes_offset = seconds_offset / 60; - add_minutes_to_datetimestruct(out, -minutes_offset); - } + add_minutes_to_datetimestruct(out, -minutes_offset); } + } - return 0; + return 0; } // Converts a Python object representing a Date / Datetime to ISO format @@ -120,69 +123,76 @@ // while base="ns" yields "2020-01-01T00:00:00.000000000Z" // len is mutated to save the length of the returned string static char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, - size_t *len) { - npy_datetimestruct dts; - int ret; - - ret = convert_pydatetime_to_datetimestruct(obj, &dts); - if (ret != 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, - "Could not convert PyDateTime to numpy datetime"); - } - return NULL; + size_t *len) { + npy_datetimestruct dts; + int ret; + + ret = convert_pydatetime_to_datetimestruct(obj, &dts); + if (ret != 0) { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "Could not convert PyDateTime to numpy datetime"); } + return NULL; + } - *len = (size_t)get_datetime_iso_8601_strlen(0, base); - char *result = PyObject_Malloc(*len); - // Check to see if PyDateTime has a timezone. - // Don't convert to UTC if it doesn't. - int is_tz_aware = 0; - if (PyObject_HasAttrString(obj, "tzinfo")) { - PyObject *offset = extract_utc_offset(obj); - if (offset == NULL) { - PyObject_Free(result); - return NULL; - } - is_tz_aware = offset != Py_None; - Py_DECREF(offset); + *len = (size_t)get_datetime_iso_8601_strlen(0, base); + char *result = PyObject_Malloc(*len); + // Check to see if PyDateTime has a timezone. + // Don't convert to UTC if it doesn't. + int is_tz_aware = 0; + if (PyObject_HasAttrString(obj, "tzinfo")) { + PyObject *offset = extract_utc_offset(obj); + if (offset == NULL) { + PyObject_Free(result); + return NULL; } - ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base); + is_tz_aware = offset != Py_None; + Py_DECREF(offset); + } + ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base); - if (ret != 0) { - PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); - PyObject_Free(result); - return NULL; - } + if (ret != 0) { + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + PyObject_Free(result); + return NULL; + } - // Note that get_datetime_iso_8601_strlen just gives a generic size - // for ISO string conversion, not the actual size used - *len = strlen(result); - return result; + // Note that get_datetime_iso_8601_strlen just gives a generic size + // for ISO string conversion, not the actual size used + *len = strlen(result); + return result; } // Convert a Python Date/Datetime to Unix epoch with resolution base static npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) { - npy_datetimestruct dts; - int ret; + npy_datetimestruct dts; + int ret; - ret = convert_pydatetime_to_datetimestruct(dt, &dts); - if (ret != 0) { - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, - "Could not convert PyDateTime to numpy datetime"); - } - // TODO(username): is setting errMsg required? - // ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - // return NULL; + ret = convert_pydatetime_to_datetimestruct(dt, &dts); + if (ret != 0) { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, + "Could not convert PyDateTime to numpy datetime"); + + return -1; } + } + + int64_t npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); + if (scaleNanosecToUnit(&npy_dt, base) == -1) { + PyErr_Format(PyExc_ValueError, + "Call to scaleNanosecToUnit with value %" NPY_DATETIME_FMT + " and base %d failed", + npy_dt, base); - npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); - return NpyDateTimeToEpoch(npy_dt, base); + return -1; + } + return npy_dt; } -static int pandas_datetime_exec(PyObject *module) { +static int pandas_datetime_exec(PyObject *Py_UNUSED(module)) { PyDateTime_IMPORT; PandasDateTime_CAPI *capi = PyMem_Malloc(sizeof(PandasDateTime_CAPI)); if (capi == NULL) { @@ -192,7 +202,6 @@ capi->npy_datetimestruct_to_datetime = npy_datetimestruct_to_datetime; capi->scaleNanosecToUnit = scaleNanosecToUnit; capi->int64ToIso = int64ToIso; - capi->NpyDateTimeToEpoch = NpyDateTimeToEpoch; capi->PyDateTimeToIso = PyDateTimeToIso; capi->PyDateTimeToEpoch = PyDateTimeToEpoch; capi->int64ToIsoDuration = int64ToIsoDuration; @@ -249,5 +258,6 @@ PyMODINIT_FUNC PyInit_pandas_datetime(void) { PyDateTime_IMPORT; + import_array(); return PyModuleDef_Init(&pandas_datetimemodule); } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/src/parser/io.c pandas-2.2.2+dfsg/pandas/_libs/src/parser/io.c --- pandas-2.1.4+dfsg/pandas/_libs/src/parser/io.c 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/src/parser/io.c 2024-04-10 17:42:52.000000000 +0000 @@ -14,19 +14,19 @@ */ void *new_rd_source(PyObject *obj) { - rd_source *rds = (rd_source *)malloc(sizeof(rd_source)); + rd_source *rds = (rd_source *)malloc(sizeof(rd_source)); - if (rds == NULL) { - PyErr_NoMemory(); - return NULL; - } - /* hold on to this object */ - Py_INCREF(obj); - rds->obj = obj; - rds->buffer = NULL; - rds->position = 0; + if (rds == NULL) { + PyErr_NoMemory(); + return NULL; + } + /* hold on to this object */ + Py_INCREF(obj); + rds->obj = obj; + rds->buffer = NULL; + rds->position = 0; - return (void *)rds; + return (void *)rds; } /* @@ -35,12 +35,10 @@ */ -int del_rd_source(void *rds) { - Py_XDECREF(RDS(rds)->obj); - Py_XDECREF(RDS(rds)->buffer); - free(rds); - - return 0; +void del_rd_source(void *rds) { + Py_XDECREF(RDS(rds)->obj); + Py_XDECREF(RDS(rds)->buffer); + free(rds); } /* @@ -49,59 +47,53 @@ */ -void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, +char *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) { - PyGILState_STATE state; - PyObject *result, *func, *args, *tmp; + rd_source *src = RDS(source); + PyGILState_STATE state = PyGILState_Ensure(); - void *retval; + /* delete old object */ + Py_XDECREF(src->buffer); + src->buffer = NULL; + PyObject *args = Py_BuildValue("(i)", nbytes); + + PyObject *func = PyObject_GetAttrString(src->obj, "read"); + + /* Note: PyObject_CallObject requires the GIL */ + PyObject *result = PyObject_CallObject(func, args); + Py_XDECREF(args); + Py_XDECREF(func); - size_t length; - rd_source *src = RDS(source); - state = PyGILState_Ensure(); - - /* delete old object */ - Py_XDECREF(src->buffer); - src->buffer = NULL; - args = Py_BuildValue("(i)", nbytes); - - func = PyObject_GetAttrString(src->obj, "read"); - - /* Note: PyObject_CallObject requires the GIL */ - result = PyObject_CallObject(func, args); - Py_XDECREF(args); - Py_XDECREF(func); - - if (result == NULL) { - PyGILState_Release(state); - *bytes_read = 0; - *status = CALLING_READ_FAILED; - return NULL; - } else if (!PyBytes_Check(result)) { - tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors); - Py_DECREF(result); - if (tmp == NULL) { - PyGILState_Release(state); - return NULL; - } - result = tmp; + if (result == NULL) { + PyGILState_Release(state); + *bytes_read = 0; + *status = CALLING_READ_FAILED; + return NULL; + } else if (!PyBytes_Check(result)) { + PyObject *tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors); + Py_DECREF(result); + if (tmp == NULL) { + PyGILState_Release(state); + return NULL; } + result = tmp; + } - length = PySequence_Length(result); + const size_t length = PySequence_Length(result); - if (length == 0) - *status = REACHED_EOF; - else - *status = 0; - - /* hang on to the Python object */ - src->buffer = result; - retval = (void *)PyBytes_AsString(result); + if (length == 0) + *status = REACHED_EOF; + else + *status = 0; - PyGILState_Release(state); + /* hang on to the Python object */ + src->buffer = result; + char *retval = PyBytes_AsString(result); + + PyGILState_Release(state); - /* TODO: more error handling */ - *bytes_read = length; + /* TODO: more error handling */ + *bytes_read = length; - return retval; + return retval; } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/src/parser/pd_parser.c pandas-2.2.2+dfsg/pandas/_libs/src/parser/pd_parser.c --- pandas-2.1.4+dfsg/pandas/_libs/src/parser/pd_parser.c 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/src/parser/pd_parser.c 2024-04-10 17:42:52.000000000 +0000 @@ -10,9 +10,10 @@ #include "pandas/parser/pd_parser.h" #include "pandas/parser/io.h" +#include "pandas/portable.h" static int to_double(char *item, double *p_value, char sci, char decimal, - int *maybe_int) { + int *maybe_int) { char *p_end = NULL; int error = 0; @@ -24,7 +25,6 @@ } static int floatify(PyObject *str, double *result, int *maybe_int) { - int status; char *data; PyObject *tmp = NULL; const char sci = 'E'; @@ -43,7 +43,7 @@ return -1; } - status = to_double(data, result, sci, dec, maybe_int); + const int status = to_double(data, result, sci, dec, maybe_int); if (!status) { /* handle inf/-inf infinity/-infinity */ @@ -95,13 +95,12 @@ return -1; } - static void pandas_parser_destructor(PyObject *op) { void *ptr = PyCapsule_GetPointer(op, PandasParser_CAPSULE_NAME); PyMem_Free(ptr); } -static int pandas_parser_exec(PyObject *module) { +static int pandas_parser_exec(PyObject *Py_UNUSED(module)) { PandasParser_CAPI *capi = PyMem_Malloc(sizeof(PandasParser_CAPI)); if (capi == NULL) { PyErr_NoMemory(); diff -Nru pandas-2.1.4+dfsg/pandas/_libs/src/parser/tokenizer.c pandas-2.2.2+dfsg/pandas/_libs/src/parser/tokenizer.c --- pandas-2.1.4+dfsg/pandas/_libs/src/parser/tokenizer.c 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/src/parser/tokenizer.c 2024-04-10 17:42:52.000000000 +0000 @@ -16,29 +16,31 @@ GitHub. See Python Software Foundation License and BSD licenses for these. */ - #include "pandas/parser/tokenizer.h" +#include "pandas/portable.h" #include #include #include +#include #include "pandas/portable.h" +#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start) { - // column i, starting at 0 - self->words = parser->words; - self->col = i; - self->line_start = parser->line_start + start; + // column i, starting at 0 + self->words = parser->words; + self->col = i; + self->line_start = parser->line_start + start; } static void free_if_not_null(void **ptr) { - TRACE(("free_if_not_null %p\n", *ptr)) - if (*ptr != NULL) { - free(*ptr); - *ptr = NULL; - } + TRACE(("free_if_not_null %p\n", *ptr)) + if (*ptr != NULL) { + free(*ptr); + *ptr = NULL; + } } /* @@ -49,542 +51,507 @@ static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity, int64_t space, int64_t elsize, int *error) { - uint64_t cap = *capacity; - void *newbuffer = buffer; - - // Can we fit potentially nbytes tokens (+ null terminators) in the stream? - while ((length + space >= cap) && (newbuffer != NULL)) { - cap = cap ? cap << 1 : 2; - buffer = newbuffer; - newbuffer = realloc(newbuffer, elsize * cap); - } + uint64_t cap = *capacity; + void *newbuffer = buffer; - if (newbuffer == NULL) { - // realloc failed so don't change *capacity, set *error to errno - // and return the last good realloc'd buffer so it can be freed - *error = errno; - newbuffer = buffer; - } else { - // realloc worked, update *capacity and set *error to 0 - // sigh, multiple return values - *capacity = cap; - *error = 0; - } - return newbuffer; + // Can we fit potentially nbytes tokens (+ null terminators) in the stream? + while ((length + space >= cap) && (newbuffer != NULL)) { + cap = cap ? cap << 1 : 2; + buffer = newbuffer; + newbuffer = realloc(newbuffer, elsize * cap); + } + + if (newbuffer == NULL) { + // realloc failed so don't change *capacity, set *error to errno + // and return the last good realloc'd buffer so it can be freed + *error = errno; + newbuffer = buffer; + } else { + // realloc worked, update *capacity and set *error to 0 + // sigh, multiple return values + *capacity = cap; + *error = 0; + } + return newbuffer; } void parser_set_default_options(parser_t *self) { - self->decimal = '.'; - self->sci = 'E'; + self->decimal = '.'; + self->sci = 'E'; - // For tokenization - self->state = START_RECORD; + // For tokenization + self->state = START_RECORD; - self->delimiter = ','; // XXX - self->delim_whitespace = 0; + self->delimiter = ','; // XXX + self->delim_whitespace = 0; - self->doublequote = 0; - self->quotechar = '"'; - self->escapechar = 0; + self->doublequote = 0; + self->quotechar = '"'; + self->escapechar = 0; - self->lineterminator = '\0'; /* NUL->standard logic */ + self->lineterminator = '\0'; /* NUL->standard logic */ - self->skipinitialspace = 0; - self->quoting = QUOTE_MINIMAL; - self->allow_embedded_newline = 1; + self->skipinitialspace = 0; + self->quoting = QUOTE_MINIMAL; + self->allow_embedded_newline = 1; - self->expected_fields = -1; - self->on_bad_lines = ERROR; + self->expected_fields = -1; + self->on_bad_lines = ERROR; - self->commentchar = '#'; - self->thousands = '\0'; + self->commentchar = '#'; + self->thousands = '\0'; - self->skipset = NULL; - self->skipfunc = NULL; - self->skip_first_N_rows = -1; - self->skip_footer = 0; + self->skipset = NULL; + self->skipfunc = NULL; + self->skip_first_N_rows = -1; + self->skip_footer = 0; } parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); } -int parser_clear_data_buffers(parser_t *self) { - free_if_not_null((void *)&self->stream); - free_if_not_null((void *)&self->words); - free_if_not_null((void *)&self->word_starts); - free_if_not_null((void *)&self->line_start); - free_if_not_null((void *)&self->line_fields); - return 0; +static void parser_clear_data_buffers(parser_t *self) { + free_if_not_null((void *)&self->stream); + free_if_not_null((void *)&self->words); + free_if_not_null((void *)&self->word_starts); + free_if_not_null((void *)&self->line_start); + free_if_not_null((void *)&self->line_fields); } -int parser_cleanup(parser_t *self) { - int status = 0; +static void parser_cleanup(parser_t *self) { + // XXX where to put this + free_if_not_null((void *)&self->error_msg); + free_if_not_null((void *)&self->warn_msg); - // XXX where to put this - free_if_not_null((void *)&self->error_msg); - free_if_not_null((void *)&self->warn_msg); - - if (self->skipset != NULL) { - kh_destroy_int64((kh_int64_t *)self->skipset); - self->skipset = NULL; - } - - if (parser_clear_data_buffers(self) < 0) { - status = -1; - } - - if (self->cb_cleanup != NULL) { - if (self->cb_cleanup(self->source) < 0) { - status = -1; - } - self->cb_cleanup = NULL; - } + if (self->skipset != NULL) { + kh_destroy_int64((kh_int64_t *)self->skipset); + self->skipset = NULL; + } - return status; + parser_clear_data_buffers(self); + if (self->cb_cleanup != NULL) { + self->cb_cleanup(self->source); + self->cb_cleanup = NULL; + } } int parser_init(parser_t *self) { - int64_t sz; - - /* - Initialize data buffers - */ - - self->stream = NULL; - self->words = NULL; - self->word_starts = NULL; - self->line_start = NULL; - self->line_fields = NULL; - self->error_msg = NULL; - self->warn_msg = NULL; - - // token stream - self->stream = malloc(STREAM_INIT_SIZE * sizeof(char)); - if (self->stream == NULL) { - parser_cleanup(self); - return PARSER_OUT_OF_MEMORY; - } - self->stream_cap = STREAM_INIT_SIZE; - self->stream_len = 0; - - // word pointers and metadata - sz = STREAM_INIT_SIZE / 10; - sz = sz ? sz : 1; - self->words = malloc(sz * sizeof(char *)); - self->word_starts = malloc(sz * sizeof(int64_t)); - self->max_words_cap = sz; - self->words_cap = sz; - self->words_len = 0; - - // line pointers and metadata - self->line_start = malloc(sz * sizeof(int64_t)); - - self->line_fields = malloc(sz * sizeof(int64_t)); - - self->lines_cap = sz; - self->lines = 0; - self->file_lines = 0; - - if (self->stream == NULL || self->words == NULL || - self->word_starts == NULL || self->line_start == NULL || - self->line_fields == NULL) { - parser_cleanup(self); + /* + Initialize data buffers + */ + + self->stream = NULL; + self->words = NULL; + self->word_starts = NULL; + self->line_start = NULL; + self->line_fields = NULL; + self->error_msg = NULL; + self->warn_msg = NULL; + + // token stream + self->stream = malloc(STREAM_INIT_SIZE * sizeof(char)); + if (self->stream == NULL) { + parser_cleanup(self); + return PARSER_OUT_OF_MEMORY; + } + self->stream_cap = STREAM_INIT_SIZE; + self->stream_len = 0; + + // word pointers and metadata + _Static_assert(STREAM_INIT_SIZE / 10 > 0, + "STREAM_INIT_SIZE must be defined and >= 10"); + const int64_t sz = STREAM_INIT_SIZE / 10; + self->words = malloc(sz * sizeof(char *)); + self->word_starts = malloc(sz * sizeof(int64_t)); + self->max_words_cap = sz; + self->words_cap = sz; + self->words_len = 0; + + // line pointers and metadata + self->line_start = malloc(sz * sizeof(int64_t)); + + self->line_fields = malloc(sz * sizeof(int64_t)); + + self->lines_cap = sz; + self->lines = 0; + self->file_lines = 0; + + if (self->stream == NULL || self->words == NULL || + self->word_starts == NULL || self->line_start == NULL || + self->line_fields == NULL) { + parser_cleanup(self); - return PARSER_OUT_OF_MEMORY; - } + return PARSER_OUT_OF_MEMORY; + } - /* amount of bytes buffered */ - self->datalen = 0; - self->datapos = 0; + /* amount of bytes buffered */ + self->datalen = 0; + self->datapos = 0; - self->line_start[0] = 0; - self->line_fields[0] = 0; + self->line_start[0] = 0; + self->line_fields[0] = 0; - self->pword_start = self->stream; - self->word_start = 0; + self->pword_start = self->stream; + self->word_start = 0; - self->state = START_RECORD; + self->state = START_RECORD; - self->error_msg = NULL; - self->warn_msg = NULL; + self->error_msg = NULL; + self->warn_msg = NULL; - self->commentchar = '\0'; + self->commentchar = '\0'; - return 0; + return 0; } void parser_free(parser_t *self) { - // opposite of parser_init - parser_cleanup(self); + // opposite of parser_init + parser_cleanup(self); } -void parser_del(parser_t *self) { - free(self); -} +void parser_del(parser_t *self) { free(self); } static int make_stream_space(parser_t *self, size_t nbytes) { - uint64_t i, cap, length; - int status; - void *orig_ptr, *newptr; - - // Can we fit potentially nbytes tokens (+ null terminators) in the stream? - - /* - TOKEN STREAM - */ - - orig_ptr = (void *)self->stream; - TRACE( - ("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", + // Can we fit potentially nbytes tokens (+ null terminators) in the stream? + + /* + TOKEN STREAM + */ + + int status; + char *orig_ptr = (void *)self->stream; + TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) - self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len, - &self->stream_cap, nbytes * 2, - sizeof(char), &status); - TRACE( - ("make_stream_space: self->stream=%p, self->stream_len = %zu, " + self->stream = + (char *)grow_buffer((void *)self->stream, self->stream_len, + &self->stream_cap, nbytes * 2, sizeof(char), &status); + TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, " "self->stream_cap=%zu, status=%zu\n", self->stream, self->stream_len, self->stream_cap, status)) - if (status != 0) { - return PARSER_OUT_OF_MEMORY; - } - - // realloc sets errno when moving buffer? - if (self->stream != orig_ptr) { - self->pword_start = self->stream + self->word_start; - - for (i = 0; i < self->words_len; ++i) { - self->words[i] = self->stream + self->word_starts[i]; - } - } - - /* - WORD VECTORS - */ - - cap = self->words_cap; - - /** - * If we are reading in chunks, we need to be aware of the maximum number - * of words we have seen in previous chunks (self->max_words_cap), so - * that way, we can properly allocate when reading subsequent ones. - * - * Otherwise, we risk a buffer overflow if we mistakenly under-allocate - * just because a recent chunk did not have as many words. - */ - if (self->words_len + nbytes < self->max_words_cap) { - length = self->max_words_cap - nbytes - 1; - } else { - length = self->words_len; - } - - self->words = - (char **)grow_buffer((void *)self->words, length, - &self->words_cap, nbytes, - sizeof(char *), &status); - TRACE( - ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " + if (status != 0) { + return PARSER_OUT_OF_MEMORY; + } + + // realloc sets errno when moving buffer? + if (self->stream != orig_ptr) { + self->pword_start = self->stream + self->word_start; + + for (uint64_t i = 0; i < self->words_len; ++i) { + self->words[i] = self->stream + self->word_starts[i]; + } + } + + /* + WORD VECTORS + */ + + const uint64_t words_cap = self->words_cap; + + /** + * If we are reading in chunks, we need to be aware of the maximum number + * of words we have seen in previous chunks (self->max_words_cap), so + * that way, we can properly allocate when reading subsequent ones. + * + * Otherwise, we risk a buffer overflow if we mistakenly under-allocate + * just because a recent chunk did not have as many words. + */ + const uint64_t length = self->words_len + nbytes < self->max_words_cap + ? self->max_words_cap - nbytes - 1 + : self->words_len; + + self->words = + (char **)grow_buffer((void *)self->words, length, &self->words_cap, + nbytes, sizeof(char *), &status); + TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, " "%d)\n", self->words_len, self->words_cap, nbytes, status)) - if (status != 0) { - return PARSER_OUT_OF_MEMORY; - } - - // realloc took place - if (cap != self->words_cap) { - TRACE( - ("make_stream_space: cap != self->words_cap, nbytes = %d, " - "self->words_cap=%d\n", - nbytes, self->words_cap)) - newptr = realloc((void *)self->word_starts, - sizeof(int64_t) * self->words_cap); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->word_starts = (int64_t *)newptr; - } + if (status != 0) { + return PARSER_OUT_OF_MEMORY; + } + + // realloc took place + if (words_cap != self->words_cap) { + TRACE(("make_stream_space: cap != self->words_cap, nbytes = %d, " + "self->words_cap=%d\n", + nbytes, self->words_cap)) + int64_t *newptr = (int64_t *)realloc(self->word_starts, + sizeof(int64_t) * self->words_cap); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + self->word_starts = newptr; } + } - /* - LINE VECTORS - */ - cap = self->lines_cap; - self->line_start = - (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1, - &self->lines_cap, nbytes, - sizeof(int64_t), &status); - TRACE(( - "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", - self->lines + 1, self->lines_cap, nbytes, status)) - if (status != 0) { - return PARSER_OUT_OF_MEMORY; - } - - // realloc took place - if (cap != self->lines_cap) { - TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", - nbytes)) - newptr = realloc((void *)self->line_fields, - sizeof(int64_t) * self->lines_cap); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->line_fields = (int64_t *)newptr; - } + /* + LINE VECTORS + */ + const uint64_t lines_cap = self->lines_cap; + self->line_start = (int64_t *)grow_buffer((void *)self->line_start, + self->lines + 1, &self->lines_cap, + nbytes, sizeof(int64_t), &status); + TRACE( + ("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", + self->lines + 1, self->lines_cap, nbytes, status)) + if (status != 0) { + return PARSER_OUT_OF_MEMORY; + } + + // realloc took place + if (lines_cap != self->lines_cap) { + TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) + int64_t *newptr = (int64_t *)realloc(self->line_fields, + sizeof(int64_t) * self->lines_cap); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + self->line_fields = newptr; } + } - return 0; + return 0; } static int push_char(parser_t *self, char c) { - TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", - self->stream_len + 1, c, self->stream_cap)) - if (self->stream_len >= self->stream_cap) { - TRACE( - ("push_char: ERROR!!! self->stream_len(%d) >= " - "self->stream_cap(%d)\n", - self->stream_len, self->stream_cap)) - int64_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, - "Buffer overflow caught - possible malformed input file.\n"); - return PARSER_OUT_OF_MEMORY; - } - self->stream[self->stream_len++] = c; - return 0; -} - -int PANDAS_INLINE end_field(parser_t *self) { - // XXX cruft - if (self->words_len >= self->words_cap) { - TRACE( - ("end_field: ERROR!!! self->words_len(%zu) >= " - "self->words_cap(%zu)\n", - self->words_len, self->words_cap)) - int64_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, - "Buffer overflow caught - possible malformed input file.\n"); - return PARSER_OUT_OF_MEMORY; - } - - // null terminate token - push_char(self, '\0'); - - // set pointer and metadata - self->words[self->words_len] = self->pword_start; - - TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0])); - - TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start, - self->word_start, self->words_len + 1)) + TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", + self->stream_len + 1, c, self->stream_cap)) + if (self->stream_len >= self->stream_cap) { + TRACE(("push_char: ERROR!!! self->stream_len(%d) >= " + "self->stream_cap(%d)\n", + self->stream_len, self->stream_cap)) + const size_t bufsize = 100; + self->error_msg = malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Buffer overflow caught - possible malformed input file.\n"); + return PARSER_OUT_OF_MEMORY; + } + self->stream[self->stream_len++] = c; + return 0; +} + +static inline int end_field(parser_t *self) { + // XXX cruft + if (self->words_len >= self->words_cap) { + TRACE(("end_field: ERROR!!! self->words_len(%zu) >= " + "self->words_cap(%zu)\n", + self->words_len, self->words_cap)) + const size_t bufsize = 100; + self->error_msg = malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Buffer overflow caught - possible malformed input file.\n"); + return PARSER_OUT_OF_MEMORY; + } + + // null terminate token + push_char(self, '\0'); + + // set pointer and metadata + self->words[self->words_len] = self->pword_start; + + TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0])); + + TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start, + self->word_start, self->words_len + 1)) + + self->word_starts[self->words_len] = self->word_start; + self->words_len++; + + // increment line field count + self->line_fields[self->lines]++; + + // New field begin in stream + self->pword_start = self->stream + self->stream_len; + self->word_start = self->stream_len; - self->word_starts[self->words_len] = self->word_start; - self->words_len++; - - // increment line field count - self->line_fields[self->lines]++; - - // New field begin in stream - self->pword_start = self->stream + self->stream_len; - self->word_start = self->stream_len; - - return 0; + return 0; } static void append_warning(parser_t *self, const char *msg) { - int64_t ex_length; - int64_t length = strlen(msg); - void *newptr; - - if (self->warn_msg == NULL) { - self->warn_msg = malloc(length + 1); - snprintf(self->warn_msg, length + 1, "%s", msg); - } else { - ex_length = strlen(self->warn_msg); - newptr = realloc(self->warn_msg, ex_length + length + 1); - if (newptr != NULL) { - self->warn_msg = (char *)newptr; - snprintf(self->warn_msg + ex_length, length + 1, "%s", msg); - } + const int64_t length = strlen(msg); + + if (self->warn_msg == NULL) { + self->warn_msg = malloc(length + 1); + snprintf(self->warn_msg, length + 1, "%s", msg); + } else { + const int64_t ex_length = strlen(self->warn_msg); + char *newptr = (char *)realloc(self->warn_msg, ex_length + length + 1); + if (newptr != NULL) { + self->warn_msg = newptr; + snprintf(self->warn_msg + ex_length, length + 1, "%s", msg); } + } } static int end_line(parser_t *self) { - char *msg; - int64_t fields; - int64_t ex_fields = self->expected_fields; - int64_t bufsize = 100; // for error or warning messages - - fields = self->line_fields[self->lines]; - - TRACE(("end_line: Line end, nfields: %d\n", fields)); - - TRACE(("end_line: lines: %d\n", self->lines)); - if (self->lines > 0) { - if (self->expected_fields >= 0) { - ex_fields = self->expected_fields; - } else { - ex_fields = self->line_fields[self->lines - 1]; - } - } - TRACE(("end_line: ex_fields: %d\n", ex_fields)); + int64_t ex_fields = self->expected_fields; + int64_t fields = self->line_fields[self->lines]; - if (self->state == START_FIELD_IN_SKIP_LINE || - self->state == IN_FIELD_IN_SKIP_LINE || - self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || - self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) { - TRACE(("end_line: Skipping row %d\n", self->file_lines)); - // increment file line count - self->file_lines++; - - // skip the tokens from this bad line - self->line_start[self->lines] += fields; + TRACE(("end_line: Line end, nfields: %d\n", fields)); - // reset field count - self->line_fields[self->lines] = 0; - return 0; + TRACE(("end_line: lines: %d\n", self->lines)); + if (self->lines > 0) { + if (self->expected_fields >= 0) { + ex_fields = self->expected_fields; + } else { + ex_fields = self->line_fields[self->lines - 1]; } + } + TRACE(("end_line: ex_fields: %d\n", ex_fields)); - if (!(self->lines <= self->header_end + 1) && - (fields > ex_fields) && !(self->usecols)) { - // increment file line count - self->file_lines++; + if (self->state == START_FIELD_IN_SKIP_LINE || + self->state == IN_FIELD_IN_SKIP_LINE || + self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || + self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) { + TRACE(("end_line: Skipping row %d\n", self->file_lines)); + // increment file line count + self->file_lines++; - // skip the tokens from this bad line - self->line_start[self->lines] += fields; + // skip the tokens from this bad line + self->line_start[self->lines] += fields; - // reset field count - self->line_fields[self->lines] = 0; + // reset field count + self->line_fields[self->lines] = 0; + return 0; + } - // file_lines is now the actual file line number (starting at 1) - if (self->on_bad_lines == ERROR) { - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, - "Expected %" PRId64 " fields in line %" PRIu64 ", saw %" - PRId64 "\n", ex_fields, self->file_lines, fields); + if (!(self->lines <= self->header_end + 1) && (fields > ex_fields) && + !(self->usecols)) { + // increment file line count + self->file_lines++; + + // skip the tokens from this bad line + self->line_start[self->lines] += fields; + + // reset field count + self->line_fields[self->lines] = 0; + + // file_lines is now the actual file line number (starting at 1) + if (self->on_bad_lines == ERROR) { + const size_t bufsize = 100; + self->error_msg = malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Expected %" PRId64 " fields in line %" PRIu64 ", saw %" PRId64 + "\n", + ex_fields, self->file_lines, fields); - TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); + TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); - return -1; - } else { - // simply skip bad lines - if (self->on_bad_lines == WARN) { - // pass up error message - msg = malloc(bufsize); - snprintf(msg, bufsize, - "Skipping line %" PRIu64 ": expected %" PRId64 - " fields, saw %" PRId64 "\n", - self->file_lines, ex_fields, fields); - append_warning(self, msg); - free(msg); - } - } + return -1; } else { - // missing trailing delimiters - if ((self->lines >= self->header_end + 1) && - fields < ex_fields) { - // might overrun the buffer when closing fields - if (make_stream_space(self, ex_fields - fields) < 0) { - int64_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, "out of memory"); - return -1; - } - - while (fields < ex_fields) { - end_field(self); - fields++; - } - } - - // increment both line counts - self->file_lines++; - self->lines++; + // simply skip bad lines + if (self->on_bad_lines == WARN) { + // pass up error message + const size_t bufsize = 100; + char *msg = (char *)malloc(bufsize); + snprintf(msg, bufsize, + "Skipping line %" PRIu64 ": expected %" PRId64 + " fields, saw %" PRId64 "\n", + self->file_lines, ex_fields, fields); + append_warning(self, msg); + free(msg); + } + } + } else { + // missing trailing delimiters + if ((self->lines >= self->header_end + 1) && fields < ex_fields) { + // might overrun the buffer when closing fields + if (make_stream_space(self, ex_fields - fields) < 0) { + const size_t bufsize = 100; + self->error_msg = malloc(bufsize); + snprintf(self->error_msg, bufsize, "out of memory"); + return -1; + } - // good line, set new start point - if (self->lines >= self->lines_cap) { - TRACE(( - "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", - self->lines, self->lines_cap)) - int64_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, - "Buffer overflow caught - " - "possible malformed input file.\n"); - return PARSER_OUT_OF_MEMORY; - } - self->line_start[self->lines] = - (self->line_start[self->lines - 1] + fields); + while (fields < ex_fields) { + end_field(self); + fields++; + } + } - TRACE( - ("end_line: new line start: %d\n", self->line_start[self->lines])); + // increment both line counts + self->file_lines++; + self->lines++; - // new line start with 0 fields - self->line_fields[self->lines] = 0; + // good line, set new start point + if (self->lines >= self->lines_cap) { + TRACE(("end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", + self->lines, self->lines_cap)) + const size_t bufsize = 100; + self->error_msg = malloc(bufsize); + snprintf(self->error_msg, bufsize, + "Buffer overflow caught - " + "possible malformed input file.\n"); + return PARSER_OUT_OF_MEMORY; } + self->line_start[self->lines] = + (self->line_start[self->lines - 1] + fields); - TRACE(("end_line: Finished line, at %d\n", self->lines)); + TRACE(("end_line: new line start: %d\n", self->line_start[self->lines])); - return 0; + // new line start with 0 fields + self->line_fields[self->lines] = 0; + } + + TRACE(("end_line: Finished line, at %d\n", self->lines)); + + return 0; } int parser_add_skiprow(parser_t *self, int64_t row) { - khiter_t k; - kh_int64_t *set; - int ret = 0; + khiter_t k; + kh_int64_t *set; + int ret = 0; - if (self->skipset == NULL) { - self->skipset = (void *)kh_init_int64(); - } + if (self->skipset == NULL) { + self->skipset = (void *)kh_init_int64(); + } - set = (kh_int64_t *)self->skipset; + set = (kh_int64_t *)self->skipset; - k = kh_put_int64(set, row, &ret); - set->keys[k] = row; + k = kh_put_int64(set, row, &ret); + set->keys[k] = row; - return 0; + return 0; } -int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { - // self->file_lines is zero based so subtract 1 from nrows - if (nrows > 0) { - self->skip_first_N_rows = nrows - 1; - } - - return 0; +void parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { + // self->file_lines is zero based so subtract 1 from nrows + if (nrows > 0) { + self->skip_first_N_rows = nrows - 1; + } } static int parser_buffer_bytes(parser_t *self, size_t nbytes, const char *encoding_errors) { - int status; - size_t bytes_read; - - status = 0; - self->datapos = 0; - self->data = self->cb_io(self->source, nbytes, &bytes_read, &status, - encoding_errors); - TRACE(( - "parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", - nbytes, bytes_read, status)); - self->datalen = bytes_read; + int status; + size_t bytes_read; - if (status != REACHED_EOF && self->data == NULL) { - int64_t bufsize = 200; - self->error_msg = malloc(bufsize); - - if (status == CALLING_READ_FAILED) { - snprintf(self->error_msg, bufsize, - "Calling read(nbytes) on source failed. " - "Try engine='python'."); - } else { - snprintf(self->error_msg, bufsize, "Unknown error in IO callback"); - } - return -1; + status = 0; + self->datapos = 0; + self->data = + self->cb_io(self->source, nbytes, &bytes_read, &status, encoding_errors); + TRACE( + ("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", + nbytes, bytes_read, status)); + self->datalen = bytes_read; + + if (status != REACHED_EOF && self->data == NULL) { + const size_t bufsize = 200; + self->error_msg = malloc(bufsize); + + if (status == CALLING_READ_FAILED) { + snprintf(self->error_msg, bufsize, + "Calling read(nbytes) on source failed. " + "Try engine='python'."); + } else { + snprintf(self->error_msg, bufsize, "Unknown error in IO callback"); } + return -1; + } - TRACE(("datalen: %d\n", self->datalen)); + TRACE(("datalen: %d\n", self->datalen)); - return status; + return status; } /* @@ -593,63 +560,61 @@ */ -#define PUSH_CHAR(c) \ - TRACE( \ - ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ - c, slen, self->stream_cap, self->stream_len)) \ - if (slen >= self->stream_cap) { \ - TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ - self->stream_cap)) \ - int64_t bufsize = 100; \ - self->error_msg = malloc(bufsize); \ - snprintf(self->error_msg, bufsize, \ - "Buffer overflow caught - possible malformed input file.\n");\ - return PARSER_OUT_OF_MEMORY; \ - } \ - *stream++ = c; \ - slen++; +#define PUSH_CHAR(c) \ + TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \ + c, slen, self->stream_cap, self->stream_len)) \ + if (slen >= self->stream_cap) { \ + TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ + self->stream_cap)) \ + const size_t bufsize = 100; \ + self->error_msg = malloc(bufsize); \ + snprintf(self->error_msg, bufsize, \ + "Buffer overflow caught - possible malformed input file.\n"); \ + return PARSER_OUT_OF_MEMORY; \ + } \ + *stream++ = c; \ + slen++; // This is a little bit of a hack but works for now -#define END_FIELD() \ - self->stream_len = slen; \ - if (end_field(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; - -#define END_LINE_STATE(STATE) \ - self->stream_len = slen; \ - if (end_line(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; \ - self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ - goto linelimit; \ - } - -#define END_LINE_AND_FIELD_STATE(STATE) \ - self->stream_len = slen; \ - if (end_line(self) < 0) { \ - goto parsingerror; \ - } \ - if (end_field(self) < 0) { \ - goto parsingerror; \ - } \ - stream = self->stream + self->stream_len; \ - slen = self->stream_len; \ - self->state = STATE; \ - if (line_limit > 0 && self->lines == start_lines + line_limit) { \ - goto linelimit; \ - } +#define END_FIELD() \ + self->stream_len = slen; \ + if (end_field(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; + +#define END_LINE_STATE(STATE) \ + self->stream_len = slen; \ + if (end_line(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; \ + self->state = STATE; \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + goto linelimit; \ + } + +#define END_LINE_AND_FIELD_STATE(STATE) \ + self->stream_len = slen; \ + if (end_line(self) < 0) { \ + goto parsingerror; \ + } \ + if (end_field(self) < 0) { \ + goto parsingerror; \ + } \ + stream = self->stream + self->stream_len; \ + slen = self->stream_len; \ + self->state = STATE; \ + if (line_limit > 0 && self->lines == start_lines + line_limit) { \ + goto linelimit; \ + } #define END_LINE() END_LINE_STATE(START_RECORD) -#define IS_TERMINATOR(c) \ - (c == lineterminator) +#define IS_TERMINATOR(c) (c == lineterminator) #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) @@ -660,678 +625,655 @@ #define IS_ESCAPE_CHAR(c) (c == escape_symbol) -#define IS_SKIPPABLE_SPACE(c) \ - ((!self->delim_whitespace && c == ' ' && self->skipinitialspace)) +#define IS_SKIPPABLE_SPACE(c) \ + ((!self->delim_whitespace && c == ' ' && self->skipinitialspace)) // applied when in a field -#define IS_DELIMITER(c) \ - ((!delim_whitespace && c == delimiter) || (delim_whitespace && isblank(c))) +#define IS_DELIMITER(c) \ + ((!delim_whitespace && c == delimiter) || (delim_whitespace && isblank(c))) -#define _TOKEN_CLEANUP() \ - self->stream_len = slen; \ - self->datapos = i; \ - TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \ - self->datalen)); - -#define CHECK_FOR_BOM() \ - if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \ - buf += 3; \ - self->datapos += 3; \ - } - -int skip_this_line(parser_t *self, int64_t rownum) { - int should_skip; - PyObject *result; - PyGILState_STATE state; - - if (self->skipfunc != NULL) { - state = PyGILState_Ensure(); - result = PyObject_CallFunction(self->skipfunc, "i", rownum); - - // Error occurred. It will be processed - // and caught at the Cython level. - if (result == NULL) { - should_skip = -1; - } else { - should_skip = PyObject_IsTrue(result); - } +#define _TOKEN_CLEANUP() \ + self->stream_len = slen; \ + self->datapos = i; \ + TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \ + self->datalen)); + +#define CHECK_FOR_BOM() \ + if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \ + buf += 3; \ + self->datapos += 3; \ + } + +static int skip_this_line(parser_t *self, int64_t rownum) { + if (self->skipfunc != NULL) { + PyGILState_STATE state = PyGILState_Ensure(); + PyObject *result = PyObject_CallFunction(self->skipfunc, "i", rownum); + + // Error occurred. It will be processed + // and caught at the Cython level. + const int should_skip = result == NULL ? -1 : PyObject_IsTrue(result); + + Py_XDECREF(result); + PyGILState_Release(state); + + return should_skip; + } else if (self->skipset != NULL) { + return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) != + ((kh_int64_t *)self->skipset)->n_buckets); + } else { + return (rownum <= self->skip_first_N_rows); + } +} + +static int tokenize_bytes(parser_t *self, size_t line_limit, + uint64_t start_lines) { + char *buf = self->data + self->datapos; + + const char lineterminator = + (self->lineterminator == '\0') ? '\n' : self->lineterminator; + + const int delim_whitespace = self->delim_whitespace; + const char delimiter = self->delimiter; + + // 1000 is something that couldn't fit in "char" + // thus comparing a char to it would always be "false" + const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000; + const int comment_symbol = + (self->commentchar != '\0') ? self->commentchar : 1000; + const int escape_symbol = + (self->escapechar != '\0') ? self->escapechar : 1000; + + if (make_stream_space(self, self->datalen - self->datapos) < 0) { + const size_t bufsize = 100; + self->error_msg = malloc(bufsize); + snprintf(self->error_msg, bufsize, "out of memory"); + return -1; + } - Py_XDECREF(result); - PyGILState_Release(state); + char *stream = self->stream + self->stream_len; + uint64_t slen = self->stream_len; - return should_skip; - } else if (self->skipset != NULL) { - return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) != - ((kh_int64_t *)self->skipset)->n_buckets); - } else { - return (rownum <= self->skip_first_N_rows); - } -} + TRACE(("%s\n", buf)); -int tokenize_bytes(parser_t *self, - size_t line_limit, uint64_t start_lines) { - int64_t i; - uint64_t slen; - int should_skip; - char c; - char *stream; - char *buf = self->data + self->datapos; - - const char lineterminator = (self->lineterminator == '\0') ? - '\n' : self->lineterminator; - - const int delim_whitespace = self->delim_whitespace; - const char delimiter = self->delimiter; - - // 1000 is something that couldn't fit in "char" - // thus comparing a char to it would always be "false" - const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000; - const int comment_symbol = (self->commentchar != '\0') ? - self->commentchar : 1000; - const int escape_symbol = (self->escapechar != '\0') ? - self->escapechar : 1000; + if (self->file_lines == 0) { + CHECK_FOR_BOM(); + } + + char c; + int64_t i; + for (i = self->datapos; i < self->datalen; ++i) { + // next character in file + c = *buf++; + + TRACE(("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, " + "state %d\n", + i, c, self->file_lines + 1, self->line_fields[self->lines], + self->state)); - if (make_stream_space(self, self->datalen - self->datapos) < 0) { - int64_t bufsize = 100; - self->error_msg = malloc(bufsize); - snprintf(self->error_msg, bufsize, "out of memory"); - return -1; - } + switch (self->state) { + case START_FIELD_IN_SKIP_LINE: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_QUOTE(c)) { + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else if (IS_DELIMITER(c)) { + // Do nothing, we're starting a new field again. + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + } + break; + + case IN_FIELD_IN_SKIP_LINE: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; + } + break; + + case IN_QUOTED_FIELD_IN_SKIP_LINE: + if (IS_QUOTE(c)) { + if (self->doublequote) { + self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + } + } + break; - stream = self->stream + self->stream_len; - slen = self->stream_len; + case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: + if (IS_QUOTE(c)) { + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + } + break; - TRACE(("%s\n", buf)); + case WHITESPACE_LINE: + if (IS_TERMINATOR(c)) { + self->file_lines++; + self->state = START_RECORD; + break; + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + break; + } else if (!self->delim_whitespace) { + if (isblank(c) && c != self->delimiter) { + } else { // backtrack + // use i + 1 because buf has been incremented but not i + do { + --buf; + --i; + } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf)); + + // reached a newline rather than the beginning + if (IS_TERMINATOR(*buf)) { + ++buf; // move pointer to first char after newline + ++i; + } + self->state = START_FIELD; + } + break; + } + // fall through + + case EAT_WHITESPACE: + if (IS_TERMINATOR(c)) { + END_LINE(); + self->state = START_RECORD; + break; + } else if (IS_CARRIAGE(c)) { + self->state = EAT_CRNL; + break; + } else if (IS_COMMENT_CHAR(c)) { + self->state = EAT_COMMENT; + break; + } else if (!isblank(c)) { + self->state = START_FIELD; + PD_FALLTHROUGH; // fall through to subsequent state + } else { + // if whitespace char, keep slurping + break; + } + + case START_RECORD: { + // start of record + const int should_skip = skip_this_line(self, self->file_lines); + + if (should_skip == -1) { + goto parsingerror; + } else if (should_skip) { + if (IS_QUOTE(c)) { + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else { + self->state = IN_FIELD_IN_SKIP_LINE; - if (self->file_lines == 0) { - CHECK_FOR_BOM(); - } + if (IS_TERMINATOR(c)) { + END_LINE(); + } + } + break; + } else if (IS_TERMINATOR(c)) { + // \n\r possible? + if (self->skip_empty_lines) { + self->file_lines++; + } else { + END_LINE(); + } + break; + } else if (IS_CARRIAGE(c)) { + if (self->skip_empty_lines) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else { + self->state = EAT_CRNL; + } + break; + } else if (IS_COMMENT_CHAR(c)) { + self->state = EAT_LINE_COMMENT; + break; + } else if (isblank(c)) { + if (self->delim_whitespace) { + if (self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + } else { + self->state = EAT_WHITESPACE; + } + break; + } else if (c != self->delimiter && self->skip_empty_lines) { + self->state = WHITESPACE_LINE; + break; + } + } + + // normal character - fall through + // to handle as START_FIELD + self->state = START_FIELD; + PD_FALLTHROUGH; + } + case START_FIELD: + // expecting field + if (IS_TERMINATOR(c)) { + END_FIELD(); + END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else if (IS_QUOTE(c)) { + // start quoted field + self->state = IN_QUOTED_FIELD; + } else if (IS_ESCAPE_CHAR(c)) { + // possible escaped character + self->state = ESCAPED_CHAR; + } else if (IS_SKIPPABLE_SPACE(c)) { + // ignore space at start of field + } else if (IS_DELIMITER(c)) { + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + // save empty field + END_FIELD(); + } + } else if (IS_COMMENT_CHAR(c)) { + END_FIELD(); + self->state = EAT_COMMENT; + } else { + // begin new unquoted field + PUSH_CHAR(c); + self->state = IN_FIELD; + } + break; + + case ESCAPED_CHAR: + PUSH_CHAR(c); + self->state = IN_FIELD; + break; + + case EAT_LINE_COMMENT: + if (IS_TERMINATOR(c)) { + self->file_lines++; + self->state = START_RECORD; + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } + break; + + case IN_FIELD: + // in unquoted field + if (IS_TERMINATOR(c)) { + END_FIELD(); + END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else if (IS_ESCAPE_CHAR(c)) { + // possible escaped character + self->state = ESCAPED_CHAR; + } else if (IS_DELIMITER(c)) { + // end of field - end of line not reached yet + END_FIELD(); + + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + self->state = START_FIELD; + } + } else if (IS_COMMENT_CHAR(c)) { + END_FIELD(); + self->state = EAT_COMMENT; + } else { + // normal character - save in field + PUSH_CHAR(c); + } + break; + + case IN_QUOTED_FIELD: + // in quoted field + if (IS_ESCAPE_CHAR(c)) { + // possible escape character + self->state = ESCAPE_IN_QUOTED_FIELD; + } else if (IS_QUOTE(c)) { + if (self->doublequote) { + // double quote - " represented by "" + self->state = QUOTE_IN_QUOTED_FIELD; + } else { + // end of quote part of field + self->state = IN_FIELD; + } + } else { + // normal character - save in field + PUSH_CHAR(c); + } + break; + + case ESCAPE_IN_QUOTED_FIELD: + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + break; + + case QUOTE_IN_QUOTED_FIELD: + // double quote - seen a quote in an quoted field + if (IS_QUOTE(c)) { + // save "" as " + + PUSH_CHAR(c); + self->state = IN_QUOTED_FIELD; + } else if (IS_DELIMITER(c)) { + // end of field - end of line not reached yet + END_FIELD(); - for (i = self->datapos; i < self->datalen; ++i) { - // next character in file - c = *buf++; - - TRACE( - ("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, " - "state %d\n", - i, c, self->file_lines + 1, self->line_fields[self->lines], - self->state)); - - switch (self->state) { - case START_FIELD_IN_SKIP_LINE: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else if (IS_DELIMITER(c)) { - // Do nothing, we're starting a new field again. - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - break; - - case IN_FIELD_IN_SKIP_LINE: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_DELIMITER(c)) { - self->state = START_FIELD_IN_SKIP_LINE; - } - break; - - case IN_QUOTED_FIELD_IN_SKIP_LINE: - if (IS_QUOTE(c)) { - if (self->doublequote) { - self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - } - break; - - case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: - if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else if (IS_DELIMITER(c)) { - self->state = START_FIELD_IN_SKIP_LINE; - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - } - break; - - case WHITESPACE_LINE: - if (IS_TERMINATOR(c)) { - self->file_lines++; - self->state = START_RECORD; - break; - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - break; - } else if (!self->delim_whitespace) { - if (isblank(c) && c != self->delimiter) { - } else { // backtrack - // use i + 1 because buf has been incremented but not i - do { - --buf; - --i; - } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf)); - - // reached a newline rather than the beginning - if (IS_TERMINATOR(*buf)) { - ++buf; // move pointer to first char after newline - ++i; - } - self->state = START_FIELD; - } - break; - } - // fall through - - case EAT_WHITESPACE: - if (IS_TERMINATOR(c)) { - END_LINE(); - self->state = START_RECORD; - break; - } else if (IS_CARRIAGE(c)) { - self->state = EAT_CRNL; - break; - } else if (IS_COMMENT_CHAR(c)) { - self->state = EAT_COMMENT; - break; - } else if (!isblank(c)) { - self->state = START_FIELD; - // fall through to subsequent state - } else { - // if whitespace char, keep slurping - break; - } - - case START_RECORD: - // start of record - should_skip = skip_this_line(self, self->file_lines); - - if (should_skip == -1) { - goto parsingerror; - } else if (should_skip) { - if (IS_QUOTE(c)) { - self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; - } else { - self->state = IN_FIELD_IN_SKIP_LINE; - - if (IS_TERMINATOR(c)) { - END_LINE(); - } - } - break; - } else if (IS_TERMINATOR(c)) { - // \n\r possible? - if (self->skip_empty_lines) { - self->file_lines++; - } else { - END_LINE(); - } - break; - } else if (IS_CARRIAGE(c)) { - if (self->skip_empty_lines) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } else { - self->state = EAT_CRNL; - } - break; - } else if (IS_COMMENT_CHAR(c)) { - self->state = EAT_LINE_COMMENT; - break; - } else if (isblank(c)) { - if (self->delim_whitespace) { - if (self->skip_empty_lines) { - self->state = WHITESPACE_LINE; - } else { - self->state = EAT_WHITESPACE; - } - break; - } else if (c != self->delimiter && self->skip_empty_lines) { - self->state = WHITESPACE_LINE; - break; - } - // fall through - } - - // normal character - fall through - // to handle as START_FIELD - self->state = START_FIELD; - - case START_FIELD: - // expecting field - if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else if (IS_QUOTE(c)) { - // start quoted field - self->state = IN_QUOTED_FIELD; - } else if (IS_ESCAPE_CHAR(c)) { - // possible escaped character - self->state = ESCAPED_CHAR; - } else if (IS_SKIPPABLE_SPACE(c)) { - // ignore space at start of field - } else if (IS_DELIMITER(c)) { - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - // save empty field - END_FIELD(); - } - } else if (IS_COMMENT_CHAR(c)) { - END_FIELD(); - self->state = EAT_COMMENT; - } else { - // begin new unquoted field - PUSH_CHAR(c); - self->state = IN_FIELD; - } - break; - - case ESCAPED_CHAR: - PUSH_CHAR(c); - self->state = IN_FIELD; - break; - - case EAT_LINE_COMMENT: - if (IS_TERMINATOR(c)) { - self->file_lines++; - self->state = START_RECORD; - } else if (IS_CARRIAGE(c)) { - self->file_lines++; - self->state = EAT_CRNL_NOP; - } - break; - - case IN_FIELD: - // in unquoted field - if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else if (IS_ESCAPE_CHAR(c)) { - // possible escaped character - self->state = ESCAPED_CHAR; - } else if (IS_DELIMITER(c)) { - // end of field - end of line not reached yet - END_FIELD(); - - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - self->state = START_FIELD; - } - } else if (IS_COMMENT_CHAR(c)) { - END_FIELD(); - self->state = EAT_COMMENT; - } else { - // normal character - save in field - PUSH_CHAR(c); - } - break; - - case IN_QUOTED_FIELD: - // in quoted field - if (IS_ESCAPE_CHAR(c)) { - // possible escape character - self->state = ESCAPE_IN_QUOTED_FIELD; - } else if (IS_QUOTE(c)) { - if (self->doublequote) { - // double quote - " represented by "" - self->state = QUOTE_IN_QUOTED_FIELD; - } else { - // end of quote part of field - self->state = IN_FIELD; - } - } else { - // normal character - save in field - PUSH_CHAR(c); - } - break; - - case ESCAPE_IN_QUOTED_FIELD: - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - break; - - case QUOTE_IN_QUOTED_FIELD: - // double quote - seen a quote in an quoted field - if (IS_QUOTE(c)) { - // save "" as " - - PUSH_CHAR(c); - self->state = IN_QUOTED_FIELD; - } else if (IS_DELIMITER(c)) { - // end of field - end of line not reached yet - END_FIELD(); - - if (self->delim_whitespace) { - self->state = EAT_WHITESPACE; - } else { - self->state = START_FIELD; - } - } else if (IS_TERMINATOR(c)) { - END_FIELD(); - END_LINE(); - } else if (IS_CARRIAGE(c)) { - END_FIELD(); - self->state = EAT_CRNL; - } else { - PUSH_CHAR(c); - self->state = IN_FIELD; - } - break; - - case EAT_COMMENT: - if (IS_TERMINATOR(c)) { - END_LINE(); - } else if (IS_CARRIAGE(c)) { - self->state = EAT_CRNL; - } - break; - - // only occurs with non-custom line terminator, - // which is why we directly check for '\n' - case EAT_CRNL: - if (c == '\n') { - END_LINE(); - } else if (IS_DELIMITER(c)) { - if (self->delim_whitespace) { - END_LINE_STATE(EAT_WHITESPACE); - } else { - // Handle \r-delimited files - END_LINE_AND_FIELD_STATE(START_FIELD); - } - } else { - if (self->delim_whitespace) { - /* XXX - * first character of a new record--need to back up and - * reread - * to handle properly... - */ - i--; - buf--; // back up one character (HACK!) - END_LINE_STATE(START_RECORD); - } else { - // \r line terminator - // UGH. we don't actually want - // to consume the token. fix this later - self->stream_len = slen; - if (end_line(self) < 0) { - goto parsingerror; - } - - stream = self->stream + self->stream_len; - slen = self->stream_len; - self->state = START_RECORD; - - --i; - buf--; // let's try this character again (HACK!) - if (line_limit > 0 && - self->lines == start_lines + line_limit) { - goto linelimit; - } - } - } - break; - - // only occurs with non-custom line terminator, - // which is why we directly check for '\n' - case EAT_CRNL_NOP: // inside an ignored comment line - self->state = START_RECORD; - // \r line terminator -- parse this character again - if (c != '\n' && !IS_DELIMITER(c)) { - --i; - --buf; - } - break; - default: - break; + if (self->delim_whitespace) { + self->state = EAT_WHITESPACE; + } else { + self->state = START_FIELD; } + } else if (IS_TERMINATOR(c)) { + END_FIELD(); + END_LINE(); + } else if (IS_CARRIAGE(c)) { + END_FIELD(); + self->state = EAT_CRNL; + } else { + PUSH_CHAR(c); + self->state = IN_FIELD; + } + break; + + case EAT_COMMENT: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->state = EAT_CRNL; + } + break; + + // only occurs with non-custom line terminator, + // which is why we directly check for '\n' + case EAT_CRNL: + if (c == '\n') { + END_LINE(); + } else if (IS_DELIMITER(c)) { + if (self->delim_whitespace) { + END_LINE_STATE(EAT_WHITESPACE); + } else { + // Handle \r-delimited files + END_LINE_AND_FIELD_STATE(START_FIELD); + } + } else { + if (self->delim_whitespace) { + /* XXX + * first character of a new record--need to back up and + * reread + * to handle properly... + */ + i--; + buf--; // back up one character (HACK!) + END_LINE_STATE(START_RECORD); + } else { + // \r line terminator + // UGH. we don't actually want + // to consume the token. fix this later + self->stream_len = slen; + if (end_line(self) < 0) { + goto parsingerror; + } + + stream = self->stream + self->stream_len; + slen = self->stream_len; + self->state = START_RECORD; + + --i; + buf--; // let's try this character again (HACK!) + if (line_limit > 0 && self->lines == start_lines + line_limit) { + goto linelimit; + } + } + } + break; + + // only occurs with non-custom line terminator, + // which is why we directly check for '\n' + case EAT_CRNL_NOP: // inside an ignored comment line + self->state = START_RECORD; + // \r line terminator -- parse this character again + if (c != '\n' && !IS_DELIMITER(c)) { + --i; + --buf; + } + break; + default: + break; } + } - _TOKEN_CLEANUP(); + _TOKEN_CLEANUP(); - TRACE(("Finished tokenizing input\n")) + TRACE(("Finished tokenizing input\n")) - return 0; + return 0; parsingerror: - i++; - _TOKEN_CLEANUP(); + i++; + _TOKEN_CLEANUP(); - return -1; + return -1; linelimit: - i++; - _TOKEN_CLEANUP(); + i++; + _TOKEN_CLEANUP(); - return 0; + return 0; } static int parser_handle_eof(parser_t *self) { - int64_t bufsize = 100; + const size_t bufsize = 100; - TRACE( - ("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) + TRACE(("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state)) - if (self->datalen != 0) return -1; + if (self->datalen != 0) + return -1; - switch (self->state) { - case START_RECORD: - case WHITESPACE_LINE: - case EAT_CRNL_NOP: - case EAT_LINE_COMMENT: - return 0; - - case ESCAPE_IN_QUOTED_FIELD: - case IN_QUOTED_FIELD: - self->error_msg = (char *)malloc(bufsize); - snprintf(self->error_msg, bufsize, - "EOF inside string starting at row %" PRIu64, - self->file_lines); - return -1; - - case ESCAPED_CHAR: - self->error_msg = (char *)malloc(bufsize); - snprintf(self->error_msg, bufsize, - "EOF following escape character"); - return -1; - - case IN_FIELD: - case START_FIELD: - case QUOTE_IN_QUOTED_FIELD: - if (end_field(self) < 0) return -1; - break; + switch (self->state) { + case START_RECORD: + case WHITESPACE_LINE: + case EAT_CRNL_NOP: + case EAT_LINE_COMMENT: + return 0; - default: - break; - } + case ESCAPE_IN_QUOTED_FIELD: + case IN_QUOTED_FIELD: + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, + "EOF inside string starting at row %" PRIu64, self->file_lines); + return -1; - if (end_line(self) < 0) - return -1; - else - return 0; + case ESCAPED_CHAR: + self->error_msg = (char *)malloc(bufsize); + snprintf(self->error_msg, bufsize, "EOF following escape character"); + return -1; + + case IN_FIELD: + case START_FIELD: + case QUOTE_IN_QUOTED_FIELD: + if (end_field(self) < 0) + return -1; + break; + + default: + break; + } + + if (end_line(self) < 0) + return -1; + else + return 0; } int parser_consume_rows(parser_t *self, size_t nrows) { - int64_t offset, word_deletions; - uint64_t char_count, i; + if (nrows > self->lines) { + nrows = self->lines; + } - if (nrows > self->lines) { - nrows = self->lines; - } + /* do nothing */ + if (nrows == 0) + return 0; - /* do nothing */ - if (nrows == 0) return 0; + /* cannot guarantee that nrows + 1 has been observed */ + const int64_t word_deletions = + self->line_start[nrows - 1] + self->line_fields[nrows - 1]; + + /* if word_deletions == 0 (i.e. this case) then char_count must + * be 0 too, as no data needs to be skipped */ + const uint64_t char_count = + word_deletions >= 1 ? (self->word_starts[word_deletions - 1] + + strlen(self->words[word_deletions - 1]) + 1) + : 0; + + TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, + char_count)); + + /* move stream, only if something to move */ + if (char_count < self->stream_len) { + memmove(self->stream, (self->stream + char_count), + self->stream_len - char_count); + } + /* buffer counts */ + self->stream_len -= char_count; + + /* move token metadata */ + // Note: We should always have words_len < word_deletions, so this + // subtraction will remain appropriately-typed. + int64_t offset; + for (uint64_t i = 0; i < self->words_len - word_deletions; ++i) { + offset = i + word_deletions; + + self->words[i] = self->words[offset] - char_count; + self->word_starts[i] = self->word_starts[offset] - char_count; + } + self->words_len -= word_deletions; + + /* move current word pointer to stream */ + self->pword_start -= char_count; + self->word_start -= char_count; + + /* move line metadata */ + // Note: We should always have self->lines - nrows + 1 >= 0, so this + // subtraction will remain appropriately-typed. + for (uint64_t i = 0; i < self->lines - nrows + 1; ++i) { + offset = i + nrows; + self->line_start[i] = self->line_start[offset] - word_deletions; + self->line_fields[i] = self->line_fields[offset]; + } + self->lines -= nrows; - /* cannot guarantee that nrows + 1 has been observed */ - word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1]; - if (word_deletions >= 1) { - char_count = (self->word_starts[word_deletions - 1] + - strlen(self->words[word_deletions - 1]) + 1); - } else { - /* if word_deletions == 0 (i.e. this case) then char_count must - * be 0 too, as no data needs to be skipped */ - char_count = 0; - } - - TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions, - char_count)); - - /* move stream, only if something to move */ - if (char_count < self->stream_len) { - memmove(self->stream, (self->stream + char_count), - self->stream_len - char_count); - } - /* buffer counts */ - self->stream_len -= char_count; - - /* move token metadata */ - // Note: We should always have words_len < word_deletions, so this - // subtraction will remain appropriately-typed. - for (i = 0; i < self->words_len - word_deletions; ++i) { - offset = i + word_deletions; - - self->words[i] = self->words[offset] - char_count; - self->word_starts[i] = self->word_starts[offset] - char_count; - } - self->words_len -= word_deletions; - - /* move current word pointer to stream */ - self->pword_start -= char_count; - self->word_start -= char_count; - - /* move line metadata */ - // Note: We should always have self->lines - nrows + 1 >= 0, so this - // subtraction will remain appropriately-typed. - for (i = 0; i < self->lines - nrows + 1; ++i) { - offset = i + nrows; - self->line_start[i] = self->line_start[offset] - word_deletions; - self->line_fields[i] = self->line_fields[offset]; - } - self->lines -= nrows; - - return 0; + return 0; } static size_t _next_pow2(size_t sz) { - size_t result = 1; - while (result < sz) result *= 2; - return result; + size_t result = 1; + while (result < sz) + result *= 2; + return result; } int parser_trim_buffers(parser_t *self) { - /* - Free memory - */ - size_t new_cap; - void *newptr; - - uint64_t i; - - /** - * Before we free up space and trim, we should - * save how many words we saw when parsing, if - * it exceeds the maximum number we saw before. - * - * This is important for when we read in chunks, - * so that we can inform subsequent chunk parsing - * as to how many words we could possibly see. - */ - if (self->words_cap > self->max_words_cap) { - self->max_words_cap = self->words_cap; - } - - /* trim words, word_starts */ - new_cap = _next_pow2(self->words_len) + 1; - if (new_cap < self->words_cap) { - TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); - self->words = realloc(self->words, new_cap * sizeof(char *)); - if (self->words == NULL) { - return PARSER_OUT_OF_MEMORY; - } - self->word_starts = realloc(self->word_starts, - new_cap * sizeof(int64_t)); - if (self->word_starts == NULL) { - return PARSER_OUT_OF_MEMORY; - } - self->words_cap = new_cap; - } - - /* trim stream */ - new_cap = _next_pow2(self->stream_len) + 1; - TRACE( - ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = " + /* + Free memory + */ + + /** + * Before we free up space and trim, we should + * save how many words we saw when parsing, if + * it exceeds the maximum number we saw before. + * + * This is important for when we read in chunks, + * so that we can inform subsequent chunk parsing + * as to how many words we could possibly see. + */ + if (self->words_cap > self->max_words_cap) { + self->max_words_cap = self->words_cap; + } + + /* trim words, word_starts */ + size_t new_cap = _next_pow2(self->words_len) + 1; + if (new_cap < self->words_cap) { + TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); + self->words = realloc(self->words, new_cap * sizeof(char *)); + if (self->words == NULL) { + return PARSER_OUT_OF_MEMORY; + } + self->word_starts = realloc(self->word_starts, new_cap * sizeof(int64_t)); + if (self->word_starts == NULL) { + return PARSER_OUT_OF_MEMORY; + } + self->words_cap = new_cap; + } + + /* trim stream */ + new_cap = _next_pow2(self->stream_len) + 1; + TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = " "%zu\n", new_cap, self->stream_cap, self->lines_cap)); - if (new_cap < self->stream_cap) { - TRACE( - ("parser_trim_buffers: new_cap < self->stream_cap, calling " - "realloc\n")); - newptr = realloc(self->stream, new_cap); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - // Update the pointers in the self->words array (char **) if - // `realloc` - // moved the `self->stream` buffer. This block mirrors a similar - // block in - // `make_stream_space`. - if (self->stream != newptr) { - self->pword_start = (char *)newptr + self->word_start; - - for (i = 0; i < self->words_len; ++i) { - self->words[i] = (char *)newptr + self->word_starts[i]; - } - } - - self->stream = newptr; - self->stream_cap = new_cap; - } + if (new_cap < self->stream_cap) { + TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling " + "realloc\n")); + void *newptr = realloc(self->stream, new_cap); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + // Update the pointers in the self->words array (char **) if + // `realloc` + // moved the `self->stream` buffer. This block mirrors a similar + // block in + // `make_stream_space`. + if (self->stream != newptr) { + self->pword_start = (char *)newptr + self->word_start; + + for (uint64_t i = 0; i < self->words_len; ++i) { + self->words[i] = (char *)newptr + self->word_starts[i]; + } + } + + self->stream = newptr; + self->stream_cap = new_cap; + } + } + + /* trim line_start, line_fields */ + new_cap = _next_pow2(self->lines) + 1; + if (new_cap < self->lines_cap) { + TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); + void *newptr = realloc(self->line_start, new_cap * sizeof(int64_t)); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + self->line_start = newptr; } - - /* trim line_start, line_fields */ - new_cap = _next_pow2(self->lines) + 1; - if (new_cap < self->lines_cap) { - TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = realloc(self->line_start, - new_cap * sizeof(int64_t)); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->line_start = newptr; - } - newptr = realloc(self->line_fields, - new_cap * sizeof(int64_t)); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->line_fields = newptr; - self->lines_cap = new_cap; - } + newptr = realloc(self->line_fields, new_cap * sizeof(int64_t)); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + self->line_fields = newptr; + self->lines_cap = new_cap; } + } - return 0; + return 0; } /* @@ -1339,65 +1281,61 @@ all : tokenize all the data vs. certain number of rows */ -int _tokenize_helper(parser_t *self, size_t nrows, int all, - const char *encoding_errors) { - int status = 0; - uint64_t start_lines = self->lines; +static int _tokenize_helper(parser_t *self, size_t nrows, int all, + const char *encoding_errors) { + int status = 0; + const uint64_t start_lines = self->lines; - if (self->state == FINISHED) { - return 0; - } - - TRACE(( - "_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n", - nrows, self->datapos, self->datalen)); - - while (1) { - if (!all && self->lines - start_lines >= nrows) break; - - if (self->datapos == self->datalen) { - status = parser_buffer_bytes(self, self->chunksize, - encoding_errors); - - if (status == REACHED_EOF) { - // close out last line - status = parser_handle_eof(self); - self->state = FINISHED; - break; - } else if (status != 0) { - return status; - } - } + if (self->state == FINISHED) { + return 0; + } - TRACE( - ("_tokenize_helper: Trying to process %d bytes, datalen=%d, " - "datapos= %d\n", - self->datalen - self->datapos, self->datalen, self->datapos)); - - status = tokenize_bytes(self, nrows, start_lines); - - if (status < 0) { - // XXX - TRACE( - ("_tokenize_helper: Status %d returned from tokenize_bytes, " - "breaking\n", - status)); - status = -1; - break; - } - } - TRACE(("leaving tokenize_helper\n")); - return status; + TRACE( + ("_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n", + nrows, self->datapos, self->datalen)); + + while (1) { + if (!all && self->lines - start_lines >= nrows) + break; + + if (self->datapos == self->datalen) { + status = parser_buffer_bytes(self, self->chunksize, encoding_errors); + + if (status == REACHED_EOF) { + // close out last line + status = parser_handle_eof(self); + self->state = FINISHED; + break; + } else if (status != 0) { + return status; + } + } + + TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, " + "datapos= %d\n", + self->datalen - self->datapos, self->datalen, self->datapos)); + + status = tokenize_bytes(self, nrows, start_lines); + + if (status < 0) { + // XXX + TRACE(("_tokenize_helper: Status %d returned from tokenize_bytes, " + "breaking\n", + status)); + status = -1; + break; + } + } + TRACE(("leaving tokenize_helper\n")); + return status; } int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) { - int status = _tokenize_helper(self, nrows, 0, encoding_errors); - return status; + return _tokenize_helper(self, nrows, 0, encoding_errors); } int tokenize_all_rows(parser_t *self, const char *encoding_errors) { - int status = _tokenize_helper(self, -1, 1, encoding_errors); - return status; + return _tokenize_helper(self, -1, 1, encoding_errors); } /* @@ -1415,15 +1353,15 @@ * leaves the value of *val unmodified. */ int to_boolean(const char *item, uint8_t *val) { - if (strcasecmp(item, "TRUE") == 0) { - *val = 1; - return 0; - } else if (strcasecmp(item, "FALSE") == 0) { - *val = 0; - return 0; - } + if (strcasecmp(item, "TRUE") == 0) { + *val = 1; + return 0; + } else if (strcasecmp(item, "FALSE") == 0) { + *val = 0; + return 0; + } - return -1; + return -1; } // --------------------------------------------------------------------------- @@ -1473,307 +1411,321 @@ // * Add tsep argument for thousands separator // -// pessimistic but quick assessment, -// assuming that each decimal digit requires 4 bits to store -const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4; - double xstrtod(const char *str, char **endptr, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { - double number; - unsigned int i_number = 0; - int exponent; - int negative; - char *p = (char *)str; - double p10; - int n; - int num_digits; - int num_decimals; - - if (maybe_int != NULL) *maybe_int = 1; - // Skip leading whitespace. - while (isspace_ascii(*p)) p++; + const char *p = str; + if (maybe_int != NULL) + *maybe_int = 1; + // Skip leading whitespace. + while (isspace_ascii(*p)) + p++; + + // Handle optional sign. + int negative = 0; + switch (*p) { + case '-': + negative = 1; + PD_FALLTHROUGH; // Fall through to increment position. + case '+': + p++; + break; + } + + int exponent = 0; + int num_digits = 0; + int num_decimals = 0; + + // pessimistic but quick assessment, + // assuming that each decimal digit requires 4 bits to store + // TODO: C23 has UINT64_WIDTH macro that can be used at compile time + const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4; + + // Process string of digits. + unsigned int i_number = 0; + while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) { + i_number = i_number * 10 + (*p - '0'); + p++; + num_digits++; + + p += (tsep != '\0' && *p == tsep); + } + double number = i_number; + + if (num_digits > max_int_decimal_digits) { + // process what's left as double + while (isdigit_ascii(*p)) { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + + p += (tsep != '\0' && *p == tsep); + } + } + + // Process decimal part. + if (*p == decimal) { + if (maybe_int != NULL) + *maybe_int = 0; + p++; + + while (isdigit_ascii(*p)) { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + num_decimals++; + } + + exponent -= num_decimals; + } + + if (num_digits == 0) { + *error = ERANGE; + return 0.0; + } + + // Correct for sign. + if (negative) + number = -number; + + // Process an exponent string. + if (toupper_ascii(*p) == toupper_ascii(sci)) { + if (maybe_int != NULL) + *maybe_int = 0; // Handle optional sign. negative = 0; - switch (*p) { - case '-': - negative = 1; // Fall through to increment position. - case '+': - p++; + switch (*++p) { + case '-': + negative = 1; + PD_FALLTHROUGH; // Fall through to increment position. + case '+': + p++; + break; } - exponent = 0; - num_digits = 0; - num_decimals = 0; - // Process string of digits. - while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) { - i_number = i_number * 10 + (*p - '0'); - p++; - num_digits++; - - p += (tsep != '\0' && *p == tsep); - } - number = i_number; - - if (num_digits > max_int_decimal_digits) { - // process what's left as double - while (isdigit_ascii(*p)) { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - - p += (tsep != '\0' && *p == tsep); - } + num_digits = 0; + int n = 0; + while (isdigit_ascii(*p)) { + n = n * 10 + (*p - '0'); + num_digits++; + p++; } - // Process decimal part. - if (*p == decimal) { - if (maybe_int != NULL) *maybe_int = 0; - p++; - - while (isdigit_ascii(*p)) { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - num_decimals++; - } - - exponent -= num_decimals; - } + if (negative) + exponent -= n; + else + exponent += n; - if (num_digits == 0) { - *error = ERANGE; - return 0.0; + // If no digits, after the 'e'/'E', un-consume it + if (num_digits == 0) + p--; + } + + if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) { + *error = ERANGE; + return HUGE_VAL; + } + + // Scale the result. + double p10 = 10.; + int n = exponent; + if (n < 0) + n = -n; + while (n) { + if (n & 1) { + if (exponent < 0) + number /= p10; + else + number *= p10; + } + n >>= 1; + p10 *= p10; + } + + if (number == HUGE_VAL) { + *error = ERANGE; + } + + if (skip_trailing) { + // Skip trailing whitespace. + while (isspace_ascii(*p)) + p++; + } + + if (endptr) + *endptr = (char *)p; + return number; +} + +double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, + char tsep, int skip_trailing, int *error, + int *maybe_int) { + const char *p = str; + const int max_digits = 17; + + if (maybe_int != NULL) + *maybe_int = 1; + // Cache powers of 10 in memory. + static double e[] = { + 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, + 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, + 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, + 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, + 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, + 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, + 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, + 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, + 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, + 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, + 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, + 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, + 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, + 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, + 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, + 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, + 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, + 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, + 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, + 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, + 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, + 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, + 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, + 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, + 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, + 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, + 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, + 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, + 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, + 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, + 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; + + // Skip leading whitespace. + while (isspace_ascii(*p)) + p++; + + // Handle optional sign. + int negative = 0; + switch (*p) { + case '-': + negative = 1; + PD_FALLTHROUGH; // Fall through to increment position. + case '+': + p++; + break; + } + + double number = 0.; + int exponent = 0; + int num_digits = 0; + int num_decimals = 0; + + // Process string of digits. + while (isdigit_ascii(*p)) { + if (num_digits < max_digits) { + number = number * 10. + (*p - '0'); + num_digits++; + } else { + ++exponent; } - // Correct for sign. - if (negative) number = -number; - - // Process an exponent string. - if (toupper_ascii(*p) == toupper_ascii(sci)) { - if (maybe_int != NULL) *maybe_int = 0; - - // Handle optional sign. - negative = 0; - switch (*++p) { - case '-': - negative = 1; // Fall through to increment pos. - case '+': - p++; - } + p++; + p += (tsep != '\0' && *p == tsep); + } - // Process string of digits. - num_digits = 0; - n = 0; - while (isdigit_ascii(*p)) { - n = n * 10 + (*p - '0'); - num_digits++; - p++; - } + // Process decimal part + if (*p == decimal) { + if (maybe_int != NULL) + *maybe_int = 0; + p++; - if (negative) - exponent -= n; - else - exponent += n; - - // If no digits, after the 'e'/'E', un-consume it - if (num_digits == 0) p--; - } - - if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) { - *error = ERANGE; - return HUGE_VAL; - } - - // Scale the result. - p10 = 10.; - n = exponent; - if (n < 0) n = -n; - while (n) { - if (n & 1) { - if (exponent < 0) - number /= p10; - else - number *= p10; - } - n >>= 1; - p10 *= p10; + while (num_digits < max_digits && isdigit_ascii(*p)) { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + num_decimals++; } - if (number == HUGE_VAL) { - *error = ERANGE; - } + if (num_digits >= max_digits) // Consume extra decimal digits. + while (isdigit_ascii(*p)) + ++p; - if (skip_trailing) { - // Skip trailing whitespace. - while (isspace_ascii(*p)) p++; - } - - if (endptr) *endptr = p; - return number; -} - -double precise_xstrtod(const char *str, char **endptr, char decimal, - char sci, char tsep, int skip_trailing, - int *error, int *maybe_int) { - double number; - int exponent; - int negative; - char *p = (char *)str; - int num_digits; - int num_decimals; - int max_digits = 17; - int n; - - if (maybe_int != NULL) *maybe_int = 1; - // Cache powers of 10 in memory. - static double e[] = { - 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, - 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, - 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, - 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, - 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, - 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, - 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, - 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, - 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, - 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, - 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, - 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, - 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, - 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, - 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, - 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, - 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, - 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, - 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, - 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, - 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, - 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, - 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, - 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, - 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, - 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, - 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, - 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, - 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, - 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, - 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; + exponent -= num_decimals; + } - // Skip leading whitespace. - while (isspace_ascii(*p)) p++; + if (num_digits == 0) { + *error = ERANGE; + return 0.0; + } + + // Correct for sign. + if (negative) + number = -number; + + // Process an exponent string. + if (toupper_ascii(*p) == toupper_ascii(sci)) { + if (maybe_int != NULL) + *maybe_int = 0; - // Handle optional sign. + // Handle optional sign negative = 0; - switch (*p) { - case '-': - negative = 1; // Fall through to increment position. - case '+': - p++; + switch (*++p) { + case '-': + negative = 1; + PD_FALLTHROUGH; // Fall through to increment position. + case '+': + p++; + break; } - number = 0.; - exponent = 0; - num_digits = 0; - num_decimals = 0; - // Process string of digits. - while (isdigit_ascii(*p)) { - if (num_digits < max_digits) { - number = number * 10. + (*p - '0'); - num_digits++; - } else { - ++exponent; - } - - p++; - p += (tsep != '\0' && *p == tsep); - } - - // Process decimal part - if (*p == decimal) { - if (maybe_int != NULL) *maybe_int = 0; - p++; - - while (num_digits < max_digits && isdigit_ascii(*p)) { - number = number * 10. + (*p - '0'); - p++; - num_digits++; - num_decimals++; - } - - if (num_digits >= max_digits) // Consume extra decimal digits. - while (isdigit_ascii(*p)) ++p; - - exponent -= num_decimals; - } - - if (num_digits == 0) { - *error = ERANGE; - return 0.0; + num_digits = 0; + int n = 0; + while (num_digits < max_digits && isdigit_ascii(*p)) { + n = n * 10 + (*p - '0'); + num_digits++; + p++; } - // Correct for sign. - if (negative) number = -number; - - // Process an exponent string. - if (toupper_ascii(*p) == toupper_ascii(sci)) { - if (maybe_int != NULL) *maybe_int = 0; - - // Handle optional sign - negative = 0; - switch (*++p) { - case '-': - negative = 1; // Fall through to increment pos. - case '+': - p++; - } - - // Process string of digits. - num_digits = 0; - n = 0; - while (num_digits < max_digits && isdigit_ascii(*p)) { - n = n * 10 + (*p - '0'); - num_digits++; - p++; - } - - if (negative) - exponent -= n; - else - exponent += n; - - // If no digits after the 'e'/'E', un-consume it. - if (num_digits == 0) p--; - } - - if (exponent > 308) { - *error = ERANGE; - return HUGE_VAL; - } else if (exponent > 0) { - number *= e[exponent]; - } else if (exponent < -308) { // Subnormal - if (exponent < -616) { // Prevent invalid array access. - number = 0.; - } else { - number /= e[-308 - exponent]; - number /= e[308]; - } + if (negative) + exponent -= n; + else + exponent += n; + // If no digits after the 'e'/'E', un-consume it. + if (num_digits == 0) + p--; + } + + if (exponent > 308) { + *error = ERANGE; + return HUGE_VAL; + } else if (exponent > 0) { + number *= e[exponent]; + } else if (exponent < -308) { // Subnormal + if (exponent < -616) { // Prevent invalid array access. + number = 0.; } else { - number /= e[-exponent]; + number /= e[-308 - exponent]; + number /= e[308]; } - if (number == HUGE_VAL || number == -HUGE_VAL) *error = ERANGE; - - if (skip_trailing) { - // Skip trailing whitespace. - while (isspace_ascii(*p)) p++; - } - - if (endptr) *endptr = p; - return number; + } else { + number /= e[-exponent]; + } + + if (number == HUGE_VAL || number == -HUGE_VAL) + *error = ERANGE; + + if (skip_trailing) { + // Skip trailing whitespace. + while (isspace_ascii(*p)) + p++; + } + + if (endptr) + *endptr = (char *)p; + return number; } /* copy a decimal number string with `decimal`, `tsep` as decimal point @@ -1782,306 +1734,302 @@ with a call to `free`. */ -char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, - char tsep) { - const char *p = s; - size_t length = strlen(s); - char *s_copy = malloc(length + 1); - char *dst = s_copy; - // Skip leading whitespace. - while (isspace_ascii(*p)) p++; - // Copy Leading sign +static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, + char tsep) { + const char *p = s; + const size_t length = strlen(s); + char *s_copy = malloc(length + 1); + char *dst = s_copy; + // Skip leading whitespace. + while (isspace_ascii(*p)) + p++; + // Copy Leading sign + if (*p == '+' || *p == '-') { + *dst++ = *p++; + } + // Copy integer part dropping `tsep` + while (isdigit_ascii(*p)) { + *dst++ = *p++; + p += (tsep != '\0' && *p == tsep); + } + // Replace `decimal` with '.' + if (*p == decimal) { + *dst++ = '.'; + p++; + } + // Copy fractional part after decimal (if any) + while (isdigit_ascii(*p)) { + *dst++ = *p++; + } + // Copy exponent if any + if (toupper_ascii(*p) == toupper_ascii('E')) { + *dst++ = *p++; + // Copy leading exponent sign (if any) if (*p == '+' || *p == '-') { - *dst++ = *p++; - } - // Copy integer part dropping `tsep` - while (isdigit_ascii(*p)) { - *dst++ = *p++; - p += (tsep != '\0' && *p == tsep); + *dst++ = *p++; } - // Replace `decimal` with '.' - if (*p == decimal) { - *dst++ = '.'; - p++; - } - // Copy fractional part after decimal (if any) + // Copy exponent digits while (isdigit_ascii(*p)) { - *dst++ = *p++; - } - // Copy exponent if any - if (toupper_ascii(*p) == toupper_ascii('E')) { - *dst++ = *p++; - // Copy leading exponent sign (if any) - if (*p == '+' || *p == '-') { - *dst++ = *p++; - } - // Copy exponent digits - while (isdigit_ascii(*p)) { - *dst++ = *p++; - } - } - *dst++ = '\0'; // terminate - if (endpos != NULL) - *endpos = (char *)p; - return s_copy; -} - - -double round_trip(const char *p, char **q, char decimal, char sci, char tsep, - int skip_trailing, int *error, int *maybe_int) { - // 'normalize' representation to C-locale; replace decimal with '.' and - // remove thousands separator. - char *endptr; - char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); - // This is called from a nogil block in parsers.pyx - // so need to explicitly get GIL before Python calls - PyGILState_STATE gstate; - gstate = PyGILState_Ensure(); - char *endpc; - double r = PyOS_string_to_double(pc, &endpc, 0); - // PyOS_string_to_double needs to consume the whole string - if (endpc == pc + strlen(pc)) { - if (q != NULL) { - // report endptr from source string (p) - *q = endptr; - } - } else { - *error = -1; - if (q != NULL) { - // p and pc are different len due to tsep removal. Can't report - // how much it has consumed of p. Just rewind to beginning. - *q = (char *)p; // TODO(willayd): this could be undefined behavior - } + *dst++ = *p++; } - if (maybe_int != NULL) *maybe_int = 0; - if (PyErr_Occurred() != NULL) *error = -1; - else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; - PyErr_Clear(); - - PyGILState_Release(gstate); - free(pc); - if (skip_trailing && q != NULL && *q != p) { - while (isspace_ascii(**q)) { - (*q)++; - } + } + *dst++ = '\0'; // terminate + if (endpos != NULL) + *endpos = (char *)p; + return s_copy; +} + +double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci), + char tsep, int skip_trailing, int *error, int *maybe_int) { + // 'normalize' representation to C-locale; replace decimal with '.' and + // remove thousands separator. + char *endptr; + char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); + // This is called from a nogil block in parsers.pyx + // so need to explicitly get GIL before Python calls + PyGILState_STATE gstate = PyGILState_Ensure(); + char *endpc; + const double r = PyOS_string_to_double(pc, &endpc, 0); + // PyOS_string_to_double needs to consume the whole string + if (endpc == pc + strlen(pc)) { + if (q != NULL) { + // report endptr from source string (p) + *q = endptr; + } + } else { + *error = -1; + if (q != NULL) { + // p and pc are different len due to tsep removal. Can't report + // how much it has consumed of p. Just rewind to beginning. + *q = (char *)p; // TODO(willayd): this could be undefined behavior + } + } + if (maybe_int != NULL) + *maybe_int = 0; + if (PyErr_Occurred() != NULL) + *error = -1; + else if (r == Py_HUGE_VAL) + *error = (int)Py_HUGE_VAL; + PyErr_Clear(); + + PyGILState_Release(gstate); + free(pc); + if (skip_trailing && q != NULL && *q != p) { + while (isspace_ascii(**q)) { + (*q)++; } - return r; + } + return r; } // End of xstrtod code // --------------------------------------------------------------------------- void uint_state_init(uint_state *self) { - self->seen_sint = 0; - self->seen_uint = 0; - self->seen_null = 0; + self->seen_sint = 0; + self->seen_uint = 0; + self->seen_null = 0; } int uint64_conflict(uint_state *self) { - return self->seen_uint && (self->seen_sint || self->seen_null); + return self->seen_uint && (self->seen_sint || self->seen_null); } int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { - const char *p = p_item; - int isneg = 0; - int64_t number = 0; - int d; + const char *p = p_item; + // Skip leading spaces. + while (isspace_ascii(*p)) { + ++p; + } + + // Handle sign. + const bool isneg = *p == '-' ? true : false; + // Handle sign. + if (isneg || (*p == '+')) { + p++; + } + + // Check that there is a first digit. + if (!isdigit_ascii(*p)) { + // Error... + *error = ERROR_NO_DIGITS; + return 0; + } - // Skip leading spaces. - while (isspace_ascii(*p)) { - ++p; - } + int64_t number = 0; + if (isneg) { + // If number is greater than pre_min, at least one more digit + // can be processed without overflowing. + int dig_pre_min = -(int_min % 10); + int64_t pre_min = int_min / 10; - // Handle sign. - if (*p == '-') { - isneg = 1; - ++p; - } else if (*p == '+') { - p++; + // Process the digits. + char d = *p; + if (tsep != '\0') { + while (1) { + if (d == tsep) { + d = *++p; + continue; + } else if (!isdigit_ascii(d)) { + break; + } + if ((number > pre_min) || + ((number == pre_min) && (d - '0' <= dig_pre_min))) { + number = number * 10 - (d - '0'); + d = *++p; + } else { + *error = ERROR_OVERFLOW; + return 0; + } + } + } else { + while (isdigit_ascii(d)) { + if ((number > pre_min) || + ((number == pre_min) && (d - '0' <= dig_pre_min))) { + number = number * 10 - (d - '0'); + d = *++p; + } else { + *error = ERROR_OVERFLOW; + return 0; + } + } } + } else { + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + int64_t pre_max = int_max / 10; + int dig_pre_max = int_max % 10; - // Check that there is a first digit. - if (!isdigit_ascii(*p)) { - // Error... - *error = ERROR_NO_DIGITS; - return 0; - } + // Process the digits. + char d = *p; + if (tsep != '\0') { + while (1) { + if (d == tsep) { + d = *++p; + continue; + } else if (!isdigit_ascii(d)) { + break; + } + if ((number < pre_max) || + ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; - if (isneg) { - // If number is greater than pre_min, at least one more digit - // can be processed without overflowing. - int dig_pre_min = -(int_min % 10); - int64_t pre_min = int_min / 10; - - // Process the digits. - d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number > pre_min) || - ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); - d = *++p; - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } } else { - while (isdigit_ascii(d)) { - if ((number > pre_min) || - ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); - d = *++p; - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } + *error = ERROR_OVERFLOW; + return 0; } + } } else { - // If number is less than pre_max, at least one more digit - // can be processed without overflowing. - int64_t pre_max = int_max / 10; - int dig_pre_max = int_max % 10; - - // Process the digits. - d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } + while (isdigit_ascii(d)) { + if ((number < pre_max) || + ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + } else { - while (isdigit_ascii(d)) { - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } + *error = ERROR_OVERFLOW; + return 0; } + } } + } - // Skip trailing spaces. - while (isspace_ascii(*p)) { - ++p; - } + // Skip trailing spaces. + while (isspace_ascii(*p)) { + ++p; + } - // Did we use up all the characters? - if (*p) { - *error = ERROR_INVALID_CHARS; - return 0; - } + // Did we use up all the characters? + if (*p) { + *error = ERROR_INVALID_CHARS; + return 0; + } - *error = 0; - return number; + *error = 0; + return number; } uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) { - const char *p = p_item; - uint64_t pre_max = uint_max / 10; - int dig_pre_max = uint_max % 10; - uint64_t number = 0; - int d; + const char *p = p_item; + // Skip leading spaces. + while (isspace_ascii(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + state->seen_sint = 1; + *error = 0; + return 0; + } else if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit_ascii(*p)) { + // Error... + *error = ERROR_NO_DIGITS; + return 0; + } - // Skip leading spaces. - while (isspace_ascii(*p)) { - ++p; - } + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + // + // Process the digits. + uint64_t number = 0; + const uint64_t pre_max = uint_max / 10; + const uint64_t dig_pre_max = uint_max % 10; + char d = *p; + if (tsep != '\0') { + while (1) { + if (d == tsep) { + d = *++p; + continue; + } else if (!isdigit_ascii(d)) { + break; + } + if ((number < pre_max) || + ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; - // Handle sign. - if (*p == '-') { - state->seen_sint = 1; - *error = 0; + } else { + *error = ERROR_OVERFLOW; return 0; - } else if (*p == '+') { - p++; + } } + } else { + while (isdigit_ascii(d)) { + if ((number < pre_max) || + ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; - // Check that there is a first digit. - if (!isdigit_ascii(*p)) { - // Error... - *error = ERROR_NO_DIGITS; + } else { + *error = ERROR_OVERFLOW; return 0; + } } + } - // If number is less than pre_max, at least one more digit - // can be processed without overflowing. - // - // Process the digits. - d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } else { - while (isdigit_ascii(d)) { - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } - - // Skip trailing spaces. - while (isspace_ascii(*p)) { - ++p; - } - - // Did we use up all the characters? - if (*p) { - *error = ERROR_INVALID_CHARS; - return 0; - } + // Skip trailing spaces. + while (isspace_ascii(*p)) { + ++p; + } + + // Did we use up all the characters? + if (*p) { + *error = ERROR_INVALID_CHARS; + return 0; + } - if (number > (uint64_t)int_max) { - state->seen_uint = 1; - } + if (number > (uint64_t)int_max) { + state->seen_uint = 1; + } - *error = 0; - return number; + *error = 0; + return number; } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c pandas-2.2.2+dfsg/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c --- pandas-2.1.4+dfsg/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c 2024-04-10 17:42:52.000000000 +0000 @@ -14,19 +14,60 @@ */ -#define NO_IMPORT +// Licence at LICENSES/NUMPY_LICENSE #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#endif // NPY_NO_DEPRECATED_API +#endif // NPY_NO_DEPRECATED_API #include -#include -#include -#include #include "pandas/vendored/numpy/datetime/np_datetime.h" +#define NO_IMPORT_ARRAY +#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY +#include +#include + +#if defined(_WIN32) +#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS +#define ENABLE_INTSAFE_SIGNED_FUNCTIONS +#endif +#include +#define checked_int64_add(a, b, res) LongLongAdd(a, b, res) +#define checked_int64_sub(a, b, res) LongLongSub(a, b, res) +#define checked_int64_mul(a, b, res) LongLongMult(a, b, res) +#else +#if defined __has_builtin +#if __has_builtin(__builtin_add_overflow) +#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) +#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) +#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) +#else +_Static_assert(0, + "Overflow checking not detected; please try a newer compiler"); +#endif +// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment +// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that +#elif __GNUC__ > 7 +#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) +#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) +#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) +#else +_Static_assert(0, "__has_builtin not detected; please try a newer compiler"); +#endif +#endif + +#define PD_CHECK_OVERFLOW(FUNC) \ + do { \ + if ((FUNC) != 0) { \ + PyGILState_STATE gstate = PyGILState_Ensure(); \ + PyErr_SetString(PyExc_OverflowError, \ + "Overflow occurred in npy_datetimestruct_to_datetime"); \ + PyGILState_Release(gstate); \ + return -1; \ + } \ + } while (0) const int days_per_month_table[2][12] = { {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, @@ -36,8 +77,8 @@ * Returns 1 if the given year is a leap year, 0 otherwise. */ int is_leapyear(npy_int64 year) { - return (year & 0x3) == 0 && /* year % 4 == 0 */ - ((year % 100) != 0 || (year % 400) == 0); + return (year & 0x3) == 0 && /* year % 4 == 0 */ + ((year % 100) != 0 || (year % 400) == 0); } /* @@ -45,108 +86,108 @@ * the current values are valid.g */ void add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes) { - int isleap; + int isleap; - /* MINUTES */ - dts->min += minutes; - while (dts->min < 0) { - dts->min += 60; - dts->hour--; - } - while (dts->min >= 60) { - dts->min -= 60; - dts->hour++; - } - - /* HOURS */ - while (dts->hour < 0) { - dts->hour += 24; - dts->day--; - } - while (dts->hour >= 24) { - dts->hour -= 24; - dts->day++; - } - - /* DAYS */ - if (dts->day < 1) { - dts->month--; - if (dts->month < 1) { - dts->year--; - dts->month = 12; - } - isleap = is_leapyear(dts->year); - dts->day += days_per_month_table[isleap][dts->month - 1]; - } else if (dts->day > 28) { - isleap = is_leapyear(dts->year); - if (dts->day > days_per_month_table[isleap][dts->month - 1]) { - dts->day -= days_per_month_table[isleap][dts->month - 1]; - dts->month++; - if (dts->month > 12) { - dts->year++; - dts->month = 1; - } - } + /* MINUTES */ + dts->min += minutes; + while (dts->min < 0) { + dts->min += 60; + dts->hour--; + } + while (dts->min >= 60) { + dts->min -= 60; + dts->hour++; + } + + /* HOURS */ + while (dts->hour < 0) { + dts->hour += 24; + dts->day--; + } + while (dts->hour >= 24) { + dts->hour -= 24; + dts->day++; + } + + /* DAYS */ + if (dts->day < 1) { + dts->month--; + if (dts->month < 1) { + dts->year--; + dts->month = 12; + } + isleap = is_leapyear(dts->year); + dts->day += days_per_month_table[isleap][dts->month - 1]; + } else if (dts->day > 28) { + isleap = is_leapyear(dts->year); + if (dts->day > days_per_month_table[isleap][dts->month - 1]) { + dts->day -= days_per_month_table[isleap][dts->month - 1]; + dts->month++; + if (dts->month > 12) { + dts->year++; + dts->month = 1; + } } + } } /* * Calculates the days offset from the 1970 epoch. */ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { - int i, month; - npy_int64 year, days = 0; - const int *month_lengths; - - year = dts->year - 1970; - days = year * 365; - - /* Adjust for leap years */ - if (days >= 0) { - /* - * 1968 is the closest leap year before 1970. - * Exclude the current year, so add 1. - */ - year += 1; - /* Add one day for each 4 years */ - days += year / 4; - /* 1900 is the closest previous year divisible by 100 */ - year += 68; - /* Subtract one day for each 100 years */ - days -= year / 100; - /* 1600 is the closest previous year divisible by 400 */ - year += 300; - /* Add one day for each 400 years */ - days += year / 400; - } else { - /* - * 1972 is the closest later year after 1970. - * Include the current year, so subtract 2. - */ - year -= 2; - /* Subtract one day for each 4 years */ - days += year / 4; - /* 2000 is the closest later year divisible by 100 */ - year -= 28; - /* Add one day for each 100 years */ - days -= year / 100; - /* 2000 is also the closest later year divisible by 400 */ - /* Subtract one day for each 400 years */ - days += year / 400; - } - - month_lengths = days_per_month_table[is_leapyear(dts->year)]; - month = dts->month - 1; - - /* Add the months */ - for (i = 0; i < month; ++i) { - days += month_lengths[i]; - } + int i, month; + npy_int64 year, days = 0; + const int *month_lengths; - /* Add the days */ - days += dts->day - 1; + year = dts->year - 1970; + days = year * 365; - return days; + /* Adjust for leap years */ + if (days >= 0) { + /* + * 1968 is the closest leap year before 1970. + * Exclude the current year, so add 1. + */ + year += 1; + /* Add one day for each 4 years */ + days += year / 4; + /* 1900 is the closest previous year divisible by 100 */ + year += 68; + /* Subtract one day for each 100 years */ + days -= year / 100; + /* 1600 is the closest previous year divisible by 400 */ + year += 300; + /* Add one day for each 400 years */ + days += year / 400; + } else { + /* + * 1972 is the closest later year after 1970. + * Include the current year, so subtract 2. + */ + year -= 2; + /* Subtract one day for each 4 years */ + days += year / 4; + /* 2000 is the closest later year divisible by 100 */ + year -= 28; + /* Add one day for each 100 years */ + days -= year / 100; + /* 2000 is also the closest later year divisible by 400 */ + /* Subtract one day for each 400 years */ + days += year / 400; + } + + month_lengths = days_per_month_table[is_leapyear(dts->year)]; + month = dts->month - 1; + + /* Add the months */ + for (i = 0; i < month; ++i) { + days += month_lengths[i]; + } + + /* Add the days */ + days += dts->day - 1; + + return days; } /* @@ -154,62 +195,61 @@ * and returns the year. */ static npy_int64 days_to_yearsdays(npy_int64 *days_) { - const npy_int64 days_per_400years = (400 * 365 + 100 - 4 + 1); - /* Adjust so it's relative to the year 2000 (divisible by 400) */ - npy_int64 days = (*days_) - (365 * 30 + 7); - npy_int64 year; - - /* Break down the 400 year cycle to get the year and day within the year */ - if (days >= 0) { - year = 400 * (days / days_per_400years); - days = days % days_per_400years; - } else { - year = 400 * ((days - (days_per_400years - 1)) / days_per_400years); - days = days % days_per_400years; - if (days < 0) { - days += days_per_400years; - } - } - - /* Work out the year/day within the 400 year cycle */ - if (days >= 366) { - year += 100 * ((days - 1) / (100 * 365 + 25 - 1)); - days = (days - 1) % (100 * 365 + 25 - 1); - if (days >= 365) { - year += 4 * ((days + 1) / (4 * 365 + 1)); - days = (days + 1) % (4 * 365 + 1); - if (days >= 366) { - year += (days - 1) / 365; - days = (days - 1) % 365; - } - } + const npy_int64 days_per_400years = (400 * 365 + 100 - 4 + 1); + /* Adjust so it's relative to the year 2000 (divisible by 400) */ + npy_int64 days = (*days_) - (365 * 30 + 7); + npy_int64 year; + + /* Break down the 400 year cycle to get the year and day within the year */ + if (days >= 0) { + year = 400 * (days / days_per_400years); + days = days % days_per_400years; + } else { + year = 400 * ((days - (days_per_400years - 1)) / days_per_400years); + days = days % days_per_400years; + if (days < 0) { + days += days_per_400years; + } + } + + /* Work out the year/day within the 400 year cycle */ + if (days >= 366) { + year += 100 * ((days - 1) / (100 * 365 + 25 - 1)); + days = (days - 1) % (100 * 365 + 25 - 1); + if (days >= 365) { + year += 4 * ((days + 1) / (4 * 365 + 1)); + days = (days + 1) % (4 * 365 + 1); + if (days >= 366) { + year += (days - 1) / 365; + days = (days - 1) % 365; + } } + } - *days_ = days; - return year + 2000; + *days_ = days; + return year + 2000; } - /* * Fills in the year, month, day in 'dts' based on the days * offset from 1970. */ static void set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) { - const int *month_lengths; - int i; + const int *month_lengths; + int i; - dts->year = days_to_yearsdays(&days); - month_lengths = days_per_month_table[is_leapyear(dts->year)]; + dts->year = days_to_yearsdays(&days); + month_lengths = days_per_month_table[is_leapyear(dts->year)]; - for (i = 0; i < 12; ++i) { - if (days < month_lengths[i]) { - dts->month = i + 1; - dts->day = days + 1; - return; - } else { - days -= month_lengths[i]; - } + for (i = 0; i < 12; ++i) { + if (days < month_lengths[i]) { + dts->month = i + 1; + dts->day = (npy_int32)days + 1; + return; + } else { + days -= month_lengths[i]; } + } } /* @@ -217,186 +257,281 @@ */ int cmp_npy_datetimestruct(const npy_datetimestruct *a, const npy_datetimestruct *b) { - if (a->year > b->year) { - return 1; - } else if (a->year < b->year) { - return -1; - } + if (a->year > b->year) { + return 1; + } else if (a->year < b->year) { + return -1; + } + + if (a->month > b->month) { + return 1; + } else if (a->month < b->month) { + return -1; + } + + if (a->day > b->day) { + return 1; + } else if (a->day < b->day) { + return -1; + } + + if (a->hour > b->hour) { + return 1; + } else if (a->hour < b->hour) { + return -1; + } + + if (a->min > b->min) { + return 1; + } else if (a->min < b->min) { + return -1; + } + + if (a->sec > b->sec) { + return 1; + } else if (a->sec < b->sec) { + return -1; + } + + if (a->us > b->us) { + return 1; + } else if (a->us < b->us) { + return -1; + } + + if (a->ps > b->ps) { + return 1; + } else if (a->ps < b->ps) { + return -1; + } + + if (a->as > b->as) { + return 1; + } else if (a->as < b->as) { + return -1; + } - if (a->month > b->month) { - return 1; - } else if (a->month < b->month) { - return -1; - } + return 0; +} +/* + * Returns the offset from utc of the timezone as a timedelta. + * The caller is responsible for ensuring that the tzinfo + * attribute exists on the datetime object. + * + * If the passed object is timezone naive, Py_None is returned. + * If extraction of the offset fails, NULL is returned. + * + * NOTE: This function is not vendored from numpy. + */ +PyObject *extract_utc_offset(PyObject *obj) { + PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo"); + if (tmp == NULL) { + return NULL; + } + if (tmp != Py_None) { + PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); + if (offset == NULL) { + Py_DECREF(tmp); + return NULL; + } + return offset; + } + return tmp; +} - if (a->day > b->day) { - return 1; - } else if (a->day < b->day) { - return -1; - } +static inline int scaleYearToEpoch(int64_t year, int64_t *result) { + return checked_int64_sub(year, 1970, result); +} - if (a->hour > b->hour) { - return 1; - } else if (a->hour < b->hour) { - return -1; - } +static inline int scaleYearsToMonths(int64_t years, int64_t *result) { + return checked_int64_mul(years, 12, result); +} - if (a->min > b->min) { - return 1; - } else if (a->min < b->min) { - return -1; +static inline int scaleDaysToWeeks(int64_t days, int64_t *result) { + if (days >= 0) { + *result = days / 7; + return 0; + } else { + int res; + int64_t checked_days; + if ((res = checked_int64_sub(days, 6, &checked_days))) { + return res; } - if (a->sec > b->sec) { - return 1; - } else if (a->sec < b->sec) { - return -1; - } + *result = checked_days / 7; + return 0; + } +} - if (a->us > b->us) { - return 1; - } else if (a->us < b->us) { - return -1; - } +static inline int scaleDaysToHours(int64_t days, int64_t *result) { + return checked_int64_mul(days, 24, result); +} - if (a->ps > b->ps) { - return 1; - } else if (a->ps < b->ps) { - return -1; - } +static inline int scaleHoursToMinutes(int64_t hours, int64_t *result) { + return checked_int64_mul(hours, 60, result); +} - if (a->as > b->as) { - return 1; - } else if (a->as < b->as) { - return -1; - } +static inline int scaleMinutesToSeconds(int64_t minutes, int64_t *result) { + return checked_int64_mul(minutes, 60, result); +} - return 0; +static inline int scaleSecondsToMilliseconds(int64_t seconds, int64_t *result) { + return checked_int64_mul(seconds, 1000, result); } -/* -* Returns the offset from utc of the timezone as a timedelta. -* The caller is responsible for ensuring that the tzinfo -* attribute exists on the datetime object. -* -* If the passed object is timezone naive, Py_None is returned. -* If extraction of the offset fails, NULL is returned. -* -* NOTE: This function is not vendored from numpy. -*/ -PyObject *extract_utc_offset(PyObject *obj) { - PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo"); - if (tmp == NULL) { - return NULL; - } - if (tmp != Py_None) { - PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); - if (offset == NULL) { - Py_DECREF(tmp); - return NULL; - } - return offset; - } - return tmp; + +static inline int scaleSecondsToMicroseconds(int64_t seconds, int64_t *result) { + return checked_int64_mul(seconds, 1000000, result); +} + +static inline int scaleMicrosecondsToNanoseconds(int64_t microseconds, + int64_t *result) { + return checked_int64_mul(microseconds, 1000, result); +} + +static inline int scaleMicrosecondsToPicoseconds(int64_t microseconds, + int64_t *result) { + return checked_int64_mul(microseconds, 1000000, result); +} + +static inline int64_t scalePicosecondsToFemtoseconds(int64_t picoseconds, + int64_t *result) { + return checked_int64_mul(picoseconds, 1000, result); +} + +static inline int64_t scalePicosecondsToAttoseconds(int64_t picoseconds, + int64_t *result) { + return checked_int64_mul(picoseconds, 1000000, result); } /* * Converts a datetime from a datetimestruct to a datetime based - * on a metadata unit. The date is assumed to be valid. + * on a metadata unit. Returns -1 on and sets PyErr on error. */ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, const npy_datetimestruct *dts) { - npy_datetime ret; + if ((base == NPY_FR_Y) || (base == NPY_FR_M)) { + int64_t years; + PD_CHECK_OVERFLOW(scaleYearToEpoch(dts->year, &years)); if (base == NPY_FR_Y) { - /* Truncate to the year */ - ret = dts->year - 1970; - } else if (base == NPY_FR_M) { - /* Truncate to the month */ - ret = 12 * (dts->year - 1970) + (dts->month - 1); - } else { - /* Otherwise calculate the number of days to start */ - npy_int64 days = get_datetimestruct_days(dts); - - switch (base) { - case NPY_FR_W: - /* Truncate to weeks */ - if (days >= 0) { - ret = days / 7; - } else { - ret = (days - 6) / 7; - } - break; - case NPY_FR_D: - ret = days; - break; - case NPY_FR_h: - ret = days * 24 + dts->hour; - break; - case NPY_FR_m: - ret = (days * 24 + dts->hour) * 60 + dts->min; - break; - case NPY_FR_s: - ret = ((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec; - break; - case NPY_FR_ms: - ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + - dts->sec) * - 1000 + - dts->us / 1000; - break; - case NPY_FR_us: - ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + - dts->sec) * - 1000000 + - dts->us; - break; - case NPY_FR_ns: - ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + - dts->sec) * - 1000000 + - dts->us) * - 1000 + - dts->ps / 1000; - break; - case NPY_FR_ps: - ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + - dts->sec) * - 1000000 + - dts->us) * - 1000000 + - dts->ps; - break; - case NPY_FR_fs: - /* only 2.6 hours */ - ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + - dts->sec) * - 1000000 + - dts->us) * - 1000000 + - dts->ps) * - 1000 + - dts->as / 1000; - break; - case NPY_FR_as: - /* only 9.2 secs */ - ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + - dts->sec) * - 1000000 + - dts->us) * - 1000000 + - dts->ps) * - 1000000 + - dts->as; - break; - default: - /* Something got corrupted */ - PyErr_SetString( - PyExc_ValueError, - "NumPy datetime metadata with corrupt unit value"); - return -1; - } + return years; + } + + int64_t months; + PD_CHECK_OVERFLOW(scaleYearsToMonths(years, &months)); + + int64_t months_adder; + PD_CHECK_OVERFLOW(checked_int64_sub(dts->month, 1, &months_adder)); + PD_CHECK_OVERFLOW(checked_int64_add(months, months_adder, &months)); + + if (base == NPY_FR_M) { + return months; } - return ret; + } + + const int64_t days = get_datetimestruct_days(dts); + if (base == NPY_FR_D) { + return days; + } + + if (base == NPY_FR_W) { + int64_t weeks; + PD_CHECK_OVERFLOW(scaleDaysToWeeks(days, &weeks)); + return weeks; + } + + int64_t hours; + PD_CHECK_OVERFLOW(scaleDaysToHours(days, &hours)); + PD_CHECK_OVERFLOW(checked_int64_add(hours, dts->hour, &hours)); + + if (base == NPY_FR_h) { + return hours; + } + + int64_t minutes; + PD_CHECK_OVERFLOW(scaleHoursToMinutes(hours, &minutes)); + PD_CHECK_OVERFLOW(checked_int64_add(minutes, dts->min, &minutes)); + + if (base == NPY_FR_m) { + return minutes; + } + + int64_t seconds; + PD_CHECK_OVERFLOW(scaleMinutesToSeconds(minutes, &seconds)); + PD_CHECK_OVERFLOW(checked_int64_add(seconds, dts->sec, &seconds)); + + if (base == NPY_FR_s) { + return seconds; + } + + if (base == NPY_FR_ms) { + int64_t milliseconds; + PD_CHECK_OVERFLOW(scaleSecondsToMilliseconds(seconds, &milliseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(milliseconds, dts->us / 1000, &milliseconds)); + + return milliseconds; + } + + int64_t microseconds; + PD_CHECK_OVERFLOW(scaleSecondsToMicroseconds(seconds, µseconds)); + PD_CHECK_OVERFLOW(checked_int64_add(microseconds, dts->us, µseconds)); + + if (base == NPY_FR_us) { + return microseconds; + } + + if (base == NPY_FR_ns) { + int64_t nanoseconds; + + // Minimum valid timestamp in nanoseconds (1677-09-21 00:12:43.145224193). + const int64_t min_nanoseconds = NPY_MIN_INT64 + 1; + if (microseconds == min_nanoseconds / 1000 - 1) { + // For values within one microsecond of min_nanoseconds, use it as base + // and offset it with nanosecond delta to avoid overflow during scaling. + PD_CHECK_OVERFLOW(checked_int64_add( + min_nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); + } else { + PD_CHECK_OVERFLOW( + scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + } + + return nanoseconds; + } + + int64_t picoseconds; + PD_CHECK_OVERFLOW(scaleMicrosecondsToPicoseconds(microseconds, &picoseconds)); + PD_CHECK_OVERFLOW(checked_int64_add(picoseconds, dts->ps, &picoseconds)); + + if (base == NPY_FR_ps) { + return picoseconds; + } + + if (base == NPY_FR_fs) { + int64_t femtoseconds; + PD_CHECK_OVERFLOW( + scalePicosecondsToFemtoseconds(picoseconds, &femtoseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(femtoseconds, dts->as / 1000, &femtoseconds)); + return femtoseconds; + } + + if (base == NPY_FR_as) { + int64_t attoseconds; + PD_CHECK_OVERFLOW(scalePicosecondsToAttoseconds(picoseconds, &attoseconds)); + PD_CHECK_OVERFLOW(checked_int64_add(attoseconds, dts->as, &attoseconds)); + return attoseconds; + } + + /* Something got corrupted */ + PyGILState_STATE gstate = PyGILState_Ensure(); + PyErr_SetString(PyExc_ValueError, + "NumPy datetime metadata with corrupt unit value"); + PyGILState_Release(gstate); + + return -1; } /* @@ -408,164 +543,162 @@ * for subsequent calls to this command - it is able to deduce that `*d >= 0`. */ npy_int64 extract_unit(npy_datetime *d, npy_datetime unit) { - assert(unit > 0); - npy_int64 div = *d / unit; - npy_int64 mod = *d % unit; - if (mod < 0) { - mod += unit; - div -= 1; - } - assert(mod >= 0); - *d = mod; - return div; + assert(unit > 0); + npy_int64 div = *d / unit; + npy_int64 mod = *d % unit; + if (mod < 0) { + mod += unit; + div -= 1; + } + assert(mod >= 0); + *d = mod; + return div; } /* * Converts a datetime based on the given metadata into a datetimestruct */ -void pandas_datetime_to_datetimestruct(npy_datetime dt, - NPY_DATETIMEUNIT base, +void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base, npy_datetimestruct *out) { - npy_int64 perday; - - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); - out->year = 1970; - out->month = 1; - out->day = 1; + npy_int64 perday; - /* - * Note that care must be taken with the / and % operators - * for negative values. - */ - switch (base) { - case NPY_FR_Y: - out->year = 1970 + dt; - break; - - case NPY_FR_M: - out->year = 1970 + extract_unit(&dt, 12); - out->month = dt + 1; - break; - - case NPY_FR_W: - /* A week is 7 days */ - set_datetimestruct_days(dt * 7, out); - break; - - case NPY_FR_D: - set_datetimestruct_days(dt, out); - break; - - case NPY_FR_h: - perday = 24LL; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = dt; - break; - - case NPY_FR_m: - perday = 24LL * 60; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 60); - out->min = (int)dt; - break; - - case NPY_FR_s: - perday = 24LL * 60 * 60; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 60 * 60); - out->min = (int)extract_unit(&dt, 60); - out->sec = (int)dt; - break; - - case NPY_FR_ms: - perday = 24LL * 60 * 60 * 1000; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 60); - out->sec = (int)extract_unit(&dt, 1000LL); - out->us = (int)(dt * 1000); - break; - - case NPY_FR_us: - perday = 24LL * 60LL * 60LL * 1000LL * 1000LL; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000); - out->us = (int)dt; - break; - - case NPY_FR_ns: - perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (int)extract_unit(&dt, 1000LL); - out->ps = (int)(dt * 1000); - break; - - case NPY_FR_ps: - perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; - - set_datetimestruct_days(extract_unit(&dt, perday), out); - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000); - out->us = (int)extract_unit(&dt, 1000LL); - out->ps = (int)(dt * 1000); - break; - - case NPY_FR_fs: - /* entire range is only +- 2.6 hours */ - out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * - 1000 * 60 * 60); - if (out->hour < 0) { - out->year = 1969; - out->month = 12; - out->day = 31; - out->hour += 24; - assert(out->hour >= 0); - } - out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * - 1000 * 60); - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * - 1000); - out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000); - out->ps = (int)extract_unit(&dt, 1000LL); - out->as = (int)(dt * 1000); - break; - - case NPY_FR_as: - /* entire range is only +- 9.2 seconds */ - out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * - 1000 * 1000); - if (out->sec < 0) { - out->year = 1969; - out->month = 12; - out->day = 31; - out->hour = 23; - out->min = 59; - out->sec += 60; - assert(out->sec >= 0); - } - out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); - out->ps = (int)extract_unit(&dt, 1000LL * 1000); - out->as = (int)dt; - break; - - default: - PyErr_SetString(PyExc_RuntimeError, - "NumPy datetime metadata is corrupted with invalid " - "base unit"); - } + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(npy_datetimestruct)); + out->year = 1970; + out->month = 1; + out->day = 1; + + /* + * Note that care must be taken with the / and % operators + * for negative values. + */ + switch (base) { + case NPY_FR_Y: + out->year = 1970 + dt; + break; + + case NPY_FR_M: + out->year = 1970 + extract_unit(&dt, 12); + out->month = (npy_int32)dt + 1; + break; + + case NPY_FR_W: + /* A week is 7 days */ + set_datetimestruct_days(dt * 7, out); + break; + + case NPY_FR_D: + set_datetimestruct_days(dt, out); + break; + + case NPY_FR_h: + perday = 24LL; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (npy_int32)dt; + break; + + case NPY_FR_m: + perday = 24LL * 60; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (npy_int32)extract_unit(&dt, 60); + out->min = (npy_int32)dt; + break; + + case NPY_FR_s: + perday = 24LL * 60 * 60; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (npy_int32)extract_unit(&dt, 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 60); + out->sec = (npy_int32)dt; + break; + + case NPY_FR_ms: + perday = 24LL * 60 * 60 * 1000; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL); + out->us = (npy_int32)(dt * 1000); + break; + + case NPY_FR_us: + perday = 24LL * 60LL * 60LL * 1000LL * 1000LL; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000); + out->us = (npy_int32)dt; + break; + + case NPY_FR_ns: + perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL); + out->ps = (npy_int32)(dt * 1000); + break; + + case NPY_FR_ps: + perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; + + set_datetimestruct_days(extract_unit(&dt, perday), out); + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60); + out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL); + out->ps = (npy_int32)(dt * 1000); + break; + + case NPY_FR_fs: + /* entire range is only +- 2.6 hours */ + out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * + 1000 * 60 * 60); + if (out->hour < 0) { + out->year = 1969; + out->month = 12; + out->day = 31; + out->hour += 24; + assert(out->hour >= 0); + } + out->min = + (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 60); + out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000); + out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000); + out->ps = (npy_int32)extract_unit(&dt, 1000LL); + out->as = (npy_int32)(dt * 1000); + break; + + case NPY_FR_as: + /* entire range is only +- 9.2 seconds */ + out->sec = + (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 1000 * 1000); + if (out->sec < 0) { + out->year = 1969; + out->month = 12; + out->day = 31; + out->hour = 23; + out->min = 59; + out->sec += 60; + assert(out->sec >= 0); + } + out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000); + out->ps = (npy_int32)extract_unit(&dt, 1000LL * 1000); + out->as = (npy_int32)dt; + break; + + default: + PyErr_SetString(PyExc_RuntimeError, + "NumPy datetime metadata is corrupted with invalid " + "base unit"); + } } /* @@ -577,363 +710,358 @@ void pandas_timedelta_to_timedeltastruct(npy_timedelta td, NPY_DATETIMEUNIT base, pandas_timedeltastruct *out) { - npy_int64 frac; - npy_int64 sfrac; - npy_int64 ifrac; - int sign; - npy_int64 per_day; - npy_int64 per_sec; - - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(pandas_timedeltastruct)); - - switch (base) { - case NPY_FR_ns: - - per_day = 86400000000000LL; - per_sec = 1000LL * 1000LL * 1000LL; - - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - if (frac < 0) { - sign = -1; - - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; - } else { - frac = -frac; - } - } else { - sign = 1; - out->days = 0; - } - - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } - - if (frac >= 3600) { - out->hrs = frac / 3600LL; - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } - - if (frac >= 60) { - out->min = frac / 60LL; - frac -= out->min * 60LL; - } else { - out->min = 0; - } - - if (frac >= 0) { - out->sec = frac; - frac -= out->sec; - } else { - out->sec = 0; - } - - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * per_sec; - - if (sign < 0) - out->days = -out->days; - - ifrac = td - (out->days * per_day + sfrac); - - if (ifrac != 0) { - out->ms = ifrac / (1000LL * 1000LL); - ifrac -= out->ms * 1000LL * 1000LL; - out->us = ifrac / 1000LL; - ifrac -= out->us * 1000LL; - out->ns = ifrac; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } - break; - - case NPY_FR_us: - - per_day = 86400000000LL; - per_sec = 1000LL * 1000LL; - - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - if (frac < 0) { - sign = -1; - - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; - } else { - frac = -frac; - } - } else { - sign = 1; - out->days = 0; - } - - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } - - if (frac >= 3600) { - out->hrs = frac / 3600LL; - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } - - if (frac >= 60) { - out->min = frac / 60LL; - frac -= out->min * 60LL; - } else { - out->min = 0; - } - - if (frac >= 0) { - out->sec = frac; - frac -= out->sec; - } else { - out->sec = 0; - } - - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * per_sec; - - if (sign < 0) - out->days = -out->days; - - ifrac = td - (out->days * per_day + sfrac); - - if (ifrac != 0) { - out->ms = ifrac / 1000LL; - ifrac -= out->ms * 1000LL; - out->us = ifrac / 1L; - ifrac -= out->us * 1L; - out->ns = ifrac; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } - break; - - case NPY_FR_ms: - - per_day = 86400000LL; - per_sec = 1000LL; - - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - if (frac < 0) { - sign = -1; - - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; - } else { - frac = -frac; - } - } else { - sign = 1; - out->days = 0; - } - - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } - - if (frac >= 3600) { - out->hrs = frac / 3600LL; - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } - - if (frac >= 60) { - out->min = frac / 60LL; - frac -= out->min * 60LL; - } else { - out->min = 0; - } - - if (frac >= 0) { - out->sec = frac; - frac -= out->sec; - } else { - out->sec = 0; - } - - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * per_sec; - - if (sign < 0) - out->days = -out->days; - - ifrac = td - (out->days * per_day + sfrac); - - if (ifrac != 0) { - out->ms = ifrac; - out->us = 0; - out->ns = 0; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } - break; - - case NPY_FR_s: - // special case where we can simplify many expressions bc per_sec=1 - - per_day = 86400LL; - per_sec = 1L; - - // put frac in seconds - if (td < 0 && td % per_sec != 0) - frac = td / per_sec - 1; - else - frac = td / per_sec; - - if (frac < 0) { - sign = -1; - - // even fraction - if ((-frac % 86400LL) != 0) { - out->days = -frac / 86400LL + 1; - frac += 86400LL * out->days; - } else { - frac = -frac; - } - } else { - sign = 1; - out->days = 0; - } - - if (frac >= 86400) { - out->days += frac / 86400LL; - frac -= out->days * 86400LL; - } - - if (frac >= 3600) { - out->hrs = frac / 3600LL; - frac -= out->hrs * 3600LL; - } else { - out->hrs = 0; - } - - if (frac >= 60) { - out->min = frac / 60LL; - frac -= out->min * 60LL; - } else { - out->min = 0; - } - - if (frac >= 0) { - out->sec = frac; - frac -= out->sec; - } else { - out->sec = 0; - } - - sfrac = (out->hrs * 3600LL + out->min * 60LL - + out->sec) * per_sec; - - if (sign < 0) - out->days = -out->days; - - ifrac = td - (out->days * per_day + sfrac); - - if (ifrac != 0) { - out->ms = 0; - out->us = 0; - out->ns = 0; - } else { - out->ms = 0; - out->us = 0; - out->ns = 0; - } - break; - - case NPY_FR_m: - - out->days = td / 1440LL; - td -= out->days * 1440LL; - out->hrs = td / 60LL; - td -= out->hrs * 60LL; - out->min = td; - - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; - - case NPY_FR_h: - out->days = td / 24LL; - td -= out->days * 24LL; - out->hrs = td; - - out->min = 0; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; - - case NPY_FR_D: - out->days = td; - out->hrs = 0; - out->min = 0; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; - - case NPY_FR_W: - out->days = 7 * td; - out->hrs = 0; - out->min = 0; - out->sec = 0; - out->ms = 0; - out->us = 0; - out->ns = 0; - break; - - default: - PyErr_SetString(PyExc_RuntimeError, - "NumPy timedelta metadata is corrupted with " - "invalid base unit"); - } - - out->seconds = out->hrs * 3600 + out->min * 60 + out->sec; - out->microseconds = out->ms * 1000 + out->us; - out->nanoseconds = out->ns; -} + npy_int64 frac; + npy_int64 sfrac; + npy_int64 ifrac; + int sign; + npy_int64 per_day; + npy_int64 per_sec; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(pandas_timedeltastruct)); + + switch (base) { + case NPY_FR_ns: + + per_day = 86400000000000LL; + per_sec = 1000LL * 1000LL * 1000LL; + + // put frac in seconds + if (td < 0 && td % per_sec != 0) + frac = td / per_sec - 1; + else + frac = td / per_sec; + + if (frac < 0) { + sign = -1; + + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } + } else { + sign = 1; + out->days = 0; + } + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } + + if (frac >= 3600) { + out->hrs = (npy_int32)(frac / 3600LL); + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } + + if (frac >= 60) { + out->min = (npy_int32)(frac / 60LL); + frac -= out->min * 60LL; + } else { + out->min = 0; + } + + if (frac >= 0) { + out->sec = (npy_int32)frac; + frac -= out->sec; + } else { + out->sec = 0; + } + + sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec; + + if (sign < 0) + out->days = -out->days; + + ifrac = td - (out->days * per_day + sfrac); + + if (ifrac != 0) { + out->ms = (npy_int32)(ifrac / (1000LL * 1000LL)); + ifrac -= out->ms * 1000LL * 1000LL; + out->us = (npy_int32)(ifrac / 1000LL); + ifrac -= out->us * 1000LL; + out->ns = (npy_int32)ifrac; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + break; + + case NPY_FR_us: + + per_day = 86400000000LL; + per_sec = 1000LL * 1000LL; + + // put frac in seconds + if (td < 0 && td % per_sec != 0) + frac = td / per_sec - 1; + else + frac = td / per_sec; + + if (frac < 0) { + sign = -1; + + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } + } else { + sign = 1; + out->days = 0; + } + + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } + + if (frac >= 3600) { + out->hrs = (npy_int32)(frac / 3600LL); + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } + + if (frac >= 60) { + out->min = (npy_int32)(frac / 60LL); + frac -= out->min * 60LL; + } else { + out->min = 0; + } + + if (frac >= 0) { + out->sec = (npy_int32)frac; + frac -= out->sec; + } else { + out->sec = 0; + } + + sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec; + + if (sign < 0) + out->days = -out->days; + + ifrac = td - (out->days * per_day + sfrac); + + if (ifrac != 0) { + out->ms = (npy_int32)(ifrac / 1000LL); + ifrac -= out->ms * 1000LL; + out->us = (npy_int32)(ifrac / 1L); + ifrac -= out->us * 1L; + out->ns = (npy_int32)ifrac; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + break; + + case NPY_FR_ms: + + per_day = 86400000LL; + per_sec = 1000LL; + + // put frac in seconds + if (td < 0 && td % per_sec != 0) + frac = td / per_sec - 1; + else + frac = td / per_sec; + + if (frac < 0) { + sign = -1; + + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } + } else { + sign = 1; + out->days = 0; + } + + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } + + if (frac >= 3600) { + out->hrs = (npy_int32)(frac / 3600LL); + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } + + if (frac >= 60) { + out->min = (npy_int32)(frac / 60LL); + frac -= out->min * 60LL; + } else { + out->min = 0; + } + + if (frac >= 0) { + out->sec = (npy_int32)frac; + frac -= out->sec; + } else { + out->sec = 0; + } + + sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec; + + if (sign < 0) + out->days = -out->days; + + ifrac = td - (out->days * per_day + sfrac); + + if (ifrac != 0) { + out->ms = (npy_int32)ifrac; + out->us = 0; + out->ns = 0; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + break; + + case NPY_FR_s: + // special case where we can simplify many expressions bc per_sec=1 + + per_day = 86400LL; + per_sec = 1L; + + // put frac in seconds + if (td < 0 && td % per_sec != 0) + frac = td / per_sec - 1; + else + frac = td / per_sec; + + if (frac < 0) { + sign = -1; + + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } + } else { + sign = 1; + out->days = 0; + } + + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } + + if (frac >= 3600) { + out->hrs = (npy_int32)(frac / 3600LL); + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } + + if (frac >= 60) { + out->min = (npy_int32)(frac / 60LL); + frac -= out->min * 60LL; + } else { + out->min = 0; + } + + if (frac >= 0) { + out->sec = (npy_int32)frac; + frac -= out->sec; + } else { + out->sec = 0; + } + + sfrac = (out->hrs * 3600LL + out->min * 60LL + out->sec) * per_sec; + + if (sign < 0) + out->days = -out->days; + + ifrac = td - (out->days * per_day + sfrac); + + if (ifrac != 0) { + out->ms = 0; + out->us = 0; + out->ns = 0; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + break; + + case NPY_FR_m: + + out->days = td / 1440LL; + td -= out->days * 1440LL; + out->hrs = (npy_int32)(td / 60LL); + td -= out->hrs * 60LL; + out->min = (npy_int32)td; + + out->sec = 0; + out->ms = 0; + out->us = 0; + out->ns = 0; + break; + + case NPY_FR_h: + out->days = td / 24LL; + td -= out->days * 24LL; + out->hrs = (npy_int32)td; + + out->min = 0; + out->sec = 0; + out->ms = 0; + out->us = 0; + out->ns = 0; + break; + + case NPY_FR_D: + out->days = td; + out->hrs = 0; + out->min = 0; + out->sec = 0; + out->ms = 0; + out->us = 0; + out->ns = 0; + break; + + case NPY_FR_W: + out->days = 7 * td; + out->hrs = 0; + out->min = 0; + out->sec = 0; + out->ms = 0; + out->us = 0; + out->ns = 0; + break; + + default: + PyErr_SetString(PyExc_RuntimeError, + "NumPy timedelta metadata is corrupted with " + "invalid base unit"); + } + + out->seconds = out->hrs * 3600 + out->min * 60 + out->sec; + out->microseconds = out->ms * 1000 + out->us; + out->nanoseconds = out->ns; +} /* * This function returns a pointer to the DateTimeMetaData @@ -943,5 +1071,8 @@ */ PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(PyArray_Descr *dtype) { - return (((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta); +#if NPY_ABI_VERSION < 0x02000000 +#define PyDataType_C_METADATA(dtype) ((dtype)->c_metadata) +#endif + return ((PyArray_DatetimeDTypeMetaData *)PyDataType_C_METADATA(dtype))->meta; } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c pandas-2.2.2+dfsg/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c --- pandas-2.1.4+dfsg/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/src/vendored/numpy/datetime/np_datetime_strings.c 2024-04-10 17:42:52.000000000 +0000 @@ -19,25 +19,26 @@ */ +// LICENSES/NUMPY_LICENSE + #define PY_SSIZE_T_CLEAN #define NO_IMPORT #ifndef NPY_NO_DEPRECATED_API #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION -#endif // NPY_NO_DEPRECATED_API +#endif // NPY_NO_DEPRECATED_API #include #include -#include -#include #include +#include +#include "pandas/portable.h" #include "pandas/vendored/numpy/datetime/np_datetime.h" #include "pandas/vendored/numpy/datetime/np_datetime_strings.h" - /* * Parses (almost) standard ISO 8601 date strings. The differences are: * @@ -68,22 +69,19 @@ */ typedef enum { - COMPARISON_SUCCESS, - COMPLETED_PARTIAL_MATCH, - COMPARISON_ERROR + COMPARISON_SUCCESS, + COMPLETED_PARTIAL_MATCH, + COMPARISON_ERROR } DatetimePartParseResult; // This function will advance the pointer on format // and decrement characters_remaining by n on success // On failure will return COMPARISON_ERROR without incrementing // If `format_requirement` is PARTIAL_MATCH, and the `format` string has // been exhausted, then return COMPLETED_PARTIAL_MATCH. -static DatetimePartParseResult compare_format( - const char **format, - int *characters_remaining, - const char *compare_to, - int n, - const FormatRequirement format_requirement -) { +static DatetimePartParseResult +compare_format(const char **format, int *characters_remaining, + const char *compare_to, int n, + const FormatRequirement format_requirement) { if (format_requirement == INFER_FORMAT) { return COMPARISON_SUCCESS; } @@ -111,636 +109,651 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, - NPY_DATETIMEUNIT *out_bestunit, - int *out_local, int *out_tzoffset, - const char* format, int format_len, + NPY_DATETIMEUNIT *out_bestunit, int *out_local, + int *out_tzoffset, const char *format, + int format_len, FormatRequirement format_requirement) { - if (len < 0 || format_len < 0) - goto parse_error; - int year_leap = 0; - int i, numdigits; - const char *substr; - int sublen; - NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC; - DatetimePartParseResult comparison; - - /* If year-month-day are separated by a valid separator, - * months/days without leading zeroes will be parsed - * (though not iso8601). If the components aren't separated, - * 4 (YYYY) or 8 (YYYYMMDD) digits are expected. 6 digits are - * forbidden here (but parsed as YYMMDD elsewhere). - */ - int has_ymd_sep = 0; - char ymd_sep = '\0'; - char valid_ymd_sep[] = {'-', '.', '/', '\\', ' '}; - int valid_ymd_sep_len = sizeof(valid_ymd_sep); - - /* hour-minute-second may or may not separated by ':'. If not, then - * each component must be 2 digits. */ - int has_hms_sep = 0; - int hour_was_2_digits = 0; - - /* Initialize the output to all zeros */ - memset(out, 0, sizeof(npy_datetimestruct)); - out->month = 1; - out->day = 1; - - substr = str; - sublen = len; + if (len < 0 || format_len < 0) + goto parse_error; + int year_leap = 0; + int i, numdigits; + const char *substr; + int sublen; + NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC; + DatetimePartParseResult comparison; + + /* If year-month-day are separated by a valid separator, + * months/days without leading zeroes will be parsed + * (though not iso8601). If the components aren't separated, + * 4 (YYYY) or 8 (YYYYMMDD) digits are expected. 6 digits are + * forbidden here (but parsed as YYMMDD elsewhere). + */ + int has_ymd_sep = 0; + char ymd_sep = '\0'; + char valid_ymd_sep[] = {'-', '.', '/', '\\', ' '}; + int valid_ymd_sep_len = sizeof(valid_ymd_sep); + + /* hour-minute-second may or may not separated by ':'. If not, then + * each component must be 2 digits. */ + int has_hms_sep = 0; + int hour_was_2_digits = 0; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(npy_datetimestruct)); + out->month = 1; + out->day = 1; - /* Skip leading whitespace */ - while (sublen > 0 && isspace(*substr)) { - ++substr; - --sublen; - comparison = compare_format(&format, &format_len, " ", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - } - - /* Leading '-' sign for negative year */ - if (*substr == '-') { - ++substr; - --sublen; - } + substr = str; + sublen = len; - if (sublen == 0) { - goto parse_error; - } - - /* PARSE THE YEAR (4 digits) */ - comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement); + /* Skip leading whitespace */ + while (sublen > 0 && isspace(*substr)) { + ++substr; + --sublen; + comparison = + compare_format(&format, &format_len, " ", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + goto finish; } + } - out->year = 0; - if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && - isdigit(substr[2]) && isdigit(substr[3])) { - out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') + - 10 * (substr[2] - '0') + (substr[3] - '0'); + /* Leading '-' sign for negative year */ + if (*substr == '-') { + ++substr; + --sublen; + } - substr += 4; - sublen -= 4; - } + if (sublen == 0) { + goto parse_error; + } - /* Negate the year if necessary */ - if (str[0] == '-') { - out->year = -out->year; - } - /* Check whether it's a leap-year */ - year_leap = is_leapyear(out->year); + /* PARSE THE YEAR (4 digits) */ + comparison = + compare_format(&format, &format_len, "%Y", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } - /* Next character must be a separator, start of month, or end of string */ - if (sublen == 0) { - if (out_local != NULL) { - *out_local = 0; - } - if (format_len) { - goto parse_error; - } - bestunit = NPY_FR_Y; - goto finish; - } + out->year = 0; + if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && + isdigit(substr[2]) && isdigit(substr[3])) { + out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') + + 10 * (substr[2] - '0') + (substr[3] - '0'); - if (!isdigit(*substr)) { - for (i = 0; i < valid_ymd_sep_len; ++i) { - if (*substr == valid_ymd_sep[i]) { - break; - } - } - if (i == valid_ymd_sep_len) { - goto parse_error; - } - has_ymd_sep = 1; - ymd_sep = valid_ymd_sep[i]; - ++substr; - --sublen; + substr += 4; + sublen -= 4; + } - comparison = compare_format(&format, &format_len, &ymd_sep, 1, - format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* Cannot have trailing separator */ - if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; - } - } + /* Negate the year if necessary */ + if (str[0] == '-') { + out->year = -out->year; + } + /* Check whether it's a leap-year */ + year_leap = is_leapyear(out->year); - /* PARSE THE MONTH */ - comparison = compare_format(&format, &format_len, "%m", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + /* Next character must be a separator, start of month, or end of string */ + if (sublen == 0) { + if (out_local != NULL) { + *out_local = 0; } - /* First digit required */ - out->month = (*substr - '0'); - ++substr; - --sublen; - /* Second digit optional if there was a separator */ - if (isdigit(*substr)) { - out->month = 10 * out->month + (*substr - '0'); - ++substr; - --sublen; - } else if (!has_ymd_sep) { - goto parse_error; - } - if (out->month < 1 || out->month > 12) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Month out of range in datetime string \"%s\"", str); - } - goto error; + if (format_len) { + goto parse_error; } + bestunit = NPY_FR_Y; + goto finish; + } - /* Next character must be the separator, start of day, or end of string */ - if (sublen == 0) { - bestunit = NPY_FR_M; - /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */ - if (!has_ymd_sep) { - goto parse_error; - } - if (format_len) { - goto parse_error; - } - if (out_local != NULL) { - *out_local = 0; - } - goto finish; + if (!isdigit(*substr)) { + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (*substr == valid_ymd_sep[i]) { + break; + } } - - if (has_ymd_sep) { - /* Must have separator, but cannot be trailing */ - if (*substr != ymd_sep || sublen == 1) { - goto parse_error; - } - ++substr; - --sublen; - comparison = compare_format(&format, &format_len, &ymd_sep, 1, - format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } + if (i == valid_ymd_sep_len) { + goto parse_error; } + has_ymd_sep = 1; + ymd_sep = valid_ymd_sep[i]; + ++substr; + --sublen; - /* PARSE THE DAY */ - comparison = compare_format(&format, &format_len, "%d", 2, format_requirement); + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + goto finish; } - /* First digit required */ - if (!isdigit(*substr)) { - goto parse_error; + /* Cannot have trailing separator */ + if (sublen == 0 || !isdigit(*substr)) { + goto parse_error; } - out->day = (*substr - '0'); + } + + /* PARSE THE MONTH */ + comparison = + compare_format(&format, &format_len, "%m", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* First digit required */ + out->month = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional if there was a separator */ + if (isdigit(*substr)) { + out->month = 10 * out->month + (*substr - '0'); ++substr; --sublen; - /* Second digit optional if there was a separator */ - if (isdigit(*substr)) { - out->day = 10 * out->day + (*substr - '0'); - ++substr; - --sublen; - } else if (!has_ymd_sep) { - goto parse_error; - } - if (out->day < 1 || - out->day > days_per_month_table[year_leap][out->month - 1]) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Day out of range in datetime string \"%s\"", str); - } - goto error; + } else if (!has_ymd_sep) { + goto parse_error; + } + if (out->month < 1 || out->month > 12) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Month out of range in datetime string \"%s\"", str); } + goto error; + } - /* Next character must be a 'T', ' ', or end of string */ - if (sublen == 0) { - if (out_local != NULL) { - *out_local = 0; - } - if (format_len) { - goto parse_error; - } - bestunit = NPY_FR_D; - goto finish; + /* Next character must be the separator, start of day, or end of string */ + if (sublen == 0) { + bestunit = NPY_FR_M; + /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */ + if (!has_ymd_sep) { + goto parse_error; + } + if (format_len) { + goto parse_error; } + if (out_local != NULL) { + *out_local = 0; + } + goto finish; + } - if ((*substr != 'T' && *substr != ' ') || sublen == 1) { - goto parse_error; + if (has_ymd_sep) { + /* Must have separator, but cannot be trailing */ + if (*substr != ymd_sep || sublen == 1) { + goto parse_error; } - comparison = compare_format(&format, &format_len, substr, 1, format_requirement); + ++substr; + --sublen; + comparison = + compare_format(&format, &format_len, &ymd_sep, 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + goto finish; } + } + + /* PARSE THE DAY */ + comparison = + compare_format(&format, &format_len, "%d", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* First digit required */ + if (!isdigit(*substr)) { + goto parse_error; + } + out->day = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional if there was a separator */ + if (isdigit(*substr)) { + out->day = 10 * out->day + (*substr - '0'); ++substr; --sublen; + } else if (!has_ymd_sep) { + goto parse_error; + } + if (out->day < 1 || + out->day > days_per_month_table[year_leap][out->month - 1]) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Day out of range in datetime string \"%s\"", str); + } + goto error; + } - /* PARSE THE HOURS */ - comparison = compare_format(&format, &format_len, "%H", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + /* Next character must be a 'T', ' ', or end of string */ + if (sublen == 0) { + if (out_local != NULL) { + *out_local = 0; } - /* First digit required */ - if (!isdigit(*substr)) { - goto parse_error; + if (format_len) { + goto parse_error; } - out->hour = (*substr - '0'); + bestunit = NPY_FR_D; + goto finish; + } + + if ((*substr != 'T' && *substr != ' ') || sublen == 1) { + goto parse_error; + } + comparison = + compare_format(&format, &format_len, substr, 1, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + ++substr; + --sublen; + + /* PARSE THE HOURS */ + comparison = + compare_format(&format, &format_len, "%H", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* First digit required */ + if (!isdigit(*substr)) { + goto parse_error; + } + out->hour = (*substr - '0'); + bestunit = NPY_FR_h; + ++substr; + --sublen; + /* Second digit optional */ + if (isdigit(*substr)) { + hour_was_2_digits = 1; + out->hour = 10 * out->hour + (*substr - '0'); ++substr; --sublen; - /* Second digit optional */ - if (isdigit(*substr)) { - hour_was_2_digits = 1; - out->hour = 10 * out->hour + (*substr - '0'); - ++substr; - --sublen; - if (out->hour >= 24) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Hours out of range in datetime string \"%s\"", - str); - } - goto error; - } + if (out->hour >= 24) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Hours out of range in datetime string \"%s\"", str); + } + goto error; } + } - /* Next character must be a ':' or the end of the string */ - if (sublen == 0) { - if (!hour_was_2_digits) { - goto parse_error; - } - if (format_len) { - goto parse_error; - } - bestunit = NPY_FR_h; - goto finish; + /* Next character must be a ':' or the end of the string */ + if (sublen == 0) { + if (!hour_was_2_digits) { + goto parse_error; } - - if (*substr == ':') { - has_hms_sep = 1; - ++substr; - --sublen; - /* Cannot have a trailing separator */ - if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; - } - comparison = compare_format(&format, &format_len, ":", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - } else if (!isdigit(*substr)) { - if (!hour_was_2_digits) { - goto parse_error; - } - goto parse_timezone; + if (format_len) { + goto parse_error; } + bestunit = NPY_FR_h; + goto finish; + } - /* PARSE THE MINUTES */ - comparison = compare_format(&format, &format_len, "%M", 2, format_requirement); + if (*substr == ':') { + has_hms_sep = 1; + ++substr; + --sublen; + /* Cannot have a trailing separator */ + if (sublen == 0 || !isdigit(*substr)) { + goto parse_error; + } + comparison = + compare_format(&format, &format_len, ":", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + goto finish; + } + } else if (!isdigit(*substr)) { + if (!hour_was_2_digits) { + goto parse_error; } - /* First digit required */ - out->min = (*substr - '0'); + goto parse_timezone; + } + + /* PARSE THE MINUTES */ + comparison = + compare_format(&format, &format_len, "%M", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* First digit required */ + out->min = (*substr - '0'); + bestunit = NPY_FR_m; + ++substr; + --sublen; + /* Second digit optional if there was a separator */ + if (isdigit(*substr)) { + out->min = 10 * out->min + (*substr - '0'); ++substr; --sublen; - /* Second digit optional if there was a separator */ - if (isdigit(*substr)) { - out->min = 10 * out->min + (*substr - '0'); - ++substr; - --sublen; - if (out->min >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Minutes out of range in datetime string \"%s\"", - str); - } - goto error; - } - } else if (!has_hms_sep) { - goto parse_error; + if (out->min >= 60) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Minutes out of range in datetime string \"%s\"", str); + } + goto error; } + } else if (!has_hms_sep) { + goto parse_error; + } - if (sublen == 0) { - bestunit = NPY_FR_m; - if (format_len) { - goto parse_error; - } - goto finish; - } - - /* If we make it through this condition block, then the next - * character is a digit. */ - if (has_hms_sep && *substr == ':') { - comparison = compare_format(&format, &format_len, ":", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - ++substr; - --sublen; - /* Cannot have a trailing ':' */ - if (sublen == 0 || !isdigit(*substr)) { - goto parse_error; - } - } else if (!has_hms_sep && isdigit(*substr)) { - } else { - goto parse_timezone; + if (sublen == 0) { + bestunit = NPY_FR_m; + if (format_len) { + goto parse_error; } + goto finish; + } - /* PARSE THE SECONDS */ - comparison = compare_format(&format, &format_len, "%S", 2, format_requirement); + /* If we make it through this condition block, then the next + * character is a digit. */ + if (has_hms_sep && *substr == ':') { + comparison = + compare_format(&format, &format_len, ":", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + goto finish; } - /* First digit required */ - out->sec = (*substr - '0'); ++substr; --sublen; - /* Second digit optional if there was a separator */ - if (isdigit(*substr)) { - out->sec = 10 * out->sec + (*substr - '0'); - ++substr; - --sublen; - if (out->sec >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Seconds out of range in datetime string \"%s\"", - str); - } - goto error; - } - } else if (!has_hms_sep) { - goto parse_error; + /* Cannot have a trailing ':' */ + if (sublen == 0 || !isdigit(*substr)) { + goto parse_error; } + } else if (!has_hms_sep && isdigit(*substr)) { + } else { + goto parse_timezone; + } - /* Next character may be a '.' indicating fractional seconds */ - if (sublen > 0 && *substr == '.') { - ++substr; - --sublen; - comparison = compare_format(&format, &format_len, ".", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - } else { - bestunit = NPY_FR_s; - goto parse_timezone; + /* PARSE THE SECONDS */ + comparison = + compare_format(&format, &format_len, "%S", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* First digit required */ + out->sec = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional if there was a separator */ + if (isdigit(*substr)) { + out->sec = 10 * out->sec + (*substr - '0'); + ++substr; + --sublen; + if (out->sec >= 60) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Seconds out of range in datetime string \"%s\"", str); + } + goto error; } + } else if (!has_hms_sep) { + goto parse_error; + } - /* PARSE THE MICROSECONDS (0 to 6 digits) */ - comparison = compare_format(&format, &format_len, "%f", 2, format_requirement); + /* Next character may be a '.' indicating fractional seconds */ + if (sublen > 0 && *substr == '.') { + ++substr; + --sublen; + comparison = + compare_format(&format, &format_len, ".", 1, format_requirement); if (comparison == COMPARISON_ERROR) { - goto parse_error; + goto parse_error; } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - numdigits = 0; - for (i = 0; i < 6; ++i) { - out->us *= 10; - if (sublen > 0 && isdigit(*substr)) { - out->us += (*substr - '0'); - ++substr; - --sublen; - ++numdigits; - } - } - - if (sublen == 0 || !isdigit(*substr)) { - if (numdigits > 3) { - bestunit = NPY_FR_us; - } else { - bestunit = NPY_FR_ms; - } - goto parse_timezone; + goto finish; } + } else { + bestunit = NPY_FR_s; + goto parse_timezone; + } - /* PARSE THE PICOSECONDS (0 to 6 digits) */ - numdigits = 0; - for (i = 0; i < 6; ++i) { - out->ps *= 10; - if (sublen > 0 && isdigit(*substr)) { - out->ps += (*substr - '0'); - ++substr; - --sublen; - ++numdigits; - } + /* PARSE THE MICROSECONDS (0 to 6 digits) */ + comparison = + compare_format(&format, &format_len, "%f", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + numdigits = 0; + for (i = 0; i < 6; ++i) { + out->us *= 10; + if (sublen > 0 && isdigit(*substr)) { + out->us += (*substr - '0'); + ++substr; + --sublen; + ++numdigits; } + } - if (sublen == 0 || !isdigit(*substr)) { - if (numdigits > 3) { - bestunit = NPY_FR_ps; - } else { - bestunit = NPY_FR_ns; - } - goto parse_timezone; + if (sublen == 0 || !isdigit(*substr)) { + if (numdigits > 3) { + bestunit = NPY_FR_us; + } else { + bestunit = NPY_FR_ms; } + goto parse_timezone; + } - /* PARSE THE ATTOSECONDS (0 to 6 digits) */ - numdigits = 0; - for (i = 0; i < 6; ++i) { - out->as *= 10; - if (sublen > 0 && isdigit(*substr)) { - out->as += (*substr - '0'); - ++substr; - --sublen; - ++numdigits; - } + /* PARSE THE PICOSECONDS (0 to 6 digits) */ + numdigits = 0; + for (i = 0; i < 6; ++i) { + out->ps *= 10; + if (sublen > 0 && isdigit(*substr)) { + out->ps += (*substr - '0'); + ++substr; + --sublen; + ++numdigits; } + } + if (sublen == 0 || !isdigit(*substr)) { if (numdigits > 3) { - bestunit = NPY_FR_as; + bestunit = NPY_FR_ps; } else { - bestunit = NPY_FR_fs; + bestunit = NPY_FR_ns; } + goto parse_timezone; + } -parse_timezone: - /* trim any whitespace between time/timezone */ - while (sublen > 0 && isspace(*substr)) { - ++substr; - --sublen; - comparison = compare_format(&format, &format_len, " ", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } + /* PARSE THE ATTOSECONDS (0 to 6 digits) */ + numdigits = 0; + for (i = 0; i < 6; ++i) { + out->as *= 10; + if (sublen > 0 && isdigit(*substr)) { + out->as += (*substr - '0'); + ++substr; + --sublen; + ++numdigits; } + } - if (sublen == 0) { - // Unlike NumPy, treating no time zone as naive - if (format_len > 0) { - goto parse_error; - } - goto finish; - } + if (numdigits > 3) { + bestunit = NPY_FR_as; + } else { + bestunit = NPY_FR_fs; + } - /* UTC specifier */ - if (*substr == 'Z') { - comparison = compare_format(&format, &format_len, "%z", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* "Z" should be equivalent to tz offset "+00:00" */ - if (out_local != NULL) { - *out_local = 1; - } +parse_timezone: + /* trim any whitespace between time/timezone */ + while (sublen > 0 && isspace(*substr)) { + ++substr; + --sublen; + comparison = + compare_format(&format, &format_len, " ", 1, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + } - if (out_tzoffset != NULL) { - *out_tzoffset = 0; - } + if (sublen == 0) { + // Unlike NumPy, treating no time zone as naive + if (format_len > 0) { + goto parse_error; + } + goto finish; + } - if (sublen == 1) { - if (format_len > 0) { - goto parse_error; - } - goto finish; - } else { - ++substr; - --sublen; - } - } else if (*substr == '-' || *substr == '+') { - comparison = compare_format(&format, &format_len, "%z", 2, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; - } - /* Time zone offset */ - int offset_neg = 0, offset_hour = 0, offset_minute = 0; + /* UTC specifier */ + if (*substr == 'Z') { + comparison = + compare_format(&format, &format_len, "%z", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* "Z" should be equivalent to tz offset "+00:00" */ + if (out_local != NULL) { + *out_local = 1; + } - /* - * Since "local" means local with respect to the current - * machine, we say this is non-local. - */ + if (out_tzoffset != NULL) { + *out_tzoffset = 0; + } - if (*substr == '-') { - offset_neg = 1; - } - ++substr; - --sublen; + if (sublen == 1) { + if (format_len > 0) { + goto parse_error; + } + goto finish; + } else { + ++substr; + --sublen; + } + } else if (*substr == '-' || *substr == '+') { + comparison = + compare_format(&format, &format_len, "%z", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } + /* Time zone offset */ + int offset_neg = 0, offset_hour = 0, offset_minute = 0; - /* The hours offset */ - if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { - offset_hour = 10 * (substr[0] - '0') + (substr[1] - '0'); - substr += 2; - sublen -= 2; - if (offset_hour >= 24) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Timezone hours offset out of range " - "in datetime string \"%s\"", - str); - } - goto error; - } - } else if (sublen >= 1 && isdigit(substr[0])) { - offset_hour = substr[0] - '0'; - ++substr; - --sublen; - } else { - goto parse_error; - } + /* + * Since "local" means local with respect to the current + * machine, we say this is non-local. + */ - /* The minutes offset is optional */ - if (sublen > 0) { - /* Optional ':' */ - if (*substr == ':') { - ++substr; - --sublen; - } - - /* The minutes offset (at the end of the string) */ - if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { - offset_minute = 10 * (substr[0] - '0') + (substr[1] - '0'); - substr += 2; - sublen -= 2; - if (offset_minute >= 60) { - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Timezone minutes offset out of range " - "in datetime string \"%s\"", - str); - } - goto error; - } - } else if (sublen >= 1 && isdigit(substr[0])) { - offset_minute = substr[0] - '0'; - ++substr; - --sublen; - } else { - goto parse_error; - } - } + if (*substr == '-') { + offset_neg = 1; + } + ++substr; + --sublen; - /* Apply the time zone offset */ - if (offset_neg) { - offset_hour = -offset_hour; - offset_minute = -offset_minute; - } - if (out_local != NULL) { - *out_local = 1; - // Unlike NumPy, do not change internal value to local time - *out_tzoffset = 60 * offset_hour + offset_minute; + /* The hours offset */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + offset_hour = 10 * (substr[0] - '0') + (substr[1] - '0'); + substr += 2; + sublen -= 2; + if (offset_hour >= 24) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Timezone hours offset out of range " + "in datetime string \"%s\"", + str); } + goto error; + } + } else if (sublen >= 1 && isdigit(substr[0])) { + offset_hour = substr[0] - '0'; + ++substr; + --sublen; + } else { + goto parse_error; } - /* Skip trailing whitespace */ - while (sublen > 0 && isspace(*substr)) { + /* The minutes offset is optional */ + if (sublen > 0) { + /* Optional ':' */ + if (*substr == ':') { ++substr; --sublen; - comparison = compare_format(&format, &format_len, " ", 1, format_requirement); - if (comparison == COMPARISON_ERROR) { - goto parse_error; - } else if (comparison == COMPLETED_PARTIAL_MATCH) { - goto finish; + } + + /* The minutes offset (at the end of the string) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + offset_minute = 10 * (substr[0] - '0') + (substr[1] - '0'); + substr += 2; + sublen -= 2; + if (offset_minute >= 60) { + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Timezone minutes offset out of range " + "in datetime string \"%s\"", + str); + } + goto error; } + } else if (sublen >= 1 && isdigit(substr[0])) { + offset_minute = substr[0] - '0'; + ++substr; + --sublen; + } else { + goto parse_error; + } } - if ((sublen != 0) || (format_len != 0)) { - goto parse_error; + /* Apply the time zone offset */ + if (offset_neg) { + offset_hour = -offset_hour; + offset_minute = -offset_minute; + } + if (out_local != NULL) { + *out_local = 1; + // Unlike NumPy, do not change internal value to local time + *out_tzoffset = 60 * offset_hour + offset_minute; } + } -finish: - if (out_bestunit != NULL) { - *out_bestunit = bestunit; + /* Skip trailing whitespace */ + while (sublen > 0 && isspace(*substr)) { + ++substr; + --sublen; + comparison = + compare_format(&format, &format_len, " ", 1, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } - return 0; + } + + if ((sublen != 0) || (format_len != 0)) { + goto parse_error; + } + +finish: + if (out_bestunit != NULL) { + *out_bestunit = bestunit; + } + return 0; parse_error: - if (want_exc) { - PyErr_Format(PyExc_ValueError, - "Error parsing datetime string \"%s\" at position %d", str, - (int)(substr - str)); - } - return -1; + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Error parsing datetime string \"%s\" at position %d", str, + (int)(substr - str)); + } + return -1; error: - return -1; + return -1; } /* @@ -748,56 +761,66 @@ * objects with the given local and unit settings. */ int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) { - int len = 0; + int len = 0; - switch (base) { - /* Generic units can only be used to represent NaT */ - /* return 4;*/ - case NPY_FR_as: - len += 3; /* "###" */ - case NPY_FR_fs: - len += 3; /* "###" */ - case NPY_FR_ps: - len += 3; /* "###" */ - case NPY_FR_ns: - len += 3; /* "###" */ - case NPY_FR_us: - len += 3; /* "###" */ - case NPY_FR_ms: - len += 4; /* ".###" */ - case NPY_FR_s: - len += 3; /* ":##" */ - case NPY_FR_m: - len += 3; /* ":##" */ - case NPY_FR_h: - len += 3; /* "T##" */ - case NPY_FR_D: - case NPY_FR_W: - len += 3; /* "-##" */ - case NPY_FR_M: - len += 3; /* "-##" */ - case NPY_FR_Y: - len += 21; /* 64-bit year */ - break; - default: - len += 3; /* handle the now defunct NPY_FR_B */ - break; - } - - if (base >= NPY_FR_h) { - if (local) { - len += 5; /* "+####" or "-####" */ - } else { - len += 1; /* "Z" */ - } + switch (base) { + /* Generic units can only be used to represent NaT */ + /* return 4;*/ + case NPY_FR_as: + len += 3; /* "###" */ + PD_FALLTHROUGH; + case NPY_FR_fs: + len += 3; /* "###" */ + PD_FALLTHROUGH; + case NPY_FR_ps: + len += 3; /* "###" */ + PD_FALLTHROUGH; + case NPY_FR_ns: + len += 3; /* "###" */ + PD_FALLTHROUGH; + case NPY_FR_us: + len += 3; /* "###" */ + PD_FALLTHROUGH; + case NPY_FR_ms: + len += 4; /* ".###" */ + PD_FALLTHROUGH; + case NPY_FR_s: + len += 3; /* ":##" */ + PD_FALLTHROUGH; + case NPY_FR_m: + len += 3; /* ":##" */ + PD_FALLTHROUGH; + case NPY_FR_h: + len += 3; /* "T##" */ + PD_FALLTHROUGH; + case NPY_FR_D: + case NPY_FR_W: + len += 3; /* "-##" */ + PD_FALLTHROUGH; + case NPY_FR_M: + len += 3; /* "-##" */ + PD_FALLTHROUGH; + case NPY_FR_Y: + len += 21; /* 64-bit year */ + break; + default: + len += 3; /* handle the now defunct NPY_FR_B */ + break; + } + + if (base >= NPY_FR_h) { + if (local) { + len += 5; /* "+####" or "-####" */ + } else { + len += 1; /* "Z" */ } + } - len += 1; /* NULL terminator */ + len += 1; /* NULL terminator */ - return len; + return len; } - /* * Converts an npy_datetimestruct to an (almost) ISO 8601 * NULL-terminated string using timezone Z (UTC). If the string fits in @@ -812,21 +835,21 @@ * Returns 0 on success, -1 on failure (for example if the output * string was too short). */ -int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, +int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, size_t outlen, int utc, NPY_DATETIMEUNIT base) { - char *substr = outstr; - int sublen = outlen; - int tmplen; - - /* - * Print weeks with the same precision as days. - * - * TODO: Could print weeks with YYYY-Www format if the week - * epoch is a Monday. - */ - if (base == NPY_FR_W) { - base = NPY_FR_D; - } + char *substr = outstr; + size_t sublen = outlen; + int tmplen; + + /* + * Print weeks with the same precision as days. + * + * TODO: Could print weeks with YYYY-Www format if the week + * epoch is a Monday. + */ + if (base == NPY_FR_W) { + base = NPY_FR_D; + } /* YEAR */ /* @@ -835,314 +858,309 @@ * to have data all the way to the end of the buffer. */ #ifdef _WIN32 - tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); + tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); #else - tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); -#endif // _WIN32 - /* If it ran out of space or there isn't space for the NULL terminator */ - if (tmplen < 0 || tmplen > sublen) { - goto string_too_short; - } - substr += tmplen; - sublen -= tmplen; - - /* Stop if the unit is years */ - if (base == NPY_FR_Y) { - if (sublen > 0) { - *substr = '\0'; - } - return 0; - } - - /* MONTH */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = '-'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->month / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->month % 10) + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is months */ - if (base == NPY_FR_M) { - if (sublen > 0) { - *substr = '\0'; - } - return 0; - } - - /* DAY */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = '-'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->day / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->day % 10) + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is days */ - if (base == NPY_FR_D) { - if (sublen > 0) { - *substr = '\0'; - } - return 0; - } - - /* HOUR */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = 'T'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->hour / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->hour % 10) + '0'); - substr += 3; - sublen -= 3; - - /* Stop if the unit is hours */ - if (base == NPY_FR_h) { - goto add_time_zone; - } - - /* MINUTE */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = ':'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->min / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->min % 10) + '0'); - substr += 3; - sublen -= 3; + tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); +#endif // _WIN32 + /* If it ran out of space or there isn't space for the NULL terminator */ + if (tmplen < 0 || (size_t)tmplen > sublen) { + goto string_too_short; + } + substr += tmplen; + sublen -= tmplen; - /* Stop if the unit is minutes */ - if (base == NPY_FR_m) { - goto add_time_zone; + /* Stop if the unit is years */ + if (base == NPY_FR_Y) { + if (sublen > 0) { + *substr = '\0'; } + return 0; + } - /* SECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = ':'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->sec / 10) + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->sec % 10) + '0'); - substr += 3; - sublen -= 3; + /* MONTH */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = '-'; + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->month / 10) + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->month % 10) + '0'); + substr += 3; + sublen -= 3; - /* Stop if the unit is seconds */ - if (base == NPY_FR_s) { - goto add_time_zone; + /* Stop if the unit is months */ + if (base == NPY_FR_M) { + if (sublen > 0) { + *substr = '\0'; } + return 0; + } - /* MILLISECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = '.'; - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->us / 100000) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->us / 10000) % 10 + '0'); - if (sublen < 4) { - goto string_too_short; - } - substr[3] = (char)((dts->us / 1000) % 10 + '0'); - substr += 4; - sublen -= 4; + /* DAY */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = '-'; + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->day / 10) + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->day % 10) + '0'); + substr += 3; + sublen -= 3; - /* Stop if the unit is milliseconds */ - if (base == NPY_FR_ms) { - goto add_time_zone; + /* Stop if the unit is days */ + if (base == NPY_FR_D) { + if (sublen > 0) { + *substr = '\0'; } + return 0; + } - /* MICROSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->us / 100) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->us / 10) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)(dts->us % 10 + '0'); - substr += 3; - sublen -= 3; + /* HOUR */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = 'T'; + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->hour / 10) + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->hour % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is hours */ + if (base == NPY_FR_h) { + goto add_time_zone; + } - /* Stop if the unit is microseconds */ - if (base == NPY_FR_us) { - goto add_time_zone; - } + /* MINUTE */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = ':'; + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->min / 10) + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->min % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is minutes */ + if (base == NPY_FR_m) { + goto add_time_zone; + } - /* NANOSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->ps / 100000) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->ps / 10000) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->ps / 1000) % 10 + '0'); - substr += 3; - sublen -= 3; + /* SECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = ':'; + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->sec / 10) + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->sec % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is seconds */ + if (base == NPY_FR_s) { + goto add_time_zone; + } - /* Stop if the unit is nanoseconds */ - if (base == NPY_FR_ns) { - goto add_time_zone; - } + /* MILLISECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = '.'; + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->us / 100000) % 10 + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->us / 10000) % 10 + '0'); + if (sublen < 4) { + goto string_too_short; + } + substr[3] = (char)((dts->us / 1000) % 10 + '0'); + substr += 4; + sublen -= 4; + + /* Stop if the unit is milliseconds */ + if (base == NPY_FR_ms) { + goto add_time_zone; + } - /* PICOSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->ps / 100) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->ps / 10) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)(dts->ps % 10 + '0'); - substr += 3; - sublen -= 3; + /* MICROSECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = (char)((dts->us / 100) % 10 + '0'); + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->us / 10) % 10 + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)(dts->us % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is microseconds */ + if (base == NPY_FR_us) { + goto add_time_zone; + } - /* Stop if the unit is picoseconds */ - if (base == NPY_FR_ps) { - goto add_time_zone; - } + /* NANOSECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = (char)((dts->ps / 100000) % 10 + '0'); + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->ps / 10000) % 10 + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->ps / 1000) % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is nanoseconds */ + if (base == NPY_FR_ns) { + goto add_time_zone; + } - /* FEMTOSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->as / 100000) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->as / 10000) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)((dts->as / 1000) % 10 + '0'); - substr += 3; - sublen -= 3; + /* PICOSECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = (char)((dts->ps / 100) % 10 + '0'); + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->ps / 10) % 10 + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)(dts->ps % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is picoseconds */ + if (base == NPY_FR_ps) { + goto add_time_zone; + } - /* Stop if the unit is femtoseconds */ - if (base == NPY_FR_fs) { - goto add_time_zone; - } + /* FEMTOSECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = (char)((dts->as / 100000) % 10 + '0'); + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->as / 10000) % 10 + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)((dts->as / 1000) % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is femtoseconds */ + if (base == NPY_FR_fs) { + goto add_time_zone; + } - /* ATTOSECOND */ - if (sublen < 1) { - goto string_too_short; - } - substr[0] = (char)((dts->as / 100) % 10 + '0'); - if (sublen < 2) { - goto string_too_short; - } - substr[1] = (char)((dts->as / 10) % 10 + '0'); - if (sublen < 3) { - goto string_too_short; - } - substr[2] = (char)(dts->as % 10 + '0'); - substr += 3; - sublen -= 3; + /* ATTOSECOND */ + if (sublen < 1) { + goto string_too_short; + } + substr[0] = (char)((dts->as / 100) % 10 + '0'); + if (sublen < 2) { + goto string_too_short; + } + substr[1] = (char)((dts->as / 10) % 10 + '0'); + if (sublen < 3) { + goto string_too_short; + } + substr[2] = (char)(dts->as % 10 + '0'); + substr += 3; + sublen -= 3; add_time_zone: - /* UTC "Zulu" time */ - if (utc) { - if (sublen < 1) { - goto string_too_short; - } - substr[0] = 'Z'; - substr += 1; - sublen -= 1; - } - /* Add a NULL terminator, and return */ - if (sublen > 0) { - substr[0] = '\0'; + /* UTC "Zulu" time */ + if (utc) { + if (sublen < 1) { + goto string_too_short; } + substr[0] = 'Z'; + substr += 1; + sublen -= 1; + } + /* Add a NULL terminator, and return */ + if (sublen > 0) { + substr[0] = '\0'; + } - return 0; + return 0; string_too_short: - PyErr_Format(PyExc_RuntimeError, - "The string provided for NumPy ISO datetime formatting " - "was too short, with length %d", - outlen); - return -1; + PyErr_Format(PyExc_RuntimeError, + "The string provided for NumPy ISO datetime formatting " + "was too short, with length %d", + outlen); + return -1; } - -int make_iso_8601_timedelta(pandas_timedeltastruct *tds, - char *outstr, size_t *outlen) { +int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, + size_t *outlen) { *outlen = 0; - *outlen += snprintf(outstr, 60, // NOLINT - "P%" NPY_INT64_FMT - "DT%" NPY_INT32_FMT - "H%" NPY_INT32_FMT - "M%" NPY_INT32_FMT, - tds->days, tds->hrs, tds->min, tds->sec); + *outlen += snprintf(outstr, 60, // NOLINT + "P%" NPY_INT64_FMT "DT%" NPY_INT32_FMT "H%" NPY_INT32_FMT + "M%" NPY_INT32_FMT, + tds->days, tds->hrs, tds->min, tds->sec); outstr += *outlen; if (tds->ns != 0) { - *outlen += snprintf(outstr, 12, // NOLINT - ".%03" NPY_INT32_FMT - "%03" NPY_INT32_FMT - "%03" NPY_INT32_FMT - "S", tds->ms, tds->us, tds->ns); + *outlen += snprintf(outstr, 12, // NOLINT + ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT + "%03" NPY_INT32_FMT "S", + tds->ms, tds->us, tds->ns); } else if (tds->us != 0) { - *outlen += snprintf(outstr, 9, // NOLINT - ".%03" NPY_INT32_FMT - "%03" NPY_INT32_FMT - "S", tds->ms, tds->us); + *outlen += snprintf(outstr, 9, // NOLINT + ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, + tds->us); } else if (tds->ms != 0) { - *outlen += snprintf(outstr, 6, // NOLINT + *outlen += snprintf(outstr, 6, // NOLINT ".%03" NPY_INT32_FMT "S", tds->ms); } else { - *outlen += snprintf(outstr, 2, // NOLINT + *outlen += snprintf(outstr, 2, // NOLINT "%s", "S"); } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c pandas-2.2.2+dfsg/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c --- pandas-2.1.4+dfsg/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/src/vendored/ujson/lib/ultrajsondec.c 2024-04-10 17:42:52.000000000 +0000 @@ -38,7 +38,9 @@ * Copyright (c) 1994 Sun Microsystems, Inc. */ -#include +// Licence at LICENSES/ULTRAJSON_LICENSE + +#include "pandas/vendored/ujson/lib/ultrajson.h" #include #include #include @@ -46,7 +48,6 @@ #include #include #include -#include "pandas/vendored/ujson/lib/ultrajson.h" #ifndef TRUE #define TRUE 1 @@ -57,15 +58,15 @@ #endif struct DecoderState { - char *start; - char *end; - wchar_t *escStart; - wchar_t *escEnd; - int escHeap; - int lastType; - JSUINT32 objDepth; - void *prv; - JSONObjectDecoder *dec; + char *start; + char *end; + wchar_t *escStart; + wchar_t *escEnd; + int escHeap; + int lastType; + JSUINT32 objDepth; + void *prv; + JSONObjectDecoder *dec; }; JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds); @@ -73,349 +74,372 @@ static JSOBJ SetError(struct DecoderState *ds, int offset, const char *message) { - ds->dec->errorOffset = ds->start + offset; - ds->dec->errorStr = (char *)message; - return NULL; + ds->dec->errorOffset = ds->start + offset; + ds->dec->errorStr = (char *)message; + return NULL; } double createDouble(double intNeg, double intValue, double frcValue, int frcDecimalCount) { - static const double g_pow10[] = {1.0, - 0.1, - 0.01, - 0.001, - 0.0001, - 0.00001, - 0.000001, - 0.0000001, - 0.00000001, - 0.000000001, - 0.0000000001, - 0.00000000001, - 0.000000000001, - 0.0000000000001, - 0.00000000000001, - 0.000000000000001}; - return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg; + static const double g_pow10[] = {1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, + 0.00000000001, + 0.000000000001, + 0.0000000000001, + 0.00000000000001, + 0.000000000000001}; + return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg; } JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) { - char *end; - double value; - errno = 0; - - value = strtod(ds->start, &end); + char *end; + double value; + errno = 0; + + value = strtod(ds->start, &end); + + if (errno == ERANGE) { + return SetError(ds, -1, "Range error when decoding numeric as double"); + } - if (errno == ERANGE) { - return SetError(ds, -1, "Range error when decoding numeric as double"); - } - - ds->start = end; - return ds->dec->newDouble(ds->prv, value); + ds->start = end; + return ds->dec->newDouble(ds->prv, value); } JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { - int intNeg = 1; - JSUINT64 intValue; - JSUINT64 prevIntValue; - int chr; - int decimalCount = 0; - double frcValue = 0.0; - double expNeg; - double expValue; - char *offset = ds->start; - - JSUINT64 overflowLimit = LLONG_MAX; - + int intNeg = 1; + JSUINT64 intValue; + JSUINT64 prevIntValue; + int chr; + int decimalCount = 0; + double frcValue = 0.0; + double expNeg; + double expValue; + char *offset = ds->start; + + JSUINT64 overflowLimit = LLONG_MAX; + + if (*(offset) == 'I') { + goto DECODE_INF; + } else if (*(offset) == 'N') { + goto DECODE_NAN; + } else if (*(offset) == '-') { + offset++; + intNeg = -1; + overflowLimit = LLONG_MIN; if (*(offset) == 'I') { goto DECODE_INF; - } else if (*(offset) == 'N') { - goto DECODE_NAN; - } else if (*(offset) == '-') { - offset++; - intNeg = -1; - overflowLimit = LLONG_MIN; - if (*(offset) == 'I') { - goto DECODE_INF; - } } + } - // Scan integer part - intValue = 0; + // Scan integer part + intValue = 0; - while (1) { - chr = (int)(unsigned char)*(offset); + while (1) { + chr = (int)(unsigned char)*(offset); - switch (chr) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - // PERF: Don't do 64-bit arithmetic here unless we have to - prevIntValue = intValue; - intValue = intValue * 10ULL + (JSLONG) (chr - 48); - - if (intNeg == 1 && prevIntValue > intValue) { - return SetError(ds, -1, "Value is too big!"); - } else if (intNeg == -1 && intValue > overflowLimit) { - return SetError(ds, -1, overflowLimit == LLONG_MAX ? - "Value is too big!" : "Value is too small"); - } - - offset++; - break; - } - case '.': { - offset++; - goto DECODE_FRACTION; - break; - } - case 'e': - case 'E': { - offset++; - goto DECODE_EXPONENT; - break; - } - - default: { - goto BREAK_INT_LOOP; - break; - } - } + switch (chr) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + // PERF: Don't do 64-bit arithmetic here unless we have to + prevIntValue = intValue; + intValue = intValue * 10ULL + (JSLONG)(chr - 48); + + if (intNeg == 1 && prevIntValue > intValue) { + return SetError(ds, -1, "Value is too big!"); + } else if (intNeg == -1 && intValue > overflowLimit) { + return SetError(ds, -1, + overflowLimit == LLONG_MAX ? "Value is too big!" + : "Value is too small"); + } + + offset++; + break; + } + case '.': { + offset++; + goto DECODE_FRACTION; + break; + } + case 'e': + case 'E': { + offset++; + goto DECODE_EXPONENT; + break; + } + + default: { + goto BREAK_INT_LOOP; + break; + } } + } BREAK_INT_LOOP: - ds->lastType = JT_INT; - ds->start = offset; + ds->lastType = JT_INT; + ds->start = offset; - if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0) - return ds->dec->newUnsignedLong(ds->prv, intValue); - else if ((intValue >> 31)) - return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg)); - else - return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg)); + if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0) + return ds->dec->newUnsignedLong(ds->prv, intValue); + else if ((intValue >> 31)) + return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg)); + else + return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg)); DECODE_FRACTION: - if (ds->dec->preciseFloat) { - return decodePreciseFloat(ds); + if (ds->dec->preciseFloat) { + return decodePreciseFloat(ds); + } + + // Scan fraction part + frcValue = 0.0; + for (;;) { + chr = (int)(unsigned char)*(offset); + + switch (chr) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) { + frcValue = frcValue * 10.0 + (double)(chr - 48); + decimalCount++; + } + offset++; + break; + } + case 'e': + case 'E': { + offset++; + goto DECODE_EXPONENT; + break; + } + default: { + goto BREAK_FRC_LOOP; } - - // Scan fraction part - frcValue = 0.0; - for (;;) { - chr = (int)(unsigned char)*(offset); - - switch (chr) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) { - frcValue = frcValue * 10.0 + (double)(chr - 48); - decimalCount++; - } - offset++; - break; - } - case 'e': - case 'E': { - offset++; - goto DECODE_EXPONENT; - break; - } - default: { goto BREAK_FRC_LOOP; } - } } + } BREAK_FRC_LOOP: - // FIXME: Check for arithmetic overflow here - ds->lastType = JT_DOUBLE; - ds->start = offset; - return ds->dec->newDouble( - ds->prv, - createDouble((double)intNeg, (double)intValue, frcValue, decimalCount)); + // FIXME: Check for arithmetic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + return ds->dec->newDouble( + ds->prv, + createDouble((double)intNeg, (double)intValue, frcValue, decimalCount)); DECODE_EXPONENT: - if (ds->dec->preciseFloat) { - return decodePreciseFloat(ds); - } + if (ds->dec->preciseFloat) { + return decodePreciseFloat(ds); + } - expNeg = 1.0; + expNeg = 1.0; - if (*(offset) == '-') { - expNeg = -1.0; - offset++; - } else if (*(offset) == '+') { - expNeg = +1.0; - offset++; - } - - expValue = 0.0; - - for (;;) { - chr = (int)(unsigned char)*(offset); - - switch (chr) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - expValue = expValue * 10.0 + (double)(chr - 48); - offset++; - break; - } - default: { goto BREAK_EXP_LOOP; } - } + if (*(offset) == '-') { + expNeg = -1.0; + offset++; + } else if (*(offset) == '+') { + expNeg = +1.0; + offset++; + } + + expValue = 0.0; + + for (;;) { + chr = (int)(unsigned char)*(offset); + + switch (chr) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + expValue = expValue * 10.0 + (double)(chr - 48); + offset++; + break; } + default: { + goto BREAK_EXP_LOOP; + } + } + } DECODE_NAN: - offset++; - if (*(offset++) != 'a') goto SET_NAN_ERROR; - if (*(offset++) != 'N') goto SET_NAN_ERROR; - - ds->lastType = JT_NULL; - ds->start = offset; - return ds->dec->newNull(ds->prv); + offset++; + if (*(offset++) != 'a') + goto SET_NAN_ERROR; + if (*(offset++) != 'N') + goto SET_NAN_ERROR; + + ds->lastType = JT_NULL; + ds->start = offset; + return ds->dec->newNull(ds->prv); SET_NAN_ERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'NaN'"); + return SetError(ds, -1, "Unexpected character found when decoding 'NaN'"); DECODE_INF: - offset++; - if (*(offset++) != 'n') goto SET_INF_ERROR; - if (*(offset++) != 'f') goto SET_INF_ERROR; - if (*(offset++) != 'i') goto SET_INF_ERROR; - if (*(offset++) != 'n') goto SET_INF_ERROR; - if (*(offset++) != 'i') goto SET_INF_ERROR; - if (*(offset++) != 't') goto SET_INF_ERROR; - if (*(offset++) != 'y') goto SET_INF_ERROR; - - ds->start = offset; - - if (intNeg == 1) { - ds->lastType = JT_POS_INF; - return ds->dec->newPosInf(ds->prv); - } else { - ds->lastType = JT_NEG_INF; - return ds->dec->newNegInf(ds->prv); - } + offset++; + if (*(offset++) != 'n') + goto SET_INF_ERROR; + if (*(offset++) != 'f') + goto SET_INF_ERROR; + if (*(offset++) != 'i') + goto SET_INF_ERROR; + if (*(offset++) != 'n') + goto SET_INF_ERROR; + if (*(offset++) != 'i') + goto SET_INF_ERROR; + if (*(offset++) != 't') + goto SET_INF_ERROR; + if (*(offset++) != 'y') + goto SET_INF_ERROR; + + ds->start = offset; + + if (intNeg == 1) { + ds->lastType = JT_POS_INF; + return ds->dec->newPosInf(ds->prv); + } else { + ds->lastType = JT_NEG_INF; + return ds->dec->newNegInf(ds->prv); + } SET_INF_ERROR: - if (intNeg == 1) { - const char *msg = "Unexpected character found when decoding 'Infinity'"; - return SetError(ds, -1, msg); - } else { - const char *msg = "Unexpected character found when decoding '-Infinity'"; - return SetError(ds, -1, msg); - } - + if (intNeg == 1) { + const char *msg = "Unexpected character found when decoding 'Infinity'"; + return SetError(ds, -1, msg); + } else { + const char *msg = "Unexpected character found when decoding '-Infinity'"; + return SetError(ds, -1, msg); + } BREAK_EXP_LOOP: - // FIXME: Check for arithmetic overflow here - ds->lastType = JT_DOUBLE; - ds->start = offset; - return ds->dec->newDouble( - ds->prv, - createDouble((double)intNeg, (double)intValue, frcValue, decimalCount) * - pow(10.0, expValue * expNeg)); + // FIXME: Check for arithmetic overflow here + ds->lastType = JT_DOUBLE; + ds->start = offset; + return ds->dec->newDouble( + ds->prv, + createDouble((double)intNeg, (double)intValue, frcValue, decimalCount) * + pow(10.0, expValue * expNeg)); } JSOBJ FASTCALL_MSVC decode_true(struct DecoderState *ds) { - char *offset = ds->start; - offset++; + char *offset = ds->start; + offset++; - if (*(offset++) != 'r') goto SETERROR; - if (*(offset++) != 'u') goto SETERROR; - if (*(offset++) != 'e') goto SETERROR; - - ds->lastType = JT_TRUE; - ds->start = offset; - return ds->dec->newTrue(ds->prv); + if (*(offset++) != 'r') + goto SETERROR; + if (*(offset++) != 'u') + goto SETERROR; + if (*(offset++) != 'e') + goto SETERROR; + + ds->lastType = JT_TRUE; + ds->start = offset; + return ds->dec->newTrue(ds->prv); SETERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'true'"); + return SetError(ds, -1, "Unexpected character found when decoding 'true'"); } JSOBJ FASTCALL_MSVC decode_false(struct DecoderState *ds) { - char *offset = ds->start; - offset++; + char *offset = ds->start; + offset++; - if (*(offset++) != 'a') goto SETERROR; - if (*(offset++) != 'l') goto SETERROR; - if (*(offset++) != 's') goto SETERROR; - if (*(offset++) != 'e') goto SETERROR; - - ds->lastType = JT_FALSE; - ds->start = offset; - return ds->dec->newFalse(ds->prv); + if (*(offset++) != 'a') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + if (*(offset++) != 's') + goto SETERROR; + if (*(offset++) != 'e') + goto SETERROR; + + ds->lastType = JT_FALSE; + ds->start = offset; + return ds->dec->newFalse(ds->prv); SETERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'false'"); + return SetError(ds, -1, "Unexpected character found when decoding 'false'"); } JSOBJ FASTCALL_MSVC decode_null(struct DecoderState *ds) { - char *offset = ds->start; - offset++; + char *offset = ds->start; + offset++; - if (*(offset++) != 'u') goto SETERROR; - if (*(offset++) != 'l') goto SETERROR; - if (*(offset++) != 'l') goto SETERROR; - - ds->lastType = JT_NULL; - ds->start = offset; - return ds->dec->newNull(ds->prv); + if (*(offset++) != 'u') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + if (*(offset++) != 'l') + goto SETERROR; + + ds->lastType = JT_NULL; + ds->start = offset; + return ds->dec->newNull(ds->prv); SETERROR: - return SetError(ds, -1, "Unexpected character found when decoding 'null'"); + return SetError(ds, -1, "Unexpected character found when decoding 'null'"); } void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) { - char *offset; - - for (offset = ds->start; (ds->end - offset) > 0; offset++) { - switch (*offset) { - case ' ': - case '\t': - case '\r': - case '\n': - break; - - default: - ds->start = offset; - return; - } - } + char *offset; - if (offset == ds->end) { - ds->start = ds->end; - } + for (offset = ds->start; (ds->end - offset) > 0; offset++) { + switch (*offset) { + case ' ': + case '\t': + case '\r': + case '\n': + break; + + default: + ds->start = offset; + return; + } + } + + if (offset == ds->end) { + ds->start = ds->end; + } } enum DECODESTRINGSTATE { - DS_ISNULL = 0x32, - DS_ISQUOTE, - DS_ISESCAPE, - DS_UTFLENERROR, + DS_ISNULL = 0x32, + DS_ISQUOTE, + DS_ISESCAPE, + DS_UTFLENERROR, }; static const JSUINT8 g_decoderLookup[256] = { @@ -678,531 +702,520 @@ }; JSOBJ FASTCALL_MSVC decode_string(struct DecoderState *ds) { - JSUTF16 sur[2] = {0}; - int iSur = 0; - int index; - wchar_t *escOffset; - wchar_t *escStart; - size_t escLen = (ds->escEnd - ds->escStart); - JSUINT8 *inputOffset; - JSUINT8 oct; - JSUTF32 ucs; - ds->lastType = JT_INVALID; - ds->start++; + JSUTF16 sur[2] = {0}; + int iSur = 0; + int index; + wchar_t *escOffset; + wchar_t *escStart; + size_t escLen = (ds->escEnd - ds->escStart); + JSUINT8 *inputOffset; + JSUINT8 oct; + JSUTF32 ucs; + ds->lastType = JT_INVALID; + ds->start++; + + if ((size_t)(ds->end - ds->start) > escLen) { + size_t newSize = (ds->end - ds->start); + + if (ds->escHeap) { + if (newSize > (SIZE_MAX / sizeof(wchar_t))) { + return SetError(ds, -1, "Could not reserve memory block"); + } + escStart = + (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t)); + if (!escStart) { + ds->dec->free(ds->escStart); + return SetError(ds, -1, "Could not reserve memory block"); + } + ds->escStart = escStart; + } else { + wchar_t *oldStart = ds->escStart; + if (newSize > (SIZE_MAX / sizeof(wchar_t))) { + return SetError(ds, -1, "Could not reserve memory block"); + } + ds->escStart = (wchar_t *)ds->dec->malloc(newSize * sizeof(wchar_t)); + if (!ds->escStart) { + return SetError(ds, -1, "Could not reserve memory block"); + } + ds->escHeap = 1; + memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t)); + } + + ds->escEnd = ds->escStart + newSize; + } + + escOffset = ds->escStart; + inputOffset = (JSUINT8 *)ds->start; + + for (;;) { + switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) { + case DS_ISNULL: { + return SetError(ds, -1, "Unmatched ''\"' when when decoding 'string'"); + } + case DS_ISQUOTE: { + ds->lastType = JT_UTF8; + inputOffset++; + ds->start += ((char *)inputOffset - (ds->start)); + return ds->dec->newString(ds->prv, ds->escStart, escOffset); + } + case DS_UTFLENERROR: { + return SetError(ds, -1, + "Invalid UTF-8 sequence length when decoding 'string'"); + } + case DS_ISESCAPE: + inputOffset++; + switch (*inputOffset) { + case '\\': + *(escOffset++) = L'\\'; + inputOffset++; + continue; + case '\"': + *(escOffset++) = L'\"'; + inputOffset++; + continue; + case '/': + *(escOffset++) = L'/'; + inputOffset++; + continue; + case 'b': + *(escOffset++) = L'\b'; + inputOffset++; + continue; + case 'f': + *(escOffset++) = L'\f'; + inputOffset++; + continue; + case 'n': + *(escOffset++) = L'\n'; + inputOffset++; + continue; + case 'r': + *(escOffset++) = L'\r'; + inputOffset++; + continue; + case 't': + *(escOffset++) = L'\t'; + inputOffset++; + continue; + + case 'u': { + int index; + inputOffset++; + + for (index = 0; index < 4; index++) { + switch (*inputOffset) { + case '\0': + return SetError(ds, -1, + "Unterminated unicode " + "escape sequence when " + "decoding 'string'"); + default: + return SetError(ds, -1, + "Unexpected character in " + "unicode escape sequence " + "when decoding 'string'"); + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + sur[iSur] = (sur[iSur] << 4) + (JSUTF16)(*inputOffset - '0'); + break; + + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16)(*inputOffset - 'a'); + break; + + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16)(*inputOffset - 'A'); + break; + } - if ((size_t)(ds->end - ds->start) > escLen) { - size_t newSize = (ds->end - ds->start); + inputOffset++; + } - if (ds->escHeap) { - if (newSize > (SIZE_MAX / sizeof(wchar_t))) { - return SetError(ds, -1, "Could not reserve memory block"); - } - escStart = (wchar_t *)ds->dec->realloc(ds->escStart, - newSize * sizeof(wchar_t)); - if (!escStart) { - ds->dec->free(ds->escStart); - return SetError(ds, -1, "Could not reserve memory block"); - } - ds->escStart = escStart; + if (iSur == 0) { + if ((sur[iSur] & 0xfc00) == 0xd800) { + // First of a surrogate pair, continue parsing + iSur++; + break; + } + (*escOffset++) = (wchar_t)sur[iSur]; + iSur = 0; } else { - wchar_t *oldStart = ds->escStart; - if (newSize > (SIZE_MAX / sizeof(wchar_t))) { - return SetError(ds, -1, "Could not reserve memory block"); - } - ds->escStart = - (wchar_t *)ds->dec->malloc(newSize * sizeof(wchar_t)); - if (!ds->escStart) { - return SetError(ds, -1, "Could not reserve memory block"); - } - ds->escHeap = 1; - memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t)); + // Decode pair + if ((sur[1] & 0xfc00) != 0xdc00) { + return SetError(ds, -1, + "Unpaired high surrogate when " + "decoding 'string'"); + } +#if WCHAR_MAX == 0xffff + (*escOffset++) = (wchar_t)sur[0]; + (*escOffset++) = (wchar_t)sur[1]; +#else + (*escOffset++) = (wchar_t)0x10000 + + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); +#endif + iSur = 0; } + break; + } - ds->escEnd = ds->escStart + newSize; - } + case '\0': + return SetError(ds, -1, + "Unterminated escape sequence when " + "decoding 'string'"); + default: + return SetError(ds, -1, + "Unrecognized escape sequence when " + "decoding 'string'"); + } + break; + + case 1: { + *(escOffset++) = (wchar_t)(*inputOffset++); + break; + } + + case 2: { + ucs = (*inputOffset++) & 0x1f; + ucs <<= 6; + if (((*inputOffset) & 0x80) != 0x80) { + return SetError(ds, -1, + "Invalid octet in UTF-8 sequence when " + "decoding 'string'"); + } + ucs |= (*inputOffset++) & 0x3f; + if (ucs < 0x80) + return SetError(ds, -1, + "Overlong 2 byte UTF-8 sequence detected " + "when decoding 'string'"); + *(escOffset++) = (wchar_t)ucs; + break; + } + + case 3: { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x0f; + + for (index = 0; index < 2; index++) { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) { + return SetError(ds, -1, + "Invalid octet in UTF-8 sequence when " + "decoding 'string'"); + } - escOffset = ds->escStart; - inputOffset = (JSUINT8 *)ds->start; + ucs |= oct & 0x3f; + } - for (;;) { - switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) { - case DS_ISNULL: { - return SetError(ds, -1, - "Unmatched ''\"' when when decoding 'string'"); - } - case DS_ISQUOTE: { - ds->lastType = JT_UTF8; - inputOffset++; - ds->start += ((char *)inputOffset - (ds->start)); - return ds->dec->newString(ds->prv, ds->escStart, escOffset); - } - case DS_UTFLENERROR: { - return SetError( - ds, -1, - "Invalid UTF-8 sequence length when decoding 'string'"); - } - case DS_ISESCAPE: - inputOffset++; - switch (*inputOffset) { - case '\\': - *(escOffset++) = L'\\'; - inputOffset++; - continue; - case '\"': - *(escOffset++) = L'\"'; - inputOffset++; - continue; - case '/': - *(escOffset++) = L'/'; - inputOffset++; - continue; - case 'b': - *(escOffset++) = L'\b'; - inputOffset++; - continue; - case 'f': - *(escOffset++) = L'\f'; - inputOffset++; - continue; - case 'n': - *(escOffset++) = L'\n'; - inputOffset++; - continue; - case 'r': - *(escOffset++) = L'\r'; - inputOffset++; - continue; - case 't': - *(escOffset++) = L'\t'; - inputOffset++; - continue; - - case 'u': { - int index; - inputOffset++; - - for (index = 0; index < 4; index++) { - switch (*inputOffset) { - case '\0': - return SetError(ds, -1, - "Unterminated unicode " - "escape sequence when " - "decoding 'string'"); - default: - return SetError(ds, -1, - "Unexpected character in " - "unicode escape sequence " - "when decoding 'string'"); - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - sur[iSur] = (sur[iSur] << 4) + - (JSUTF16)(*inputOffset - '0'); - break; - - case 'a': - case 'b': - case 'c': - case 'd': - case 'e': - case 'f': - sur[iSur] = (sur[iSur] << 4) + 10 + - (JSUTF16)(*inputOffset - 'a'); - break; - - case 'A': - case 'B': - case 'C': - case 'D': - case 'E': - case 'F': - sur[iSur] = (sur[iSur] << 4) + 10 + - (JSUTF16)(*inputOffset - 'A'); - break; - } - - inputOffset++; - } - - if (iSur == 0) { - if ((sur[iSur] & 0xfc00) == 0xd800) { - // First of a surrogate pair, continue parsing - iSur++; - break; - } - (*escOffset++) = (wchar_t)sur[iSur]; - iSur = 0; - } else { - // Decode pair - if ((sur[1] & 0xfc00) != 0xdc00) { - return SetError(ds, -1, - "Unpaired high surrogate when " - "decoding 'string'"); - } -#if WCHAR_MAX == 0xffff - (*escOffset++) = (wchar_t)sur[0]; - (*escOffset++) = (wchar_t)sur[1]; -#else - (*escOffset++) = - (wchar_t)0x10000 + - (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); -#endif - iSur = 0; - } - break; - } - - case '\0': - return SetError(ds, -1, - "Unterminated escape sequence when " - "decoding 'string'"); - default: - return SetError(ds, -1, - "Unrecognized escape sequence when " - "decoding 'string'"); - } - break; - - case 1: { - *(escOffset++) = (wchar_t)(*inputOffset++); - break; - } - - case 2: { - ucs = (*inputOffset++) & 0x1f; - ucs <<= 6; - if (((*inputOffset) & 0x80) != 0x80) { - return SetError(ds, -1, - "Invalid octet in UTF-8 sequence when " - "decoding 'string'"); - } - ucs |= (*inputOffset++) & 0x3f; - if (ucs < 0x80) - return SetError(ds, -1, - "Overlong 2 byte UTF-8 sequence detected " - "when decoding 'string'"); - *(escOffset++) = (wchar_t)ucs; - break; - } - - case 3: { - JSUTF32 ucs = 0; - ucs |= (*inputOffset++) & 0x0f; - - for (index = 0; index < 2; index++) { - ucs <<= 6; - oct = (*inputOffset++); - - if ((oct & 0x80) != 0x80) { - return SetError(ds, -1, - "Invalid octet in UTF-8 sequence when " - "decoding 'string'"); - } - - ucs |= oct & 0x3f; - } - - if (ucs < 0x800) - return SetError(ds, -1, - "Overlong 3 byte UTF-8 sequence detected " - "when encoding string"); - *(escOffset++) = (wchar_t)ucs; - break; - } - - case 4: { - JSUTF32 ucs = 0; - ucs |= (*inputOffset++) & 0x07; - - for (index = 0; index < 3; index++) { - ucs <<= 6; - oct = (*inputOffset++); - - if ((oct & 0x80) != 0x80) { - return SetError(ds, -1, - "Invalid octet in UTF-8 sequence when " - "decoding 'string'"); - } - - ucs |= oct & 0x3f; - } - - if (ucs < 0x10000) - return SetError(ds, -1, - "Overlong 4 byte UTF-8 sequence detected " - "when decoding 'string'"); + if (ucs < 0x800) + return SetError(ds, -1, + "Overlong 3 byte UTF-8 sequence detected " + "when encoding string"); + *(escOffset++) = (wchar_t)ucs; + break; + } + + case 4: { + JSUTF32 ucs = 0; + ucs |= (*inputOffset++) & 0x07; + + for (index = 0; index < 3; index++) { + ucs <<= 6; + oct = (*inputOffset++); + + if ((oct & 0x80) != 0x80) { + return SetError(ds, -1, + "Invalid octet in UTF-8 sequence when " + "decoding 'string'"); + } + + ucs |= oct & 0x3f; + } + + if (ucs < 0x10000) + return SetError(ds, -1, + "Overlong 4 byte UTF-8 sequence detected " + "when decoding 'string'"); #if WCHAR_MAX == 0xffff - if (ucs >= 0x10000) { - ucs -= 0x10000; - *(escOffset++) = (wchar_t)(ucs >> 10) + 0xd800; - *(escOffset++) = (wchar_t)(ucs & 0x3ff) + 0xdc00; - } else { - *(escOffset++) = (wchar_t)ucs; - } + if (ucs >= 0x10000) { + ucs -= 0x10000; + *(escOffset++) = (wchar_t)(ucs >> 10) + 0xd800; + *(escOffset++) = (wchar_t)(ucs & 0x3ff) + 0xdc00; + } else { + *(escOffset++) = (wchar_t)ucs; + } #else - *(escOffset++) = (wchar_t)ucs; + *(escOffset++) = (wchar_t)ucs; #endif - break; - } - } + break; + } } + } } JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) { - JSOBJ itemValue; - JSOBJ newObj; - int len; - ds->objDepth++; - if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { - return SetError(ds, -1, "Reached object decoding depth limit"); - } - - newObj = ds->dec->newArray(ds->prv, ds->dec); - len = 0; - - ds->lastType = JT_INVALID; - ds->start++; + JSOBJ itemValue; + JSOBJ newObj; + int len; + ds->objDepth++; + if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { + return SetError(ds, -1, "Reached object decoding depth limit"); + } - for (;;) { - SkipWhitespace(ds); + newObj = ds->dec->newArray(ds->prv, ds->dec); + len = 0; - if ((*ds->start) == ']') { - ds->objDepth--; - if (len == 0) { - ds->start++; - return ds->dec->endArray(ds->prv, newObj); - } - - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return SetError( - ds, -1, - "Unexpected character found when decoding array value (1)"); - } + ds->lastType = JT_INVALID; + ds->start++; - itemValue = decode_any(ds); + for (;;) { + SkipWhitespace(ds); - if (itemValue == NULL) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return NULL; - } + if ((*ds->start) == ']') { + ds->objDepth--; + if (len == 0) { + ds->start++; + return ds->dec->endArray(ds->prv, newObj); + } - if (!ds->dec->arrayAddItem(ds->prv, newObj, itemValue)) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return NULL; - } + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return SetError( + ds, -1, "Unexpected character found when decoding array value (1)"); + } - SkipWhitespace(ds); + itemValue = decode_any(ds); - switch (*(ds->start++)) { - case ']': { - ds->objDepth--; - return ds->dec->endArray(ds->prv, newObj); - } - case ',': - break; - - default: - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return SetError( - ds, -1, - "Unexpected character found when decoding array value (2)"); - } + if (itemValue == NULL) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return NULL; + } - len++; + if (!ds->dec->arrayAddItem(ds->prv, newObj, itemValue)) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return NULL; } -} -JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) { - JSOBJ itemName; - JSOBJ itemValue; - JSOBJ newObj; + SkipWhitespace(ds); - ds->objDepth++; - if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { - return SetError(ds, -1, "Reached object decoding depth limit"); + switch (*(ds->start++)) { + case ']': { + ds->objDepth--; + return ds->dec->endArray(ds->prv, newObj); } + case ',': + break; - newObj = ds->dec->newObject(ds->prv, ds->dec); + default: + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return SetError( + ds, -1, "Unexpected character found when decoding array value (2)"); + } - ds->start++; + len++; + } +} - for (;;) { - SkipWhitespace(ds); +JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) { + JSOBJ itemName; + JSOBJ itemValue; + JSOBJ newObj; + + ds->objDepth++; + if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) { + return SetError(ds, -1, "Reached object decoding depth limit"); + } + + newObj = ds->dec->newObject(ds->prv, ds->dec); + + ds->start++; + + for (;;) { + SkipWhitespace(ds); + + if ((*ds->start) == '}') { + ds->objDepth--; + ds->start++; + return ds->dec->endObject(ds->prv, newObj); + } - if ((*ds->start) == '}') { - ds->objDepth--; - ds->start++; - return ds->dec->endObject(ds->prv, newObj); - } + ds->lastType = JT_INVALID; + itemName = decode_any(ds); - ds->lastType = JT_INVALID; - itemName = decode_any(ds); + if (itemName == NULL) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return NULL; + } - if (itemName == NULL) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return NULL; - } + if (ds->lastType != JT_UTF8) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + return SetError( + ds, -1, "Key name of object must be 'string' when decoding 'object'"); + } - if (ds->lastType != JT_UTF8) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - return SetError( - ds, -1, - "Key name of object must be 'string' when decoding 'object'"); - } + SkipWhitespace(ds); - SkipWhitespace(ds); + if (*(ds->start++) != ':') { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + return SetError(ds, -1, "No ':' found when decoding object value"); + } - if (*(ds->start++) != ':') { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - return SetError(ds, -1, "No ':' found when decoding object value"); - } + SkipWhitespace(ds); - SkipWhitespace(ds); + itemValue = decode_any(ds); - itemValue = decode_any(ds); + if (itemValue == NULL) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + return NULL; + } - if (itemValue == NULL) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - return NULL; - } + if (!ds->dec->objectAddKey(ds->prv, newObj, itemName, itemValue)) { + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + ds->dec->releaseObject(ds->prv, itemName, ds->dec); + ds->dec->releaseObject(ds->prv, itemValue, ds->dec); + return NULL; + } - if (!ds->dec->objectAddKey(ds->prv, newObj, itemName, itemValue)) { - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - ds->dec->releaseObject(ds->prv, itemName, ds->dec); - ds->dec->releaseObject(ds->prv, itemValue, ds->dec); - return NULL; - } + SkipWhitespace(ds); - SkipWhitespace(ds); + switch (*(ds->start++)) { + case '}': { + ds->objDepth--; + return ds->dec->endObject(ds->prv, newObj); + } + case ',': + break; - switch (*(ds->start++)) { - case '}': { - ds->objDepth--; - return ds->dec->endObject(ds->prv, newObj); - } - case ',': - break; - - default: - ds->dec->releaseObject(ds->prv, newObj, ds->dec); - return SetError( - ds, -1, - "Unexpected character found when decoding object value"); - } + default: + ds->dec->releaseObject(ds->prv, newObj, ds->dec); + return SetError(ds, -1, + "Unexpected character found when decoding object value"); } + } } JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) { - for (;;) { - switch (*ds->start) { - case '\"': - return decode_string(ds); - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - case 'I': - case 'N': - case '-': - return decode_numeric(ds); - - case '[': - return decode_array(ds); - case '{': - return decode_object(ds); - case 't': - return decode_true(ds); - case 'f': - return decode_false(ds); - case 'n': - return decode_null(ds); - - case ' ': - case '\t': - case '\r': - case '\n': - // White space - ds->start++; - break; + for (;;) { + switch (*ds->start) { + case '\"': + return decode_string(ds); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case 'I': + case 'N': + case '-': + return decode_numeric(ds); + + case '[': + return decode_array(ds); + case '{': + return decode_object(ds); + case 't': + return decode_true(ds); + case 'f': + return decode_false(ds); + case 'n': + return decode_null(ds); + + case ' ': + case '\t': + case '\r': + case '\n': + // White space + ds->start++; + break; - default: - return SetError(ds, -1, "Expected object or value"); - } + default: + return SetError(ds, -1, "Expected object or value"); } + } } JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer) { - /* - FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode - escaping doesn't run into the wall each time */ - char *locale; - struct DecoderState ds; - wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; - JSOBJ ret; - - ds.start = (char *)buffer; - ds.end = ds.start + cbBuffer; - - ds.escStart = escBuffer; - ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); - ds.escHeap = 0; - ds.prv = dec->prv; - ds.dec = dec; - ds.dec->errorStr = NULL; - ds.dec->errorOffset = NULL; - ds.objDepth = 0; - - ds.dec = dec; - - locale = setlocale(LC_NUMERIC, NULL); - if (!locale) { - return SetError(&ds, -1, "setlocale call failed"); - } - - if (strcmp(locale, "C")) { - size_t len = strlen(locale) + 1; - char *saved_locale = malloc(len); - if (saved_locale == NULL) { - return SetError(&ds, -1, "Could not reserve memory block"); - } - memcpy(saved_locale, locale, len); - setlocale(LC_NUMERIC, "C"); - ret = decode_any(&ds); - setlocale(LC_NUMERIC, saved_locale); - free(saved_locale); - } else { - ret = decode_any(&ds); - } - - if (ds.escHeap) { - dec->free(ds.escStart); - } - - SkipWhitespace(&ds); - - if (ds.start != ds.end && ret) { - dec->releaseObject(ds.prv, ret, ds.dec); - return SetError(&ds, -1, "Trailing data"); - } + /* + FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode + escaping doesn't run into the wall each time */ + char *locale; + struct DecoderState ds; + wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; + JSOBJ ret; + + ds.start = (char *)buffer; + ds.end = ds.start + cbBuffer; + + ds.escStart = escBuffer; + ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); + ds.escHeap = 0; + ds.prv = dec->prv; + ds.dec = dec; + ds.dec->errorStr = NULL; + ds.dec->errorOffset = NULL; + ds.objDepth = 0; + + ds.dec = dec; + + locale = setlocale(LC_NUMERIC, NULL); + if (!locale) { + return SetError(&ds, -1, "setlocale call failed"); + } + + if (strcmp(locale, "C")) { + size_t len = strlen(locale) + 1; + char *saved_locale = malloc(len); + if (saved_locale == NULL) { + return SetError(&ds, -1, "Could not reserve memory block"); + } + memcpy(saved_locale, locale, len); + setlocale(LC_NUMERIC, "C"); + ret = decode_any(&ds); + setlocale(LC_NUMERIC, saved_locale); + free(saved_locale); + } else { + ret = decode_any(&ds); + } + + if (ds.escHeap) { + dec->free(ds.escStart); + } + + SkipWhitespace(&ds); + + if (ds.start != ds.end && ret) { + dec->releaseObject(ds.prv, ret, ds.dec); + return SetError(&ds, -1, "Trailing data"); + } - return ret; + return ret; } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c pandas-2.2.2+dfsg/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c --- pandas-2.1.4+dfsg/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c 2024-04-10 17:42:52.000000000 +0000 @@ -38,14 +38,16 @@ * Copyright (c) 1994 Sun Microsystems, Inc. */ -#include -#include +// Licence at LICENSES/ULTRAJSON_LICENSE + +#include "pandas/portable.h" +#include "pandas/vendored/ujson/lib/ultrajson.h" #include #include +#include #include #include #include -#include "pandas/vendored/ujson/lib/ultrajson.h" #ifndef TRUE #define TRUE 1 @@ -69,7 +71,7 @@ The extra 2 bytes are for the quotes around the string */ -#define RESERVE_STRING(_len) (2 + ((_len)*6)) +#define RESERVE_STRING(_len) (2 + ((_len) * 6)) static const double g_pow10[] = {1, 10, @@ -356,8 +358,8 @@ 1}; static void SetError(JSOBJ obj, JSONObjectEncoder *enc, const char *message) { - enc->errorMsg = message; - enc->errorObj = obj; + enc->errorMsg = message; + enc->errorObj = obj; } /* @@ -365,371 +367,364 @@ make an estimate That way we won't run our head into the wall each call */ void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded) { - size_t curSize = enc->end - enc->start; - size_t newSize = curSize * 2; - size_t offset = enc->offset - enc->start; - - while (newSize < curSize + cbNeeded) { - newSize *= 2; - } + size_t curSize = enc->end - enc->start; + size_t newSize = curSize * 2; + size_t offset = enc->offset - enc->start; - if (enc->heap) { - enc->start = (char *)enc->realloc(enc->start, newSize); - if (!enc->start) { - SetError(NULL, enc, "Could not reserve memory block"); - return; - } - } else { - char *oldStart = enc->start; - enc->heap = 1; - enc->start = (char *)enc->malloc(newSize); - if (!enc->start) { - SetError(NULL, enc, "Could not reserve memory block"); - return; - } - memcpy(enc->start, oldStart, offset); + while (newSize < curSize + cbNeeded) { + newSize *= 2; + } + + if (enc->heap) { + enc->start = (char *)enc->realloc(enc->start, newSize); + if (!enc->start) { + SetError(NULL, enc, "Could not reserve memory block"); + return; + } + } else { + char *oldStart = enc->start; + enc->heap = 1; + enc->start = (char *)enc->malloc(newSize); + if (!enc->start) { + SetError(NULL, enc, "Could not reserve memory block"); + return; } - enc->offset = enc->start + offset; - enc->end = enc->start + newSize; + memcpy(enc->start, oldStart, offset); + } + enc->offset = enc->start + offset; + enc->end = enc->start + newSize; } INLINE_PREFIX void FASTCALL_MSVC Buffer_AppendShortHexUnchecked(char *outputOffset, unsigned short value) { - *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12]; - *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8]; - *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4]; - *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; + *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12]; + *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8]; + *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4]; + *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0]; } int Buffer_EscapeStringUnvalidated(JSONObjectEncoder *enc, const char *io, const char *end) { - char *of = (char *)enc->offset; + char *of = (char *)enc->offset; - for (;;) { - switch (*io) { - case 0x00: { - if (io < end) { - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - break; - } else { - enc->offset += (of - enc->offset); - return TRUE; - } - } - case '\"': - (*of++) = '\\'; - (*of++) = '\"'; - break; - case '\\': - (*of++) = '\\'; - (*of++) = '\\'; - break; - case '/': - (*of++) = '\\'; - (*of++) = '/'; - break; - case '\b': - (*of++) = '\\'; - (*of++) = 'b'; - break; - case '\f': - (*of++) = '\\'; - (*of++) = 'f'; - break; - case '\n': - (*of++) = '\\'; - (*of++) = 'n'; - break; - case '\r': - (*of++) = '\\'; - (*of++) = 'r'; - break; - case '\t': - (*of++) = '\\'; - (*of++) = 't'; - break; - - case 0x26: // '/' - case 0x3c: // '<' - case 0x3e: // '>' - { - if (enc->encodeHTMLChars) { - // Fall through to \u00XX case below. - } else { - // Same as default case below. - (*of++) = (*io); - break; - } - } - case 0x01: - case 0x02: - case 0x03: - case 0x04: - case 0x05: - case 0x06: - case 0x07: - case 0x0b: - case 0x0e: - case 0x0f: - case 0x10: - case 0x11: - case 0x12: - case 0x13: - case 0x14: - case 0x15: - case 0x16: - case 0x17: - case 0x18: - case 0x19: - case 0x1a: - case 0x1b: - case 0x1c: - case 0x1d: - case 0x1e: - case 0x1f: { - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)]; - *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)]; - break; - } - default: - (*of++) = (*io); - break; - } - io++; + for (;;) { + switch (*io) { + case 0x00: { + if (io < end) { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + break; + } else { + enc->offset += (of - enc->offset); + return TRUE; + } + } + case '\"': + (*of++) = '\\'; + (*of++) = '\"'; + break; + case '\\': + (*of++) = '\\'; + (*of++) = '\\'; + break; + case '/': + (*of++) = '\\'; + (*of++) = '/'; + break; + case '\b': + (*of++) = '\\'; + (*of++) = 'b'; + break; + case '\f': + (*of++) = '\\'; + (*of++) = 'f'; + break; + case '\n': + (*of++) = '\\'; + (*of++) = 'n'; + break; + case '\r': + (*of++) = '\\'; + (*of++) = 'r'; + break; + case '\t': + (*of++) = '\\'; + (*of++) = 't'; + break; + + case 0x26: // '/' + case 0x3c: // '<' + case 0x3e: // '>' + { + if (enc->encodeHTMLChars) { + // Fall through to \u00XX case below. + PD_FALLTHROUGH; + } else { + // Same as default case below. + (*of++) = (*io); + break; + } + } + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x0b: + case 0x0e: + case 0x0f: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1a: + case 0x1b: + case 0x1c: + case 0x1d: + case 0x1e: + case 0x1f: { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)]; + break; + } + default: + (*of++) = (*io); + break; } + io++; + } } int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, const char *io, const char *end) { - JSUTF32 ucs; - char *of = (char *)enc->offset; + JSUTF32 ucs; + char *of = (char *)enc->offset; + + for (;;) { + JSUINT8 utflen = g_asciiOutputTable[(unsigned char)*io]; + + switch (utflen) { + case 0: { + if (io < end) { + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = '0'; + io++; + continue; + } else { + enc->offset += (of - enc->offset); + return TRUE; + } + } - for (;;) { - JSUINT8 utflen = g_asciiOutputTable[(unsigned char)*io]; + case 1: { + *(of++) = (*io++); + continue; + } + + case 2: { + JSUTF32 in; + JSUTF16 in16; - switch (utflen) { - case 0: { - if (io < end) { - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = '0'; - io++; - continue; - } else { - enc->offset += (of - enc->offset); - return TRUE; - } - } - - case 1: { - *(of++) = (*io++); - continue; - } - - case 2: { - JSUTF32 in; - JSUTF16 in16; - - if (end - io < 1) { - enc->offset += (of - enc->offset); - SetError( - obj, enc, - "Unterminated UTF-8 sequence when encoding string"); - return FALSE; - } + if (end - io < 1) { + enc->offset += (of - enc->offset); + SetError(obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } - memcpy(&in16, io, sizeof(JSUTF16)); - in = (JSUTF32)in16; + memcpy(&in16, io, sizeof(JSUTF16)); + in = (JSUTF32)in16; #ifdef __LITTLE_ENDIAN__ - ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f); + ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f); #else - ucs = ((in & 0x1f00) >> 2) | (in & 0x3f); + ucs = ((in & 0x1f00) >> 2) | (in & 0x3f); #endif - if (ucs < 0x80) { - enc->offset += (of - enc->offset); - SetError(obj, enc, - "Overlong 2 byte UTF-8 sequence detected when " - "encoding string"); - return FALSE; - } - - io += 2; - break; - } - - case 3: { - JSUTF32 in; - JSUTF16 in16; - JSUINT8 in8; - - if (end - io < 2) { - enc->offset += (of - enc->offset); - SetError( - obj, enc, - "Unterminated UTF-8 sequence when encoding string"); - return FALSE; - } + if (ucs < 0x80) { + enc->offset += (of - enc->offset); + SetError(obj, enc, + "Overlong 2 byte UTF-8 sequence detected when " + "encoding string"); + return FALSE; + } + + io += 2; + break; + } + + case 3: { + JSUTF32 in; + JSUTF16 in16; + JSUINT8 in8; + + if (end - io < 2) { + enc->offset += (of - enc->offset); + SetError(obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } - memcpy(&in16, io, sizeof(JSUTF16)); - memcpy(&in8, io + 2, sizeof(JSUINT8)); + memcpy(&in16, io, sizeof(JSUTF16)); + memcpy(&in8, io + 2, sizeof(JSUINT8)); #ifdef __LITTLE_ENDIAN__ - in = (JSUTF32)in16; - in |= in8 << 16; - ucs = ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) | - ((in & 0x3f0000) >> 16); + in = (JSUTF32)in16; + in |= in8 << 16; + ucs = + ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) | ((in & 0x3f0000) >> 16); #else - in = in16 << 8; - in |= in8; - ucs = - ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); + in = in16 << 8; + in |= in8; + ucs = ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f); #endif - if (ucs < 0x800) { - enc->offset += (of - enc->offset); - SetError(obj, enc, - "Overlong 3 byte UTF-8 sequence detected when " - "encoding string"); - return FALSE; - } - - io += 3; - break; - } - case 4: { - JSUTF32 in; - - if (end - io < 3) { - enc->offset += (of - enc->offset); - SetError( - obj, enc, - "Unterminated UTF-8 sequence when encoding string"); - return FALSE; - } + if (ucs < 0x800) { + enc->offset += (of - enc->offset); + SetError(obj, enc, + "Overlong 3 byte UTF-8 sequence detected when " + "encoding string"); + return FALSE; + } + + io += 3; + break; + } + case 4: { + JSUTF32 in; + + if (end - io < 3) { + enc->offset += (of - enc->offset); + SetError(obj, enc, "Unterminated UTF-8 sequence when encoding string"); + return FALSE; + } - memcpy(&in, io, sizeof(JSUTF32)); + memcpy(&in, io, sizeof(JSUTF32)); #ifdef __LITTLE_ENDIAN__ - ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) | - ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24); + ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) | + ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24); #else - ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) | - ((in & 0x3f00) >> 2) | (in & 0x3f); + ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) | + ((in & 0x3f00) >> 2) | (in & 0x3f); #endif - if (ucs < 0x10000) { - enc->offset += (of - enc->offset); - SetError(obj, enc, - "Overlong 4 byte UTF-8 sequence detected when " - "encoding string"); - return FALSE; - } - - io += 4; - break; - } - - case 5: - case 6: { - enc->offset += (of - enc->offset); - SetError( - obj, enc, - "Unsupported UTF-8 sequence length when encoding string"); - return FALSE; - } - - case 29: { - if (enc->encodeHTMLChars) { - // Fall through to \u00XX case 30 below. - } else { - // Same as case 1 above. - *(of++) = (*io++); - continue; - } - } - - case 30: { - // \uXXXX encode - *(of++) = '\\'; - *(of++) = 'u'; - *(of++) = '0'; - *(of++) = '0'; - *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)]; - *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)]; - io++; - continue; - } - case 10: - case 12: - case 14: - case 16: - case 18: - case 20: - case 22: - case 24: { - *(of++) = *((char *)(g_escapeChars + utflen + 0)); - *(of++) = *((char *)(g_escapeChars + utflen + 1)); - io++; - continue; - } - // This can never happen, it's here to make L4 VC++ happy - default: { - ucs = 0; - break; - } - } - - /* - If the character is a UTF8 sequence of length > 1 we end up here */ - if (ucs >= 0x10000) { - ucs -= 0x10000; - *(of++) = '\\'; - *(of++) = 'u'; - Buffer_AppendShortHexUnchecked( - of, (unsigned short)(ucs >> 10) + 0xd800); - of += 4; - - *(of++) = '\\'; - *(of++) = 'u'; - Buffer_AppendShortHexUnchecked( - of, (unsigned short)(ucs & 0x3ff) + 0xdc00); - of += 4; - } else { - *(of++) = '\\'; - *(of++) = 'u'; - Buffer_AppendShortHexUnchecked(of, (unsigned short)ucs); - of += 4; - } + if (ucs < 0x10000) { + enc->offset += (of - enc->offset); + SetError(obj, enc, + "Overlong 4 byte UTF-8 sequence detected when " + "encoding string"); + return FALSE; + } + + io += 4; + break; + } + + case 5: + case 6: { + enc->offset += (of - enc->offset); + SetError(obj, enc, + "Unsupported UTF-8 sequence length when encoding string"); + return FALSE; + } + + case 29: { + if (enc->encodeHTMLChars) { + // Fall through to \u00XX case 30 below. + PD_FALLTHROUGH; + } else { + // Same as case 1 above. + *(of++) = (*io++); + continue; + } + } + + case 30: { + // \uXXXX encode + *(of++) = '\\'; + *(of++) = 'u'; + *(of++) = '0'; + *(of++) = '0'; + *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)]; + *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)]; + io++; + continue; + } + case 10: + case 12: + case 14: + case 16: + case 18: + case 20: + case 22: + case 24: { + *(of++) = *((char *)(g_escapeChars + utflen + 0)); + *(of++) = *((char *)(g_escapeChars + utflen + 1)); + io++; + continue; + } + // This can never happen, it's here to make L4 VC++ happy + default: { + ucs = 0; + break; + } } + + /* + If the character is a UTF8 sequence of length > 1 we end up here */ + if (ucs >= 0x10000) { + ucs -= 0x10000; + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (unsigned short)(ucs >> 10) + 0xd800); + of += 4; + + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, + (unsigned short)(ucs & 0x3ff) + 0xdc00); + of += 4; + } else { + *(of++) = '\\'; + *(of++) = 'u'; + Buffer_AppendShortHexUnchecked(of, (unsigned short)ucs); + of += 4; + } + } } -#define Buffer_Reserve(__enc, __len) \ - if ( (size_t) ((__enc)->end - (__enc)->offset) < (size_t) (__len)) \ - { \ - Buffer_Realloc((__enc), (__len));\ - } \ +#define Buffer_Reserve(__enc, __len) \ + if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ + Buffer_Realloc((__enc), (__len)); \ + } #define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr; -INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, - char *end) { - char aux; - while (end > begin) aux = *end, *end-- = *begin, *begin++ = aux; +INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, char *end) { + char aux; + while (end > begin) + aux = *end, *end-- = *begin, *begin++ = aux; } void Buffer_AppendIndentNewlineUnchecked(JSONObjectEncoder *enc) { - if (enc->indent > 0) Buffer_AppendCharUnchecked(enc, '\n'); + if (enc->indent > 0) + Buffer_AppendCharUnchecked(enc, '\n'); } // This function could be refactored to only accept enc as an argument, @@ -744,167 +739,174 @@ } void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) { - char *wstr; - JSUINT32 uvalue = (value < 0) ? -value : value; - wstr = enc->offset; - - // Conversion. Number is reversed. - do { - *wstr++ = (char)(48 + (uvalue % 10)); - } while (uvalue /= 10); - if (value < 0) *wstr++ = '-'; - - // Reverse string - strreverse(enc->offset, wstr - 1); - enc->offset += (wstr - (enc->offset)); + char *wstr; + JSUINT32 uvalue = (value < 0) ? -value : value; + wstr = enc->offset; + + // Conversion. Number is reversed. + do { + *wstr++ = (char)(48 + (uvalue % 10)); + } while (uvalue /= 10); + if (value < 0) + *wstr++ = '-'; + + // Reverse string + strreverse(enc->offset, wstr - 1); + enc->offset += (wstr - (enc->offset)); } void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) { - char *wstr; - JSUINT64 uvalue = (value < 0) ? -value : value; + char *wstr; + JSUINT64 uvalue; + if (value == INT64_MIN) { + uvalue = INT64_MAX + UINT64_C(1); + } else { + uvalue = (value < 0) ? -value : value; + } - wstr = enc->offset; - // Conversion. Number is reversed. + wstr = enc->offset; + // Conversion. Number is reversed. - do { - *wstr++ = (char)(48 + (uvalue % 10ULL)); - } while (uvalue /= 10ULL); - if (value < 0) *wstr++ = '-'; - - // Reverse string - strreverse(enc->offset, wstr - 1); - enc->offset += (wstr - (enc->offset)); + do { + *wstr++ = (char)(48 + (uvalue % 10ULL)); + } while (uvalue /= 10ULL); + if (value < 0) + *wstr++ = '-'; + + // Reverse string + strreverse(enc->offset, wstr - 1); + enc->offset += (wstr - (enc->offset)); } int Buffer_AppendDoubleUnchecked(JSOBJ obj, JSONObjectEncoder *enc, double value) { - /* if input is beyond the thresholds, revert to exponential */ - const double thres_max = (double)1e16 - 1; - const double thres_min = (double)1e-15; - char precision_str[20]; - int count; - double diff = 0.0; - char *str = enc->offset; - char *wstr = str; - unsigned long long whole; - double tmp; - unsigned long long frac; - int neg; - double pow10; - - if (value == HUGE_VAL || value == -HUGE_VAL) { - SetError(obj, enc, "Invalid Inf value when encoding double"); - return FALSE; - } + /* if input is beyond the thresholds, revert to exponential */ + const double thres_max = (double)1e16 - 1; + const double thres_min = (double)1e-15; + char precision_str[20]; + int count; + double diff = 0.0; + char *str = enc->offset; + char *wstr = str; + unsigned long long whole; + double tmp; + unsigned long long frac; + int neg; + double pow10; + + if (value == HUGE_VAL || value == -HUGE_VAL) { + SetError(obj, enc, "Invalid Inf value when encoding double"); + return FALSE; + } - if (!(value == value)) { - SetError(obj, enc, "Invalid Nan value when encoding double"); - return FALSE; - } + if (!(value == value)) { + SetError(obj, enc, "Invalid Nan value when encoding double"); + return FALSE; + } - /* we'll work in positive values and deal with the - negative sign issue later */ - neg = 0; - if (value < 0) { - neg = 1; - value = -value; - } + /* we'll work in positive values and deal with the + negative sign issue later */ + neg = 0; + if (value < 0) { + neg = 1; + value = -value; + } - /* - for very large or small numbers switch back to native sprintf for - exponentials. anyone want to write code to replace this? */ - if (value > thres_max || (value != 0.0 && fabs(value) < thres_min)) { - precision_str[0] = '%'; - precision_str[1] = '.'; + /* + for very large or small numbers switch back to native sprintf for + exponentials. anyone want to write code to replace this? */ + if (value > thres_max || (value != 0.0 && fabs(value) < thres_min)) { + precision_str[0] = '%'; + precision_str[1] = '.'; #if defined(_WIN32) && defined(_MSC_VER) - sprintf_s(precision_str + 2, sizeof(precision_str) - 2, "%ug", - enc->doublePrecision); - enc->offset += sprintf_s(str, enc->end - enc->offset, precision_str, - neg ? -value : value); + sprintf_s(precision_str + 2, sizeof(precision_str) - 2, "%ug", + enc->doublePrecision); + enc->offset += sprintf_s(str, enc->end - enc->offset, precision_str, + neg ? -value : value); #else - snprintf(precision_str + 2, sizeof(precision_str) - 2, "%ug", - enc->doublePrecision); - enc->offset += snprintf(str, enc->end - enc->offset, precision_str, - neg ? -value : value); + snprintf(precision_str + 2, sizeof(precision_str) - 2, "%ug", + enc->doublePrecision); + enc->offset += snprintf(str, enc->end - enc->offset, precision_str, + neg ? -value : value); #endif - return TRUE; - } + return TRUE; + } - pow10 = g_pow10[enc->doublePrecision]; + pow10 = g_pow10[enc->doublePrecision]; - whole = (unsigned long long)value; - tmp = (value - whole) * pow10; - frac = (unsigned long long)(tmp); - diff = tmp - frac; + whole = (unsigned long long)value; + tmp = (value - whole) * pow10; + frac = (unsigned long long)(tmp); + diff = tmp - frac; + + if (diff > 0.5) { + ++frac; + } else if (diff == 0.5 && ((frac == 0) || (frac & 1))) { + /* if halfway, round up if odd, OR + if last digit is 0. That last part is strange */ + ++frac; + } + + // handle rollover, e.g. + // case 0.99 with prec 1 is 1.0 and case 0.95 with prec is 1.0 as well + if (frac >= pow10) { + frac = 0; + ++whole; + } + + if (enc->doublePrecision == 0) { + diff = value - whole; if (diff > 0.5) { - ++frac; - } else if (diff == 0.5 && ((frac == 0) || (frac & 1))) { - /* if halfway, round up if odd, OR - if last digit is 0. That last part is strange */ - ++frac; - } - - // handle rollover, e.g. - // case 0.99 with prec 1 is 1.0 and case 0.95 with prec is 1.0 as well - if (frac >= pow10) { - frac = 0; - ++whole; - } - - if (enc->doublePrecision == 0) { - diff = value - whole; - - if (diff > 0.5) { - /* greater than 0.5, round up, e.g. 1.6 -> 2 */ - ++whole; - } else if (diff == 0.5 && (whole & 1)) { - /* exactly 0.5 and ODD, then round up */ - /* 1.5 -> 2, but 2.5 -> 2 */ - ++whole; - } - - // vvvvvvvvvvvvvvvvvvv Diff from modp_dto2 - } else if (frac) { - count = enc->doublePrecision; - // now do fractional part, as an unsigned number - // we know it is not 0 but we can have leading zeros, these - // should be removed - while (!(frac % 10)) { - --count; - frac /= 10; - } - //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2 - - // now do fractional part, as an unsigned number - do { - --count; - *wstr++ = (char)(48 + (frac % 10)); - } while (frac /= 10); - // add extra 0s - while (count-- > 0) { - *wstr++ = '0'; - } - // add decimal - *wstr++ = '.'; - } else { - *wstr++ = '0'; - *wstr++ = '.'; + /* greater than 0.5, round up, e.g. 1.6 -> 2 */ + ++whole; + } else if (diff == 0.5 && (whole & 1)) { + /* exactly 0.5 and ODD, then round up */ + /* 1.5 -> 2, but 2.5 -> 2 */ + ++whole; + } + + // vvvvvvvvvvvvvvvvvvv Diff from modp_dto2 + } else if (frac) { + count = enc->doublePrecision; + // now do fractional part, as an unsigned number + // we know it is not 0 but we can have leading zeros, these + // should be removed + while (!(frac % 10)) { + --count; + frac /= 10; } + //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2 - // Do whole part. Take care of sign - // conversion. Number is reversed. + // now do fractional part, as an unsigned number do { - *wstr++ = (char)(48 + (whole % 10)); - } while (whole /= 10); + --count; + *wstr++ = (char)(48 + (frac % 10)); + } while (frac /= 10); + // add extra 0s + while (count-- > 0) { + *wstr++ = '0'; + } + // add decimal + *wstr++ = '.'; + } else { + *wstr++ = '0'; + *wstr++ = '.'; + } - if (neg) { - *wstr++ = '-'; - } - strreverse(str, wstr - 1); - enc->offset += (wstr - (enc->offset)); + // Do whole part. Take care of sign + // conversion. Number is reversed. + do { + *wstr++ = (char)(48 + (whole % 10)); + } while (whole /= 10); - return TRUE; + if (neg) { + *wstr++ = '-'; + } + strreverse(str, wstr - 1); + enc->offset += (wstr - (enc->offset)); + + return TRUE; } /* @@ -917,291 +919,287 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t cbName) { - const char *value; - char *objName; - int count; - JSOBJ iterObj; - size_t szlen; - JSONTypeContext tc; - tc.encoder = enc; + const char *value; + char *objName; + int count; + JSOBJ iterObj; + size_t szlen; + JSONTypeContext tc; + tc.encoder = enc; + + if (enc->level > enc->recursionMax) { + SetError(obj, enc, "Maximum recursion level reached"); + return; + } - if (enc->level > enc->recursionMax) { - SetError(obj, enc, "Maximum recursion level reached"); - return; - } + /* + This reservation must hold - /* - This reservation must hold + length of _name as encoded worst case + + maxLength of double to string OR maxLength of JSLONG to string + */ + + Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName)); + if (enc->errorMsg) { + return; + } - length of _name as encoded worst case + - maxLength of double to string OR maxLength of JSLONG to string - */ + if (name) { + Buffer_AppendCharUnchecked(enc, '\"'); - Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName)); - if (enc->errorMsg) { + if (enc->forceASCII) { + if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) { return; + } + } else { + if (!Buffer_EscapeStringUnvalidated(enc, name, name + cbName)) { + return; + } } - if (name) { - Buffer_AppendCharUnchecked(enc, '\"'); - - if (enc->forceASCII) { - if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) { - return; - } - } else { - if (!Buffer_EscapeStringUnvalidated(enc, name, name + cbName)) { - return; - } - } + Buffer_AppendCharUnchecked(enc, '\"'); - Buffer_AppendCharUnchecked(enc, '\"'); - - Buffer_AppendCharUnchecked(enc, ':'); + Buffer_AppendCharUnchecked(enc, ':'); #ifndef JSON_NO_EXTRA_WHITESPACE - Buffer_AppendCharUnchecked(enc, ' '); + Buffer_AppendCharUnchecked(enc, ' '); #endif - } + } - enc->beginTypeContext(obj, &tc); + enc->beginTypeContext(obj, &tc); + + switch (tc.type) { + case JT_INVALID: { + return; + } - switch (tc.type) { - case JT_INVALID: { - return; - } - - case JT_ARRAY: { - count = 0; - enc->iterBegin(obj, &tc); - - Buffer_AppendCharUnchecked(enc, '['); - Buffer_AppendIndentNewlineUnchecked(enc); - - while (enc->iterNext(obj, &tc)) { - if (count > 0) { - Buffer_AppendCharUnchecked(enc, ','); + case JT_ARRAY: { + count = 0; + enc->iterBegin(obj, &tc); + + Buffer_AppendCharUnchecked(enc, '['); + Buffer_AppendIndentNewlineUnchecked(enc); + + while (enc->iterNext(obj, &tc)) { + if (count > 0) { + Buffer_AppendCharUnchecked(enc, ','); #ifndef JSON_NO_EXTRA_WHITESPACE - Buffer_AppendCharUnchecked(buffer, ' '); + Buffer_AppendCharUnchecked(buffer, ' '); #endif - Buffer_AppendIndentNewlineUnchecked(enc); - } + Buffer_AppendIndentNewlineUnchecked(enc); + } - iterObj = enc->iterGetValue(obj, &tc); + iterObj = enc->iterGetValue(obj, &tc); - enc->level++; - Buffer_AppendIndentUnchecked(enc, enc->level); - encode(iterObj, enc, NULL, 0); - count++; - } - - enc->iterEnd(obj, &tc); - Buffer_AppendIndentNewlineUnchecked(enc); - Buffer_AppendIndentUnchecked(enc, enc->level); - Buffer_AppendCharUnchecked(enc, ']'); - break; - } - - case JT_OBJECT: { - count = 0; - enc->iterBegin(obj, &tc); - - Buffer_AppendCharUnchecked(enc, '{'); - Buffer_AppendIndentNewlineUnchecked(enc); - - while (enc->iterNext(obj, &tc)) { - if (count > 0) { - Buffer_AppendCharUnchecked(enc, ','); + enc->level++; + Buffer_AppendIndentUnchecked(enc, enc->level); + encode(iterObj, enc, NULL, 0); + count++; + } + + enc->iterEnd(obj, &tc); + Buffer_AppendIndentNewlineUnchecked(enc); + Buffer_AppendIndentUnchecked(enc, enc->level); + Buffer_AppendCharUnchecked(enc, ']'); + break; + } + + case JT_OBJECT: { + count = 0; + enc->iterBegin(obj, &tc); + + Buffer_AppendCharUnchecked(enc, '{'); + Buffer_AppendIndentNewlineUnchecked(enc); + + while (enc->iterNext(obj, &tc)) { + if (count > 0) { + Buffer_AppendCharUnchecked(enc, ','); #ifndef JSON_NO_EXTRA_WHITESPACE - Buffer_AppendCharUnchecked(enc, ' '); + Buffer_AppendCharUnchecked(enc, ' '); #endif - Buffer_AppendIndentNewlineUnchecked(enc); - } + Buffer_AppendIndentNewlineUnchecked(enc); + } - iterObj = enc->iterGetValue(obj, &tc); - objName = enc->iterGetName(obj, &tc, &szlen); + iterObj = enc->iterGetValue(obj, &tc); + objName = enc->iterGetName(obj, &tc, &szlen); - enc->level++; - Buffer_AppendIndentUnchecked(enc, enc->level); - encode(iterObj, enc, objName, szlen); - count++; - } - - enc->iterEnd(obj, &tc); - Buffer_AppendIndentNewlineUnchecked(enc); - Buffer_AppendIndentUnchecked(enc, enc->level); - Buffer_AppendCharUnchecked(enc, '}'); - break; - } - - case JT_LONG: { - Buffer_AppendLongUnchecked(enc, enc->getLongValue(obj, &tc)); - break; - } - - case JT_INT: { - Buffer_AppendIntUnchecked(enc, enc->getIntValue(obj, &tc)); - break; - } - - case JT_TRUE: { - Buffer_AppendCharUnchecked(enc, 't'); - Buffer_AppendCharUnchecked(enc, 'r'); - Buffer_AppendCharUnchecked(enc, 'u'); - Buffer_AppendCharUnchecked(enc, 'e'); - break; - } - - case JT_FALSE: { - Buffer_AppendCharUnchecked(enc, 'f'); - Buffer_AppendCharUnchecked(enc, 'a'); - Buffer_AppendCharUnchecked(enc, 'l'); - Buffer_AppendCharUnchecked(enc, 's'); - Buffer_AppendCharUnchecked(enc, 'e'); - break; - } - - case JT_NULL: { - Buffer_AppendCharUnchecked(enc, 'n'); - Buffer_AppendCharUnchecked(enc, 'u'); - Buffer_AppendCharUnchecked(enc, 'l'); - Buffer_AppendCharUnchecked(enc, 'l'); - break; - } - - case JT_DOUBLE: { - if (!Buffer_AppendDoubleUnchecked(obj, enc, - enc->getDoubleValue(obj, &tc))) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - break; - } - - case JT_UTF8: { - value = enc->getStringValue(obj, &tc, &szlen); - if (enc->errorMsg) { - enc->endTypeContext(obj, &tc); - return; - } - Buffer_Reserve(enc, RESERVE_STRING(szlen)); - Buffer_AppendCharUnchecked(enc, '\"'); - - if (enc->forceASCII) { - if (!Buffer_EscapeStringValidated(obj, enc, value, - value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } else { - if (!Buffer_EscapeStringUnvalidated(enc, value, - value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } - - Buffer_AppendCharUnchecked(enc, '\"'); - break; - } - - case JT_BIGNUM: { - value = enc->getBigNumStringValue(obj, &tc, &szlen); - - Buffer_Reserve(enc, RESERVE_STRING(szlen)); - if (enc->errorMsg) { - enc->endTypeContext(obj, &tc); - return; - } - - if (enc->forceASCII) { - if (!Buffer_EscapeStringValidated(obj, enc, value, - value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } else { - if (!Buffer_EscapeStringUnvalidated(enc, value, - value + szlen)) { - enc->endTypeContext(obj, &tc); - enc->level--; - return; - } - } + enc->level++; + Buffer_AppendIndentUnchecked(enc, enc->level); + encode(iterObj, enc, objName, szlen); + count++; + } + + enc->iterEnd(obj, &tc); + Buffer_AppendIndentNewlineUnchecked(enc); + Buffer_AppendIndentUnchecked(enc, enc->level); + Buffer_AppendCharUnchecked(enc, '}'); + break; + } - break; - } - } + case JT_LONG: { + Buffer_AppendLongUnchecked(enc, enc->getLongValue(obj, &tc)); + break; + } - enc->endTypeContext(obj, &tc); - enc->level--; -} + case JT_INT: { + Buffer_AppendIntUnchecked(enc, enc->getIntValue(obj, &tc)); + break; + } -char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, - size_t _cbBuffer) { - char *locale; - enc->malloc = enc->malloc ? enc->malloc : malloc; - enc->free = enc->free ? enc->free : free; - enc->realloc = enc->realloc ? enc->realloc : realloc; - enc->errorMsg = NULL; - enc->errorObj = NULL; - enc->level = 0; - - if (enc->recursionMax < 1) { - enc->recursionMax = JSON_MAX_RECURSION_DEPTH; - } - - if (enc->doublePrecision < 0 || - enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) { - enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; - } - - if (_buffer == NULL) { - _cbBuffer = 32768; - enc->start = (char *)enc->malloc(_cbBuffer); - if (!enc->start) { - SetError(obj, enc, "Could not reserve memory block"); - return NULL; - } - enc->heap = 1; - } else { - enc->start = _buffer; - enc->heap = 0; + case JT_TRUE: { + Buffer_AppendCharUnchecked(enc, 't'); + Buffer_AppendCharUnchecked(enc, 'r'); + Buffer_AppendCharUnchecked(enc, 'u'); + Buffer_AppendCharUnchecked(enc, 'e'); + break; + } + + case JT_FALSE: { + Buffer_AppendCharUnchecked(enc, 'f'); + Buffer_AppendCharUnchecked(enc, 'a'); + Buffer_AppendCharUnchecked(enc, 'l'); + Buffer_AppendCharUnchecked(enc, 's'); + Buffer_AppendCharUnchecked(enc, 'e'); + break; + } + + case JT_NULL: { + Buffer_AppendCharUnchecked(enc, 'n'); + Buffer_AppendCharUnchecked(enc, 'u'); + Buffer_AppendCharUnchecked(enc, 'l'); + Buffer_AppendCharUnchecked(enc, 'l'); + break; + } + + case JT_DOUBLE: { + if (!Buffer_AppendDoubleUnchecked(obj, enc, + enc->getDoubleValue(obj, &tc))) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; } + break; + } - enc->end = enc->start + _cbBuffer; - enc->offset = enc->start; + case JT_UTF8: { + value = enc->getStringValue(obj, &tc, &szlen); + if (enc->errorMsg) { + enc->endTypeContext(obj, &tc); + return; + } + Buffer_Reserve(enc, RESERVE_STRING(szlen)); + Buffer_AppendCharUnchecked(enc, '\"'); - locale = setlocale(LC_NUMERIC, NULL); - if (!locale) { - SetError(NULL, enc, "setlocale call failed"); - return NULL; - } - - if (strcmp(locale, "C")) { - size_t len = strlen(locale) + 1; - char *saved_locale = malloc(len); - if (saved_locale == NULL) { - SetError(NULL, enc, "Could not reserve memory block"); - return NULL; - } - memcpy(saved_locale, locale, len); - setlocale(LC_NUMERIC, "C"); - encode(obj, enc, NULL, 0); - setlocale(LC_NUMERIC, saved_locale); - free(saved_locale); + if (enc->forceASCII) { + if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } } else { - encode(obj, enc, NULL, 0); + if (!Buffer_EscapeStringUnvalidated(enc, value, value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } } - Buffer_Reserve(enc, 1); + Buffer_AppendCharUnchecked(enc, '\"'); + break; + } + + case JT_BIGNUM: { + value = enc->getBigNumStringValue(obj, &tc, &szlen); + + Buffer_Reserve(enc, RESERVE_STRING(szlen)); if (enc->errorMsg) { - return NULL; + enc->endTypeContext(obj, &tc); + return; } - Buffer_AppendCharUnchecked(enc, '\0'); - return enc->start; + if (enc->forceASCII) { + if (!Buffer_EscapeStringValidated(obj, enc, value, value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + } else { + if (!Buffer_EscapeStringUnvalidated(enc, value, value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + } + + break; + } + } + + enc->endTypeContext(obj, &tc); + enc->level--; +} + +char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, + size_t _cbBuffer) { + char *locale; + enc->malloc = enc->malloc ? enc->malloc : malloc; + enc->free = enc->free ? enc->free : free; + enc->realloc = enc->realloc ? enc->realloc : realloc; + enc->errorMsg = NULL; + enc->errorObj = NULL; + enc->level = 0; + + if (enc->recursionMax < 1) { + enc->recursionMax = JSON_MAX_RECURSION_DEPTH; + } + + if (enc->doublePrecision < 0 || + enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) { + enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS; + } + + if (_buffer == NULL) { + _cbBuffer = 32768; + enc->start = (char *)enc->malloc(_cbBuffer); + if (!enc->start) { + SetError(obj, enc, "Could not reserve memory block"); + return NULL; + } + enc->heap = 1; + } else { + enc->start = _buffer; + enc->heap = 0; + } + + enc->end = enc->start + _cbBuffer; + enc->offset = enc->start; + + locale = setlocale(LC_NUMERIC, NULL); + if (!locale) { + SetError(NULL, enc, "setlocale call failed"); + return NULL; + } + + if (strcmp(locale, "C")) { + size_t len = strlen(locale) + 1; + char *saved_locale = malloc(len); + if (saved_locale == NULL) { + SetError(NULL, enc, "Could not reserve memory block"); + return NULL; + } + memcpy(saved_locale, locale, len); + setlocale(LC_NUMERIC, "C"); + encode(obj, enc, NULL, 0); + setlocale(LC_NUMERIC, saved_locale); + free(saved_locale); + } else { + encode(obj, enc, NULL, 0); + } + + Buffer_Reserve(enc, 1); + if (enc->errorMsg) { + return NULL; + } + Buffer_AppendCharUnchecked(enc, '\0'); + + return enc->start; } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c pandas-2.2.2+dfsg/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c --- pandas-2.1.4+dfsg/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c 2024-04-10 17:42:52.000000000 +0000 @@ -16,18 +16,19 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms @@ -35,486 +36,135 @@ * Copyright (c) 1994 Sun Microsystems, Inc. */ -#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY -#define NO_IMPORT_ARRAY +// Licence at LICENSES/ULTRAJSON_LICENSE + +#include "pandas/vendored/ujson/lib/ultrajson.h" #define PY_SSIZE_T_CLEAN #include -#include -#include "pandas/vendored/ujson/lib/ultrajson.h" - -#define PRINTMARK() -typedef struct __PyObjectDecoder { - JSONObjectDecoder dec; - - void *npyarr; // Numpy context buffer - void *npyarr_addr; // Ref to npyarr ptr to track DECREF calls - npy_intp curdim; // Current array dimension - - PyArray_Descr *dtype; -} PyObjectDecoder; - -typedef struct __NpyArrContext { - PyObject *ret; - PyObject *labels[2]; - PyArray_Dims shape; - - PyObjectDecoder *dec; - - npy_intp i; - npy_intp elsize; - npy_intp elcount; -} NpyArrContext; - -// Numpy handling based on numpy internal code, specifically the function -// PyArray_FromIter. - -// numpy related functions are inter-dependent so declare them all here, -// to ensure the compiler catches any errors - -// standard numpy array handling -JSOBJ Object_npyNewArray(void *prv, void *decoder); -JSOBJ Object_npyEndArray(void *prv, JSOBJ obj); -int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value); - -// for more complex dtypes (object and string) fill a standard Python list -// and convert to a numpy array when done. -JSOBJ Object_npyNewArrayList(void *prv, void *decoder); -JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj); -int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value); - -// free the numpy context buffer -void Npy_releaseContext(NpyArrContext *npyarr) { - PRINTMARK(); - if (npyarr) { - if (npyarr->shape.ptr) { - PyObject_Free(npyarr->shape.ptr); - } - if (npyarr->dec) { - npyarr->dec->npyarr = NULL; - npyarr->dec->curdim = 0; - } - Py_XDECREF(npyarr->labels[0]); - Py_XDECREF(npyarr->labels[1]); - Py_XDECREF(npyarr->ret); - PyObject_Free(npyarr); - } +static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name, + JSOBJ value) { + int ret = PyDict_SetItem(obj, name, value); + Py_DECREF((PyObject *)name); + Py_DECREF((PyObject *)value); + return ret == 0 ? 1 : 0; } -JSOBJ Object_npyNewArray(void *prv, void *_decoder) { - NpyArrContext *npyarr; - PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; - PRINTMARK(); - if (decoder->curdim <= 0) { - // start of array - initialise the context buffer - npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext)); - decoder->npyarr_addr = npyarr; - - if (!npyarr) { - PyErr_NoMemory(); - return NULL; - } - - npyarr->dec = decoder; - npyarr->labels[0] = npyarr->labels[1] = NULL; - - npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp) * NPY_MAXDIMS); - npyarr->shape.len = 1; - npyarr->ret = NULL; - - npyarr->elsize = 0; - npyarr->elcount = 4; - npyarr->i = 0; - } else { - // starting a new dimension continue the current array (and reshape - // after) - npyarr = (NpyArrContext *)decoder->npyarr; - if (decoder->curdim >= npyarr->shape.len) { - npyarr->shape.len++; - } - } - - npyarr->shape.ptr[decoder->curdim] = 0; - decoder->curdim++; - return npyarr; -} - -PyObject *Npy_returnLabelled(NpyArrContext *npyarr) { - PyObject *ret = npyarr->ret; - npy_intp i; - - if (npyarr->labels[0] || npyarr->labels[1]) { - // finished decoding, build tuple with values and labels - ret = PyTuple_New(npyarr->shape.len + 1); - for (i = 0; i < npyarr->shape.len; i++) { - if (npyarr->labels[i]) { - PyTuple_SET_ITEM(ret, i + 1, npyarr->labels[i]); - npyarr->labels[i] = NULL; - } else { - Py_INCREF(Py_None); - PyTuple_SET_ITEM(ret, i + 1, Py_None); - } - } - PyTuple_SET_ITEM(ret, 0, npyarr->ret); - } - - return ret; +static int Object_arrayAddItem(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ value) { + int ret = PyList_Append(obj, value); + Py_DECREF((PyObject *)value); + return ret == 0 ? 1 : 0; } -JSOBJ Object_npyEndArray(void *prv, JSOBJ obj) { - PyObject *ret; - char *new_data; - NpyArrContext *npyarr = (NpyArrContext *)obj; - int emptyType = NPY_DEFAULT_TYPE; - npy_intp i; - PRINTMARK(); - if (!npyarr) { - return NULL; - } - - ret = npyarr->ret; - i = npyarr->i; - - npyarr->dec->curdim--; - - if (i == 0 || !npyarr->ret) { - // empty array would not have been initialised so do it now. - if (npyarr->dec->dtype) { - emptyType = npyarr->dec->dtype->type_num; - } - npyarr->ret = ret = - PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0); - } else if (npyarr->dec->curdim <= 0) { - // realloc to final size - new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize); - if (new_data == NULL) { - PyErr_NoMemory(); - Npy_releaseContext(npyarr); - return NULL; - } - ((PyArrayObject *)ret)->data = (void *)new_data; - // PyArray_BYTES(ret) = new_data; - } - - if (npyarr->dec->curdim <= 0) { - // finished decoding array, reshape if necessary - if (npyarr->shape.len > 1) { - npyarr->ret = PyArray_Newshape((PyArrayObject *)ret, &npyarr->shape, - NPY_ANYORDER); - Py_DECREF(ret); - } - - ret = Npy_returnLabelled(npyarr); - - npyarr->ret = NULL; - Npy_releaseContext(npyarr); - } - - return ret; +static JSOBJ Object_newString(void *Py_UNUSED(prv), wchar_t *start, + wchar_t *end) { + return PyUnicode_FromWideChar(start, (end - start)); } -int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value) { - PyObject *type; - PyArray_Descr *dtype; - npy_intp i; - char *new_data, *item; - NpyArrContext *npyarr = (NpyArrContext *)obj; - PRINTMARK(); - if (!npyarr) { - return 0; - } - - i = npyarr->i; - - npyarr->shape.ptr[npyarr->dec->curdim - 1]++; - - if (PyArray_Check((PyObject *)value)) { - // multidimensional array, keep decoding values. - return 1; - } - - if (!npyarr->ret) { - // Array not initialised yet. - // We do it here so we can 'sniff' the data type if none was provided - if (!npyarr->dec->dtype) { - type = PyObject_Type(value); - if (!PyArray_DescrConverter(type, &dtype)) { - Py_DECREF(type); - goto fail; - } - Py_INCREF(dtype); - Py_DECREF(type); - } else { - dtype = PyArray_DescrNew(npyarr->dec->dtype); - } - - // If it's an object or string then fill a Python list and subsequently - // convert. Otherwise we would need to somehow mess about with - // reference counts when renewing memory. - npyarr->elsize = dtype->elsize; - if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) { - Py_XDECREF(dtype); - - if (npyarr->dec->curdim > 1) { - PyErr_SetString(PyExc_ValueError, - "Cannot decode multidimensional arrays with " - "variable length elements to numpy"); - goto fail; - } - npyarr->elcount = 0; - npyarr->ret = PyList_New(0); - if (!npyarr->ret) { - goto fail; - } - ((JSONObjectDecoder *)npyarr->dec)->newArray = - Object_npyNewArrayList; - ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = - Object_npyArrayListAddItem; - ((JSONObjectDecoder *)npyarr->dec)->endArray = - Object_npyEndArrayList; - return Object_npyArrayListAddItem(prv, obj, value); - } - - npyarr->ret = PyArray_NewFromDescr( - &PyArray_Type, dtype, 1, &npyarr->elcount, NULL, NULL, 0, NULL); - - if (!npyarr->ret) { - goto fail; - } - } - - if (i >= npyarr->elcount) { - // Grow PyArray_DATA(ret): - // this is similar for the strategy for PyListObject, but we use - // 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ... - if (npyarr->elsize == 0) { - PyErr_SetString(PyExc_ValueError, - "Cannot decode multidimensional arrays with " - "variable length elements to numpy"); - goto fail; - } - - npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i; - if (npyarr->elcount <= NPY_MAX_INTP / npyarr->elsize) { - new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret), - npyarr->elcount * npyarr->elsize); - } else { - PyErr_NoMemory(); - goto fail; - } - ((PyArrayObject *)npyarr->ret)->data = (void *)new_data; - - // PyArray_BYTES(npyarr->ret) = new_data; - } +static JSOBJ Object_newTrue(void *Py_UNUSED(prv)) { Py_RETURN_TRUE; } - PyArray_DIMS(npyarr->ret)[0] = i + 1; +static JSOBJ Object_newFalse(void *Py_UNUSED(prv)) { Py_RETURN_FALSE; } - if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL || - PyArray_SETITEM(npyarr->ret, item, value) == -1) { - goto fail; - } +static JSOBJ Object_newNull(void *Py_UNUSED(prv)) { Py_RETURN_NONE; } - Py_DECREF((PyObject *)value); - npyarr->i++; - return 1; - -fail: - - Npy_releaseContext(npyarr); - return 0; -} - -JSOBJ Object_npyNewArrayList(void *prv, void *_decoder) { - PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; - PRINTMARK(); - PyErr_SetString( - PyExc_ValueError, - "nesting not supported for object or variable length dtypes"); - Npy_releaseContext(decoder->npyarr); - return NULL; +static JSOBJ Object_newPosInf(void *Py_UNUSED(prv)) { + return PyFloat_FromDouble(Py_HUGE_VAL); } -JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj) { - PyObject *list, *ret; - NpyArrContext *npyarr = (NpyArrContext *)obj; - PRINTMARK(); - if (!npyarr) { - return NULL; - } - - // convert decoded list to numpy array - list = (PyObject *)npyarr->ret; - npyarr->ret = PyArray_FROM_O(list); - - ret = Npy_returnLabelled(npyarr); - npyarr->ret = list; - - ((JSONObjectDecoder *)npyarr->dec)->newArray = Object_npyNewArray; - ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem; - ((JSONObjectDecoder *)npyarr->dec)->endArray = Object_npyEndArray; - Npy_releaseContext(npyarr); - return ret; -} - -int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value) { - NpyArrContext *npyarr = (NpyArrContext *)obj; - PRINTMARK(); - if (!npyarr) { - return 0; - } - PyList_Append((PyObject *)npyarr->ret, value); - Py_DECREF((PyObject *)value); - npyarr->elcount++; - return 1; +static JSOBJ Object_newNegInf(void *Py_UNUSED(prv)) { + return PyFloat_FromDouble(-Py_HUGE_VAL); } -int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) { - int ret = PyDict_SetItem(obj, name, value); - Py_DECREF((PyObject *)name); - Py_DECREF((PyObject *)value); - return ret == 0 ? 1 : 0; +static JSOBJ Object_newObject(void *Py_UNUSED(prv), void *Py_UNUSED(decoder)) { + return PyDict_New(); } -int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) { - int ret = PyList_Append(obj, value); - Py_DECREF((PyObject *)value); - return ret == 0 ? 1 : 0; -} +static JSOBJ Object_endObject(void *Py_UNUSED(prv), JSOBJ obj) { return obj; } -JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) { - return PyUnicode_FromWideChar(start, (end - start)); +static JSOBJ Object_newArray(void *Py_UNUSED(prv), void *Py_UNUSED(decoder)) { + return PyList_New(0); } -JSOBJ Object_newTrue(void *prv) { Py_RETURN_TRUE; } - -JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; } - -JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; } - -JSOBJ Object_newPosInf(void *prv) { return PyFloat_FromDouble(Py_HUGE_VAL); } - -JSOBJ Object_newNegInf(void *prv) { return PyFloat_FromDouble(-Py_HUGE_VAL); } - -JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); } +static JSOBJ Object_endArray(void *Py_UNUSED(prv), JSOBJ obj) { return obj; } -JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; } - -JSOBJ Object_newArray(void *prv, void *decoder) { return PyList_New(0); } - -JSOBJ Object_endArray(void *prv, JSOBJ obj) { return obj; } - -JSOBJ Object_newInteger(void *prv, JSINT32 value) { - return PyLong_FromLong((long)value); +static JSOBJ Object_newInteger(void *Py_UNUSED(prv), JSINT32 value) { + return PyLong_FromLong(value); } -JSOBJ Object_newLong(void *prv, JSINT64 value) { - return PyLong_FromLongLong(value); +static JSOBJ Object_newLong(void *Py_UNUSED(prv), JSINT64 value) { + return PyLong_FromLongLong(value); } -JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) { - return PyLong_FromUnsignedLongLong(value); +static JSOBJ Object_newUnsignedLong(void *Py_UNUSED(prv), JSUINT64 value) { + return PyLong_FromUnsignedLongLong(value); } -JSOBJ Object_newDouble(void *prv, double value) { - return PyFloat_FromDouble(value); +static JSOBJ Object_newDouble(void *Py_UNUSED(prv), double value) { + return PyFloat_FromDouble(value); } -static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) { - PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; - if (obj != decoder->npyarr_addr) { - Py_XDECREF(((PyObject *)obj)); - } +static void Object_releaseObject(void *Py_UNUSED(prv), JSOBJ obj, + void *Py_UNUSED(decoder)) { + Py_XDECREF(((PyObject *)obj)); } -static char *g_kwlist[] = {"obj", "precise_float", - "labelled", "dtype", NULL}; +PyObject *JSONToObj(PyObject *Py_UNUSED(self), PyObject *args, + PyObject *kwargs) { + JSONObjectDecoder dec = {.newString = Object_newString, + .objectAddKey = Object_objectAddKey, + .arrayAddItem = Object_arrayAddItem, + .newTrue = Object_newTrue, + .newFalse = Object_newFalse, + .newNull = Object_newNull, + .newPosInf = Object_newPosInf, + .newNegInf = Object_newNegInf, + .newObject = Object_newObject, + .endObject = Object_endObject, + .newArray = Object_newArray, + .endArray = Object_endArray, + .newInt = Object_newInteger, + .newLong = Object_newLong, + .newUnsignedLong = Object_newUnsignedLong, + .newDouble = Object_newDouble, + .releaseObject = Object_releaseObject, + .malloc = PyObject_Malloc, + .free = PyObject_Free, + .realloc = PyObject_Realloc, + .errorStr = NULL, + .errorOffset = NULL, + .preciseFloat = 0, + .prv = NULL}; -PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { - PyObject *ret; - PyObject *sarg; - PyObject *arg; - PyObject *opreciseFloat = NULL; - JSONObjectDecoder *decoder; - PyObjectDecoder pyDecoder; - PyArray_Descr *dtype = NULL; - int labelled = 0; - - JSONObjectDecoder dec = { - Object_newString, Object_objectAddKey, Object_arrayAddItem, - Object_newTrue, Object_newFalse, Object_newNull, - Object_newPosInf, Object_newNegInf, Object_newObject, - Object_endObject, Object_newArray, Object_endArray, - Object_newInteger, Object_newLong, Object_newUnsignedLong, - Object_newDouble, - Object_releaseObject, PyObject_Malloc, PyObject_Free, - PyObject_Realloc}; - - dec.preciseFloat = 0; - dec.prv = NULL; - - pyDecoder.dec = dec; - pyDecoder.curdim = 0; - pyDecoder.npyarr = NULL; - pyDecoder.npyarr_addr = NULL; - - decoder = (JSONObjectDecoder *)&pyDecoder; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiiO&", g_kwlist, &arg, - &opreciseFloat, &labelled, - PyArray_DescrConverter2, &dtype)) { - Npy_releaseContext(pyDecoder.npyarr); - return NULL; - } + char *kwlist[] = {"obj", "precise_float", NULL}; + char *buf; + Py_ssize_t len; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|b", kwlist, &buf, &len, + &dec.preciseFloat)) { + return NULL; + } - if (opreciseFloat && PyObject_IsTrue(opreciseFloat)) { - decoder->preciseFloat = 1; - } + PyObject *ret = JSON_DecodeObject(&dec, buf, len); - if (PyBytes_Check(arg)) { - sarg = arg; - } else if (PyUnicode_Check(arg)) { - sarg = PyUnicode_AsUTF8String(arg); - if (sarg == NULL) { - // Exception raised above us by codec according to docs - return NULL; - } - } else { - PyErr_Format(PyExc_TypeError, "Expected 'str' or 'bytes'"); - return NULL; + if (PyErr_Occurred()) { + if (ret) { + Py_DECREF((PyObject *)ret); } + return NULL; + } - decoder->errorStr = NULL; - decoder->errorOffset = NULL; - - ret = JSON_DecodeObject(decoder, PyBytes_AS_STRING(sarg), - PyBytes_GET_SIZE(sarg)); + if (dec.errorStr) { + /* + FIXME: It's possible to give a much nicer error message here with actual + failing element in input etc*/ - if (sarg != arg) { - Py_DECREF(sarg); - } + PyErr_Format(PyExc_ValueError, "%s", dec.errorStr); - if (PyErr_Occurred()) { - if (ret) { - Py_DECREF((PyObject *)ret); - } - Npy_releaseContext(pyDecoder.npyarr); - return NULL; + if (ret) { + Py_DECREF((PyObject *)ret); } - if (decoder->errorStr) { - /* - FIXME: It's possible to give a much nicer error message here with actual - failing element in input etc*/ - - PyErr_Format(PyExc_ValueError, "%s", decoder->errorStr); - - if (ret) { - Py_DECREF((PyObject *)ret); - } - Npy_releaseContext(pyDecoder.npyarr); - - return NULL; - } + return NULL; + } - return ret; + return ret; } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/src/vendored/ujson/python/objToJSON.c pandas-2.2.2+dfsg/pandas/_libs/src/vendored/ujson/python/objToJSON.c --- pandas-2.1.4+dfsg/pandas/_libs/src/vendored/ujson/python/objToJSON.c 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/src/vendored/ujson/python/objToJSON.c 2024-04-10 17:42:52.000000000 +0000 @@ -36,19 +36,20 @@ * Copyright (c) 1994 Sun Microsystems, Inc. */ +// Licence at LICENSES/ULTRAJSON_LICENSE + #define PY_SSIZE_T_CLEAN #include -#include #define NO_IMPORT_ARRAY #define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY +#include "datetime.h" +#include "pandas/datetime/pd_datetime.h" +#include "pandas/vendored/ujson/lib/ultrajson.h" #include #include #include #include -#include "pandas/vendored/ujson/lib/ultrajson.h" -#include "datetime.h" -#include "pandas/datetime/pd_datetime.h" npy_int64 get_nat(void) { return NPY_MIN_INT64; } @@ -63,344 +64,313 @@ int object_is_na_type(PyObject *obj); typedef struct __NpyArrContext { - PyObject *array; - char *dataptr; - int curdim; // current dimension in array's order - int stridedim; // dimension we are striding over - int inc; // stride dimension increment (+/- 1) - npy_intp dim; - npy_intp stride; - npy_intp ndim; - npy_intp index[NPY_MAXDIMS]; - int type_num; - PyArray_GetItemFunc *getitem; + PyObject *array; + char *dataptr; + npy_intp curdim; // current dimension in array's order + npy_intp stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) + npy_intp dim; + npy_intp stride; + npy_intp ndim; + npy_intp index[NPY_MAXDIMS]; + int type_num; - char **rowLabels; - char **columnLabels; + char **rowLabels; + char **columnLabels; } NpyArrContext; typedef struct __PdBlockContext { - int colIdx; - int ncols; - int transpose; + Py_ssize_t colIdx; + Py_ssize_t ncols; + int transpose; - NpyArrContext **npyCtxts; // NpyArrContext for each column + NpyArrContext **npyCtxts; // NpyArrContext for each column } PdBlockContext; typedef struct __TypeContext { - JSPFN_ITERBEGIN iterBegin; - JSPFN_ITEREND iterEnd; - JSPFN_ITERNEXT iterNext; - JSPFN_ITERGETNAME iterGetName; - JSPFN_ITERGETVALUE iterGetValue; - PFN_PyTypeToUTF8 PyTypeToUTF8; - PyObject *newObj; - PyObject *dictObj; - Py_ssize_t index; - Py_ssize_t size; - PyObject *itemValue; - PyObject *itemName; - PyObject *attrList; - PyObject *iterator; - - double doubleValue; - JSINT64 longValue; - - char *cStr; - NpyArrContext *npyarr; - PdBlockContext *pdblock; - int transpose; - char **rowLabels; - char **columnLabels; - npy_intp rowLabelsLen; - npy_intp columnLabelsLen; + JSPFN_ITERBEGIN iterBegin; + JSPFN_ITEREND iterEnd; + JSPFN_ITERNEXT iterNext; + JSPFN_ITERGETNAME iterGetName; + JSPFN_ITERGETVALUE iterGetValue; + PFN_PyTypeToUTF8 PyTypeToUTF8; + PyObject *newObj; + PyObject *dictObj; + Py_ssize_t index; + Py_ssize_t size; + PyObject *itemValue; + PyObject *itemName; + PyObject *attrList; + PyObject *iterator; + + double doubleValue; + JSINT64 longValue; + + char *cStr; + NpyArrContext *npyarr; + PdBlockContext *pdblock; + int transpose; + char **rowLabels; + char **columnLabels; + npy_intp rowLabelsLen; + npy_intp columnLabelsLen; } TypeContext; typedef struct __PyObjectEncoder { - JSONObjectEncoder enc; + JSONObjectEncoder enc; - // pass through the NpyArrContext when encoding multi-dimensional arrays - NpyArrContext *npyCtxtPassthru; + // pass through the NpyArrContext when encoding multi-dimensional arrays + NpyArrContext *npyCtxtPassthru; - // pass through the PdBlockContext when encoding blocks - PdBlockContext *blkCtxtPassthru; + // pass through the PdBlockContext when encoding blocks + PdBlockContext *blkCtxtPassthru; - // pass-through to encode numpy data directly - int npyType; - void *npyValue; + // pass-through to encode numpy data directly + int npyType; + void *npyValue; - int datetimeIso; - NPY_DATETIMEUNIT datetimeUnit; - NPY_DATETIMEUNIT valueUnit; + int datetimeIso; + NPY_DATETIMEUNIT datetimeUnit; + NPY_DATETIMEUNIT valueUnit; - // output format style for pandas data types - int outputFormat; - int originalOutputFormat; + // output format style for pandas data types + int outputFormat; + int originalOutputFormat; - PyObject *defaultHandler; + PyObject *defaultHandler; } PyObjectEncoder; #define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv)) enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; -int PdBlock_iterNext(JSOBJ, JSONTypeContext *); +static int PdBlock_iterNext(JSOBJ, JSONTypeContext *); static TypeContext *createTypeContext(void) { - TypeContext *pc; - - pc = PyObject_Malloc(sizeof(TypeContext)); - if (!pc) { - PyErr_NoMemory(); - return NULL; - } - pc->newObj = NULL; - pc->dictObj = NULL; - pc->itemValue = NULL; - pc->itemName = NULL; - pc->attrList = NULL; - pc->index = 0; - pc->size = 0; - pc->longValue = 0; - pc->doubleValue = 0.0; - pc->cStr = NULL; - pc->npyarr = NULL; - pc->pdblock = NULL; - pc->rowLabels = NULL; - pc->columnLabels = NULL; - pc->transpose = 0; - pc->rowLabelsLen = 0; - pc->columnLabelsLen = 0; + TypeContext *pc = PyObject_Malloc(sizeof(TypeContext)); + if (!pc) { + PyErr_NoMemory(); + return NULL; + } + pc->newObj = NULL; + pc->dictObj = NULL; + pc->itemValue = NULL; + pc->itemName = NULL; + pc->attrList = NULL; + pc->index = 0; + pc->size = 0; + pc->longValue = 0; + pc->doubleValue = 0.0; + pc->cStr = NULL; + pc->npyarr = NULL; + pc->pdblock = NULL; + pc->rowLabels = NULL; + pc->columnLabels = NULL; + pc->transpose = 0; + pc->rowLabelsLen = 0; + pc->columnLabelsLen = 0; - return pc; + return pc; } static PyObject *get_values(PyObject *obj) { - PyObject *values = NULL; + PyObject *values = NULL; - if (object_is_index_type(obj) || object_is_series_type(obj)) { - // The special cases to worry about are dt64tz and category[dt64tz]. - // In both cases we want the UTC-localized datetime64 ndarray, - // without going through and object array of Timestamps. - if (PyObject_HasAttrString(obj, "tz")) { - PyObject *tz = PyObject_GetAttrString(obj, "tz"); - if (tz != Py_None) { - // Go through object array if we have dt64tz, since tz info will - // be lost if values is used directly. - Py_DECREF(tz); - values = PyObject_CallMethod(obj, "__array__", NULL); - return values; - } - Py_DECREF(tz); - } - values = PyObject_GetAttrString(obj, "values"); - if (values == NULL) { - // Clear so we can subsequently try another method - PyErr_Clear(); - } else if (PyObject_HasAttrString(values, "__array__")) { - // We may have gotten a Categorical or Sparse array so call np.array - PyObject *array_values = PyObject_CallMethod(values, "__array__", - NULL); - Py_DECREF(values); - values = array_values; - } else if (!PyArray_CheckExact(values)) { - // Didn't get a numpy array, so keep trying - Py_DECREF(values); - values = NULL; - } + if (object_is_index_type(obj) || object_is_series_type(obj)) { + // The special cases to worry about are dt64tz and category[dt64tz]. + // In both cases we want the UTC-localized datetime64 ndarray, + // without going through and object array of Timestamps. + if (PyObject_HasAttrString(obj, "tz")) { + PyObject *tz = PyObject_GetAttrString(obj, "tz"); + if (tz != Py_None) { + // Go through object array if we have dt64tz, since tz info will + // be lost if values is used directly. + Py_DECREF(tz); + values = PyObject_CallMethod(obj, "__array__", NULL); + return values; + } + Py_DECREF(tz); } - + values = PyObject_GetAttrString(obj, "values"); if (values == NULL) { - PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); - PyObject *repr; - if (PyObject_HasAttrString(obj, "dtype")) { - PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); - repr = PyObject_Repr(dtype); - Py_DECREF(dtype); - } else { - repr = PyUnicode_FromString(""); - } + // Clear so we can subsequently try another method + PyErr_Clear(); + } else if (PyObject_HasAttrString(values, "__array__")) { + // We may have gotten a Categorical or Sparse array so call np.array + PyObject *array_values = PyObject_CallMethod(values, "__array__", NULL); + Py_DECREF(values); + values = array_values; + } else if (!PyArray_CheckExact(values)) { + // Didn't get a numpy array, so keep trying + Py_DECREF(values); + values = NULL; + } + } + + if (values == NULL) { + PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); + PyObject *repr; + if (PyObject_HasAttrString(obj, "dtype")) { + PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); + repr = PyObject_Repr(dtype); + Py_DECREF(dtype); + } else { + repr = PyUnicode_FromString(""); + } - PyErr_Format(PyExc_ValueError, "%R or %R are not JSON serializable yet", - repr, typeRepr); - Py_DECREF(repr); - Py_DECREF(typeRepr); + PyErr_Format(PyExc_ValueError, "%R or %R are not JSON serializable yet", + repr, typeRepr); + Py_DECREF(repr); + Py_DECREF(typeRepr); - return NULL; - } + return NULL; + } - return values; + return values; } static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) { - PyObject *tmp = PyObject_GetAttrString(obj, attr); - PyObject *ret; - - if (tmp == 0) { - return 0; - } - ret = PyObject_GetAttrString(tmp, subAttr); - Py_DECREF(tmp); + PyObject *tmp = PyObject_GetAttrString(obj, attr); + if (tmp == 0) { + return 0; + } + PyObject *ret = PyObject_GetAttrString(tmp, subAttr); + Py_DECREF(tmp); - return ret; + return ret; } static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { - PyObject *tmp = PyObject_GetAttrString(obj, attr); - Py_ssize_t ret; - - if (tmp == 0) { - return 0; - } - ret = PyObject_Length(tmp); - Py_DECREF(tmp); - - if (ret == -1) { - return 0; - } - - return ret; -} + PyObject *tmp = PyObject_GetAttrString(obj, attr); + if (tmp == 0) { + return 0; + } + Py_ssize_t ret = PyObject_Length(tmp); + Py_DECREF(tmp); -static int is_simple_frame(PyObject *obj) { - PyObject *mgr = PyObject_GetAttrString(obj, "_mgr"); - if (!mgr) { - return 0; - } - int ret; - if (PyObject_HasAttrString(mgr, "blocks")) { - ret = (get_attr_length(mgr, "blocks") <= 1); - } else { - ret = 0; - } + if (ret == -1) { + return 0; + } - Py_DECREF(mgr); - return ret; + return ret; } static npy_int64 get_long_attr(PyObject *o, const char *attr) { - // NB we are implicitly assuming that o is a Timedelta or Timestamp, or NaT + // NB we are implicitly assuming that o is a Timedelta or Timestamp, or NaT - npy_int64 long_val; - PyObject *value = PyObject_GetAttrString(o, attr); - long_val = - (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyLong_AsLong(value)); + PyObject *value = PyObject_GetAttrString(o, attr); + const npy_int64 long_val = + (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyLong_AsLong(value)); - Py_DECREF(value); - - if (object_is_nat_type(o)) { - // i.e. o is NaT, long_val will be NPY_MIN_INT64 - return long_val; - } + Py_DECREF(value); - // ensure we are in nanoseconds, similar to Timestamp._as_creso or _as_unit - PyObject* reso = PyObject_GetAttrString(o, "_creso"); - if (!PyLong_Check(reso)) { - // https://github.com/pandas-dev/pandas/pull/49034#discussion_r1023165139 - Py_DECREF(reso); - return -1; - } + if (object_is_nat_type(o)) { + // i.e. o is NaT, long_val will be NPY_MIN_INT64 + return long_val; + } - long cReso = PyLong_AsLong(reso); + // ensure we are in nanoseconds, similar to Timestamp._as_creso or _as_unit + PyObject *reso = PyObject_GetAttrString(o, "_creso"); + if (!PyLong_Check(reso)) { + // https://github.com/pandas-dev/pandas/pull/49034#discussion_r1023165139 Py_DECREF(reso); - if (cReso == -1 && PyErr_Occurred()) { - return -1; - } + return -1; + } - if (cReso == NPY_FR_us) { - long_val = long_val * 1000L; - } else if (cReso == NPY_FR_ms) { - long_val = long_val * 1000000L; - } else if (cReso == NPY_FR_s) { - long_val = long_val * 1000000000L; - } + long cReso = PyLong_AsLong(reso); + Py_DECREF(reso); + if (cReso == -1 && PyErr_Occurred()) { + return -1; + } + + if (cReso == NPY_FR_us) { + return long_val * 1000L; + } else if (cReso == NPY_FR_ms) { + return long_val * 1000000L; + } else if (cReso == NPY_FR_s) { + return long_val * 1000000000L; + } - return long_val; + return long_val; } static npy_float64 total_seconds(PyObject *td) { - npy_float64 double_val; - PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); - double_val = PyFloat_AS_DOUBLE(value); - Py_DECREF(value); - return double_val; + PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); + const npy_float64 double_val = PyFloat_AS_DOUBLE(value); + Py_DECREF(value); + return double_val; } static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), size_t *_outLen) { - PyObject *obj = (PyObject *)_obj; - *_outLen = PyBytes_GET_SIZE(obj); - return PyBytes_AS_STRING(obj); -} - -static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, - size_t *_outLen) { - char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj, - (Py_ssize_t *)_outLen); - if (encoded == NULL) { - /* Something went wrong. - Set errorMsg(to tell encoder to stop), - and let Python exception propagate. */ - JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; - enc->errorMsg = "Encoding failed."; - } - return encoded; + PyObject *obj = (PyObject *)_obj; + *_outLen = PyBytes_GET_SIZE(obj); + return PyBytes_AS_STRING(obj); +} + +static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, size_t *_outLen) { + char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); + if (encoded == NULL) { + /* Something went wrong. + Set errorMsg(to tell encoder to stop), + and let Python exception propagate. */ + JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; + enc->errorMsg = "Encoding failed."; + } + return encoded; } /* JSON callback. returns a char* and mutates the pointer to *len */ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc, size_t *len) { - NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; - GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); - return GET_TC(tc)->cStr; + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; + GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); + return GET_TC(tc)->cStr; } /* JSON callback. returns a char* and mutates the pointer to *len */ static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc, size_t *len) { - GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len); - return GET_TC(tc)->cStr; + GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len); + return GET_TC(tc)->cStr; } /* JSON callback */ static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, size_t *len) { - if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) { - PyErr_SetString(PyExc_TypeError, "Expected date or datetime object"); - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - return NULL; - } + if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) { + PyErr_SetString(PyExc_TypeError, "Expected date or datetime object"); + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; + } - NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - return PyDateTimeToIso(obj, base, len); + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + return PyDateTimeToIso(obj, base, len); } static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { - PyObject *obj = (PyObject *)_obj; - PyObject *str; - PyObject *tmp; - - str = PyObject_CallMethod(obj, "isoformat", NULL); - if (str == NULL) { - *outLen = 0; - if (!PyErr_Occurred()) { - PyErr_SetString(PyExc_ValueError, "Failed to convert time"); - } - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - return NULL; - } - if (PyUnicode_Check(str)) { - tmp = str; - str = PyUnicode_AsUTF8String(str); - Py_DECREF(tmp); + PyObject *obj = (PyObject *)_obj; + PyObject *str = PyObject_CallMethod(obj, "isoformat", NULL); + if (str == NULL) { + *outLen = 0; + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_ValueError, "Failed to convert time"); } + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + return NULL; + } + if (PyUnicode_Check(str)) { + PyObject *tmp = str; + str = PyUnicode_AsUTF8String(str); + Py_DECREF(tmp); + } - GET_TC(tc)->newObj = str; + GET_TC(tc)->newObj = str; - *outLen = PyBytes_GET_SIZE(str); - char *outValue = PyBytes_AS_STRING(str); - return outValue; + *outLen = PyBytes_GET_SIZE(str); + char *outValue = PyBytes_AS_STRING(str); + return outValue; } //============================================================================= @@ -408,167 +378,181 @@ //============================================================================= static void NpyArr_freeItemValue(JSOBJ Py_UNUSED(_obj), JSONTypeContext *tc) { - if (GET_TC(tc)->npyarr && - GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) { - Py_XDECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } + if (GET_TC(tc)->npyarr && + GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) { + Py_XDECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } } -int NpyArr_iterNextNone(JSOBJ Py_UNUSED(_obj), JSONTypeContext *Py_UNUSED(tc)) { - return 0; +static int NpyArr_iterNextNone(JSOBJ Py_UNUSED(_obj), + JSONTypeContext *Py_UNUSED(tc)) { + return 0; } -void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { - PyArrayObject *obj; - NpyArrContext *npyarr; +static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { + PyArrayObject *obj = + (PyArrayObject *)(GET_TC(tc)->newObj ? GET_TC(tc)->newObj : _obj); - if (GET_TC(tc)->newObj) { - obj = (PyArrayObject *)GET_TC(tc)->newObj; - } else { - obj = (PyArrayObject *)_obj; - } + NpyArrContext *npyarr = PyObject_Malloc(sizeof(NpyArrContext)); + GET_TC(tc)->npyarr = npyarr; - npyarr = PyObject_Malloc(sizeof(NpyArrContext)); - GET_TC(tc)->npyarr = npyarr; + if (!npyarr) { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } - if (!npyarr) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - - npyarr->array = (PyObject *)obj; - npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem; - npyarr->dataptr = PyArray_DATA(obj); - npyarr->ndim = PyArray_NDIM(obj) - 1; - npyarr->curdim = 0; - npyarr->type_num = PyArray_DESCR(obj)->type_num; - - if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); - npyarr->stridedim = npyarr->ndim; - npyarr->index[npyarr->ndim] = 0; - npyarr->inc = -1; - } else { - npyarr->dim = PyArray_DIM(obj, 0); - npyarr->stride = PyArray_STRIDE(obj, 0); - npyarr->stridedim = 0; - npyarr->index[0] = 0; - npyarr->inc = 1; - } + npyarr->array = (PyObject *)obj; + npyarr->dataptr = PyArray_DATA(obj); + npyarr->ndim = PyArray_NDIM(obj) - 1; + npyarr->curdim = 0; + npyarr->type_num = PyArray_DESCR(obj)->type_num; + + if (GET_TC(tc)->transpose) { + npyarr->dim = PyArray_DIM(obj, npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->stridedim = npyarr->ndim; + npyarr->index[npyarr->ndim] = 0; + npyarr->inc = -1; + } else { + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; + } - npyarr->columnLabels = GET_TC(tc)->columnLabels; - npyarr->rowLabels = GET_TC(tc)->rowLabels; + npyarr->columnLabels = GET_TC(tc)->columnLabels; + npyarr->rowLabels = GET_TC(tc)->rowLabels; } -void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; +static void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + NpyArrContext *npyarr = GET_TC(tc)->npyarr; - if (npyarr) { - NpyArr_freeItemValue(obj, tc); - PyObject_Free(npyarr); - } + if (npyarr) { + NpyArr_freeItemValue(obj, tc); + PyObject_Free(npyarr); + } } -void NpyArrPassThru_iterBegin(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc)) {} +static void NpyArrPassThru_iterBegin(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc)) {} -void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; - // finished this dimension, reset the data pointer - npyarr->curdim--; - npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; - npyarr->stridedim -= npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); - npyarr->dataptr += npyarr->stride; +static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + // finished this dimension, reset the data pointer + npyarr->curdim--; + npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; + npyarr->stridedim -= npyarr->inc; + + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArrayPassThru_iterEnd received a non-array object"); + return; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; + npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); + npyarr->dataptr += npyarr->stride; - NpyArr_freeItemValue(obj, tc); + NpyArr_freeItemValue(obj, tc); } -int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; +static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { + NpyArrContext *npyarr = GET_TC(tc)->npyarr; - if (PyErr_Occurred()) { - return 0; - } + if (PyErr_Occurred()) { + return 0; + } - if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { - return 0; - } + if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { + return 0; + } - NpyArr_freeItemValue(obj, tc); + NpyArr_freeItemValue(obj, tc); - if (PyArray_ISDATETIME(npyarr->array)) { - GET_TC(tc)->itemValue = obj; - Py_INCREF(obj); - ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); - // Also write the resolution (unit) of the ndarray - PyArray_Descr *dtype = PyArray_DESCR(npyarr->array); - ((PyObjectEncoder *)tc->encoder)->valueUnit = - get_datetime_metadata_from_dtype(dtype).base; - ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; - } else { - GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); - } + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArr_iterNextItem received a non-array object"); + return 0; + } + PyArrayObject *arrayobj = (PyArrayObject *)npyarr->array; - npyarr->dataptr += npyarr->stride; - npyarr->index[npyarr->stridedim]++; - return 1; -} + if (PyArray_ISDATETIME(arrayobj)) { + GET_TC(tc)->itemValue = obj; + Py_INCREF(obj); + ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(arrayobj); + // Also write the resolution (unit) of the ndarray + PyArray_Descr *dtype = PyArray_DESCR(arrayobj); + ((PyObjectEncoder *)tc->encoder)->valueUnit = + get_datetime_metadata_from_dtype(dtype).base; + ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; + ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; + } else { + GET_TC(tc)->itemValue = PyArray_GETITEM(arrayobj, npyarr->dataptr); + } -int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; + npyarr->dataptr += npyarr->stride; + npyarr->index[npyarr->stridedim]++; + return 1; +} - if (PyErr_Occurred()) { - return 0; - } +static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { + NpyArrContext *npyarr = GET_TC(tc)->npyarr; - if (npyarr->curdim >= npyarr->ndim || - npyarr->index[npyarr->stridedim] >= npyarr->dim) { - // innermost dimension, start retrieving item values - GET_TC(tc)->iterNext = NpyArr_iterNextItem; - return NpyArr_iterNextItem(_obj, tc); - } + if (PyErr_Occurred()) { + return 0; + } - // dig a dimension deeper - npyarr->index[npyarr->stridedim]++; + if (npyarr->curdim >= npyarr->ndim || + npyarr->index[npyarr->stridedim] >= npyarr->dim) { + // innermost dimension, start retrieving item values + GET_TC(tc)->iterNext = NpyArr_iterNextItem; + return NpyArr_iterNextItem(_obj, tc); + } + + // dig a dimension deeper + npyarr->index[npyarr->stridedim]++; + + npyarr->curdim++; + npyarr->stridedim += npyarr->inc; + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArr_iterNext received a non-array object"); + return 0; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; - npyarr->curdim++; - npyarr->stridedim += npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); - npyarr->index[npyarr->stridedim] = 0; + npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); + npyarr->index[npyarr->stridedim] = 0; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; - GET_TC(tc)->itemValue = npyarr->array; - return 1; + ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; + GET_TC(tc)->itemValue = npyarr->array; + return 1; } -JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; +static JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - NpyArrContext *npyarr = GET_TC(tc)->npyarr; - npy_intp idx; - char *cStr; +static char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + char *cStr; - if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { - idx = npyarr->index[npyarr->stridedim] - 1; - cStr = npyarr->columnLabels[idx]; - } else { - idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; - cStr = npyarr->rowLabels[idx]; - } + if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { + const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; + cStr = npyarr->columnLabels[idx]; + } else { + const npy_intp idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; + cStr = npyarr->rowLabels[idx]; + } - *outLen = strlen(cStr); + *outLen = strlen(cStr); - return cStr; + return cStr; } //============================================================================= @@ -580,301 +564,289 @@ // Uses a dedicated NpyArrContext for each column. //============================================================================= -void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; +static void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - if (blkCtxt->transpose) { - blkCtxt->colIdx++; - } else { - blkCtxt->colIdx = 0; - } + if (blkCtxt->transpose) { + blkCtxt->colIdx++; + } else { + blkCtxt->colIdx = 0; + } - NpyArr_freeItemValue(obj, tc); + NpyArr_freeItemValue(obj, tc); } -int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - - if (blkCtxt->colIdx >= blkCtxt->ncols) { - return 0; - } +static int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - blkCtxt->colIdx++; - return NpyArr_iterNextItem(obj, tc); -} - -char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; - npy_intp idx; - char *cStr; - - if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { - idx = blkCtxt->colIdx - 1; - cStr = npyarr->columnLabels[idx]; - } else { - idx = GET_TC(tc)->iterNext != PdBlock_iterNext - ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 - : npyarr->index[npyarr->stridedim]; + if (blkCtxt->colIdx >= blkCtxt->ncols) { + return 0; + } - cStr = npyarr->rowLabels[idx]; - } + GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; + blkCtxt->colIdx++; + return NpyArr_iterNextItem(obj, tc); +} + +static char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; + char *cStr; + + if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { + const npy_intp idx = blkCtxt->colIdx - 1; + cStr = npyarr->columnLabels[idx]; + } else { + const npy_intp idx = + GET_TC(tc)->iterNext != PdBlock_iterNext + ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 + : npyarr->index[npyarr->stridedim]; + + cStr = npyarr->rowLabels[idx]; + } + + *outLen = strlen(cStr); + return cStr; +} + +static char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc, + size_t *outLen) { + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; + NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; + char *cStr; + + if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { + const npy_intp idx = npyarr->index[npyarr->stridedim] - 1; + cStr = npyarr->columnLabels[idx]; + } else { + const npy_intp idx = blkCtxt->colIdx; + cStr = npyarr->rowLabels[idx]; + } - *outLen = strlen(cStr); - return cStr; + *outLen = strlen(cStr); + return cStr; } -char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - npy_intp idx; - char *cStr; +static int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { - idx = npyarr->index[npyarr->stridedim] - 1; - cStr = npyarr->columnLabels[idx]; - } else { - idx = blkCtxt->colIdx; - cStr = npyarr->rowLabels[idx]; - } - - *outLen = strlen(cStr); - return cStr; -} - -int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - NpyArrContext *npyarr; + if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { + return 0; + } - if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { - return 0; + if (blkCtxt->transpose) { + if (blkCtxt->colIdx >= blkCtxt->ncols) { + return 0; } - - if (blkCtxt->transpose) { - if (blkCtxt->colIdx >= blkCtxt->ncols) { - return 0; - } - } else { - npyarr = blkCtxt->npyCtxts[0]; - if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { - return 0; - } + } else { + const NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; + if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { + return 0; } + } - ((PyObjectEncoder *)tc->encoder)->blkCtxtPassthru = blkCtxt; - GET_TC(tc)->itemValue = obj; + ((PyObjectEncoder *)tc->encoder)->blkCtxtPassthru = blkCtxt; + GET_TC(tc)->itemValue = obj; - return 1; + return 1; } -void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; +static void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), + JSONTypeContext *tc) { + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - if (blkCtxt->transpose) { - // if transposed we exhaust each column before moving to the next - GET_TC(tc)->iterNext = NpyArr_iterNextItem; - GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose; - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; - } + if (blkCtxt->transpose) { + // if transposed we exhaust each column before moving to the next + GET_TC(tc)->iterNext = NpyArr_iterNextItem; + GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose; + GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; + } } -void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj, *values, *arrays, *array; - PdBlockContext *blkCtxt; - NpyArrContext *npyarr; - Py_ssize_t i; +static void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { + PyObject *obj = (PyObject *)_obj; - obj = (PyObject *)_obj; + GET_TC(tc)->iterGetName = GET_TC(tc)->transpose + ? PdBlock_iterGetName_Transpose + : PdBlock_iterGetName; - GET_TC(tc)->iterGetName = GET_TC(tc)->transpose - ? PdBlock_iterGetName_Transpose - : PdBlock_iterGetName; + PdBlockContext *blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); + if (!blkCtxt) { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } + GET_TC(tc)->pdblock = blkCtxt; - blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); - if (!blkCtxt) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } - GET_TC(tc)->pdblock = blkCtxt; + blkCtxt->colIdx = 0; + blkCtxt->transpose = GET_TC(tc)->transpose; + blkCtxt->ncols = get_attr_length(obj, "columns"); - blkCtxt->colIdx = 0; - blkCtxt->transpose = GET_TC(tc)->transpose; - blkCtxt->ncols = get_attr_length(obj, "columns"); + if (blkCtxt->ncols == 0) { + blkCtxt->npyCtxts = NULL; - if (blkCtxt->ncols == 0) { - blkCtxt->npyCtxts = NULL; + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } + blkCtxt->npyCtxts = PyObject_Malloc(sizeof(NpyArrContext *) * blkCtxt->ncols); + if (!blkCtxt->npyCtxts) { + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } - blkCtxt->npyCtxts = - PyObject_Malloc(sizeof(NpyArrContext *) * blkCtxt->ncols); - if (!blkCtxt->npyCtxts) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; - } + PyObject *arrays = get_sub_attr(obj, "_mgr", "column_arrays"); + if (!arrays) { + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; + } - arrays = get_sub_attr(obj, "_mgr", "column_arrays"); - if (!arrays) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; + for (Py_ssize_t i = 0; i < PyObject_Length(arrays); i++) { + PyObject *array = PyList_GET_ITEM(arrays, i); + if (!array) { + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto ARR_RET; } - for (i = 0; i < PyObject_Length(arrays); i++) { - array = PyList_GET_ITEM(arrays, i); - if (!array) { - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto ARR_RET; - } - - // ensure we have a numpy array (i.e. np.asarray) - values = PyObject_CallMethod(array, "__array__", NULL); - if ((!values) || (!PyArray_CheckExact(values))) { - // Didn't get a numpy array - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto ARR_RET; - } + // ensure we have a numpy array (i.e. np.asarray) + PyObject *values = PyObject_CallMethod(array, "__array__", NULL); + if ((!values) || (!PyArray_CheckExact(values))) { + // Didn't get a numpy array + ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + goto ARR_RET; + } - GET_TC(tc)->newObj = values; + GET_TC(tc)->newObj = values; - // init a dedicated context for this column - NpyArr_iterBegin(obj, tc); - npyarr = GET_TC(tc)->npyarr; + // init a dedicated context for this column + NpyArr_iterBegin(obj, tc); - GET_TC(tc)->itemValue = NULL; - ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; + GET_TC(tc)->itemValue = NULL; + ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL; - blkCtxt->npyCtxts[i] = npyarr; - GET_TC(tc)->newObj = NULL; - } - GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; - goto ARR_RET; + blkCtxt->npyCtxts[i] = GET_TC(tc)->npyarr; + GET_TC(tc)->newObj = NULL; + } + GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; + goto ARR_RET; ARR_RET: - Py_DECREF(arrays); + Py_DECREF(arrays); } -void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - PdBlockContext *blkCtxt; - NpyArrContext *npyarr; - int i; - - GET_TC(tc)->itemValue = NULL; - npyarr = GET_TC(tc)->npyarr; +static void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->itemValue = NULL; + NpyArrContext *npyarr = GET_TC(tc)->npyarr; + PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - blkCtxt = GET_TC(tc)->pdblock; - - if (blkCtxt) { - for (i = 0; i < blkCtxt->ncols; i++) { - npyarr = blkCtxt->npyCtxts[i]; - if (npyarr) { - if (npyarr->array) { - Py_DECREF(npyarr->array); - npyarr->array = NULL; - } + if (blkCtxt) { + for (int i = 0; i < blkCtxt->ncols; i++) { + npyarr = blkCtxt->npyCtxts[i]; + if (npyarr) { + if (npyarr->array) { + Py_DECREF(npyarr->array); + npyarr->array = NULL; + } - GET_TC(tc)->npyarr = npyarr; - NpyArr_iterEnd(obj, tc); + GET_TC(tc)->npyarr = npyarr; + NpyArr_iterEnd(obj, tc); - blkCtxt->npyCtxts[i] = NULL; - } - } + blkCtxt->npyCtxts[i] = NULL; + } + } - if (blkCtxt->npyCtxts) { - PyObject_Free(blkCtxt->npyCtxts); - } - PyObject_Free(blkCtxt); + if (blkCtxt->npyCtxts) { + PyObject_Free(blkCtxt->npyCtxts); } + PyObject_Free(blkCtxt); + } } //============================================================================= // Tuple iteration functions // itemValue is borrowed reference, no ref counting //============================================================================= -void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->index = 0; - GET_TC(tc)->size = PyTuple_GET_SIZE((PyObject *)obj); - GET_TC(tc)->itemValue = NULL; +static void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyTuple_GET_SIZE((PyObject *)obj); + GET_TC(tc)->itemValue = NULL; } -int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) { - PyObject *item; +static int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (GET_TC(tc)->index >= GET_TC(tc)->size) { - return 0; - } + if (GET_TC(tc)->index >= GET_TC(tc)->size) { + return 0; + } - item = PyTuple_GET_ITEM(obj, GET_TC(tc)->index); + PyObject *item = PyTuple_GET_ITEM(obj, GET_TC(tc)->index); - GET_TC(tc)->itemValue = item; - GET_TC(tc)->index++; - return 1; + GET_TC(tc)->itemValue = item; + GET_TC(tc)->index++; + return 1; } -void Tuple_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {} +static void Tuple_iterEnd(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc)) {} -JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; +static JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { - return NULL; +static char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { + return NULL; } //============================================================================= // Set iteration functions // itemValue is borrowed reference, no ref counting //============================================================================= -void Set_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->itemValue = NULL; - GET_TC(tc)->iterator = PyObject_GetIter(obj); +static void Set_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->itemValue = NULL; + GET_TC(tc)->iterator = PyObject_GetIter(obj); } -int Set_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObject *item; - - if (GET_TC(tc)->itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } +static int Set_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } - item = PyIter_Next(GET_TC(tc)->iterator); + PyObject *item = PyIter_Next(GET_TC(tc)->iterator); - if (item == NULL) { - return 0; - } + if (item == NULL) { + return 0; + } - GET_TC(tc)->itemValue = item; - return 1; + GET_TC(tc)->itemValue = item; + return 1; } -void Set_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (GET_TC(tc)->itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } +static void Set_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } - if (GET_TC(tc)->iterator) { - Py_DECREF(GET_TC(tc)->iterator); - GET_TC(tc)->iterator = NULL; - } + if (GET_TC(tc)->iterator) { + Py_DECREF(GET_TC(tc)->iterator); + GET_TC(tc)->iterator = NULL; + } } -JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; +static JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *Set_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { - return NULL; +static char *Set_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { + return NULL; } //============================================================================= @@ -882,294 +854,285 @@ // itemName ref is borrowed from PyObject_Dir (attrList). No refcount // itemValue ref is from PyObject_GetAttr. Ref counted //============================================================================= -void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->attrList = PyObject_Dir(obj); - GET_TC(tc)->index = 0; - GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList); +static void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->attrList = PyObject_Dir(obj); + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList); } -void Dir_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (GET_TC(tc)->itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = NULL; - } +static void Dir_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + if (GET_TC(tc)->itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = NULL; + } - if (GET_TC(tc)->itemName) { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = NULL; - } + if (GET_TC(tc)->itemName) { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } - Py_DECREF((PyObject *)GET_TC(tc)->attrList); + Py_DECREF((PyObject *)GET_TC(tc)->attrList); } -int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj = (PyObject *)_obj; - PyObject *itemValue = GET_TC(tc)->itemValue; - PyObject *itemName = GET_TC(tc)->itemName; - PyObject *attr; - PyObject *attrName; - char *attrStr; +static int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { + PyObject *obj = (PyObject *)_obj; + PyObject *itemValue = GET_TC(tc)->itemValue; + PyObject *itemName = GET_TC(tc)->itemName; - if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { - return 0; - } - - if (itemValue) { - Py_DECREF(GET_TC(tc)->itemValue); - GET_TC(tc)->itemValue = itemValue = NULL; - } + if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { + return 0; + } - if (itemName) { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = itemName = NULL; + if (itemValue) { + Py_DECREF(GET_TC(tc)->itemValue); + GET_TC(tc)->itemValue = itemValue = NULL; + } + + if (itemName) { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = itemName = NULL; + } + + for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index++) { + PyObject *attrName = + PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index); + PyObject *attr = PyUnicode_AsUTF8String(attrName); + const char *attrStr = PyBytes_AS_STRING(attr); + + if (attrStr[0] == '_') { + Py_DECREF(attr); + continue; + } + + itemValue = PyObject_GetAttr(obj, attrName); + if (itemValue == NULL) { + PyErr_Clear(); + Py_DECREF(attr); + continue; + } + + if (PyCallable_Check(itemValue)) { + Py_DECREF(itemValue); + Py_DECREF(attr); + continue; } - for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index++) { - attrName = PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index); - attr = PyUnicode_AsUTF8String(attrName); - attrStr = PyBytes_AS_STRING(attr); - - if (attrStr[0] == '_') { - Py_DECREF(attr); - continue; - } - - itemValue = PyObject_GetAttr(obj, attrName); - if (itemValue == NULL) { - PyErr_Clear(); - Py_DECREF(attr); - continue; - } - - if (PyCallable_Check(itemValue)) { - Py_DECREF(itemValue); - Py_DECREF(attr); - continue; - } - - GET_TC(tc)->itemName = itemName; - GET_TC(tc)->itemValue = itemValue; + GET_TC(tc)->itemName = itemName; + GET_TC(tc)->itemValue = itemValue; - itemName = attr; - break; - } + itemName = attr; + break; + } - if (itemName == NULL) { - GET_TC(tc)->index = GET_TC(tc)->size; - GET_TC(tc)->itemValue = NULL; - return 0; - } + if (itemName == NULL) { + GET_TC(tc)->index = GET_TC(tc)->size; + GET_TC(tc)->itemValue = NULL; + return 0; + } - GET_TC(tc)->itemName = itemName; - GET_TC(tc)->itemValue = itemValue; - GET_TC(tc)->index++; + GET_TC(tc)->itemName = itemName; + GET_TC(tc)->itemValue = itemValue; + GET_TC(tc)->index++; - return 1; + return 1; } -JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; +static JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); - return PyBytes_AS_STRING(GET_TC(tc)->itemName); +static char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { + *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); + return PyBytes_AS_STRING(GET_TC(tc)->itemName); } //============================================================================= // List iteration functions // itemValue is borrowed from object (which is list). No refcounting //============================================================================= -void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->index = 0; - GET_TC(tc)->size = PyList_GET_SIZE((PyObject *)obj); +static void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->index = 0; + GET_TC(tc)->size = PyList_GET_SIZE((PyObject *)obj); } -int List_iterNext(JSOBJ obj, JSONTypeContext *tc) { - if (GET_TC(tc)->index >= GET_TC(tc)->size) { - return 0; - } +static int List_iterNext(JSOBJ obj, JSONTypeContext *tc) { + if (GET_TC(tc)->index >= GET_TC(tc)->size) { + return 0; + } - GET_TC(tc)->itemValue = PyList_GET_ITEM(obj, GET_TC(tc)->index); - GET_TC(tc)->index++; - return 1; + GET_TC(tc)->itemValue = PyList_GET_ITEM(obj, GET_TC(tc)->index); + GET_TC(tc)->index++; + return 1; } -void List_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {} +static void List_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) { +} -JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; +static JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *List_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { - return NULL; +static char *List_iterGetName(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { + return NULL; } //============================================================================= // pandas Index iteration functions //============================================================================= -void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } +static void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + GET_TC(tc)->index = 0; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + if (!GET_TC(tc)->cStr) { + PyErr_NoMemory(); + } } -int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { - Py_ssize_t index; - if (!GET_TC(tc)->cStr) { - return 0; - } +static int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { + if (!GET_TC(tc)->cStr) { + return 0; + } - index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); - } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); - GET_TC(tc)->itemValue = get_values(obj); - if (!GET_TC(tc)->itemValue) { - return 0; - } - } else { - return 0; + const Py_ssize_t index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) { + memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } else if (index == 1) { + memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->itemValue = get_values(obj); + if (!GET_TC(tc)->itemValue) { + return 0; } + } else { + return 0; + } - GET_TC(tc)->index++; - return 1; + GET_TC(tc)->index++; + return 1; } -void Index_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {} +static void Index_iterEnd(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc)) {} -JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; +static JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; +static char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= // pandas Series iteration functions //============================================================================= -void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } +static void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + GET_TC(tc)->index = 0; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series + if (!GET_TC(tc)->cStr) { + PyErr_NoMemory(); + } } -int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { - Py_ssize_t index; - if (!GET_TC(tc)->cStr) { - return 0; - } +static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { + if (!GET_TC(tc)->cStr) { + return 0; + } - index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) { - memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); - } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); - } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); - GET_TC(tc)->itemValue = get_values(obj); - if (!GET_TC(tc)->itemValue) { - return 0; - } - } else { - return 0; + const Py_ssize_t index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) { + memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); + } else if (index == 1) { + memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } else if (index == 2) { + memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + GET_TC(tc)->itemValue = get_values(obj); + if (!GET_TC(tc)->itemValue) { + return 0; } + } else { + return 0; + } - GET_TC(tc)->index++; - return 1; + GET_TC(tc)->index++; + return 1; } -void Series_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - enc->outputFormat = enc->originalOutputFormat; +static void Series_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + enc->outputFormat = enc->originalOutputFormat; } -JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; +static JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; +static char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= // pandas DataFrame iteration functions //============================================================================= -void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - GET_TC(tc)->index = 0; - GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series & index - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - } +static void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + GET_TC(tc)->index = 0; + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + enc->outputFormat = VALUES; // for contained series & index + if (!GET_TC(tc)->cStr) { + PyErr_NoMemory(); + } } -int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { - Py_ssize_t index; - if (!GET_TC(tc)->cStr) { - return 0; - } +static int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { + if (!GET_TC(tc)->cStr) { + return 0; + } - index = GET_TC(tc)->index; - Py_XDECREF(GET_TC(tc)->itemValue); - if (index == 0) { - memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); - } else if (index == 1) { - memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); - } else if (index == 2) { - memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); - if (is_simple_frame(obj)) { - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); - if (!GET_TC(tc)->itemValue) { - return 0; - } - } else { - Py_INCREF(obj); - GET_TC(tc)->itemValue = obj; - } - } else { - return 0; - } + const Py_ssize_t index = GET_TC(tc)->index; + Py_XDECREF(GET_TC(tc)->itemValue); + if (index == 0) { + memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); + } else if (index == 1) { + memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6); + GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); + } else if (index == 2) { + memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5); + Py_INCREF(obj); + GET_TC(tc)->itemValue = obj; + } else { + return 0; + } - GET_TC(tc)->index++; - return 1; + GET_TC(tc)->index++; + return 1; } -void DataFrame_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - enc->outputFormat = enc->originalOutputFormat; +static void DataFrame_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; + enc->outputFormat = enc->originalOutputFormat; } -JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; +static JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; } -char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - *outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; +static char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= @@ -1177,63 +1140,59 @@ // itemName might converted to string (Python_Str). Do refCounting // itemValue is borrowed from object (which is dict). No refCounting //============================================================================= -void Dict_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - GET_TC(tc)->index = 0; +static void Dict_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + GET_TC(tc)->index = 0; } -int Dict_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PyObject *itemNameTmp; - - if (GET_TC(tc)->itemName) { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = NULL; - } +static int Dict_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + if (GET_TC(tc)->itemName) { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } - if (!PyDict_Next((PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index, - &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) { - return 0; - } - - if (PyUnicode_Check(GET_TC(tc)->itemName)) { - GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName); - } else if (!PyBytes_Check(GET_TC(tc)->itemName)) { - GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName); - itemNameTmp = GET_TC(tc)->itemName; - GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName); - Py_DECREF(itemNameTmp); - } else { - Py_INCREF(GET_TC(tc)->itemName); - } - return 1; -} - -void Dict_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (GET_TC(tc)->itemName) { - Py_DECREF(GET_TC(tc)->itemName); - GET_TC(tc)->itemName = NULL; - } - Py_DECREF(GET_TC(tc)->dictObj); -} - -JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->itemValue; -} - -char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, - size_t *outLen) { - *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); - return PyBytes_AS_STRING(GET_TC(tc)->itemName); -} - -void NpyArr_freeLabels(char **labels, npy_intp len) { - npy_intp i; + if (!PyDict_Next((PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index, + &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) { + return 0; + } - if (labels) { - for (i = 0; i < len; i++) { - PyObject_Free(labels[i]); - } - PyObject_Free(labels); + if (PyUnicode_Check(GET_TC(tc)->itemName)) { + GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName); + } else if (!PyBytes_Check(GET_TC(tc)->itemName)) { + GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName); + PyObject *itemNameTmp = GET_TC(tc)->itemName; + GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName); + Py_DECREF(itemNameTmp); + } else { + Py_INCREF(GET_TC(tc)->itemName); + } + return 1; +} + +static void Dict_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + if (GET_TC(tc)->itemName) { + Py_DECREF(GET_TC(tc)->itemName); + GET_TC(tc)->itemName = NULL; + } + Py_DECREF(GET_TC(tc)->dictObj); +} + +static JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->itemValue; +} + +static char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { + *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); + return PyBytes_AS_STRING(GET_TC(tc)->itemName); +} + +static void NpyArr_freeLabels(char **labels, npy_intp len) { + if (labels) { + for (npy_intp i = 0; i < len; i++) { + PyObject_Free(labels[i]); } + PyObject_Free(labels); + } } /* @@ -1253,895 +1212,881 @@ * this has instead just stringified any input save for datetime values, * which may need to be represented in various formats. */ -char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, - npy_intp num) { - // NOTE this function steals a reference to labels. - PyObject *item = NULL; - size_t len; - npy_intp i, stride; - char **ret; - char *dataptr, *cLabel; - int type_num; - PyArray_Descr *dtype; - NPY_DATETIMEUNIT base = enc->datetimeUnit; - - if (!labels) { - return 0; - } - - if (PyArray_SIZE(labels) < num) { - PyErr_SetString( - PyExc_ValueError, - "Label array sizes do not match corresponding data shape"); - Py_DECREF(labels); - return 0; - } +static char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, + npy_intp num) { + // NOTE this function steals a reference to labels. + PyObject *item = NULL; + const NPY_DATETIMEUNIT base = enc->datetimeUnit; - ret = PyObject_Malloc(sizeof(char *) * num); - if (!ret) { - PyErr_NoMemory(); - Py_DECREF(labels); - return 0; - } - - for (i = 0; i < num; i++) { - ret[i] = NULL; - } + if (!labels) { + return 0; + } - stride = PyArray_STRIDE(labels, 0); - dataptr = PyArray_DATA(labels); - type_num = PyArray_TYPE(labels); - dtype = PyArray_DESCR(labels); + if (PyArray_SIZE(labels) < num) { + PyErr_SetString(PyExc_ValueError, + "Label array sizes do not match corresponding data shape"); + Py_DECREF(labels); + return 0; + } - for (i = 0; i < num; i++) { - item = PyArray_GETITEM(labels, dataptr); - if (!item) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } + char **ret = PyObject_Malloc(sizeof(char *) * num); + if (!ret) { + PyErr_NoMemory(); + Py_DECREF(labels); + return 0; + } - int is_datetimelike = 0; - npy_int64 i8date; - NPY_DATETIMEUNIT dateUnit = NPY_FR_ns; - if (PyTypeNum_ISDATETIME(type_num)) { - is_datetimelike = 1; - i8date = *(npy_int64 *)dataptr; - dateUnit = get_datetime_metadata_from_dtype(dtype).base; - } else if (PyDate_Check(item) || PyDelta_Check(item)) { - is_datetimelike = 1; - if (PyObject_HasAttrString(item, "_value")) { - // see test_date_index_and_values for case with non-nano - i8date = get_long_attr(item, "_value"); - } else { - if (PyDelta_Check(item)) { - i8date = total_seconds(item) * - 1000000000LL; // nanoseconds per second - } else { - // datetime.* objects don't follow above rules - i8date = PyDateTimeToEpoch(item, NPY_FR_ns); - } - } + for (npy_intp i = 0; i < num; i++) { + ret[i] = NULL; + } + + const npy_intp stride = PyArray_STRIDE(labels, 0); + char *dataptr = PyArray_DATA(labels); + const int type_num = PyArray_TYPE(labels); + PyArray_Descr *dtype = PyArray_DESCR(labels); + + for (npy_intp i = 0; i < num; i++) { + item = PyArray_GETITEM(labels, dataptr); + if (!item) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + int is_datetimelike = 0; + int64_t i8date; + NPY_DATETIMEUNIT dateUnit = NPY_FR_ns; + if (PyTypeNum_ISDATETIME(type_num)) { + is_datetimelike = 1; + i8date = *(int64_t *)dataptr; + dateUnit = get_datetime_metadata_from_dtype(dtype).base; + } else if (PyDate_Check(item) || PyDelta_Check(item)) { + is_datetimelike = 1; + if (PyObject_HasAttrString(item, "_value")) { + // pd.Timestamp object or pd.NaT + // see test_date_index_and_values for case with non-nano + i8date = get_long_attr(item, "_value"); + } else { + if (PyDelta_Check(item)) { + // TODO(anyone): cast below loses precision if total_seconds return + // value exceeds number of bits that significand can hold + // also liable to overflow + i8date = (int64_t)(total_seconds(item) * + 1000000000LL); // nanoseconds per second + } else { + // datetime.* objects don't follow above rules + i8date = PyDateTimeToEpoch(item, NPY_FR_ns); } + } + } - if (is_datetimelike) { - if (i8date == get_nat()) { - len = 4; - cLabel = PyObject_Malloc(len + 1); - strncpy(cLabel, "null", len + 1); + size_t len; + char *cLabel; + if (is_datetimelike) { + if (i8date == get_nat()) { + len = 4; + cLabel = PyObject_Malloc(len + 1); + strncpy(cLabel, "null", len + 1); + } else { + if (enc->datetimeIso) { + if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { + // TODO(username): non-nano timedelta support? + cLabel = int64ToIsoDuration(i8date, &len); + } else { + if (type_num == NPY_DATETIME) { + cLabel = int64ToIso(i8date, dateUnit, base, &len); } else { - if (enc->datetimeIso) { - if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { - // TODO(username): non-nano timedelta support? - cLabel = int64ToIsoDuration(i8date, &len); - } else { - if (type_num == NPY_DATETIME) { - cLabel = int64ToIso(i8date, dateUnit, base, &len); - } else { - cLabel = PyDateTimeToIso(item, base, &len); - } - } - if (cLabel == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - } else { - int size_of_cLabel = 21; // 21 chars for int 64 - cLabel = PyObject_Malloc(size_of_cLabel); - snprintf(cLabel, size_of_cLabel, "%" NPY_DATETIME_FMT, - NpyDateTimeToEpoch(i8date, base)); - len = strlen(cLabel); - } + cLabel = PyDateTimeToIso(item, base, &len); } - } else { // Fallback to string representation - // Replace item with the string to keep it alive. - Py_SETREF(item, PyObject_Str(item)); - if (item == NULL) { - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - cLabel = (char *)PyUnicode_AsUTF8(item); - len = strlen(cLabel); - } - - // Add 1 to include NULL terminator - ret[i] = PyObject_Malloc(len + 1); - memcpy(ret[i], cLabel, len + 1); - Py_DECREF(item); - - if (is_datetimelike) { - PyObject_Free(cLabel); - } - - if (PyErr_Occurred()) { + } + if (cLabel == NULL) { + Py_DECREF(item); NpyArr_freeLabels(ret, num); ret = 0; break; - } - - if (!ret[i]) { - PyErr_NoMemory(); + } + } else { + int size_of_cLabel = 21; // 21 chars for int 64 + cLabel = PyObject_Malloc(size_of_cLabel); + if (scaleNanosecToUnit(&i8date, base) == -1) { + NpyArr_freeLabels(ret, num); ret = 0; break; - } - - dataptr += stride; - } - - Py_DECREF(labels); - return ret; -} + } + snprintf(cLabel, size_of_cLabel, "%" PRId64, i8date); + len = strlen(cLabel); + } + } + } else { // Fallback to string representation + // Replace item with the string to keep it alive. + Py_SETREF(item, PyObject_Str(item)); + if (item == NULL) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } -void Object_invokeDefaultHandler(PyObject *obj, PyObjectEncoder *enc) { - PyObject *tmpObj = NULL; - tmpObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL); - if (!PyErr_Occurred()) { - if (tmpObj == NULL) { - PyErr_SetString(PyExc_TypeError, - "Failed to execute default handler"); - } else { - encode(tmpObj, (JSONObjectEncoder *)enc, NULL, 0); - } + cLabel = (char *)PyUnicode_AsUTF8(item); + len = strlen(cLabel); } - Py_XDECREF(tmpObj); - return; -} -void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj, *exc, *toDictFunc, *tmpObj, *values; - TypeContext *pc; - PyObjectEncoder *enc; - double val; - npy_int64 value; - int unit; + // Add 1 to include NULL terminator + ret[i] = PyObject_Malloc(len + 1); + memcpy(ret[i], cLabel, len + 1); + Py_DECREF(item); - tc->prv = NULL; - - if (!_obj) { - tc->type = JT_INVALID; - return; + if (is_datetimelike) { + PyObject_Free(cLabel); } - obj = (PyObject *)_obj; - enc = (PyObjectEncoder *)tc->encoder; - - if (PyBool_Check(obj)) { - tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; - return; - } else if (obj == Py_None) { - tc->type = JT_NULL; - return; + if (PyErr_Occurred()) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; } - pc = createTypeContext(); - if (!pc) { - tc->type = JT_INVALID; - return; + if (!ret[i]) { + PyErr_NoMemory(); + ret = 0; + break; } - tc->prv = pc; - if (PyTypeNum_ISDATETIME(enc->npyType)) { - int64_t longVal; + dataptr += stride; + } - longVal = *(npy_int64 *)enc->npyValue; - if (longVal == get_nat()) { - tc->type = JT_NULL; - } else { - if (enc->datetimeIso) { - if (enc->npyType == NPY_TIMEDELTA) { - pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; - } else { - pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; - } - // Currently no way to pass longVal to iso function, so use - // state management - GET_TC(tc)->longValue = longVal; - tc->type = JT_UTF8; - } else { - NPY_DATETIMEUNIT base = - ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = NpyDateTimeToEpoch(longVal, base); - tc->type = JT_LONG; - } - } + Py_DECREF(labels); + return ret; +} - // TODO(username): this prevents infinite loop with - // mixed-type DataFrames; - // refactor - enc->npyCtxtPassthru = NULL; - enc->npyType = -1; - return; +static void Object_invokeDefaultHandler(PyObject *obj, PyObjectEncoder *enc) { + PyObject *tmpObj = NULL; + tmpObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL); + if (!PyErr_Occurred()) { + if (tmpObj == NULL) { + PyErr_SetString(PyExc_TypeError, "Failed to execute default handler"); + } else { + encode(tmpObj, (JSONObjectEncoder *)enc, NULL, 0); } + } + Py_XDECREF(tmpObj); + return; +} - if (PyIter_Check(obj) || - (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) { - goto ISITERABLE; - } +static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { + tc->prv = NULL; - if (PyLong_Check(obj)) { - tc->type = JT_LONG; - int overflow = 0; - GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow); - int err; - err = (GET_TC(tc)->longValue == -1) && PyErr_Occurred(); - - if (overflow) { - tc->type = JT_BIGNUM; - } else if (err) { - goto INVALID; - } + if (!_obj) { + tc->type = JT_INVALID; + return; + } - return; - } else if (PyFloat_Check(obj)) { - val = PyFloat_AS_DOUBLE(obj); - if (npy_isnan(val) || npy_isinf(val)) { - tc->type = JT_NULL; - } else { - GET_TC(tc)->doubleValue = val; - tc->type = JT_DOUBLE; - } - return; - } else if (PyBytes_Check(obj)) { - pc->PyTypeToUTF8 = PyBytesToUTF8; - tc->type = JT_UTF8; - return; - } else if (PyUnicode_Check(obj)) { - pc->PyTypeToUTF8 = PyUnicodeToUTF8; - tc->type = JT_UTF8; - return; - } else if (object_is_decimal_type(obj)) { - GET_TC(tc)->doubleValue = PyFloat_AsDouble(obj); - tc->type = JT_DOUBLE; - return; - } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { - if (object_is_nat_type(obj)) { - tc->type = JT_NULL; - return; - } + PyObject *obj = (PyObject *)_obj; + PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; - if (enc->datetimeIso) { - pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; - tc->type = JT_UTF8; - } else { - NPY_DATETIMEUNIT base = - ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base); - tc->type = JT_LONG; - } - return; - } else if (PyTime_Check(obj)) { - pc->PyTypeToUTF8 = PyTimeToJSON; - tc->type = JT_UTF8; - return; - } else if (PyArray_IsScalar(obj, Datetime)) { - npy_int64 longVal; - if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) { - tc->type = JT_NULL; - return; - } - PyArray_Descr *dtype = PyArray_DescrFromScalar(obj); - if (!PyTypeNum_ISDATETIME(dtype->type_num)) { - PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime"); - return; - } + if (PyBool_Check(obj)) { + tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; + return; + } else if (obj == Py_None) { + tc->type = JT_NULL; + return; + } - PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64); - PyArray_CastScalarToCtype(obj, &longVal, outcode); - Py_DECREF(outcode); + TypeContext *pc = createTypeContext(); + if (!pc) { + tc->type = JT_INVALID; + return; + } + tc->prv = pc; - if (enc->datetimeIso) { - GET_TC(tc)->longValue = longVal; - pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; - enc->valueUnit = get_datetime_metadata_from_dtype(dtype).base; - tc->type = JT_UTF8; - } else { - NPY_DATETIMEUNIT base = - ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base); - tc->type = JT_LONG; - } - return; - } else if (PyDelta_Check(obj)) { - if (PyObject_HasAttrString(obj, "_value")) { - value = get_long_attr(obj, "_value"); + if (PyTypeNum_ISDATETIME(enc->npyType)) { + int64_t longVal = *(npy_int64 *)enc->npyValue; + if (longVal == get_nat()) { + tc->type = JT_NULL; + } else { + if (enc->datetimeIso) { + if (enc->npyType == NPY_TIMEDELTA) { + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; } else { - value = total_seconds(obj) * 1000000000LL; // nanoseconds per sec + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; } - - if (value == get_nat()) { - tc->type = JT_NULL; - return; - } else if (enc->datetimeIso) { - pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; - tc->type = JT_UTF8; - } else { - unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - if (scaleNanosecToUnit(&value, unit) != 0) { - // TODO(username): Add some kind of error handling here - } - - exc = PyErr_Occurred(); - - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { - goto INVALID; - } - - tc->type = JT_LONG; + // Currently no way to pass longVal to iso function, so use + // state management + pc->longValue = longVal; + tc->type = JT_UTF8; + } else { + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + if (scaleNanosecToUnit(&longVal, base) == -1) { + goto INVALID; } - GET_TC(tc)->longValue = value; - return; - } else if (PyArray_IsScalar(obj, Integer)) { + pc->longValue = longVal; tc->type = JT_LONG; - PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), - PyArray_DescrFromType(NPY_INT64)); - - exc = PyErr_Occurred(); - - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { - goto INVALID; - } - - return; - } else if (PyArray_IsScalar(obj, Bool)) { - PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), - PyArray_DescrFromType(NPY_BOOL)); - tc->type = (GET_TC(tc)->longValue) ? JT_TRUE : JT_FALSE; - return; - } else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) { - PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->doubleValue), - PyArray_DescrFromType(NPY_DOUBLE)); - tc->type = JT_DOUBLE; - return; - } else if (PyArray_Check(obj) && PyArray_CheckScalar(obj)) { - PyErr_Format(PyExc_TypeError, - "%R (0d array) is not JSON serializable at the moment", - obj); - goto INVALID; - } else if (object_is_na_type(obj)) { - tc->type = JT_NULL; - return; + } } -ISITERABLE: - - if (object_is_index_type(obj)) { - if (enc->outputFormat == SPLIT) { - tc->type = JT_OBJECT; - pc->iterBegin = Index_iterBegin; - pc->iterEnd = Index_iterEnd; - pc->iterNext = Index_iterNext; - pc->iterGetValue = Index_iterGetValue; - pc->iterGetName = Index_iterGetName; - return; - } - - pc->newObj = get_values(obj); - if (pc->newObj) { - tc->type = JT_ARRAY; - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - } else { - goto INVALID; - } + // TODO(username): this prevents infinite loop with + // mixed-type DataFrames; + // refactor + enc->npyCtxtPassthru = NULL; + enc->npyType = -1; + return; + } - return; - } else if (object_is_series_type(obj)) { - if (enc->outputFormat == SPLIT) { - tc->type = JT_OBJECT; - pc->iterBegin = Series_iterBegin; - pc->iterEnd = Series_iterEnd; - pc->iterNext = Series_iterNext; - pc->iterGetValue = Series_iterGetValue; - pc->iterGetName = Series_iterGetName; - return; - } + if (PyIter_Check(obj) || (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) { + goto ISITERABLE; + } + + if (PyLong_Check(obj)) { + tc->type = JT_LONG; + int overflow = 0; + pc->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow); + int err; + err = (pc->longValue == -1) && PyErr_Occurred(); + + if (overflow) { + tc->type = JT_BIGNUM; + } else if (err) { + goto INVALID; + } - pc->newObj = get_values(obj); - if (!pc->newObj) { - goto INVALID; - } + return; + } else if (PyFloat_Check(obj)) { + const double val = PyFloat_AS_DOUBLE(obj); + if (npy_isnan(val) || npy_isinf(val)) { + tc->type = JT_NULL; + } else { + pc->doubleValue = val; + tc->type = JT_DOUBLE; + } + return; + } else if (PyBytes_Check(obj)) { + pc->PyTypeToUTF8 = PyBytesToUTF8; + tc->type = JT_UTF8; + return; + } else if (PyUnicode_Check(obj)) { + pc->PyTypeToUTF8 = PyUnicodeToUTF8; + tc->type = JT_UTF8; + return; + } else if (object_is_decimal_type(obj)) { + pc->doubleValue = PyFloat_AsDouble(obj); + tc->type = JT_DOUBLE; + return; + } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { + if (object_is_nat_type(obj)) { + tc->type = JT_NULL; + return; + } - if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { - tc->type = JT_OBJECT; - tmpObj = PyObject_GetAttrString(obj, "index"); - if (!tmpObj) { - goto INVALID; - } - values = get_values(tmpObj); - Py_DECREF(tmpObj); - if (!values) { - goto INVALID; - } - pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); - if (!pc->columnLabels) { - goto INVALID; - } - } else { - tc->type = JT_ARRAY; - } - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - return; - } else if (PyArray_Check(obj)) { - if (enc->npyCtxtPassthru) { - pc->npyarr = enc->npyCtxtPassthru; - tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY); - - pc->iterBegin = NpyArrPassThru_iterBegin; - pc->iterNext = NpyArr_iterNext; - pc->iterEnd = NpyArrPassThru_iterEnd; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; + if (enc->datetimeIso) { + pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; + tc->type = JT_UTF8; + } else { + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + pc->longValue = PyDateTimeToEpoch(obj, base); + tc->type = JT_LONG; + } + return; + } else if (PyTime_Check(obj)) { + pc->PyTypeToUTF8 = PyTimeToJSON; + tc->type = JT_UTF8; + return; + } else if (PyArray_IsScalar(obj, Datetime)) { + npy_int64 longVal; + if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) { + tc->type = JT_NULL; + return; + } + PyArray_Descr *dtype = PyArray_DescrFromScalar(obj); + if (!PyTypeNum_ISDATETIME(dtype->type_num)) { + PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime"); + return; + } + + PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64); + PyArray_CastScalarToCtype(obj, &longVal, outcode); + Py_DECREF(outcode); + + if (enc->datetimeIso) { + GET_TC(tc)->longValue = longVal; + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + enc->valueUnit = get_datetime_metadata_from_dtype(dtype).base; + tc->type = JT_UTF8; + } else { + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + pc->longValue = PyDateTimeToEpoch(obj, base); + tc->type = JT_LONG; + } + return; + } else if (PyDelta_Check(obj)) { + // pd.Timedelta object or pd.NaT should evaluate true here + // fallback to nanoseconds per sec for other objects + // TODO(anyone): cast below loses precision if total_seconds return + // value exceeds number of bits that significand can hold + // also liable to overflow + int64_t value = PyObject_HasAttrString(obj, "_value") + ? get_long_attr(obj, "_value") + : (int64_t)(total_seconds(obj) * 1000000000LL); + + if (value == get_nat()) { + tc->type = JT_NULL; + return; + } else if (enc->datetimeIso) { + pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; + tc->type = JT_UTF8; + } else { + const int unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + if (scaleNanosecToUnit(&value, unit) != 0) { + // TODO(username): Add some kind of error handling here + } - enc->npyCtxtPassthru = NULL; - return; - } + if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_OverflowError)) { + goto INVALID; + } - tc->type = JT_ARRAY; - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetValue = NpyArr_iterGetValue; - pc->iterGetName = NpyArr_iterGetName; - return; - } else if (object_is_dataframe_type(obj)) { - if (enc->blkCtxtPassthru) { - pc->pdblock = enc->blkCtxtPassthru; - tc->type = - (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY); - - pc->iterBegin = PdBlockPassThru_iterBegin; - pc->iterEnd = PdBlockPassThru_iterEnd; - pc->iterNext = PdBlock_iterNextItem; - pc->iterGetName = PdBlock_iterGetName; - pc->iterGetValue = NpyArr_iterGetValue; + tc->type = JT_LONG; + } + pc->longValue = value; + return; + } else if (PyArray_IsScalar(obj, Integer)) { + tc->type = JT_LONG; + PyArray_CastScalarToCtype(obj, &(pc->longValue), + PyArray_DescrFromType(NPY_INT64)); - enc->blkCtxtPassthru = NULL; - return; - } + if (PyErr_Occurred() && PyErr_ExceptionMatches(PyExc_OverflowError)) { + goto INVALID; + } - if (enc->outputFormat == SPLIT) { - tc->type = JT_OBJECT; - pc->iterBegin = DataFrame_iterBegin; - pc->iterEnd = DataFrame_iterEnd; - pc->iterNext = DataFrame_iterNext; - pc->iterGetValue = DataFrame_iterGetValue; - pc->iterGetName = DataFrame_iterGetName; - return; - } + return; + } else if (PyArray_IsScalar(obj, Bool)) { + PyArray_CastScalarToCtype(obj, &(pc->longValue), + PyArray_DescrFromType(NPY_BOOL)); + tc->type = (pc->longValue) ? JT_TRUE : JT_FALSE; + return; + } else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) { + PyArray_CastScalarToCtype(obj, &(pc->doubleValue), + PyArray_DescrFromType(NPY_DOUBLE)); + tc->type = JT_DOUBLE; + return; + } else if (PyArray_CheckScalar(obj)) { + PyErr_Format(PyExc_TypeError, + "%R (numpy-scalar) is not JSON serializable at the moment", + obj); + goto INVALID; + } else if (object_is_na_type(obj)) { + tc->type = JT_NULL; + return; + } - if (is_simple_frame(obj)) { - pc->iterBegin = NpyArr_iterBegin; - pc->iterEnd = NpyArr_iterEnd; - pc->iterNext = NpyArr_iterNext; - pc->iterGetName = NpyArr_iterGetName; - - pc->newObj = PyObject_GetAttrString(obj, "values"); - if (!pc->newObj) { - goto INVALID; - } - } else { - pc->iterBegin = PdBlock_iterBegin; - pc->iterEnd = PdBlock_iterEnd; - pc->iterNext = PdBlock_iterNext; - pc->iterGetName = PdBlock_iterGetName; - } - pc->iterGetValue = NpyArr_iterGetValue; +ISITERABLE: - if (enc->outputFormat == VALUES) { - tc->type = JT_ARRAY; - } else if (enc->outputFormat == RECORDS) { - tc->type = JT_ARRAY; - tmpObj = PyObject_GetAttrString(obj, "columns"); - if (!tmpObj) { - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - goto INVALID; - } - pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); - Py_DECREF(tmpObj); - if (!pc->columnLabels) { - goto INVALID; - } - } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { - tc->type = JT_OBJECT; - tmpObj = (enc->outputFormat == INDEX - ? PyObject_GetAttrString(obj, "index") - : PyObject_GetAttrString(obj, "columns")); - if (!tmpObj) { - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - goto INVALID; - } - pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->rowLabelsLen); - Py_DECREF(tmpObj); - tmpObj = (enc->outputFormat == INDEX - ? PyObject_GetAttrString(obj, "columns") - : PyObject_GetAttrString(obj, "index")); - if (!tmpObj) { - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - values = get_values(tmpObj); - if (!values) { - Py_DECREF(tmpObj); - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } - pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); - Py_DECREF(tmpObj); - if (!pc->columnLabels) { - NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); - pc->rowLabels = NULL; - goto INVALID; - } + if (object_is_index_type(obj)) { + if (enc->outputFormat == SPLIT) { + tc->type = JT_OBJECT; + pc->iterBegin = Index_iterBegin; + pc->iterEnd = Index_iterEnd; + pc->iterNext = Index_iterNext; + pc->iterGetValue = Index_iterGetValue; + pc->iterGetName = Index_iterGetName; + return; + } + + pc->newObj = get_values(obj); + if (pc->newObj) { + tc->type = JT_ARRAY; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + } else { + goto INVALID; + } - if (enc->outputFormat == COLUMNS) { - pc->transpose = 1; - } - } else { - goto INVALID; - } - return; - } else if (PyDict_Check(obj)) { - tc->type = JT_OBJECT; - pc->iterBegin = Dict_iterBegin; - pc->iterEnd = Dict_iterEnd; - pc->iterNext = Dict_iterNext; - pc->iterGetValue = Dict_iterGetValue; - pc->iterGetName = Dict_iterGetName; - pc->dictObj = obj; - Py_INCREF(obj); - - return; - } else if (PyList_Check(obj)) { - tc->type = JT_ARRAY; - pc->iterBegin = List_iterBegin; - pc->iterEnd = List_iterEnd; - pc->iterNext = List_iterNext; - pc->iterGetValue = List_iterGetValue; - pc->iterGetName = List_iterGetName; - return; - } else if (PyTuple_Check(obj)) { - tc->type = JT_ARRAY; - pc->iterBegin = Tuple_iterBegin; - pc->iterEnd = Tuple_iterEnd; - pc->iterNext = Tuple_iterNext; - pc->iterGetValue = Tuple_iterGetValue; - pc->iterGetName = Tuple_iterGetName; - return; - } else if (PyAnySet_Check(obj)) { - tc->type = JT_ARRAY; - pc->iterBegin = Set_iterBegin; - pc->iterEnd = Set_iterEnd; - pc->iterNext = Set_iterNext; - pc->iterGetValue = Set_iterGetValue; - pc->iterGetName = Set_iterGetName; - return; - } - - toDictFunc = PyObject_GetAttrString(obj, "toDict"); - - if (toDictFunc) { - PyObject *tuple = PyTuple_New(0); - PyObject *toDictResult = PyObject_Call(toDictFunc, tuple, NULL); - Py_DECREF(tuple); - Py_DECREF(toDictFunc); - - if (toDictResult == NULL) { - PyErr_Clear(); - tc->type = JT_NULL; - return; - } + return; + } else if (object_is_series_type(obj)) { + if (enc->outputFormat == SPLIT) { + tc->type = JT_OBJECT; + pc->iterBegin = Series_iterBegin; + pc->iterEnd = Series_iterEnd; + pc->iterNext = Series_iterNext; + pc->iterGetValue = Series_iterGetValue; + pc->iterGetName = Series_iterGetName; + return; + } + + pc->newObj = get_values(obj); + if (!pc->newObj) { + goto INVALID; + } + + if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { + tc->type = JT_OBJECT; + PyObject *tmpObj = PyObject_GetAttrString(obj, "index"); + if (!tmpObj) { + goto INVALID; + } + PyObject *values = get_values(tmpObj); + Py_DECREF(tmpObj); + if (!values) { + goto INVALID; + } - if (!PyDict_Check(toDictResult)) { - Py_DECREF(toDictResult); - tc->type = JT_NULL; - return; - } + if (!PyArray_Check(pc->newObj)) { + PyErr_SetString(PyExc_TypeError, + "Object_beginTypeContext received a non-array object"); + goto INVALID; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)pc->newObj; + pc->columnLabelsLen = PyArray_DIM(arrayobj, 0); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, + pc->columnLabelsLen); + if (!pc->columnLabels) { + goto INVALID; + } + } else { + tc->type = JT_ARRAY; + } + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } else if (PyArray_Check(obj)) { + if (enc->npyCtxtPassthru) { + pc->npyarr = enc->npyCtxtPassthru; + tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY); + + pc->iterBegin = NpyArrPassThru_iterBegin; + pc->iterNext = NpyArr_iterNext; + pc->iterEnd = NpyArrPassThru_iterEnd; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + + enc->npyCtxtPassthru = NULL; + return; + } + + tc->type = JT_ARRAY; + pc->iterBegin = NpyArr_iterBegin; + pc->iterEnd = NpyArr_iterEnd; + pc->iterNext = NpyArr_iterNext; + pc->iterGetValue = NpyArr_iterGetValue; + pc->iterGetName = NpyArr_iterGetName; + return; + } else if (object_is_dataframe_type(obj)) { + if (enc->blkCtxtPassthru) { + pc->pdblock = enc->blkCtxtPassthru; + tc->type = + (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY); + + pc->iterBegin = PdBlockPassThru_iterBegin; + pc->iterEnd = PdBlockPassThru_iterEnd; + pc->iterNext = PdBlock_iterNextItem; + pc->iterGetName = PdBlock_iterGetName; + pc->iterGetValue = NpyArr_iterGetValue; + + enc->blkCtxtPassthru = NULL; + return; + } + + if (enc->outputFormat == SPLIT) { + tc->type = JT_OBJECT; + pc->iterBegin = DataFrame_iterBegin; + pc->iterEnd = DataFrame_iterEnd; + pc->iterNext = DataFrame_iterNext; + pc->iterGetValue = DataFrame_iterGetValue; + pc->iterGetName = DataFrame_iterGetName; + return; + } + + pc->iterBegin = PdBlock_iterBegin; + pc->iterEnd = PdBlock_iterEnd; + pc->iterNext = PdBlock_iterNext; + pc->iterGetName = PdBlock_iterGetName; + pc->iterGetValue = NpyArr_iterGetValue; + + if (enc->outputFormat == VALUES) { + tc->type = JT_ARRAY; + } else if (enc->outputFormat == RECORDS) { + tc->type = JT_ARRAY; + PyObject *tmpObj = PyObject_GetAttrString(obj, "columns"); + if (!tmpObj) { + goto INVALID; + } + PyObject *values = get_values(tmpObj); + if (!values) { + Py_DECREF(tmpObj); + goto INVALID; + } + pc->columnLabelsLen = PyObject_Size(tmpObj); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, + pc->columnLabelsLen); + Py_DECREF(tmpObj); + if (!pc->columnLabels) { + goto INVALID; + } + } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { + tc->type = JT_OBJECT; + PyObject *tmpObj = + (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "index") + : PyObject_GetAttrString(obj, "columns")); + if (!tmpObj) { + goto INVALID; + } + PyObject *values = get_values(tmpObj); + if (!values) { + Py_DECREF(tmpObj); + goto INVALID; + } + pc->rowLabelsLen = PyObject_Size(tmpObj); + pc->rowLabels = + NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->rowLabelsLen); + Py_DECREF(tmpObj); + tmpObj = + (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") + : PyObject_GetAttrString(obj, "index")); + if (!tmpObj) { + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + values = get_values(tmpObj); + if (!values) { + Py_DECREF(tmpObj); + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } + pc->columnLabelsLen = PyObject_Size(tmpObj); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, + pc->columnLabelsLen); + Py_DECREF(tmpObj); + if (!pc->columnLabels) { + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } - tc->type = JT_OBJECT; - pc->iterBegin = Dict_iterBegin; - pc->iterEnd = Dict_iterEnd; - pc->iterNext = Dict_iterNext; - pc->iterGetValue = Dict_iterGetValue; - pc->iterGetName = Dict_iterGetName; - pc->dictObj = toDictResult; - return; + if (enc->outputFormat == COLUMNS) { + pc->transpose = 1; + } + } else { + goto INVALID; } + return; + } else if (PyDict_Check(obj)) { + tc->type = JT_OBJECT; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = obj; + Py_INCREF(obj); - PyErr_Clear(); + return; + } else if (PyList_Check(obj)) { + tc->type = JT_ARRAY; + pc->iterBegin = List_iterBegin; + pc->iterEnd = List_iterEnd; + pc->iterNext = List_iterNext; + pc->iterGetValue = List_iterGetValue; + pc->iterGetName = List_iterGetName; + return; + } else if (PyTuple_Check(obj)) { + tc->type = JT_ARRAY; + pc->iterBegin = Tuple_iterBegin; + pc->iterEnd = Tuple_iterEnd; + pc->iterNext = Tuple_iterNext; + pc->iterGetValue = Tuple_iterGetValue; + pc->iterGetName = Tuple_iterGetName; + return; + } else if (PyAnySet_Check(obj)) { + tc->type = JT_ARRAY; + pc->iterBegin = Set_iterBegin; + pc->iterEnd = Set_iterEnd; + pc->iterNext = Set_iterNext; + pc->iterGetValue = Set_iterGetValue; + pc->iterGetName = Set_iterGetName; + return; + } - if (enc->defaultHandler) { - Object_invokeDefaultHandler(obj, enc); - goto INVALID; + PyObject *toDictFunc = PyObject_GetAttrString(obj, "toDict"); + + if (toDictFunc) { + PyObject *tuple = PyTuple_New(0); + PyObject *toDictResult = PyObject_Call(toDictFunc, tuple, NULL); + Py_DECREF(tuple); + Py_DECREF(toDictFunc); + + if (toDictResult == NULL) { + PyErr_Clear(); + tc->type = JT_NULL; + return; + } + + if (!PyDict_Check(toDictResult)) { + Py_DECREF(toDictResult); + tc->type = JT_NULL; + return; } tc->type = JT_OBJECT; - pc->iterBegin = Dir_iterBegin; - pc->iterEnd = Dir_iterEnd; - pc->iterNext = Dir_iterNext; - pc->iterGetValue = Dir_iterGetValue; - pc->iterGetName = Dir_iterGetName; + pc->iterBegin = Dict_iterBegin; + pc->iterEnd = Dict_iterEnd; + pc->iterNext = Dict_iterNext; + pc->iterGetValue = Dict_iterGetValue; + pc->iterGetName = Dict_iterGetName; + pc->dictObj = toDictResult; return; + } + + PyErr_Clear(); + + if (enc->defaultHandler) { + Object_invokeDefaultHandler(obj, enc); + goto INVALID; + } + + tc->type = JT_OBJECT; + pc->iterBegin = Dir_iterBegin; + pc->iterEnd = Dir_iterEnd; + pc->iterNext = Dir_iterNext; + pc->iterGetValue = Dir_iterGetValue; + pc->iterGetName = Dir_iterGetName; + return; INVALID: - tc->type = JT_INVALID; + tc->type = JT_INVALID; + PyObject_Free(tc->prv); + tc->prv = NULL; + return; +} + +static void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + if (tc->prv) { + Py_XDECREF(GET_TC(tc)->newObj); + GET_TC(tc)->newObj = NULL; + NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); + GET_TC(tc)->rowLabels = NULL; + NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); + GET_TC(tc)->columnLabels = NULL; + PyObject_Free(GET_TC(tc)->cStr); + GET_TC(tc)->cStr = NULL; PyObject_Free(tc->prv); tc->prv = NULL; - return; + } } -void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - if (tc->prv) { - Py_XDECREF(GET_TC(tc)->newObj); - GET_TC(tc)->newObj = NULL; - NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); - GET_TC(tc)->rowLabels = NULL; - NpyArr_freeLabels(GET_TC(tc)->columnLabels, - GET_TC(tc)->columnLabelsLen); - GET_TC(tc)->columnLabels = NULL; - PyObject_Free(GET_TC(tc)->cStr); - GET_TC(tc)->cStr = NULL; - PyObject_Free(tc->prv); - tc->prv = NULL; - } +static const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, + size_t *_outLen) { + return GET_TC(tc)->PyTypeToUTF8(obj, tc, _outLen); } -const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, - size_t *_outLen) { - return GET_TC(tc)->PyTypeToUTF8(obj, tc, _outLen); -} - -JSINT64 Object_getLongValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->longValue; -} - -double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - return GET_TC(tc)->doubleValue; -} - -const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, - size_t *_outLen) { - PyObject *repr = PyObject_Str(obj); - const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *)_outLen); - char *bytes = PyObject_Malloc(*_outLen + 1); - memcpy(bytes, str, *_outLen + 1); - GET_TC(tc)->cStr = bytes; +static JSINT64 Object_getLongValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->longValue; +} - Py_DECREF(repr); +static double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->doubleValue; +} - return GET_TC(tc)->cStr; +static const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, + size_t *_outLen) { + PyObject *repr = PyObject_Str(obj); + const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *)_outLen); + char *bytes = PyObject_Malloc(*_outLen + 1); + memcpy(bytes, str, *_outLen + 1); + GET_TC(tc)->cStr = bytes; + + Py_DECREF(repr); + + return GET_TC(tc)->cStr; } static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); } -void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->iterBegin(obj, tc); +static void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->iterBegin(obj, tc); } -int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) { - return GET_TC(tc)->iterNext(obj, tc); +static int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) { + return GET_TC(tc)->iterNext(obj, tc); } -void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - GET_TC(tc)->iterEnd(obj, tc); +static void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) { + GET_TC(tc)->iterEnd(obj, tc); } -JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { - return GET_TC(tc)->iterGetValue(obj, tc); +static JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { + return GET_TC(tc)->iterGetValue(obj, tc); } -char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { - return GET_TC(tc)->iterGetName(obj, tc, outLen); +static char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, + size_t *outLen) { + return GET_TC(tc)->iterGetName(obj, tc, outLen); } PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, PyObject *kwargs) { - PyDateTime_IMPORT; - if (PyDateTimeAPI == NULL) { - return NULL; - } - - PandasDateTime_IMPORT; - if (PandasDateTimeAPI == NULL) { - return NULL; - } - - static char *kwlist[] = {"obj", - "ensure_ascii", - "double_precision", - "encode_html_chars", - "orient", - "date_unit", - "iso_dates", - "default_handler", - "indent", - NULL}; - - char buffer[65536]; - char *ret; - PyObject *newobj; - PyObject *oinput = NULL; - PyObject *oensureAscii = NULL; - int idoublePrecision = 10; // default double precision setting - PyObject *oencodeHTMLChars = NULL; - char *sOrient = NULL; - char *sdateFormat = NULL; - PyObject *oisoDates = 0; - PyObject *odefHandler = 0; - int indent = 0; - - PyObjectEncoder pyEncoder = {{ - Object_beginTypeContext, - Object_endTypeContext, - Object_getStringValue, - Object_getLongValue, - NULL, // getIntValue is unused - Object_getDoubleValue, - Object_getBigNumStringValue, - Object_iterBegin, - Object_iterNext, - Object_iterEnd, - Object_iterGetValue, - Object_iterGetName, - Object_releaseObject, - PyObject_Malloc, - PyObject_Realloc, - PyObject_Free, - -1, // recursionMax - idoublePrecision, - 1, // forceAscii - 0, // encodeHTMLChars - 0, // indent - }}; - JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; - - pyEncoder.npyCtxtPassthru = NULL; - pyEncoder.blkCtxtPassthru = NULL; - pyEncoder.npyType = -1; - pyEncoder.npyValue = NULL; - pyEncoder.datetimeIso = 0; - pyEncoder.datetimeUnit = NPY_FR_ms; - pyEncoder.outputFormat = COLUMNS; - pyEncoder.defaultHandler = 0; - - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOOi", kwlist, - &oinput, &oensureAscii, &idoublePrecision, - &oencodeHTMLChars, &sOrient, &sdateFormat, - &oisoDates, &odefHandler, &indent)) { - return NULL; - } - - if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) { - encoder->forceASCII = 0; - } - - if (oencodeHTMLChars != NULL && PyObject_IsTrue(oencodeHTMLChars)) { - encoder->encodeHTMLChars = 1; - } - - if (idoublePrecision > JSON_DOUBLE_MAX_DECIMALS || idoublePrecision < 0) { - PyErr_Format( - PyExc_ValueError, - "Invalid value '%d' for option 'double_precision', max is '%u'", - idoublePrecision, JSON_DOUBLE_MAX_DECIMALS); - return NULL; - } - encoder->doublePrecision = idoublePrecision; - - if (sOrient != NULL) { - if (strcmp(sOrient, "records") == 0) { - pyEncoder.outputFormat = RECORDS; - } else if (strcmp(sOrient, "index") == 0) { - pyEncoder.outputFormat = INDEX; - } else if (strcmp(sOrient, "split") == 0) { - pyEncoder.outputFormat = SPLIT; - } else if (strcmp(sOrient, "values") == 0) { - pyEncoder.outputFormat = VALUES; - } else if (strcmp(sOrient, "columns") != 0) { - PyErr_Format(PyExc_ValueError, - "Invalid value '%s' for option 'orient'", sOrient); - return NULL; - } - } + PyDateTime_IMPORT; + if (PyDateTimeAPI == NULL) { + return NULL; + } - if (sdateFormat != NULL) { - if (strcmp(sdateFormat, "s") == 0) { - pyEncoder.datetimeUnit = NPY_FR_s; - } else if (strcmp(sdateFormat, "ms") == 0) { - pyEncoder.datetimeUnit = NPY_FR_ms; - } else if (strcmp(sdateFormat, "us") == 0) { - pyEncoder.datetimeUnit = NPY_FR_us; - } else if (strcmp(sdateFormat, "ns") == 0) { - pyEncoder.datetimeUnit = NPY_FR_ns; - } else { - PyErr_Format(PyExc_ValueError, - "Invalid value '%s' for option 'date_unit'", - sdateFormat); - return NULL; - } - } + PandasDateTime_IMPORT; + if (PandasDateTimeAPI == NULL) { + return NULL; + } - if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) { - pyEncoder.datetimeIso = 1; - } + static char *kwlist[] = {"obj", + "ensure_ascii", + "double_precision", + "encode_html_chars", + "orient", + "date_unit", + "iso_dates", + "default_handler", + "indent", + NULL}; + + PyObject *oinput = NULL; + PyObject *oensureAscii = NULL; + int idoublePrecision = 10; // default double precision setting + PyObject *oencodeHTMLChars = NULL; + char *sOrient = NULL; + char *sdateFormat = NULL; + PyObject *oisoDates = 0; + PyObject *odefHandler = 0; + int indent = 0; + + PyObjectEncoder pyEncoder = { + { + .beginTypeContext = Object_beginTypeContext, + .endTypeContext = Object_endTypeContext, + .getStringValue = Object_getStringValue, + .getLongValue = Object_getLongValue, + .getIntValue = NULL, + .getDoubleValue = Object_getDoubleValue, + .getBigNumStringValue = Object_getBigNumStringValue, + .iterBegin = Object_iterBegin, + .iterNext = Object_iterNext, + .iterEnd = Object_iterEnd, + .iterGetValue = Object_iterGetValue, + .iterGetName = Object_iterGetName, + .releaseObject = Object_releaseObject, + .malloc = PyObject_Malloc, + .realloc = PyObject_Realloc, + .free = PyObject_Free, + .recursionMax = -1, + .doublePrecision = idoublePrecision, + .forceASCII = 1, + .encodeHTMLChars = 0, + .indent = indent, + .errorMsg = NULL, + }, + .npyCtxtPassthru = NULL, + .blkCtxtPassthru = NULL, + .npyType = -1, + .npyValue = NULL, + .datetimeIso = 0, + .datetimeUnit = NPY_FR_ms, + .outputFormat = COLUMNS, + .defaultHandler = NULL, + }; + JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOOi", kwlist, &oinput, + &oensureAscii, &idoublePrecision, + &oencodeHTMLChars, &sOrient, &sdateFormat, + &oisoDates, &odefHandler, &indent)) { + return NULL; + } - if (odefHandler != NULL && odefHandler != Py_None) { - if (!PyCallable_Check(odefHandler)) { - PyErr_SetString(PyExc_TypeError, "Default handler is not callable"); - return NULL; - } - pyEncoder.defaultHandler = odefHandler; + if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) { + encoder->forceASCII = 0; + } + + if (oencodeHTMLChars != NULL && PyObject_IsTrue(oencodeHTMLChars)) { + encoder->encodeHTMLChars = 1; + } + + if (idoublePrecision > JSON_DOUBLE_MAX_DECIMALS || idoublePrecision < 0) { + PyErr_Format( + PyExc_ValueError, + "Invalid value '%d' for option 'double_precision', max is '%u'", + idoublePrecision, JSON_DOUBLE_MAX_DECIMALS); + return NULL; + } + encoder->doublePrecision = idoublePrecision; + + if (sOrient != NULL) { + if (strcmp(sOrient, "records") == 0) { + pyEncoder.outputFormat = RECORDS; + } else if (strcmp(sOrient, "index") == 0) { + pyEncoder.outputFormat = INDEX; + } else if (strcmp(sOrient, "split") == 0) { + pyEncoder.outputFormat = SPLIT; + } else if (strcmp(sOrient, "values") == 0) { + pyEncoder.outputFormat = VALUES; + } else if (strcmp(sOrient, "columns") != 0) { + PyErr_Format(PyExc_ValueError, "Invalid value '%s' for option 'orient'", + sOrient); + return NULL; + } + } + + if (sdateFormat != NULL) { + if (strcmp(sdateFormat, "s") == 0) { + pyEncoder.datetimeUnit = NPY_FR_s; + } else if (strcmp(sdateFormat, "ms") == 0) { + pyEncoder.datetimeUnit = NPY_FR_ms; + } else if (strcmp(sdateFormat, "us") == 0) { + pyEncoder.datetimeUnit = NPY_FR_us; + } else if (strcmp(sdateFormat, "ns") == 0) { + pyEncoder.datetimeUnit = NPY_FR_ns; + } else { + PyErr_Format(PyExc_ValueError, + "Invalid value '%s' for option 'date_unit'", sdateFormat); + return NULL; } + } - encoder->indent = indent; + if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) { + pyEncoder.datetimeIso = 1; + } - pyEncoder.originalOutputFormat = pyEncoder.outputFormat; - ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer)); - if (PyErr_Occurred()) { - return NULL; + if (odefHandler != NULL && odefHandler != Py_None) { + if (!PyCallable_Check(odefHandler)) { + PyErr_SetString(PyExc_TypeError, "Default handler is not callable"); + return NULL; } + pyEncoder.defaultHandler = odefHandler; + } - if (encoder->errorMsg) { - if (ret != buffer) { - encoder->free(ret); - } - PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg); - return NULL; - } + encoder->indent = indent; + + pyEncoder.originalOutputFormat = pyEncoder.outputFormat; - newobj = PyUnicode_FromString(ret); + char buffer[65536]; + char *ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer)); + if (PyErr_Occurred()) { + return NULL; + } + if (encoder->errorMsg) { if (ret != buffer) { - encoder->free(ret); + encoder->free(ret); } + PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg); + return NULL; + } + + PyObject *newobj = PyUnicode_FromString(ret); + + if (ret != buffer) { + encoder->free(ret); + } - return newobj; + return newobj; } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/src/vendored/ujson/python/ujson.c pandas-2.2.2+dfsg/pandas/_libs/src/vendored/ujson/python/ujson.c --- pandas-2.1.4+dfsg/pandas/_libs/src/vendored/ujson/python/ujson.c 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/src/vendored/ujson/python/ujson.c 2024-04-10 17:42:52.000000000 +0000 @@ -16,18 +16,19 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from TCL library https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms @@ -35,7 +36,8 @@ * Copyright (c) 1994 Sun Microsystems, Inc. */ -#include "pandas/vendored/ujson/python/version.h" +// Licence at LICENSES/ULTRAJSON_LICENSE + #define PY_SSIZE_T_CLEAN #include #define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY @@ -48,27 +50,29 @@ /* JSONToObj */ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs); -#define ENCODER_HELP_TEXT \ - "Use ensure_ascii=false to output UTF-8. Pass in double_precision to " \ - "alter the maximum digit precision of doubles. Set " \ - "encode_html_chars=True to encode < > & as unicode escape sequences." +#define ENCODER_HELP_TEXT \ + "Use ensure_ascii=false to output UTF-8. Pass in double_precision to " \ + "alter the maximum digit precision of doubles. Set " \ + "encode_html_chars=True to encode < > & as unicode escape sequences." static PyMethodDef ujsonMethods[] = { - {"ujson_dumps", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS, + {"ujson_dumps", (PyCFunction)(void (*)(void))objToJSON, + METH_VARARGS | METH_KEYWORDS, "Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT}, - {"ujson_loads", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS, + {"ujson_loads", (PyCFunction)(void (*)(void))JSONToObj, + METH_VARARGS | METH_KEYWORDS, "Converts JSON as string to dict object structure. Use precise_float=True " "to use high precision float decoder."}, {NULL, NULL, 0, NULL} /* Sentinel */ }; typedef struct { - PyObject *type_decimal; - PyObject *type_dataframe; - PyObject *type_series; - PyObject *type_index; - PyObject *type_nat; - PyObject *type_na; + PyObject *type_decimal; + PyObject *type_dataframe; + PyObject *type_series; + PyObject *type_index; + PyObject *type_nat; + PyObject *type_na; } modulestate; #define modulestate(o) ((modulestate *)PyModule_GetState(o)) @@ -88,359 +92,356 @@ #ifndef PYPY_VERSION /* Used in objToJSON.c */ int object_is_decimal_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_decimal = state->type_decimal; - if (type_decimal == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_decimal); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_decimal = state->type_decimal; + if (type_decimal == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_decimal); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; } int object_is_dataframe_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_dataframe = state->type_dataframe; - if (type_dataframe == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_dataframe); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_dataframe = state->type_dataframe; + if (type_dataframe == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_dataframe); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; } int object_is_series_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_series = state->type_series; - if (type_series == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_series); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_series = state->type_series; + if (type_series == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_series); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; } int object_is_index_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_index = state->type_index; - if (type_index == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_index); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_index = state->type_index; + if (type_index == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_index); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; } int object_is_nat_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_nat = state->type_nat; - if (type_nat == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_nat); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_nat = state->type_nat; + if (type_nat == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_nat); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; } int object_is_na_type(PyObject *obj) { - PyObject *module = PyState_FindModule(&moduledef); - if (module == NULL) - return 0; - modulestate *state = modulestate(module); - if (state == NULL) - return 0; - PyObject *type_na = state->type_na; - if (type_na == NULL) { - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_na); - if (result == -1) { - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyState_FindModule(&moduledef); + if (module == NULL) + return 0; + modulestate *state = modulestate(module); + if (state == NULL) + return 0; + PyObject *type_na = state->type_na; + if (type_na == NULL) { + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_na); + if (result == -1) { + PyErr_Clear(); + return 0; + } + return result; } #else - /* Used in objToJSON.c */ +/* Used in objToJSON.c */ int object_is_decimal_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("decimal"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_decimal = PyObject_GetAttrString(module, "Decimal"); - if (type_decimal == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_decimal); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_decimal); - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyImport_ImportModule("decimal"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_decimal = PyObject_GetAttrString(module, "Decimal"); + if (type_decimal == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_decimal); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_decimal); + PyErr_Clear(); + return 0; + } + return result; } int object_is_dataframe_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_dataframe = PyObject_GetAttrString(module, "DataFrame"); - if (type_dataframe == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_dataframe); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_dataframe); - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyImport_ImportModule("pandas"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_dataframe = PyObject_GetAttrString(module, "DataFrame"); + if (type_dataframe == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_dataframe); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_dataframe); + PyErr_Clear(); + return 0; + } + return result; } int object_is_series_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_series = PyObject_GetAttrString(module, "Series"); - if (type_series == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_series); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_series); - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyImport_ImportModule("pandas"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_series = PyObject_GetAttrString(module, "Series"); + if (type_series == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_series); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_series); + PyErr_Clear(); + return 0; + } + return result; } int object_is_index_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_index = PyObject_GetAttrString(module, "Index"); - if (type_index == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_index); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_index); - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyImport_ImportModule("pandas"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_index = PyObject_GetAttrString(module, "Index"); + if (type_index == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_index); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_index); + PyErr_Clear(); + return 0; + } + return result; } int object_is_nat_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas._libs.tslibs.nattype"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_nat = PyObject_GetAttrString(module, "NaTType"); - if (type_nat == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_nat); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_nat); - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyImport_ImportModule("pandas._libs.tslibs.nattype"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_nat = PyObject_GetAttrString(module, "NaTType"); + if (type_nat == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_nat); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_nat); + PyErr_Clear(); + return 0; + } + return result; } int object_is_na_type(PyObject *obj) { - PyObject *module = PyImport_ImportModule("pandas._libs.missing"); - if (module == NULL) { - PyErr_Clear(); - return 0; - } - PyObject *type_na = PyObject_GetAttrString(module, "NAType"); - if (type_na == NULL) { - Py_DECREF(module); - PyErr_Clear(); - return 0; - } - int result = PyObject_IsInstance(obj, type_na); - if (result == -1) { - Py_DECREF(module); - Py_DECREF(type_na); - PyErr_Clear(); - return 0; - } - return result; + PyObject *module = PyImport_ImportModule("pandas._libs.missing"); + if (module == NULL) { + PyErr_Clear(); + return 0; + } + PyObject *type_na = PyObject_GetAttrString(module, "NAType"); + if (type_na == NULL) { + Py_DECREF(module); + PyErr_Clear(); + return 0; + } + int result = PyObject_IsInstance(obj, type_na); + if (result == -1) { + Py_DECREF(module); + Py_DECREF(type_na); + PyErr_Clear(); + return 0; + } + return result; } #endif static int module_traverse(PyObject *m, visitproc visit, void *arg) { - Py_VISIT(modulestate(m)->type_decimal); - Py_VISIT(modulestate(m)->type_dataframe); - Py_VISIT(modulestate(m)->type_series); - Py_VISIT(modulestate(m)->type_index); - Py_VISIT(modulestate(m)->type_nat); - Py_VISIT(modulestate(m)->type_na); - return 0; + Py_VISIT(modulestate(m)->type_decimal); + Py_VISIT(modulestate(m)->type_dataframe); + Py_VISIT(modulestate(m)->type_series); + Py_VISIT(modulestate(m)->type_index); + Py_VISIT(modulestate(m)->type_nat); + Py_VISIT(modulestate(m)->type_na); + return 0; } static int module_clear(PyObject *m) { - Py_CLEAR(modulestate(m)->type_decimal); - Py_CLEAR(modulestate(m)->type_dataframe); - Py_CLEAR(modulestate(m)->type_series); - Py_CLEAR(modulestate(m)->type_index); - Py_CLEAR(modulestate(m)->type_nat); - Py_CLEAR(modulestate(m)->type_na); - return 0; + Py_CLEAR(modulestate(m)->type_decimal); + Py_CLEAR(modulestate(m)->type_dataframe); + Py_CLEAR(modulestate(m)->type_series); + Py_CLEAR(modulestate(m)->type_index); + Py_CLEAR(modulestate(m)->type_nat); + Py_CLEAR(modulestate(m)->type_na); + return 0; } static void module_free(void *module) { module_clear((PyObject *)module); } PyMODINIT_FUNC PyInit_json(void) { - import_array() - PyObject *module; + import_array() PyObject *module; #ifndef PYPY_VERSION - // This function is not supported in PyPy. - if ((module = PyState_FindModule(&moduledef)) != NULL) { - Py_INCREF(module); - return module; - } + // This function is not supported in PyPy. + if ((module = PyState_FindModule(&moduledef)) != NULL) { + Py_INCREF(module); + return module; + } #endif - module = PyModule_Create(&moduledef); - if (module == NULL) { - return NULL; - } + module = PyModule_Create(&moduledef); + if (module == NULL) { + return NULL; + } #ifndef PYPY_VERSION - PyObject *mod_decimal = PyImport_ImportModule("decimal"); - if (mod_decimal) { - PyObject *type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal"); - assert(type_decimal != NULL); - modulestate(module)->type_decimal = type_decimal; - Py_DECREF(mod_decimal); - } - - PyObject *mod_pandas = PyImport_ImportModule("pandas"); - if (mod_pandas) { - PyObject *type_dataframe = - PyObject_GetAttrString(mod_pandas, "DataFrame"); - assert(type_dataframe != NULL); - modulestate(module)->type_dataframe = type_dataframe; - - PyObject *type_series = PyObject_GetAttrString(mod_pandas, "Series"); - assert(type_series != NULL); - modulestate(module)->type_series = type_series; - - PyObject *type_index = PyObject_GetAttrString(mod_pandas, "Index"); - assert(type_index != NULL); - modulestate(module)->type_index = type_index; - - Py_DECREF(mod_pandas); - } - - PyObject *mod_nattype = - PyImport_ImportModule("pandas._libs.tslibs.nattype"); - if (mod_nattype) { - PyObject *type_nat = PyObject_GetAttrString(mod_nattype, "NaTType"); - assert(type_nat != NULL); - modulestate(module)->type_nat = type_nat; - - Py_DECREF(mod_nattype); - } - - PyObject *mod_natype = PyImport_ImportModule("pandas._libs.missing"); - if (mod_natype) { - PyObject *type_na = PyObject_GetAttrString(mod_natype, "NAType"); - assert(type_na != NULL); - modulestate(module)->type_na = type_na; - - Py_DECREF(mod_natype); - } else { - PyErr_Clear(); - } + PyObject *mod_decimal = PyImport_ImportModule("decimal"); + if (mod_decimal) { + PyObject *type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal"); + assert(type_decimal != NULL); + modulestate(module)->type_decimal = type_decimal; + Py_DECREF(mod_decimal); + } + + PyObject *mod_pandas = PyImport_ImportModule("pandas"); + if (mod_pandas) { + PyObject *type_dataframe = PyObject_GetAttrString(mod_pandas, "DataFrame"); + assert(type_dataframe != NULL); + modulestate(module)->type_dataframe = type_dataframe; + + PyObject *type_series = PyObject_GetAttrString(mod_pandas, "Series"); + assert(type_series != NULL); + modulestate(module)->type_series = type_series; + + PyObject *type_index = PyObject_GetAttrString(mod_pandas, "Index"); + assert(type_index != NULL); + modulestate(module)->type_index = type_index; + + Py_DECREF(mod_pandas); + } + + PyObject *mod_nattype = PyImport_ImportModule("pandas._libs.tslibs.nattype"); + if (mod_nattype) { + PyObject *type_nat = PyObject_GetAttrString(mod_nattype, "NaTType"); + assert(type_nat != NULL); + modulestate(module)->type_nat = type_nat; + + Py_DECREF(mod_nattype); + } + + PyObject *mod_natype = PyImport_ImportModule("pandas._libs.missing"); + if (mod_natype) { + PyObject *type_na = PyObject_GetAttrString(mod_natype, "NAType"); + assert(type_na != NULL); + modulestate(module)->type_na = type_na; + + Py_DECREF(mod_natype); + } else { + PyErr_Clear(); + } #endif - /* Not vendored for now - JSONDecodeError = PyErr_NewException("ujson.JSONDecodeError", - PyExc_ValueError, NULL); Py_XINCREF(JSONDecodeError); if - (PyModule_AddObject(module, "JSONDecodeError", JSONDecodeError) < 0) - { - Py_XDECREF(JSONDecodeError); - Py_CLEAR(JSONDecodeError); - Py_DECREF(module); - return NULL; - } - */ + /* Not vendored for now + JSONDecodeError = PyErr_NewException("ujson.JSONDecodeError", + PyExc_ValueError, NULL); Py_XINCREF(JSONDecodeError); if + (PyModule_AddObject(module, "JSONDecodeError", JSONDecodeError) < 0) + { + Py_XDECREF(JSONDecodeError); + Py_CLEAR(JSONDecodeError); + Py_DECREF(module); + return NULL; + } + */ - return module; + return module; } diff -Nru pandas-2.1.4+dfsg/pandas/_libs/testing.pyx pandas-2.2.2+dfsg/pandas/_libs/testing.pyx --- pandas-2.1.4+dfsg/pandas/_libs/testing.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/testing.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -78,7 +78,7 @@ robj : str, default None Specify right object name being compared, internally used to show appropriate assertion message. - index_values : ndarray, default None + index_values : Index | ndarray, default None Specify shared index values of objects being compared, internally used to show appropriate assertion message. diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslib.pyi pandas-2.2.2+dfsg/pandas/_libs/tslib.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslib.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslib.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -23,10 +23,15 @@ dayfirst: bool = ..., yearfirst: bool = ..., utc: bool = ..., + creso: int = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] def array_to_datetime_with_tz( - values: npt.NDArray[np.object_], tz: tzinfo + values: npt.NDArray[np.object_], + tz: tzinfo, + dayfirst: bool, + yearfirst: bool, + creso: int, ) -> npt.NDArray[np.int64]: ... diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslib.pyx pandas-2.2.2+dfsg/pandas/_libs/tslib.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslib.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslib.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -30,10 +30,14 @@ cnp.import_array() +from pandas._libs.tslibs.dtypes cimport ( + get_supported_reso, + npy_unit_to_abbrev, +) from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, - check_dts_bounds, + get_datetime64_unit, import_pandas_datetime, npy_datetimestruct, npy_datetimestruct_to_datetime, @@ -45,9 +49,11 @@ import_pandas_datetime() -from pandas._libs.tslibs.strptime cimport parse_today_now +from pandas._libs.tslibs.strptime cimport ( + DatetimeParseState, + parse_today_now, +) from pandas._libs.util cimport ( - is_datetime64_object, is_float_object, is_integer_object, ) @@ -58,16 +64,18 @@ _TSObject, cast_from_unit, convert_str_to_tsobject, - convert_timezone, + convert_to_tsobject, get_datetime64_nanos, parse_pydatetime, ) +from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_NaT as NaT, c_nat_strings as nat_strings, ) from pandas._libs.tslibs.timestamps cimport _Timestamp +from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs import ( Resolution, @@ -94,8 +102,10 @@ obj = _TSObject() string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True) - obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts) - check_dts_bounds(&obj.dts) + try: + obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts) + except OverflowError as err: + raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp: {ts}") from err if out_local == 1: obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) obj.value = tz_localize_to_utc_single(obj.value, obj.tzinfo) @@ -267,7 +277,7 @@ bint is_raise = errors == "raise" ndarray[int64_t] iresult tzinfo tz = None - float fval + double fval assert is_ignore or is_coerce or is_raise @@ -275,6 +285,7 @@ result, tz = array_to_datetime( values.astype(object, copy=False), errors=errors, + creso=NPY_FR_ns, ) return result, tz @@ -327,7 +338,7 @@ f"unit='{unit}' not valid with non-numerical val='{val}'" ) - except (ValueError, OutOfBoundsDatetime, TypeError) as err: + except (ValueError, TypeError) as err: if is_raise: err.args = (f"{err}, at position {i}",) raise @@ -406,6 +417,7 @@ bint dayfirst=False, bint yearfirst=False, bint utc=False, + NPY_DATETIMEUNIT creso=NPY_FR_ns, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -423,25 +435,27 @@ Parameters ---------- values : ndarray of object - date-like objects to convert + date-like objects to convert errors : str, default 'raise' - error behavior when parsing + error behavior when parsing dayfirst : bool, default False - dayfirst parsing behavior when encountering datetime strings + dayfirst parsing behavior when encountering datetime strings yearfirst : bool, default False - yearfirst parsing behavior when encountering datetime strings + yearfirst parsing behavior when encountering datetime strings utc : bool, default False - indicator whether the dates should be UTC + indicator whether the dates should be UTC + creso : NPY_DATETIMEUNIT, default NPY_FR_ns + Set to NPY_FR_GENERIC to infer a resolution. Returns ------- np.ndarray - May be datetime64[ns] or object dtype + May be datetime64[creso_unit] or object dtype tzinfo or None """ cdef: Py_ssize_t i, n = values.size - object val, tz + object val ndarray[int64_t] iresult npy_datetimestruct dts bint utc_convert = bool(utc) @@ -450,48 +464,61 @@ bint is_ignore = errors == "ignore" bint is_coerce = errors == "coerce" bint is_same_offsets - _TSObject _ts + _TSObject tsobj float tz_offset set out_tzoffset_vals = set() - tzinfo tz_out = None - bint found_tz = False, found_naive = False - cnp.broadcast mi + tzinfo tz, tz_out = None + cnp.flatiter it = cnp.PyArray_IterNew(values) + NPY_DATETIMEUNIT item_reso + bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC + DatetimeParseState state = DatetimeParseState(creso) + str abbrev # specify error conditions assert is_raise or is_ignore or is_coerce - result = np.empty((values).shape, dtype="M8[ns]") - mi = cnp.PyArray_MultiIterNew2(result, values) + if infer_reso: + abbrev = "ns" + else: + abbrev = npy_unit_to_abbrev(creso) + result = np.empty((values).shape, dtype=f"M8[{abbrev}]") iresult = result.view("i8").ravel() for i in range(n): # Analogous to `val = values[i]` - val = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] + val = cnp.PyArray_GETITEM(values, cnp.PyArray_ITER_DATA(it)) + cnp.PyArray_ITER_NEXT(it) try: if checknull_with_nat_and_na(val): iresult[i] = NPY_NAT elif PyDateTime_Check(val): - if val.tzinfo is not None: - found_tz = True + if isinstance(val, _Timestamp): + item_reso = val._creso else: - found_naive = True - tz_out = convert_timezone( - val.tzinfo, - tz_out, - found_naive, - found_tz, - utc_convert, - ) - iresult[i] = parse_pydatetime(val, &dts, utc_convert) + item_reso = NPY_DATETIMEUNIT.NPY_FR_us + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + tz_out = state.process_datetime(val, tz_out, utc_convert) + iresult[i] = parse_pydatetime(val, &dts, creso=creso) elif PyDate_Check(val): - iresult[i] = pydate_to_dt64(val, &dts) - check_dts_bounds(&dts) - - elif is_datetime64_object(val): - iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) + item_reso = NPY_DATETIMEUNIT.NPY_FR_s + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + iresult[i] = pydate_to_dt64(val, &dts, reso=creso) + state.found_other = True + + elif cnp.is_datetime64_object(val): + item_reso = get_supported_reso(get_datetime64_unit(val)) + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + iresult[i] = get_datetime64_nanos(val, creso) + state.found_other = True elif is_integer_object(val) or is_float_object(val): # these must be ns unit by-definition @@ -499,8 +526,14 @@ if val != val or val == NPY_NAT: iresult[i] = NPY_NAT else: - # we now need to parse this as if unit='ns' - iresult[i] = cast_from_unit(val, "ns") + item_reso = NPY_FR_ns + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + + # we now need to parse this as if unit=abbrev + iresult[i] = cast_from_unit(val, abbrev, out_reso=creso) + state.found_other = True elif isinstance(val, str): # string @@ -508,45 +541,56 @@ # GH#32264 np.str_ object val = str(val) - if parse_today_now(val, &iresult[i], utc): + if parse_today_now(val, &iresult[i], utc, creso): # We can't _quite_ dispatch this to convert_str_to_tsobject # bc there isn't a nice way to pass "utc" - cnp.PyArray_MultiIter_NEXT(mi) + item_reso = NPY_DATETIMEUNIT.NPY_FR_us + state.update_creso(item_reso) + if infer_reso: + creso = state.creso continue - _ts = convert_str_to_tsobject( - val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst + tsobj = convert_str_to_tsobject( + val, None, dayfirst=dayfirst, yearfirst=yearfirst ) - _ts.ensure_reso(NPY_FR_ns, val) - iresult[i] = _ts.value + if tsobj.value == NPY_NAT: + # e.g. "NaT" string or empty string, we do not consider + # this as either tzaware or tznaive. See + # test_to_datetime_with_empty_str_utc_false_format_mixed + # We also do not update resolution inference based on this, + # see test_infer_with_nat_int_float_str + iresult[i] = tsobj.value + continue + + item_reso = tsobj.creso + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + + tsobj.ensure_reso(creso, val) + iresult[i] = tsobj.value - tz = _ts.tzinfo + tz = tsobj.tzinfo if tz is not None: # dateutil timezone objects cannot be hashed, so # store the UTC offsets in seconds instead nsecs = tz.utcoffset(None).total_seconds() out_tzoffset_vals.add(nsecs) - # need to set seen_datetime_offset *after* the - # potentially-raising timezone(timedelta(...)) call, - # otherwise we can go down the is_same_offsets path - # bc len(out_tzoffset_vals) == 0 seen_datetime_offset = True else: # Add a marker for naive string, to track if we are # parsing mixed naive and aware strings out_tzoffset_vals.add("naive") + state.found_naive_str = True else: raise TypeError(f"{type(val)} is not convertible to datetime") - cnp.PyArray_MultiIter_NEXT(mi) - except (TypeError, OverflowError, ValueError) as ex: ex.args = (f"{ex}, at position {i}",) if is_coerce: iresult[i] = NPY_NAT - cnp.PyArray_MultiIter_NEXT(mi) continue elif is_raise: raise @@ -562,9 +606,51 @@ is_same_offsets = len(out_tzoffset_vals) == 1 if not is_same_offsets: return _array_to_datetime_object(values, errors, dayfirst, yearfirst) + elif state.found_naive or state.found_other: + # e.g. test_to_datetime_mixed_awareness_mixed_types + raise ValueError("Cannot mix tz-aware with tz-naive values") + elif tz_out is not None: + # GH#55693 + tz_offset = out_tzoffset_vals.pop() + tz_out2 = timezone(timedelta(seconds=tz_offset)) + if not tz_compare(tz_out, tz_out2): + # e.g. test_to_datetime_mixed_tzs_mixed_types + raise ValueError( + "Mixed timezones detected. pass utc=True in to_datetime " + "or tz='UTC' in DatetimeIndex to convert to a common timezone." + ) + # e.g. test_to_datetime_mixed_types_matching_tzs else: tz_offset = out_tzoffset_vals.pop() tz_out = timezone(timedelta(seconds=tz_offset)) + elif not utc_convert: + if tz_out and (state.found_other or state.found_naive_str): + # found_other indicates a tz-naive int, float, dt64, or date + # e.g. test_to_datetime_mixed_awareness_mixed_types + raise ValueError("Cannot mix tz-aware with tz-naive values") + + if infer_reso: + if state.creso_ever_changed: + # We encountered mismatched resolutions, need to re-parse with + # the correct one. + return array_to_datetime( + values, + errors=errors, + yearfirst=yearfirst, + dayfirst=dayfirst, + utc=utc, + creso=state.creso, + ) + elif state.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + # i.e. we never encountered anything non-NaT, default to "s". This + # ensures that insert and concat-like operations with NaT + # do not upcast units + result = iresult.view("M8[s]").reshape(result.shape) + else: + # Otherwise we can use the single reso that we encountered and avoid + # a second pass. + abbrev = npy_unit_to_abbrev(state.creso) + result = iresult.view(f"M8[{abbrev}]").reshape(result.shape) return result, tz_out @@ -620,7 +706,6 @@ # 1) NaT or NaT-like values # 2) datetime strings, which we return as datetime.datetime # 3) special strings - "now" & "today" - unique_timezones = set() for i in range(n): # Analogous to: val = values[i] val = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] @@ -640,7 +725,7 @@ try: tsobj = convert_str_to_tsobject( - val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst + val, None, dayfirst=dayfirst, yearfirst=yearfirst ) tsobj.ensure_reso(NPY_FR_ns, val) @@ -650,7 +735,6 @@ tzinfo=tsobj.tzinfo, fold=tsobj.fold, ) - unique_timezones.add(tsobj.tzinfo) except (ValueError, OverflowError) as ex: ex.args = (f"{ex}, at position {i}", ) @@ -668,20 +752,21 @@ cnp.PyArray_MultiIter_NEXT(mi) - if len(unique_timezones) > 1: - warnings.warn( - "In a future version of pandas, parsing datetimes with mixed time " - "zones will raise an error unless `utc=True`. " - "Please specify `utc=True` to opt in to the new behaviour " - "and silence this warning. To create a `Series` with mixed offsets and " - "`object` dtype, please use `apply` and `datetime.datetime.strptime`", - FutureWarning, - stacklevel=find_stack_level(), - ) + warnings.warn( + "In a future version of pandas, parsing datetimes with mixed time " + "zones will raise an error unless `utc=True`. " + "Please specify `utc=True` to opt in to the new behaviour " + "and silence this warning. To create a `Series` with mixed offsets and " + "`object` dtype, please use `apply` and `datetime.datetime.strptime`", + FutureWarning, + stacklevel=find_stack_level(), + ) return oresult_nd, None -def array_to_datetime_with_tz(ndarray values, tzinfo tz): +def array_to_datetime_with_tz( + ndarray values, tzinfo tz, bint dayfirst, bint yearfirst, NPY_DATETIMEUNIT creso +): """ Vectorized analogue to pd.Timestamp(value, tz=tz) @@ -697,7 +782,16 @@ Py_ssize_t i, n = values.size object item int64_t ival - datetime ts + _TSObject tsobj + bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC + DatetimeParseState state = DatetimeParseState(creso) + str abbrev + + if infer_reso: + # We treat ints/floats as nanoseconds + abbrev = "ns" + else: + abbrev = npy_unit_to_abbrev(creso) for i in range(n): # Analogous to `item = values[i]` @@ -708,21 +802,42 @@ ival = NPY_NAT else: - ts = Timestamp(item) - if ts is NaT: - ival = NPY_NAT - else: - if ts.tzinfo is not None: - ts = ts.tz_convert(tz) - else: - # datetime64, tznaive pydatetime, int, float - ts = ts.tz_localize(tz) - ts = ts.as_unit("ns") - ival = ts._value + tsobj = convert_to_tsobject( + item, + tz=tz, + unit=abbrev, + dayfirst=dayfirst, + yearfirst=yearfirst, + nanos=0, + ) + if tsobj.value != NPY_NAT: + state.update_creso(tsobj.creso) + if infer_reso: + creso = state.creso + tsobj.ensure_reso(creso, item, round_ok=True) + ival = tsobj.value # Analogous to: result[i] = ival (cnp.PyArray_MultiIter_DATA(mi, 0))[0] = ival cnp.PyArray_MultiIter_NEXT(mi) + if infer_reso: + if state.creso_ever_changed: + # We encountered mismatched resolutions, need to re-parse with + # the correct one. + return array_to_datetime_with_tz(values, tz=tz, creso=creso) + elif creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + # i.e. we never encountered anything non-NaT, default to "s". This + # ensures that insert and concat-like operations with NaT + # do not upcast units + result = result.view("M8[s]") + else: + # Otherwise we can use the single reso that we encountered and avoid + # a second pass. + abbrev = npy_unit_to_abbrev(creso) + result = result.view(f"M8[{abbrev}]") + else: + abbrev = npy_unit_to_abbrev(creso) + result = result.view(f"M8[{abbrev}]") return result diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/__init__.py pandas-2.2.2+dfsg/pandas/_libs/tslibs/__init__.py --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -30,18 +30,16 @@ "get_unit_from_dtype", "periods_per_day", "periods_per_second", - "is_supported_unit", - "npy_unit_to_abbrev", - "get_supported_reso", + "guess_datetime_format", + "add_overflowsafe", + "get_supported_dtype", + "is_supported_dtype", ] from pandas._libs.tslibs import dtypes # pylint: disable=import-self from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.dtypes import ( Resolution, - get_supported_reso, - is_supported_unit, - npy_unit_to_abbrev, periods_per_day, periods_per_second, ) @@ -54,7 +52,10 @@ from pandas._libs.tslibs.np_datetime import ( OutOfBoundsDatetime, OutOfBoundsTimedelta, + add_overflowsafe, astype_overflowsafe, + get_supported_dtype, + is_supported_dtype, is_unitless, py_get_unit_from_dtype as get_unit_from_dtype, ) @@ -63,6 +64,7 @@ Tick, to_offset, ) +from pandas._libs.tslibs.parsing import guess_datetime_format from pandas._libs.tslibs.period import ( IncompatibleFrequency, Period, diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/ccalendar.pxd pandas-2.2.2+dfsg/pandas/_libs/tslibs/ccalendar.pxd --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/ccalendar.pxd 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/ccalendar.pxd 2024-04-10 17:42:52.000000000 +0000 @@ -15,6 +15,6 @@ cpdef int get_lastbday(int year, int month) noexcept nogil cpdef int get_firstbday(int year, int month) noexcept nogil -cdef dict c_MONTH_NUMBERS +cdef dict c_MONTH_NUMBERS, MONTH_TO_CAL_NUM cdef int32_t* month_offset diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/ccalendar.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/ccalendar.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/ccalendar.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/ccalendar.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -38,7 +38,7 @@ MONTH_NUMBERS = {name: num for num, name in enumerate(MONTHS)} cdef dict c_MONTH_NUMBERS = MONTH_NUMBERS MONTH_ALIASES = {(num + 1): name for num, name in enumerate(MONTHS)} -MONTH_TO_CAL_NUM = {name: num + 1 for num, name in enumerate(MONTHS)} +cdef dict MONTH_TO_CAL_NUM = {name: num + 1 for num, name in enumerate(MONTHS)} DAYS = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] DAYS_FULL = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/conversion.pxd pandas-2.2.2+dfsg/pandas/_libs/tslibs/conversion.pxd --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/conversion.pxd 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/conversion.pxd 2024-04-10 17:42:52.000000000 +0000 @@ -24,7 +24,9 @@ bint fold NPY_DATETIMEUNIT creso - cdef int64_t ensure_reso(self, NPY_DATETIMEUNIT creso, str val=*) except? -1 + cdef int64_t ensure_reso( + self, NPY_DATETIMEUNIT creso, val=*, bint round_ok=* + ) except? -1 cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit, @@ -35,7 +37,7 @@ int32_t nanos=*, NPY_DATETIMEUNIT reso=*) -cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, +cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, bint dayfirst=*, bint yearfirst=*) @@ -43,20 +45,15 @@ cpdef datetime localize_pydatetime(datetime dt, tzinfo tz) cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1 -cpdef (int64_t, int) precision_from_unit(str unit, NPY_DATETIMEUNIT out_reso=*) +cdef (int64_t, int) precision_from_unit( + NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=* +) cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso) -cdef tzinfo convert_timezone( - tzinfo tz_in, - tzinfo tz_out, - bint found_naive, - bint found_tz, - bint utc_convert, -) cdef int64_t parse_pydatetime( datetime val, npy_datetimestruct *dts, - bint utc_convert, + NPY_DATETIMEUNIT creso, ) except? -1 diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/conversion.pyi pandas-2.2.2+dfsg/pandas/_libs/tslibs/conversion.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/conversion.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/conversion.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -8,7 +8,7 @@ DT64NS_DTYPE: np.dtype TD64NS_DTYPE: np.dtype -def precision_from_unit( - unit: str, -) -> tuple[int, int]: ... # (int64_t, _) def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... +def cast_from_unit_vectorized( + values: np.ndarray, unit: str, out_unit: str = ... +) -> np.ndarray: ... diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/conversion.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/conversion.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/conversion.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/conversion.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -1,8 +1,11 @@ +cimport cython + import numpy as np cimport numpy as cnp from libc.math cimport log10 from numpy cimport ( + float64_t, int32_t, int64_t, ) @@ -26,21 +29,22 @@ import_datetime() from pandas._libs.missing cimport checknull_with_nat_and_na -from pandas._libs.tslibs.base cimport ABCTimestamp from pandas._libs.tslibs.dtypes cimport ( abbrev_to_npy_unit, get_supported_reso, + npy_unit_to_attrname, periods_per_second, ) from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, NPY_FR_us, + astype_overflowsafe, check_dts_bounds, convert_reso, + dts_to_iso_string, get_conversion_factor, get_datetime64_unit, - get_datetime64_value, get_implementation_bounds, import_pandas_datetime, npy_datetime, @@ -61,6 +65,7 @@ c_nat_strings as nat_strings, ) from pandas._libs.tslibs.parsing cimport parse_datetime_string +from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.timezones cimport ( get_utcoffset, is_utc, @@ -70,9 +75,9 @@ tz_localize_to_utc_single, ) from pandas._libs.tslibs.util cimport ( - is_datetime64_object, is_float_object, is_integer_object, + is_nan, ) # ---------------------------------------------------------------------- @@ -85,6 +90,79 @@ # ---------------------------------------------------------------------- # Unit Conversion Helpers +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.overflowcheck(True) +def cast_from_unit_vectorized( + ndarray values, + str unit, + str out_unit="ns", +): + """ + Vectorized analogue to cast_from_unit. + """ + cdef: + int64_t m + int p + NPY_DATETIMEUNIT in_reso, out_reso + Py_ssize_t i + + assert values.dtype.kind == "f" + + if unit in "YM": + if not (((values % 1) == 0) | np.isnan(values)).all(): + # GH#47267 it is clear that 2 "M" corresponds to 1970-02-01, + # but not clear what 2.5 "M" corresponds to, so we will + # disallow that case. + raise ValueError( + f"Conversion of non-round float with unit={unit} " + "is ambiguous" + ) + + # GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y" + # and 150 we'd get 2120-01-01 09:00:00 + values = values.astype(f"M8[{unit}]") + dtype = np.dtype(f"M8[{out_unit}]") + return astype_overflowsafe(values, dtype=dtype, copy=False).view("i8") + + in_reso = abbrev_to_npy_unit(unit) + out_reso = abbrev_to_npy_unit(out_unit) + m, p = precision_from_unit(in_reso, out_reso) + + cdef: + ndarray[int64_t] base, out + ndarray[float64_t] frac + tuple shape = (values).shape + + out = np.empty(shape, dtype="i8") + base = np.empty(shape, dtype="i8") + frac = np.empty(shape, dtype="f8") + + for i in range(len(values)): + if is_nan(values[i]): + base[i] = NPY_NAT + else: + base[i] = values[i] + frac[i] = values[i] - base[i] + + if p: + frac = np.round(frac, p) + + try: + for i in range(len(values)): + if base[i] == NPY_NAT: + out[i] = NPY_NAT + else: + out[i] = (base[i] * m) + (frac[i] * m) + except (OverflowError, FloatingPointError) as err: + # FloatingPointError can be issued if we have float dtype and have + # set np.errstate(over="raise") + raise OutOfBoundsDatetime( + f"cannot convert input {values[i]} with the unit '{unit}'" + ) from err + return out + + cdef int64_t cast_from_unit( object ts, str unit, @@ -106,6 +184,7 @@ cdef: int64_t m int p + NPY_DATETIMEUNIT in_reso if unit in ["Y", "M"]: if is_float_object(ts) and not ts.is_integer(): @@ -123,7 +202,14 @@ dt64obj = np.datetime64(ts, unit) return get_datetime64_nanos(dt64obj, out_reso) - m, p = precision_from_unit(unit, out_reso) + in_reso = abbrev_to_npy_unit(unit) + if out_reso < in_reso and in_reso != NPY_DATETIMEUNIT.NPY_FR_GENERIC: + # We will end up rounding (always *down*), so don't need the fractional + # part of `ts`. + m, _ = precision_from_unit(out_reso, in_reso) + return (ts) // m + + m, p = precision_from_unit(in_reso, out_reso) # cast the unit, multiply base/frac separately # to avoid precision issues from float -> int @@ -146,8 +232,8 @@ ) from err -cpdef inline (int64_t, int) precision_from_unit( - str unit, +cdef (int64_t, int) precision_from_unit( + NPY_DATETIMEUNIT in_reso, NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns, ): """ @@ -163,17 +249,16 @@ int64_t m int64_t multiplier int p - NPY_DATETIMEUNIT reso = abbrev_to_npy_unit(unit) - if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: - reso = NPY_DATETIMEUNIT.NPY_FR_ns - if reso == NPY_DATETIMEUNIT.NPY_FR_Y: + if in_reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + in_reso = NPY_DATETIMEUNIT.NPY_FR_ns + if in_reso == NPY_DATETIMEUNIT.NPY_FR_Y: # each 400 years we have 97 leap years, for an average of 97/400=.2425 # extra days each year. We get 31556952 by writing # 3600*24*365.2425=31556952 multiplier = periods_per_second(out_reso) m = multiplier * 31556952 - elif reso == NPY_DATETIMEUNIT.NPY_FR_M: + elif in_reso == NPY_DATETIMEUNIT.NPY_FR_M: # 2629746 comes from dividing the "Y" case by 12. multiplier = periods_per_second(out_reso) m = multiplier * 2629746 @@ -181,7 +266,7 @@ # Careful: if get_conversion_factor raises, the exception does # not propagate, instead we get a warning about an ignored exception. # https://github.com/pandas-dev/pandas/pull/51483#discussion_r1115198951 - m = get_conversion_factor(reso, out_reso) + m = get_conversion_factor(in_reso, out_reso) p = log10(m) # number of digits in 'm' minus 1 return m, p @@ -197,7 +282,7 @@ NPY_DATETIMEUNIT unit npy_datetime ival - ival = get_datetime64_value(val) + ival = cnp.get_datetime64_value(val) if ival == NPY_NAT: return NPY_NAT @@ -205,8 +290,13 @@ if unit != reso: pandas_datetime_to_datetimestruct(ival, unit, &dts) - check_dts_bounds(&dts, reso) - ival = npy_datetimestruct_to_datetime(reso, &dts) + try: + ival = npy_datetimestruct_to_datetime(reso, &dts) + except OverflowError as err: + attrname = npy_unit_to_attrname[reso] + raise OutOfBoundsDatetime( + f"Out of bounds {attrname} timestamp: {val}" + ) from err return ival @@ -228,14 +318,19 @@ self.fold = 0 self.creso = NPY_FR_ns # default value - cdef int64_t ensure_reso(self, NPY_DATETIMEUNIT creso, str val=None) except? -1: + cdef int64_t ensure_reso( + self, NPY_DATETIMEUNIT creso, val=None, bint round_ok=False + ) except? -1: if self.creso != creso: try: - self.value = convert_reso(self.value, self.creso, creso, False) + self.value = convert_reso( + self.value, self.creso, creso, round_ok=round_ok + ) except OverflowError as err: if val is not None: + attrname = npy_unit_to_attrname[creso] raise OutOfBoundsDatetime( - f"Out of bounds nanosecond timestamp: {val}" + f"Out of bounds {attrname} timestamp: {val}" ) from err raise OutOfBoundsDatetime from err @@ -266,16 +361,21 @@ obj = _TSObject() if isinstance(ts, str): - return convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst) + return convert_str_to_tsobject(ts, tz, dayfirst, yearfirst) if checknull_with_nat_and_na(ts): obj.value = NPY_NAT - elif is_datetime64_object(ts): + elif cnp.is_datetime64_object(ts): reso = get_supported_reso(get_datetime64_unit(ts)) obj.creso = reso obj.value = get_datetime64_nanos(ts, reso) if obj.value != NPY_NAT: pandas_datetime_to_datetimestruct(obj.value, reso, &obj.dts) + if tz is not None: + # GH#24559, GH#42288 We treat np.datetime64 objects as *wall* times + obj.value = tz_localize_to_utc_single( + obj.value, tz, ambiguous="raise", nonexistent=None, creso=reso + ) elif is_integer_object(ts): try: ts = ts @@ -302,8 +402,8 @@ pandas_datetime_to_datetimestruct(ts, NPY_FR_ns, &obj.dts) elif PyDateTime_Check(ts): if nanos == 0: - if isinstance(ts, ABCTimestamp): - reso = abbrev_to_npy_unit(ts.unit) # TODO: faster way to do this? + if isinstance(ts, _Timestamp): + reso = (<_Timestamp>ts)._creso else: # TODO: what if user explicitly passes nanos=0? reso = NPY_FR_us @@ -391,62 +491,43 @@ pydatetime_to_dtstruct(ts, &obj.dts) obj.tzinfo = ts.tzinfo - if isinstance(ts, ABCTimestamp): + if isinstance(ts, _Timestamp): obj.dts.ps = ts.nanosecond * 1000 if nanos: obj.dts.ps = nanos * 1000 - obj.value = npy_datetimestruct_to_datetime(reso, &obj.dts) + try: + obj.value = npy_datetimestruct_to_datetime(reso, &obj.dts) + except OverflowError as err: + attrname = npy_unit_to_attrname[reso] + raise OutOfBoundsDatetime(f"Out of bounds {attrname} timestamp") from err if obj.tzinfo is not None and not is_utc(obj.tzinfo): offset = get_utcoffset(obj.tzinfo, ts) pps = periods_per_second(reso) obj.value -= int(offset.total_seconds() * pps) - check_dts_bounds(&obj.dts, reso) check_overflows(obj, reso) return obj -cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts, - int tzoffset, tzinfo tz=None, - NPY_DATETIMEUNIT reso=NPY_FR_ns): +cdef _adjust_tsobject_tz_using_offset(_TSObject obj, tzinfo tz): """ - Convert a datetimestruct `dts`, along with initial timezone offset - `tzoffset` to a _TSObject (with timezone object `tz` - optional). + Convert a datetimestruct `obj.dts`, with an attached tzinfo to a new + user-provided tz. Parameters ---------- - dts : npy_datetimestruct - tzoffset : int - tz : tzinfo or None - timezone for the timezone-aware output. - reso : NPY_DATETIMEUNIT, default NPY_FR_ns - - Returns - ------- obj : _TSObject + tz : tzinfo + timezone for the timezone-aware output. """ cdef: - _TSObject obj = _TSObject() - int64_t value # numpy dt64 datetime dt Py_ssize_t pos - - value = npy_datetimestruct_to_datetime(reso, &dts) - obj.dts = dts - obj.tzinfo = timezone(timedelta(minutes=tzoffset)) - obj.value = tz_localize_to_utc_single( - value, obj.tzinfo, ambiguous=None, nonexistent=None, creso=reso - ) - obj.creso = reso - if tz is None: - check_overflows(obj, reso) - return obj - - cdef: - Localizer info = Localizer(tz, reso) + int64_t ps = obj.dts.ps + Localizer info = Localizer(tz, obj.creso) # Infer fold from offset-adjusted obj.value # see PEP 495 https://www.python.org/dev/peps/pep-0495/#the-fold-attribute @@ -462,13 +543,18 @@ dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, obj.dts.hour, obj.dts.min, obj.dts.sec, obj.dts.us, obj.tzinfo, fold=obj.fold) - obj = convert_datetime_to_tsobject( - dt, tz, nanos=obj.dts.ps // 1000) - obj.ensure_reso(reso) # TODO: more performant to get reso right up front? - return obj + + # The rest here is similar to the 2-tz path in convert_datetime_to_tsobject + # but avoids re-calculating obj.value + dt = dt.astimezone(tz) + pydatetime_to_dtstruct(dt, &obj.dts) + obj.tzinfo = dt.tzinfo + obj.dts.ps = ps + check_dts_bounds(&obj.dts, obj.creso) + check_overflows(obj, obj.creso) -cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit, +cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, bint dayfirst=False, bint yearfirst=False): """ @@ -484,7 +570,6 @@ Value to be converted to _TSObject tz : tzinfo or None timezone for the timezone-aware output - unit : str or None dayfirst : bool, default False When parsing an ambiguous date string, interpret e.g. "3/4/1975" as April 3, as opposed to the standard US interpretation March 4. @@ -500,8 +585,9 @@ npy_datetimestruct dts int out_local = 0, out_tzoffset = 0, string_to_dts_failed datetime dt - int64_t ival + int64_t ival, nanos = 0 NPY_DATETIMEUNIT out_bestunit, reso + _TSObject obj if len(ts) == 0 or ts in nat_strings: obj = _TSObject() @@ -512,11 +598,13 @@ # Issue 9000, we short-circuit rather than going # into np_datetime_strings which returns utc dt = datetime.now(tz) + return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us) elif ts == "today": # Issue 9000, we short-circuit rather than going # into np_datetime_strings which returns a normalized datetime dt = datetime.now(tz) # equiv: datetime.today().replace(tzinfo=tz) + return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us) else: string_to_dts_failed = string_to_dts( ts, &dts, &out_bestunit, &out_local, @@ -525,31 +613,40 @@ if not string_to_dts_failed: reso = get_supported_reso(out_bestunit) check_dts_bounds(&dts, reso) + obj = _TSObject() + obj.dts = dts + obj.creso = reso + ival = npy_datetimestruct_to_datetime(reso, &dts) + if out_local == 1: - return _create_tsobject_tz_using_offset( - dts, out_tzoffset, tz, reso + obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) + obj.value = tz_localize_to_utc_single( + ival, obj.tzinfo, ambiguous="raise", nonexistent=None, creso=reso ) + if tz is None: + check_overflows(obj, reso) + return obj + _adjust_tsobject_tz_using_offset(obj, tz) + return obj else: - ival = npy_datetimestruct_to_datetime(reso, &dts) if tz is not None: # shift for _localize_tso ival = tz_localize_to_utc_single( ival, tz, ambiguous="raise", nonexistent=None, creso=reso ) - obj = _TSObject() - obj.dts = dts obj.value = ival - obj.creso = reso maybe_localize_tso(obj, tz, obj.creso) return obj dt = parse_datetime_string( - ts, dayfirst=dayfirst, yearfirst=yearfirst, out_bestunit=&out_bestunit + ts, + dayfirst=dayfirst, + yearfirst=yearfirst, + out_bestunit=&out_bestunit, + nanos=&nanos, ) reso = get_supported_reso(out_bestunit) - return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=reso) - - return convert_datetime_to_tsobject(dt, tz) + return convert_datetime_to_tsobject(dt, tz, nanos=nanos, reso=reso) cdef check_overflows(_TSObject obj, NPY_DATETIMEUNIT reso=NPY_FR_ns): @@ -578,18 +675,18 @@ if obj.dts.year == lb.year: if not (obj.value < 0): from pandas._libs.tslibs.timestamps import Timestamp - fmt = (f"{obj.dts.year}-{obj.dts.month:02d}-{obj.dts.day:02d} " - f"{obj.dts.hour:02d}:{obj.dts.min:02d}:{obj.dts.sec:02d}") + fmt = dts_to_iso_string(&obj.dts) + min_ts = (<_Timestamp>Timestamp(0))._as_creso(reso).min raise OutOfBoundsDatetime( - f"Converting {fmt} underflows past {Timestamp.min}" + f"Converting {fmt} underflows past {min_ts}" ) elif obj.dts.year == ub.year: if not (obj.value > 0): from pandas._libs.tslibs.timestamps import Timestamp - fmt = (f"{obj.dts.year}-{obj.dts.month:02d}-{obj.dts.day:02d} " - f"{obj.dts.hour:02d}:{obj.dts.min:02d}:{obj.dts.sec:02d}") + fmt = dts_to_iso_string(&obj.dts) + max_ts = (<_Timestamp>Timestamp(0))._as_creso(reso).max raise OutOfBoundsDatetime( - f"Converting {fmt} overflows past {Timestamp.max}" + f"Converting {fmt} overflows past {max_ts}" ) # ---------------------------------------------------------------------- @@ -647,7 +744,8 @@ """ try: # datetime.replace with pytz may be incorrect result - return tz.localize(dt) + # TODO: try to respect `fold` attribute + return tz.localize(dt, is_dst=None) except AttributeError: return dt.replace(tzinfo=tz) @@ -667,68 +765,15 @@ """ if tz is None: return dt - elif isinstance(dt, ABCTimestamp): + elif isinstance(dt, _Timestamp): return dt.tz_localize(tz) return _localize_pydatetime(dt, tz) -cdef tzinfo convert_timezone( - tzinfo tz_in, - tzinfo tz_out, - bint found_naive, - bint found_tz, - bint utc_convert, -): - """ - Validate that ``tz_in`` can be converted/localized to ``tz_out``. - - Parameters - ---------- - tz_in : tzinfo or None - Timezone info of element being processed. - tz_out : tzinfo or None - Timezone info of output. - found_naive : bool - Whether a timezone-naive element has been found so far. - found_tz : bool - Whether a timezone-aware element has been found so far. - utc_convert : bool - Whether to convert/localize to UTC. - - Returns - ------- - tz_info - Timezone info of output. - - Raises - ------ - ValueError - If ``tz_in`` can't be converted/localized to ``tz_out``. - """ - if tz_in is not None: - if utc_convert: - pass - elif found_naive: - raise ValueError("Tz-aware datetime.datetime " - "cannot be converted to " - "datetime64 unless utc=True") - elif tz_out is not None and not tz_compare(tz_out, tz_in): - raise ValueError("Tz-aware datetime.datetime " - "cannot be converted to " - "datetime64 unless utc=True") - else: - tz_out = tz_in - else: - if found_tz and not utc_convert: - raise ValueError("Cannot mix tz-aware with " - "tz-naive values") - return tz_out - - cdef int64_t parse_pydatetime( datetime val, npy_datetimestruct *dts, - bint utc_convert, + NPY_DATETIMEUNIT creso, ) except? -1: """ Convert pydatetime to datetime64. @@ -739,8 +784,8 @@ Element being processed. dts : *npy_datetimestruct Needed to use in pydatetime_to_dt64, which writes to it. - utc_convert : bool - Whether to convert/localize to UTC. + creso : NPY_DATETIMEUNIT + Resolution to store the the result. Raises ------ @@ -751,18 +796,11 @@ int64_t result if val.tzinfo is not None: - if utc_convert: - _ts = convert_datetime_to_tsobject(val, None) - _ts.ensure_reso(NPY_FR_ns) - result = _ts.value - else: - _ts = convert_datetime_to_tsobject(val, None) - _ts.ensure_reso(NPY_FR_ns) - result = _ts.value + _ts = convert_datetime_to_tsobject(val, None, nanos=0, reso=creso) + result = _ts.value else: - if isinstance(val, ABCTimestamp): - result = val.as_unit("ns")._value + if isinstance(val, _Timestamp): + result = (<_Timestamp>val)._as_creso(creso, round_ok=True)._value else: - result = pydatetime_to_dt64(val, dts) - check_dts_bounds(dts) + result = pydatetime_to_dt64(val, dts, reso=creso) return result diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/dtypes.pxd pandas-2.2.2+dfsg/pandas/_libs/tslibs/dtypes.pxd --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/dtypes.pxd 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/dtypes.pxd 2024-04-10 17:42:52.000000000 +0000 @@ -3,14 +3,19 @@ from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT -cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit) +cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit) cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev) cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) noexcept nogil cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1 cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1 -cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) -cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso) +cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) +cdef bint is_supported_unit(NPY_DATETIMEUNIT reso) +cpdef freq_to_period_freqstr(freq_n, freq_name) +cdef dict c_OFFSET_TO_PERIOD_FREQSTR +cdef dict c_OFFSET_DEPR_FREQSTR +cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR +cdef dict c_DEPR_ABBREVS cdef dict attrname_to_abbrevs cdef dict npy_unit_to_attrname cdef dict attrname_to_npy_unit diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/dtypes.pyi pandas-2.2.2+dfsg/pandas/_libs/tslibs/dtypes.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/dtypes.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/dtypes.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -1,16 +1,11 @@ from enum import Enum -# These are not public API, but are exposed in the .pyi file because they -# are imported in tests. -_attrname_to_abbrevs: dict[str, str] -_period_code_map: dict[str, int] +OFFSET_TO_PERIOD_FREQSTR: dict[str, str] -def periods_per_day(reso: int) -> int: ... +def periods_per_day(reso: int = ...) -> int: ... def periods_per_second(reso: int) -> int: ... -def is_supported_unit(reso: int) -> bool: ... -def npy_unit_to_abbrev(reso: int) -> str: ... -def get_supported_reso(reso: int) -> int: ... -def abbrev_to_npy_unit(abbrev: str) -> int: ... +def abbrev_to_npy_unit(abbrev: str | None) -> int: ... +def freq_to_period_freqstr(freq_n: int, freq_name: str) -> str: ... class PeriodDtypeBase: _dtype_code: int # PeriodDtypeCode diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/dtypes.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/dtypes.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/dtypes.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/dtypes.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,11 @@ # period frequency constants corresponding to scikits timeseries # originals from enum import Enum +import warnings +from pandas.util._exceptions import find_stack_level + +from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, get_conversion_factor, @@ -10,7 +14,6 @@ import_pandas_datetime() - cdef class PeriodDtypeBase: """ Similar to an actual dtype, this contains all of the information @@ -44,7 +47,7 @@ def _resolution_obj(self) -> "Resolution": fgc = self._freq_group_code freq_group = FreqGroup(fgc) - abbrev = _reverse_period_code_map[freq_group.value].split("-")[0] + abbrev = _period_code_to_abbrev[freq_group.value].split("-")[0] if abbrev == "B": return Resolution.RESO_DAY attrname = _abbrev_to_attrnames[abbrev] @@ -53,7 +56,7 @@ @property def _freqstr(self) -> str: # Will be passed to to_offset in Period._maybe_convert_freq - out = _reverse_period_code_map.get(self._dtype_code) + out = _period_code_to_abbrev.get(self._dtype_code) if self._n == 1: return out return str(self._n) + out @@ -99,19 +102,19 @@ _period_code_map = { # Annual freqs with various fiscal year ends. - # eg, 2005 for A-FEB runs Mar 1, 2004 to Feb 28, 2005 - "A-DEC": PeriodDtypeCode.A_DEC, # Annual - December year end - "A-JAN": PeriodDtypeCode.A_JAN, # Annual - January year end - "A-FEB": PeriodDtypeCode.A_FEB, # Annual - February year end - "A-MAR": PeriodDtypeCode.A_MAR, # Annual - March year end - "A-APR": PeriodDtypeCode.A_APR, # Annual - April year end - "A-MAY": PeriodDtypeCode.A_MAY, # Annual - May year end - "A-JUN": PeriodDtypeCode.A_JUN, # Annual - June year end - "A-JUL": PeriodDtypeCode.A_JUL, # Annual - July year end - "A-AUG": PeriodDtypeCode.A_AUG, # Annual - August year end - "A-SEP": PeriodDtypeCode.A_SEP, # Annual - September year end - "A-OCT": PeriodDtypeCode.A_OCT, # Annual - October year end - "A-NOV": PeriodDtypeCode.A_NOV, # Annual - November year end + # eg, 2005 for Y-FEB runs Mar 1, 2004 to Feb 28, 2005 + "Y-DEC": PeriodDtypeCode.A_DEC, # Annual - December year end + "Y-JAN": PeriodDtypeCode.A_JAN, # Annual - January year end + "Y-FEB": PeriodDtypeCode.A_FEB, # Annual - February year end + "Y-MAR": PeriodDtypeCode.A_MAR, # Annual - March year end + "Y-APR": PeriodDtypeCode.A_APR, # Annual - April year end + "Y-MAY": PeriodDtypeCode.A_MAY, # Annual - May year end + "Y-JUN": PeriodDtypeCode.A_JUN, # Annual - June year end + "Y-JUL": PeriodDtypeCode.A_JUL, # Annual - July year end + "Y-AUG": PeriodDtypeCode.A_AUG, # Annual - August year end + "Y-SEP": PeriodDtypeCode.A_SEP, # Annual - September year end + "Y-OCT": PeriodDtypeCode.A_OCT, # Annual - October year end + "Y-NOV": PeriodDtypeCode.A_NOV, # Annual - November year end # Quarterly frequencies with various fiscal year ends. # eg, Q42005 for Q-OCT runs Aug 1, 2005 to Oct 31, 2005 @@ -140,49 +143,275 @@ "B": PeriodDtypeCode.B, # Business days "D": PeriodDtypeCode.D, # Daily - "H": PeriodDtypeCode.H, # Hourly - "T": PeriodDtypeCode.T, # Minutely - "S": PeriodDtypeCode.S, # Secondly - "L": PeriodDtypeCode.L, # Millisecondly - "U": PeriodDtypeCode.U, # Microsecondly - "N": PeriodDtypeCode.N, # Nanosecondly + "h": PeriodDtypeCode.H, # Hourly + "min": PeriodDtypeCode.T, # Minutely + "s": PeriodDtypeCode.S, # Secondly + "ms": PeriodDtypeCode.L, # Millisecondly + "us": PeriodDtypeCode.U, # Microsecondly + "ns": PeriodDtypeCode.N, # Nanosecondly } -_reverse_period_code_map = { +cdef dict _period_code_to_abbrev = { _period_code_map[key]: key for key in _period_code_map} -# Yearly aliases; careful not to put these in _reverse_period_code_map -_period_code_map.update({"Y" + key[1:]: _period_code_map[key] - for key in _period_code_map - if key.startswith("A-")}) - -_period_code_map.update({ - "Q": 2000, # Quarterly - December year end (default quarterly) - "A": PeriodDtypeCode.A, # Annual - "W": 4000, # Weekly - "C": 5000, # Custom Business Day -}) - -cdef set _month_names = { - x.split("-")[-1] for x in _period_code_map.keys() if x.startswith("A-") -} +cdef set _month_names = set(c_MONTH_NUMBERS.keys()) # Map attribute-name resolutions to resolution abbreviations -_attrname_to_abbrevs = { - "year": "A", +cdef dict attrname_to_abbrevs = { + "year": "Y", "quarter": "Q", "month": "M", "day": "D", - "hour": "H", - "minute": "T", - "second": "S", - "millisecond": "L", - "microsecond": "U", - "nanosecond": "N", + "hour": "h", + "minute": "min", + "second": "s", + "millisecond": "ms", + "microsecond": "us", + "nanosecond": "ns", } -cdef dict attrname_to_abbrevs = _attrname_to_abbrevs cdef dict _abbrev_to_attrnames = {v: k for k, v in attrname_to_abbrevs.items()} +OFFSET_TO_PERIOD_FREQSTR: dict = { + "WEEKDAY": "D", + "EOM": "M", + "BME": "M", + "SME": "M", + "BQS": "Q", + "QS": "Q", + "BQE": "Q", + "BQE-DEC": "Q-DEC", + "BQE-JAN": "Q-JAN", + "BQE-FEB": "Q-FEB", + "BQE-MAR": "Q-MAR", + "BQE-APR": "Q-APR", + "BQE-MAY": "Q-MAY", + "BQE-JUN": "Q-JUN", + "BQE-JUL": "Q-JUL", + "BQE-AUG": "Q-AUG", + "BQE-SEP": "Q-SEP", + "BQE-OCT": "Q-OCT", + "BQE-NOV": "Q-NOV", + "MS": "M", + "D": "D", + "B": "B", + "min": "min", + "s": "s", + "ms": "ms", + "us": "us", + "ns": "ns", + "h": "h", + "QE": "Q", + "QE-DEC": "Q-DEC", + "QE-JAN": "Q-JAN", + "QE-FEB": "Q-FEB", + "QE-MAR": "Q-MAR", + "QE-APR": "Q-APR", + "QE-MAY": "Q-MAY", + "QE-JUN": "Q-JUN", + "QE-JUL": "Q-JUL", + "QE-AUG": "Q-AUG", + "QE-SEP": "Q-SEP", + "QE-OCT": "Q-OCT", + "QE-NOV": "Q-NOV", + "YE": "Y", + "YE-DEC": "Y-DEC", + "YE-JAN": "Y-JAN", + "YE-FEB": "Y-FEB", + "YE-MAR": "Y-MAR", + "YE-APR": "Y-APR", + "YE-MAY": "Y-MAY", + "YE-JUN": "Y-JUN", + "YE-JUL": "Y-JUL", + "YE-AUG": "Y-AUG", + "YE-SEP": "Y-SEP", + "YE-OCT": "Y-OCT", + "YE-NOV": "Y-NOV", + "W": "W", + "ME": "M", + "Y": "Y", + "BYE": "Y", + "BYE-DEC": "Y-DEC", + "BYE-JAN": "Y-JAN", + "BYE-FEB": "Y-FEB", + "BYE-MAR": "Y-MAR", + "BYE-APR": "Y-APR", + "BYE-MAY": "Y-MAY", + "BYE-JUN": "Y-JUN", + "BYE-JUL": "Y-JUL", + "BYE-AUG": "Y-AUG", + "BYE-SEP": "Y-SEP", + "BYE-OCT": "Y-OCT", + "BYE-NOV": "Y-NOV", + "YS": "Y", + "BYS": "Y", +} +cdef dict c_OFFSET_DEPR_FREQSTR = { + "M": "ME", + "Q": "QE", + "Q-DEC": "QE-DEC", + "Q-JAN": "QE-JAN", + "Q-FEB": "QE-FEB", + "Q-MAR": "QE-MAR", + "Q-APR": "QE-APR", + "Q-MAY": "QE-MAY", + "Q-JUN": "QE-JUN", + "Q-JUL": "QE-JUL", + "Q-AUG": "QE-AUG", + "Q-SEP": "QE-SEP", + "Q-OCT": "QE-OCT", + "Q-NOV": "QE-NOV", + "Y": "YE", + "Y-DEC": "YE-DEC", + "Y-JAN": "YE-JAN", + "Y-FEB": "YE-FEB", + "Y-MAR": "YE-MAR", + "Y-APR": "YE-APR", + "Y-MAY": "YE-MAY", + "Y-JUN": "YE-JUN", + "Y-JUL": "YE-JUL", + "Y-AUG": "YE-AUG", + "Y-SEP": "YE-SEP", + "Y-OCT": "YE-OCT", + "Y-NOV": "YE-NOV", + "A": "YE", + "A-DEC": "YE-DEC", + "A-JAN": "YE-JAN", + "A-FEB": "YE-FEB", + "A-MAR": "YE-MAR", + "A-APR": "YE-APR", + "A-MAY": "YE-MAY", + "A-JUN": "YE-JUN", + "A-JUL": "YE-JUL", + "A-AUG": "YE-AUG", + "A-SEP": "YE-SEP", + "A-OCT": "YE-OCT", + "A-NOV": "YE-NOV", + "BY": "BYE", + "BY-DEC": "BYE-DEC", + "BY-JAN": "BYE-JAN", + "BY-FEB": "BYE-FEB", + "BY-MAR": "BYE-MAR", + "BY-APR": "BYE-APR", + "BY-MAY": "BYE-MAY", + "BY-JUN": "BYE-JUN", + "BY-JUL": "BYE-JUL", + "BY-AUG": "BYE-AUG", + "BY-SEP": "BYE-SEP", + "BY-OCT": "BYE-OCT", + "BY-NOV": "BYE-NOV", + "BA": "BYE", + "BA-DEC": "BYE-DEC", + "BA-JAN": "BYE-JAN", + "BA-FEB": "BYE-FEB", + "BA-MAR": "BYE-MAR", + "BA-APR": "BYE-APR", + "BA-MAY": "BYE-MAY", + "BA-JUN": "BYE-JUN", + "BA-JUL": "BYE-JUL", + "BA-AUG": "BYE-AUG", + "BA-SEP": "BYE-SEP", + "BA-OCT": "BYE-OCT", + "BA-NOV": "BYE-NOV", + "BM": "BME", + "CBM": "CBME", + "SM": "SME", + "BQ": "BQE", + "BQ-DEC": "BQE-DEC", + "BQ-JAN": "BQE-JAN", + "BQ-FEB": "BQE-FEB", + "BQ-MAR": "BQE-MAR", + "BQ-APR": "BQE-APR", + "BQ-MAY": "BQE-MAY", + "BQ-JUN": "BQE-JUN", + "BQ-JUL": "BQE-JUL", + "BQ-AUG": "BQE-AUG", + "BQ-SEP": "BQE-SEP", + "BQ-OCT": "BQE-OCT", + "BQ-NOV": "BQE-NOV", +} +cdef dict c_OFFSET_TO_PERIOD_FREQSTR = OFFSET_TO_PERIOD_FREQSTR +cdef dict c_REVERSE_OFFSET_DEPR_FREQSTR = { + v: k for k, v in c_OFFSET_DEPR_FREQSTR.items() +} + +cpdef freq_to_period_freqstr(freq_n, freq_name): + if freq_n == 1: + freqstr = f"""{c_OFFSET_TO_PERIOD_FREQSTR.get( + freq_name, freq_name)}""" + else: + freqstr = f"""{freq_n}{c_OFFSET_TO_PERIOD_FREQSTR.get( + freq_name, freq_name)}""" + return freqstr + +# Map deprecated resolution abbreviations to correct resolution abbreviations +cdef dict c_DEPR_ABBREVS = { + "A": "Y", + "a": "Y", + "A-DEC": "Y-DEC", + "A-JAN": "Y-JAN", + "A-FEB": "Y-FEB", + "A-MAR": "Y-MAR", + "A-APR": "Y-APR", + "A-MAY": "Y-MAY", + "A-JUN": "Y-JUN", + "A-JUL": "Y-JUL", + "A-AUG": "Y-AUG", + "A-SEP": "Y-SEP", + "A-OCT": "Y-OCT", + "A-NOV": "Y-NOV", + "BA": "BY", + "BA-DEC": "BY-DEC", + "BA-JAN": "BY-JAN", + "BA-FEB": "BY-FEB", + "BA-MAR": "BY-MAR", + "BA-APR": "BY-APR", + "BA-MAY": "BY-MAY", + "BA-JUN": "BY-JUN", + "BA-JUL": "BY-JUL", + "BA-AUG": "BY-AUG", + "BA-SEP": "BY-SEP", + "BA-OCT": "BY-OCT", + "BA-NOV": "BY-NOV", + "AS": "YS", + "AS-DEC": "YS-DEC", + "AS-JAN": "YS-JAN", + "AS-FEB": "YS-FEB", + "AS-MAR": "YS-MAR", + "AS-APR": "YS-APR", + "AS-MAY": "YS-MAY", + "AS-JUN": "YS-JUN", + "AS-JUL": "YS-JUL", + "AS-AUG": "YS-AUG", + "AS-SEP": "YS-SEP", + "AS-OCT": "YS-OCT", + "AS-NOV": "YS-NOV", + "BAS": "BYS", + "BAS-DEC": "BYS-DEC", + "BAS-JAN": "BYS-JAN", + "BAS-FEB": "BYS-FEB", + "BAS-MAR": "BYS-MAR", + "BAS-APR": "BYS-APR", + "BAS-MAY": "BYS-MAY", + "BAS-JUN": "BYS-JUN", + "BAS-JUL": "BYS-JUL", + "BAS-AUG": "BYS-AUG", + "BAS-SEP": "BYS-SEP", + "BAS-OCT": "BYS-OCT", + "BAS-NOV": "BYS-NOV", + "H": "h", + "BH": "bh", + "CBH": "cbh", + "T": "min", + "t": "min", + "S": "s", + "L": "ms", + "l": "ms", + "U": "us", + "u": "us", + "N": "ns", + "n": "ns", +} + class FreqGroup(Enum): # Mirrors c_FreqGroup in the .pxd file @@ -228,7 +457,7 @@ @property def attr_abbrev(self) -> str: # string that we can pass to to_offset - return _attrname_to_abbrevs[self.attrname] + return attrname_to_abbrevs[self.attrname] @property def attrname(self) -> str: @@ -266,13 +495,25 @@ Examples -------- - >>> Resolution.get_reso_from_freqstr('H') + >>> Resolution.get_reso_from_freqstr('h') - >>> Resolution.get_reso_from_freqstr('H') == Resolution.RESO_HR + >>> Resolution.get_reso_from_freqstr('h') == Resolution.RESO_HR True """ + cdef: + str abbrev try: + if freq in c_DEPR_ABBREVS: + abbrev = c_DEPR_ABBREVS[freq] + warnings.warn( + f"\'{freq}\' is deprecated and will be removed in a future " + f"version. Please use \'{abbrev}\' " + "instead of \'{freq}\'.", + FutureWarning, + stacklevel=find_stack_level(), + ) + freq = abbrev attr_name = _abbrev_to_attrnames[freq] except KeyError: # For quarterly and yearly resolutions, we need to chop off @@ -283,6 +524,16 @@ if split_freq[1] not in _month_names: # i.e. we want e.g. "Q-DEC", not "Q-INVALID" raise + if split_freq[0] in c_DEPR_ABBREVS: + abbrev = c_DEPR_ABBREVS[split_freq[0]] + warnings.warn( + f"\'{split_freq[0]}\' is deprecated and will be removed in a " + f"future version. Please use \'{abbrev}\' " + f"instead of \'{split_freq[0]}\'.", + FutureWarning, + stacklevel=find_stack_level(), + ) + split_freq[0] = abbrev attr_name = _abbrev_to_attrnames[split_freq[0]] return cls.from_attrname(attr_name) @@ -308,7 +559,7 @@ NPY_FR_GENERIC = NPY_DATETIMEUNIT.NPY_FR_GENERIC -cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso): +cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso): # If we have an unsupported reso, return the nearest supported reso. if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: # TODO: or raise ValueError? trying this gives unraisable errors, but @@ -321,7 +572,7 @@ return reso -cpdef bint is_supported_unit(NPY_DATETIMEUNIT reso): +cdef bint is_supported_unit(NPY_DATETIMEUNIT reso): return ( reso == NPY_DATETIMEUNIT.NPY_FR_ns or reso == NPY_DATETIMEUNIT.NPY_FR_us @@ -330,7 +581,7 @@ ) -cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit): +cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit): if unit == NPY_DATETIMEUNIT.NPY_FR_ns or unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC: # generic -> default to nanoseconds return "ns" @@ -425,7 +676,6 @@ return NPY_DATETIMEUNIT.NPY_FR_D -# TODO: use in _matplotlib.converter? cpdef int64_t periods_per_day( NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_ns ) except? -1: diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/fields.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/fields.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/fields.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/fields.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -253,8 +253,8 @@ # month of year. Other offsets use month, startingMonth as ending # month of year. - if (freqstr[0:2] in ["MS", "QS", "AS"]) or ( - freqstr[1:3] in ["MS", "QS", "AS"]): + if (freqstr[0:2] in ["MS", "QS", "YS"]) or ( + freqstr[1:3] in ["MS", "QS", "YS"]): end_month = 12 if month_kw == 1 else month_kw - 1 start_month = month_kw else: @@ -746,7 +746,31 @@ cdef ndarray[int64_t] _rounddown_int64(values, int64_t unit): - return _ceil_int64(values - unit // 2, unit) + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] result = np.empty(n, dtype="i8") + int64_t res, value, remainder, half + + half = unit // 2 + + with cython.overflowcheck(True): + for i in range(n): + value = values[i] + + if value == NPY_NAT: + res = NPY_NAT + else: + # This adjustment is the only difference between rounddown_int64 + # and _ceil_int64 + value = value - half + remainder = value % unit + if remainder == 0: + res = value + else: + res = value + (unit - remainder) + + result[i] = res + return result cdef ndarray[int64_t] _roundup_int64(values, int64_t unit): diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/meson.build pandas-2.2.2+dfsg/pandas/_libs/tslibs/meson.build --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/meson.build 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/meson.build 2024-04-10 17:42:52.000000000 +0000 @@ -19,11 +19,20 @@ 'vectorized': {'sources': ['vectorized.pyx']}, } +cython_args = [ + '--include-dir', + meson.current_build_dir(), + '-X always_allow_keywords=true' +] +if get_option('buildtype') == 'debug' + cython_args += ['--gdb'] +endif + foreach ext_name, ext_dict : tslibs_sources py.extension_module( ext_name, ext_dict.get('sources'), - cython_args: ['--include-dir', meson.current_build_dir(), '-X always_allow_keywords=true'], + cython_args: cython_args, include_directories: [inc_np, inc_pd], dependencies: ext_dict.get('deps', ''), subdir: 'pandas/_libs/tslibs', diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/nattype.pyi pandas-2.2.2+dfsg/pandas/_libs/tslibs/nattype.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/nattype.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/nattype.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,7 @@ import numpy as np from pandas._libs.tslibs.period import Period +from pandas._typing import Self NaT: NaTType iNaT: int @@ -132,4 +133,9 @@ __le__: _NatComparison __gt__: _NatComparison __ge__: _NatComparison + def __sub__(self, other: Self | timedelta | datetime) -> Self: ... + def __rsub__(self, other: Self | timedelta | datetime) -> Self: ... + def __add__(self, other: Self | timedelta | datetime) -> Self: ... + def __radd__(self, other: Self | timedelta | datetime) -> Self: ... + def __hash__(self) -> int: ... def as_unit(self, unit: str, round_ok: bool = ...) -> NaTType: ... diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/nattype.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/nattype.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/nattype.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/nattype.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -21,10 +21,6 @@ cnp.import_array() cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.np_datetime cimport ( - get_datetime64_value, - get_timedelta64_value, -) # ---------------------------------------------------------------------- # Constants @@ -67,7 +63,7 @@ cdef _nat_divide_op(self, other): - if PyDelta_Check(other) or util.is_timedelta64_object(other) or other is c_NaT: + if PyDelta_Check(other) or cnp.is_timedelta64_object(other) or other is c_NaT: return np.nan if util.is_integer_object(other) or util.is_float_object(other): return c_NaT @@ -95,11 +91,11 @@ __array_priority__ = 100 def __richcmp__(_NaT self, object other, int op): - if util.is_datetime64_object(other) or PyDateTime_Check(other): + if cnp.is_datetime64_object(other) or PyDateTime_Check(other): # We treat NaT as datetime-like for this comparison return op == Py_NE - elif util.is_timedelta64_object(other) or PyDelta_Check(other): + elif cnp.is_timedelta64_object(other) or PyDelta_Check(other): # We treat NaT as timedelta-like for this comparison return op == Py_NE @@ -128,16 +124,11 @@ return NotImplemented def __add__(self, other): - if self is not c_NaT: - # TODO(cython3): remove this it moved to __radd__ - # cython __radd__ semantics - self, other = other, self - if PyDateTime_Check(other): return c_NaT elif PyDelta_Check(other): return c_NaT - elif util.is_datetime64_object(other) or util.is_timedelta64_object(other): + elif cnp.is_datetime64_object(other) or cnp.is_timedelta64_object(other): return c_NaT elif util.is_integer_object(other): @@ -162,20 +153,12 @@ def __sub__(self, other): # Duplicate some logic from _Timestamp.__sub__ to avoid needing # to subclass; allows us to @final(_Timestamp.__sub__) - cdef: - bint is_rsub = False - - if self is not c_NaT: - # cython __rsub__ semantics - # TODO(cython3): remove __rsub__ logic from here - self, other = other, self - is_rsub = True if PyDateTime_Check(other): return c_NaT elif PyDelta_Check(other): return c_NaT - elif util.is_datetime64_object(other) or util.is_timedelta64_object(other): + elif cnp.is_datetime64_object(other) or cnp.is_timedelta64_object(other): return c_NaT elif util.is_integer_object(other): @@ -184,19 +167,9 @@ elif util.is_array(other): if other.dtype.kind == "m": - if not is_rsub: - # NaT - timedelta64 we treat NaT as datetime64, so result - # is datetime64 - result = np.empty(other.shape, dtype="datetime64[ns]") - result.fill("NaT") - return result - - # __rsub__ logic here - # TODO(cython3): remove this, move above code out of - # ``if not is_rsub`` block - # timedelta64 - NaT we have to treat NaT as timedelta64 - # for this to be meaningful, and the result is timedelta64 - result = np.empty(other.shape, dtype="timedelta64[ns]") + # NaT - timedelta64 we treat NaT as datetime64, so result + # is datetime64 + result = np.empty(other.shape, dtype="datetime64[ns]") result.fill("NaT") return result @@ -1000,26 +973,26 @@ A timestamp can be rounded using multiple frequency units: - >>> ts.round(freq='H') # hour + >>> ts.round(freq='h') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.round(freq='T') # minute + >>> ts.round(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.round(freq='S') # seconds + >>> ts.round(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.round(freq='L') # milliseconds + >>> ts.round(freq='ms') # milliseconds Timestamp('2020-03-14 15:32:52.193000') - ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): - >>> ts.round(freq='5T') + >>> ts.round(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.round(freq='1H30T') + >>> ts.round(freq='1h30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -1032,10 +1005,10 @@ >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.round("H", ambiguous=False) + >>> ts_tz.round("h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.round("H", ambiguous=True) + >>> ts_tz.round("h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """, ) @@ -1089,26 +1062,26 @@ A timestamp can be floored using multiple frequency units: - >>> ts.floor(freq='H') # hour + >>> ts.floor(freq='h') # hour Timestamp('2020-03-14 15:00:00') - >>> ts.floor(freq='T') # minute + >>> ts.floor(freq='min') # minute Timestamp('2020-03-14 15:32:00') - >>> ts.floor(freq='S') # seconds + >>> ts.floor(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.floor(freq='N') # nanoseconds + >>> ts.floor(freq='ns') # nanoseconds Timestamp('2020-03-14 15:32:52.192548651') - ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): - >>> ts.floor(freq='5T') + >>> ts.floor(freq='5min') Timestamp('2020-03-14 15:30:00') - or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.floor(freq='1H30T') + >>> ts.floor(freq='1h30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -1121,10 +1094,10 @@ >>> ts_tz = pd.Timestamp("2021-10-31 03:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.floor("2H", ambiguous=False) + >>> ts_tz.floor("2h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.floor("2H", ambiguous=True) + >>> ts_tz.floor("2h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """, ) @@ -1178,26 +1151,26 @@ A timestamp can be ceiled using multiple frequency units: - >>> ts.ceil(freq='H') # hour + >>> ts.ceil(freq='h') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.ceil(freq='T') # minute + >>> ts.ceil(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.ceil(freq='S') # seconds + >>> ts.ceil(freq='s') # seconds Timestamp('2020-03-14 15:32:53') - >>> ts.ceil(freq='U') # microseconds + >>> ts.ceil(freq='us') # microseconds Timestamp('2020-03-14 15:32:52.192549') - ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): - >>> ts.ceil(freq='5T') + >>> ts.ceil(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.ceil(freq='1H30T') + >>> ts.ceil(freq='1h30min') Timestamp('2020-03-14 16:30:00') Analogous for ``pd.NaT``: @@ -1210,10 +1183,10 @@ >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.ceil("H", ambiguous=False) + >>> ts_tz.ceil("h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.ceil("H", ambiguous=True) + >>> ts_tz.ceil("h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """, ) @@ -1438,8 +1411,8 @@ """ Is this a np.datetime64 object np.datetime64("NaT"). """ - if util.is_datetime64_object(val): - return get_datetime64_value(val) == NPY_NAT + if cnp.is_datetime64_object(val): + return cnp.get_datetime64_value(val) == NPY_NAT return False @@ -1447,6 +1420,6 @@ """ Is this a np.timedelta64 object np.timedelta64("NaT"). """ - if util.is_timedelta64_object(val): - return get_timedelta64_value(val) == NPY_NAT + if cnp.is_timedelta64_object(val): + return cnp.get_timedelta64_value(val) == NPY_NAT return False diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/np_datetime.pxd pandas-2.2.2+dfsg/pandas/_libs/tslibs/np_datetime.pxd --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/np_datetime.pxd 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/np_datetime.pxd 2024-04-10 17:42:52.000000000 +0000 @@ -6,30 +6,12 @@ from numpy cimport ( int32_t, int64_t, + npy_datetime, + npy_timedelta, ) # TODO(cython3): most of these can be cimported directly from numpy -cdef extern from "numpy/ndarrayobject.h": - ctypedef int64_t npy_timedelta - ctypedef int64_t npy_datetime - -cdef extern from "numpy/ndarraytypes.h": - ctypedef struct PyArray_DatetimeMetaData: - NPY_DATETIMEUNIT base - int64_t num - -cdef extern from "numpy/arrayscalars.h": - ctypedef struct PyDatetimeScalarObject: - # PyObject_HEAD - npy_datetime obval - PyArray_DatetimeMetaData obmeta - - ctypedef struct PyTimedeltaScalarObject: - # PyObject_HEAD - npy_timedelta obval - PyArray_DatetimeMetaData obmeta - cdef extern from "numpy/ndarraytypes.h": ctypedef struct npy_datetimestruct: int64_t year @@ -65,7 +47,7 @@ npy_datetimestruct *result) nogil npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr, - npy_datetimestruct *d) nogil + npy_datetimestruct *d) except? -1 nogil void pandas_timedelta_to_timedeltastruct(npy_timedelta val, NPY_DATETIMEUNIT fr, @@ -85,19 +67,19 @@ cdef bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1 +cdef str dts_to_iso_string(npy_datetimestruct *dts) + cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=?) cdef int64_t pydatetime_to_dt64( datetime val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=? -) +) except? -1 cdef void pydatetime_to_dtstruct(datetime dt, npy_datetimestruct *dts) noexcept cdef int64_t pydate_to_dt64( date val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=? -) +) except? -1 cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts) noexcept -cdef npy_datetime get_datetime64_value(object obj) noexcept nogil -cdef npy_timedelta get_timedelta64_value(object obj) noexcept nogil cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) noexcept nogil cdef int string_to_dts( @@ -136,3 +118,5 @@ NPY_DATETIMEUNIT to_reso, bint round_ok, ) except? -1 + +cpdef cnp.ndarray add_overflowsafe(cnp.ndarray left, cnp.ndarray right) diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/np_datetime.pyi pandas-2.2.2+dfsg/pandas/_libs/tslibs/np_datetime.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/np_datetime.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/np_datetime.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -9,7 +9,7 @@ def py_get_unit_from_dtype(dtype: np.dtype): ... def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ... def astype_overflowsafe( - arr: np.ndarray, + values: np.ndarray, dtype: np.dtype, copy: bool = ..., round_ok: bool = ..., @@ -19,3 +19,9 @@ def compare_mismatched_resolutions( left: np.ndarray, right: np.ndarray, op ) -> npt.NDArray[np.bool_]: ... +def add_overflowsafe( + left: npt.NDArray[np.int64], + right: npt.NDArray[np.int64], +) -> npt.NDArray[np.int64]: ... +def get_supported_dtype(dtype: np.dtype) -> np.dtype: ... +def is_supported_dtype(dtype: np.dtype) -> bool: ... diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/np_datetime.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/np_datetime.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/np_datetime.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/np_datetime.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -18,21 +18,32 @@ Py_LT, Py_NE, ) +from libc.stdint cimport INT64_MAX import_datetime() PandasDateTime_IMPORT +import operator + import numpy as np cimport numpy as cnp cnp.import_array() from numpy cimport ( + PyArray_DatetimeMetaData, + PyDatetimeScalarObject, int64_t, ndarray, uint8_t, ) +from pandas._libs.tslibs.dtypes cimport ( + get_supported_reso, + is_supported_unit, + npy_unit_to_abbrev, + npy_unit_to_attrname, +) from pandas._libs.tslibs.util cimport get_c_string_buf_and_size @@ -59,22 +70,6 @@ # ---------------------------------------------------------------------- # numpy object inspection -cdef npy_datetime get_datetime64_value(object obj) noexcept nogil: - """ - returns the int64 value underlying scalar numpy datetime64 object - - Note that to interpret this as a datetime, the corresponding unit is - also needed. That can be found using `get_datetime64_unit`. - """ - return (obj).obval - - -cdef npy_timedelta get_timedelta64_value(object obj) noexcept nogil: - """ - returns the int64 value underlying scalar numpy timedelta64 object - """ - return (obj).obval - cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) noexcept nogil: """ @@ -98,6 +93,28 @@ return get_unit_from_dtype(dtype) +def get_supported_dtype(dtype: cnp.dtype) -> cnp.dtype: + reso = get_unit_from_dtype(dtype) + new_reso = get_supported_reso(reso) + new_unit = npy_unit_to_abbrev(new_reso) + + # Accessing dtype.kind here incorrectly(?) gives "" instead of "m"/"M", + # so we check type_num instead + if dtype.type_num == cnp.NPY_DATETIME: + new_dtype = np.dtype(f"M8[{new_unit}]") + else: + new_dtype = np.dtype(f"m8[{new_unit}]") + return new_dtype + + +def is_supported_dtype(dtype: cnp.dtype) -> bool: + if dtype.type_num not in [cnp.NPY_DATETIME, cnp.NPY_TIMEDELTA]: + raise ValueError("is_unitless dtype must be datetime64 or timedelta64") + cdef: + NPY_DATETIMEUNIT unit = get_unit_from_dtype(dtype) + return is_supported_unit(unit) + + def is_unitless(dtype: cnp.dtype) -> bool: """ Check if a datetime64 or timedelta64 dtype has no attached unit. @@ -210,6 +227,11 @@ raise NotImplementedError(reso) +cdef str dts_to_iso_string(npy_datetimestruct *dts): + return (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " + f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}") + + cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=NPY_FR_ns): """Raises OutOfBoundsDatetime if the given date is outside the range that can be represented by nanosecond-resolution 64-bit integers.""" @@ -225,10 +247,9 @@ error = True if error: - fmt = (f"{dts.year}-{dts.month:02d}-{dts.day:02d} " - f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}") - # TODO: "nanosecond" in the message assumes NPY_FR_ns - raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp: {fmt}") + fmt = dts_to_iso_string(dts) + attrname = npy_unit_to_attrname[unit] + raise OutOfBoundsDatetime(f"Out of bounds {attrname} timestamp: {fmt}") # ---------------------------------------------------------------------- @@ -262,12 +283,21 @@ cdef int64_t pydatetime_to_dt64(datetime val, npy_datetimestruct *dts, - NPY_DATETIMEUNIT reso=NPY_FR_ns): + NPY_DATETIMEUNIT reso=NPY_FR_ns) except? -1: """ Note we are assuming that the datetime object is timezone-naive. """ + cdef int64_t result pydatetime_to_dtstruct(val, dts) - return npy_datetimestruct_to_datetime(reso, dts) + try: + result = npy_datetimestruct_to_datetime(reso, dts) + except OverflowError as err: + attrname = npy_unit_to_attrname[reso] + raise OutOfBoundsDatetime( + f"Out of bounds {attrname} timestamp: {val}" + ) from err + + return result cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts) noexcept: @@ -278,11 +308,20 @@ dts.ps = dts.as = 0 return + cdef int64_t pydate_to_dt64( date val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=NPY_FR_ns -): +) except? -1: + cdef int64_t result pydate_to_dtstruct(val, dts) - return npy_datetimestruct_to_datetime(reso, dts) + + try: + result = npy_datetimestruct_to_datetime(reso, dts) + except OverflowError as err: + attrname = npy_unit_to_attrname[reso] + raise OutOfBoundsDatetime(f"Out of bounds {attrname} timestamp: {val}") from err + + return result cdef int string_to_dts( @@ -361,13 +400,10 @@ return values elif from_unit > to_unit: - if round_ok: - # e.g. ns -> us, so there is no risk of overflow, so we can use - # numpy's astype safely. Note there _is_ risk of truncation. - return values.astype(dtype) - else: - iresult2 = astype_round_check(values.view("i8"), from_unit, to_unit) - return iresult2.view(dtype) + iresult2 = _astype_overflowsafe_to_smaller_unit( + values.view("i8"), from_unit, to_unit, round_ok=round_ok + ) + return iresult2.view(dtype) if (values).dtype.byteorder == ">": # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap @@ -422,7 +458,7 @@ return iresult.view(dtype) -# TODO: try to upstream this fix to numpy +# TODO(numpy#16352): try to upstream this fix to numpy def compare_mismatched_resolutions(ndarray left, ndarray right, op): """ Overflow-safe comparison of timedelta64/datetime64 with mismatched resolutions. @@ -480,11 +516,7 @@ return result -import operator - - cdef int op_to_op_code(op): - # TODO: should exist somewhere? if op is operator.eq: return Py_EQ if op is operator.ne: @@ -499,13 +531,20 @@ return Py_GT -cdef ndarray astype_round_check( +cdef ndarray _astype_overflowsafe_to_smaller_unit( ndarray i8values, NPY_DATETIMEUNIT from_unit, - NPY_DATETIMEUNIT to_unit + NPY_DATETIMEUNIT to_unit, + bint round_ok, ): - # cases with from_unit > to_unit, e.g. ns->us, raise if the conversion - # involves truncation, e.g. 1500ns->1us + """ + Overflow-safe conversion for cases with from_unit > to_unit, e.g. ns->us. + In addition for checking for overflows (which can occur near the lower + implementation bound, see numpy#22346), this checks for truncation, + e.g. 1500ns->1us. + """ + # e.g. test_astype_ns_to_ms_near_bounds is a case with round_ok=True where + # just using numpy's astype silently fails cdef: Py_ssize_t i, N = i8values.size @@ -528,9 +567,7 @@ new_value = NPY_DATETIME_NAT else: new_value, mod = divmod(value, mult) - if mod != 0: - # TODO: avoid runtime import - from pandas._libs.tslibs.dtypes import npy_unit_to_abbrev + if not round_ok and mod != 0: from_abbrev = npy_unit_to_abbrev(from_unit) to_abbrev = npy_unit_to_abbrev(to_unit) raise ValueError( @@ -545,7 +582,6 @@ return iresult -@cython.overflowcheck(True) cdef int64_t get_conversion_factor( NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit @@ -553,40 +589,57 @@ """ Find the factor by which we need to multiply to convert from from_unit to to_unit. """ + cdef int64_t value, overflow_limit, factor if ( from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC ): raise ValueError("unit-less resolutions are not supported") if from_unit > to_unit: - raise ValueError + raise ValueError("from_unit must be <= to_unit") if from_unit == to_unit: return 1 if from_unit == NPY_DATETIMEUNIT.NPY_FR_W: - return 7 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) + factor = 7 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D: - return 24 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) + factor = 24 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h: - return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) + factor = 60 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m: - return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) + factor = 60 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) + factor = 1000 else: raise ValueError("Converting from M or Y units is not supported.") + overflow_limit = INT64_MAX // factor + if value > overflow_limit or value < -overflow_limit: + raise OverflowError("result would overflow") + + return factor * value + cdef int64_t convert_reso( int64_t value, @@ -595,7 +648,7 @@ bint round_ok, ) except? -1: cdef: - int64_t res_value, mult, div, mod + int64_t res_value, mult, div, mod, overflow_limit if from_reso == to_reso: return value @@ -624,9 +677,12 @@ else: # e.g. ns -> us, risk of overflow, but no risk of lossy rounding mult = get_conversion_factor(from_reso, to_reso) - with cython.overflowcheck(True): + overflow_limit = INT64_MAX // mult + if value > overflow_limit or value < -overflow_limit: # Note: caller is responsible for re-raising as OutOfBoundsTimedelta - res_value = value * mult + raise OverflowError("result would overflow") + + res_value = value * mult return res_value @@ -638,7 +694,52 @@ ) except? -1: cdef: npy_datetimestruct dts + int64_t result pandas_datetime_to_datetimestruct(value, from_unit, &dts) - check_dts_bounds(&dts, to_unit) - return npy_datetimestruct_to_datetime(to_unit, &dts) + try: + result = npy_datetimestruct_to_datetime(to_unit, &dts) + except OverflowError as err: + raise OutOfBoundsDatetime from err + + return result + + +@cython.overflowcheck(True) +cpdef cnp.ndarray add_overflowsafe(cnp.ndarray left, cnp.ndarray right): + """ + Overflow-safe addition for datetime64/timedelta64 dtypes. + + `right` may either be zero-dim or of the same shape as `left`. + """ + cdef: + Py_ssize_t N = left.size + int64_t lval, rval, res_value + ndarray iresult = cnp.PyArray_EMPTY( + left.ndim, left.shape, cnp.NPY_INT64, 0 + ) + cnp.broadcast mi = cnp.PyArray_MultiIterNew3(iresult, left, right) + + # Note: doing this try/except outside the loop improves performance over + # doing it inside the loop. + try: + for i in range(N): + # Analogous to: lval = lvalues[i] + lval = (cnp.PyArray_MultiIter_DATA(mi, 1))[0] + + # Analogous to: rval = rvalues[i] + rval = (cnp.PyArray_MultiIter_DATA(mi, 2))[0] + + if lval == NPY_DATETIME_NAT or rval == NPY_DATETIME_NAT: + res_value = NPY_DATETIME_NAT + else: + res_value = lval + rval + + # Analogous to: result[i] = res_value + (cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_value + + cnp.PyArray_MultiIter_NEXT(mi) + except OverflowError as err: + raise OverflowError("Overflow in int64 addition") from err + + return iresult diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/offsets.pxd pandas-2.2.2+dfsg/pandas/_libs/tslibs/offsets.pxd --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/offsets.pxd 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/offsets.pxd 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,7 @@ from numpy cimport int64_t -cpdef to_offset(object obj) +cpdef to_offset(object obj, bint is_period=*) cdef bint is_offset_object(object obj) cdef bint is_tick_object(object obj) diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/offsets.pyi pandas-2.2.2+dfsg/pandas/_libs/tslibs/offsets.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/offsets.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/offsets.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -33,6 +33,7 @@ class BaseOffset: n: int + normalize: bool def __init__(self, n: int = ..., normalize: bool = ...) -> None: ... def __eq__(self, other) -> bool: ... def __ne__(self, other) -> bool: ... @@ -85,7 +86,7 @@ @property def freqstr(self) -> str: ... def _apply(self, other): ... - def _apply_array(self, dtarr) -> None: ... + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: ... def rollback(self, dt: datetime) -> datetime: ... def rollforward(self, dt: datetime) -> datetime: ... def is_on_offset(self, dt: datetime) -> bool: ... @@ -103,11 +104,11 @@ def __reduce__(self): ... @overload -def to_offset(freq: None) -> None: ... +def to_offset(freq: None, is_period: bool = ...) -> None: ... @overload -def to_offset(freq: _BaseOffsetT) -> _BaseOffsetT: ... +def to_offset(freq: _BaseOffsetT, is_period: bool = ...) -> _BaseOffsetT: ... @overload -def to_offset(freq: timedelta | str) -> BaseOffset: ... +def to_offset(freq: timedelta | str, is_period: bool = ...) -> BaseOffset: ... class Tick(SingleConstructorOffset): _creso: int @@ -277,7 +278,10 @@ INVALID_FREQ_ERR_MSG: Literal["Invalid frequency: {0}"] def shift_months( - dtindex: npt.NDArray[np.int64], months: int, day_opt: str | None = ... + dtindex: npt.NDArray[np.int64], + months: int, + day_opt: str | None = ..., + reso: int = ..., ) -> npt.NDArray[np.int64]: ... _offset_map: dict[str, BaseOffset] diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/offsets.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/offsets.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/offsets.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/offsets.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -1,5 +1,8 @@ import re import time +import warnings + +from pandas.util._exceptions import find_stack_level cimport cython from cpython.datetime cimport ( @@ -13,6 +16,8 @@ timedelta, ) +import warnings + import_datetime() import numpy as np @@ -31,26 +36,31 @@ from pandas._libs.tslibs cimport util from pandas._libs.tslibs.util cimport ( - is_datetime64_object, is_float_object, is_integer_object, ) from pandas._libs.tslibs.ccalendar import ( MONTH_ALIASES, - MONTH_TO_CAL_NUM, int_to_weekday, weekday_to_int, ) +from pandas.util._exceptions import find_stack_level from pandas._libs.tslibs.ccalendar cimport ( + MONTH_TO_CAL_NUM, dayofweek, get_days_in_month, get_firstbday, get_lastbday, ) from pandas._libs.tslibs.conversion cimport localize_pydatetime -from pandas._libs.tslibs.dtypes cimport periods_per_day +from pandas._libs.tslibs.dtypes cimport ( + c_DEPR_ABBREVS, + c_OFFSET_DEPR_FREQSTR, + c_REVERSE_OFFSET_DEPR_FREQSTR, + periods_per_day, +) from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_NaT as NaT, @@ -91,12 +101,6 @@ return isinstance(obj, Tick) -cdef datetime _as_datetime(datetime obj): - if isinstance(obj, _Timestamp): - return obj.to_pydatetime() - return obj - - cdef bint _is_normalized(datetime dt): if dt.hour != 0 or dt.minute != 0 or dt.second != 0 or dt.microsecond != 0: # Regardless of whether dt is datetime vs Timestamp @@ -106,33 +110,6 @@ return True -def apply_wrapper_core(func, self, other) -> ndarray: - result = func(self, other) - result = np.asarray(result) - - if self.normalize: - # TODO: Avoid circular/runtime import - from .vectorized import normalize_i8_timestamps - reso = get_unit_from_dtype(other.dtype) - result = normalize_i8_timestamps(result.view("i8"), None, reso=reso) - - return result - - -def apply_array_wraps(func): - # Note: normally we would use `@functools.wraps(func)`, but this does - # not play nicely with cython class methods - def wrapper(self, other) -> np.ndarray: - # other is a DatetimeArray - result = apply_wrapper_core(func, self, other) - return result - - # do @functools.wraps(func) manually since it doesn't work on cdef funcs - wrapper.__name__ = func.__name__ - wrapper.__doc__ = func.__doc__ - return wrapper - - def apply_wraps(func): # Note: normally we would use `@functools.wraps(func)`, but this does # not play nicely with cython class methods @@ -144,11 +121,11 @@ elif ( isinstance(other, BaseOffset) or PyDelta_Check(other) - or util.is_timedelta64_object(other) + or cnp.is_timedelta64_object(other) ): # timedelta path return func(self, other) - elif is_datetime64_object(other) or PyDate_Check(other): + elif cnp.is_datetime64_object(other) or PyDate_Check(other): # PyDate_Check includes date, datetime other = Timestamp(other) else: @@ -479,12 +456,7 @@ return type(self)(n=1, normalize=self.normalize, **self.kwds) def __add__(self, other): - if not isinstance(self, BaseOffset): - # cython semantics; this is __radd__ - # TODO(cython3): remove this, this moved to __radd__ - return other.__add__(self) - - elif util.is_array(other) and other.dtype == object: + if util.is_array(other) and other.dtype == object: return np.array([self + x for x in other]) try: @@ -501,10 +473,6 @@ elif type(other) is type(self): return type(self)(self.n - other.n, normalize=self.normalize, **self.kwds) - elif not isinstance(self, BaseOffset): - # TODO(cython3): remove, this moved to __rsub__ - # cython semantics, this is __rsub__ - return (-other).__add__(self) else: # e.g. PeriodIndex return NotImplemented @@ -518,10 +486,6 @@ elif is_integer_object(other): return type(self)(n=other * self.n, normalize=self.normalize, **self.kwds) - elif not isinstance(self, BaseOffset): - # TODO(cython3): remove this, this moved to __rmul__ - # cython semantics, this is __rmul__ - return other.__mul__(self) return NotImplemented def __rmul__(self, other): @@ -592,10 +556,10 @@ Examples -------- >>> pd.offsets.Hour().name - 'H' + 'h' >>> pd.offsets.Hour(5).name - 'H' + 'h' """ return self.rule_code @@ -618,13 +582,13 @@ '<5 * DateOffsets>' >>> pd.offsets.BusinessHour(2).freqstr - '2BH' + '2bh' >>> pd.offsets.Nano().freqstr - 'N' + 'ns' >>> pd.offsets.Nano(-3).freqstr - '-3N' + '-3ns' """ try: code = self.rule_code @@ -653,8 +617,9 @@ def _apply(self, other): raise NotImplementedError("implemented by subclasses") - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: + # NB: _apply_array does not handle respecting `self.normalize`, the + # caller (DatetimeArray) handles that in post-processing. raise NotImplementedError( f"DateOffset subclass {type(self).__name__} " "does not have a vectorized implementation" @@ -751,7 +716,7 @@ TypeError if `int(n)` raises ValueError if n != int(n) """ - if util.is_timedelta64_object(n): + if cnp.is_timedelta64_object(n): raise TypeError(f"`n` argument must be an integer, got {type(n)}") try: nint = int(n) @@ -791,11 +756,14 @@ raise ValueError(f"{self} is a non-fixed frequency") def is_anchored(self) -> bool: - # TODO: Does this make sense for the general case? It would help - # if there were a canonical docstring for what is_anchored means. + # GH#55388 """ Return boolean whether the frequency is a unit frequency (n=1). + .. deprecated:: 2.2.0 + is_anchored is deprecated and will be removed in a future version. + Use ``obj.n == 1`` instead. + Examples -------- >>> pd.DateOffset().is_anchored() @@ -803,6 +771,12 @@ >>> pd.DateOffset(2).is_anchored() False """ + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self.n == 1 # ------------------------------------------------------------------ @@ -948,9 +922,25 @@ # Since cdef classes have no __dict__, we need to override return "" + @cache_readonly + def _as_pd_timedelta(self): + return Timedelta(self) + @property def delta(self): - return self.n * Timedelta(self._nanos_inc) + warnings.warn( + # GH#55498 + f"{type(self).__name__}.delta is deprecated and will be removed in " + "a future version. Use pd.Timedelta(obj) instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + try: + return self.n * Timedelta(self._nanos_inc) + except OverflowError as err: + # GH#55503 as_unit will raise a more useful OutOfBoundsTimedelta + Timedelta(self).as_unit("ns") + raise AssertionError("This should not be reached.") @property def nanos(self) -> int64_t: @@ -973,6 +963,27 @@ return True def is_anchored(self) -> bool: + # GH#55388 + """ + Return False. + + .. deprecated:: 2.2.0 + is_anchored is deprecated and will be removed in a future version. + Use ``False`` instead. + + Examples + -------- + >>> pd.offsets.Hour().is_anchored() + False + >>> pd.offsets.Hour(2).is_anchored() + False + """ + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use False instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return False # This is identical to BaseOffset.__hash__, but has to be redefined here @@ -992,28 +1003,24 @@ except ValueError: # e.g. "infer" return False - return self.delta == other + return self._as_pd_timedelta == other def __ne__(self, other): return not (self == other) def __le__(self, other): - return self.delta.__le__(other) + return self._as_pd_timedelta.__le__(other) def __lt__(self, other): - return self.delta.__lt__(other) + return self._as_pd_timedelta.__lt__(other) def __ge__(self, other): - return self.delta.__ge__(other) + return self._as_pd_timedelta.__ge__(other) def __gt__(self, other): - return self.delta.__gt__(other) + return self._as_pd_timedelta.__gt__(other) def __mul__(self, other): - if not isinstance(self, Tick): - # TODO(cython3), remove this, this moved to __rmul__ - # cython semantics, this is __rmul__ - return other.__mul__(self) if is_float_object(other): n = other * self.n # If the new `n` is an integer, we can represent it using the @@ -1031,26 +1038,21 @@ def __truediv__(self, other): if not isinstance(self, Tick): # cython semantics mean the args are sometimes swapped - result = other.delta.__rtruediv__(self) + result = other._as_pd_timedelta.__rtruediv__(self) else: - result = self.delta.__truediv__(other) + result = self._as_pd_timedelta.__truediv__(other) return _wrap_timedelta_result(result) def __rtruediv__(self, other): - result = self.delta.__rtruediv__(other) + result = self._as_pd_timedelta.__rtruediv__(other) return _wrap_timedelta_result(result) def __add__(self, other): - if not isinstance(self, Tick): - # cython semantics; this is __radd__ - # TODO(cython3): remove this, this moved to __radd__ - return other.__add__(self) - if isinstance(other, Tick): if type(self) is type(other): return type(self)(self.n + other.n) else: - return delta_to_tick(self.delta + other.delta) + return delta_to_tick(self._as_pd_timedelta + other._as_pd_timedelta) try: return self._apply(other) except ApplyTypeError: @@ -1068,15 +1070,15 @@ # Timestamp can handle tz and nano sec, thus no need to use apply_wraps if isinstance(other, _Timestamp): # GH#15126 - return other + self.delta + return other + self._as_pd_timedelta elif other is NaT: return NaT - elif is_datetime64_object(other) or PyDate_Check(other): + elif cnp.is_datetime64_object(other) or PyDate_Check(other): # PyDate_Check includes date, datetime return Timestamp(other) + self - if util.is_timedelta64_object(other) or PyDelta_Check(other): - return other + self.delta + if cnp.is_timedelta64_object(other) or PyDelta_Check(other): + return other + self._as_pd_timedelta raise ApplyTypeError(f"Unhandled type: {type(other).__name__}") @@ -1155,7 +1157,7 @@ Timestamp('2022-12-09 11:00:00') """ _nanos_inc = 3600 * 1_000_000_000 - _prefix = "H" + _prefix = "h" _period_dtype_code = PeriodDtypeCode.H _creso = NPY_DATETIMEUNIT.NPY_FR_h @@ -1191,7 +1193,7 @@ Timestamp('2022-12-09 14:50:00') """ _nanos_inc = 60 * 1_000_000_000 - _prefix = "T" + _prefix = "min" _period_dtype_code = PeriodDtypeCode.T _creso = NPY_DATETIMEUNIT.NPY_FR_m @@ -1227,28 +1229,118 @@ Timestamp('2022-12-09 14:59:50') """ _nanos_inc = 1_000_000_000 - _prefix = "S" + _prefix = "s" _period_dtype_code = PeriodDtypeCode.S _creso = NPY_DATETIMEUNIT.NPY_FR_s cdef class Milli(Tick): + """ + Offset ``n`` milliseconds. + + Parameters + ---------- + n : int, default 1 + The number of milliseconds represented. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + You can use the parameter ``n`` to represent a shift of n milliseconds. + + >>> from pandas.tseries.offsets import Milli + >>> ts = pd.Timestamp(2022, 12, 9, 15) + >>> ts + Timestamp('2022-12-09 15:00:00') + + >>> ts + Milli(n=10) + Timestamp('2022-12-09 15:00:00.010000') + + >>> ts - Milli(n=10) + Timestamp('2022-12-09 14:59:59.990000') + + >>> ts + Milli(n=-10) + Timestamp('2022-12-09 14:59:59.990000') + """ _nanos_inc = 1_000_000 - _prefix = "L" + _prefix = "ms" _period_dtype_code = PeriodDtypeCode.L _creso = NPY_DATETIMEUNIT.NPY_FR_ms cdef class Micro(Tick): + """ + Offset ``n`` microseconds. + + Parameters + ---------- + n : int, default 1 + The number of microseconds represented. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + You can use the parameter ``n`` to represent a shift of n microseconds. + + >>> from pandas.tseries.offsets import Micro + >>> ts = pd.Timestamp(2022, 12, 9, 15) + >>> ts + Timestamp('2022-12-09 15:00:00') + + >>> ts + Micro(n=1000) + Timestamp('2022-12-09 15:00:00.001000') + + >>> ts - Micro(n=1000) + Timestamp('2022-12-09 14:59:59.999000') + + >>> ts + Micro(n=-1000) + Timestamp('2022-12-09 14:59:59.999000') + """ _nanos_inc = 1000 - _prefix = "U" + _prefix = "us" _period_dtype_code = PeriodDtypeCode.U _creso = NPY_DATETIMEUNIT.NPY_FR_us cdef class Nano(Tick): + """ + Offset ``n`` nanoseconds. + + Parameters + ---------- + n : int, default 1 + The number of nanoseconds represented. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + You can use the parameter ``n`` to represent a shift of n nanoseconds. + + >>> from pandas.tseries.offsets import Nano + >>> ts = pd.Timestamp(2022, 12, 9, 15) + >>> ts + Timestamp('2022-12-09 15:00:00') + + >>> ts + Nano(n=1000) + Timestamp('2022-12-09 15:00:00.000001') + + >>> ts - Nano(n=1000) + Timestamp('2022-12-09 14:59:59.999999') + + >>> ts + Nano(n=-1000) + Timestamp('2022-12-09 14:59:59.999999') + """ _nanos_inc = 1 - _prefix = "N" + _prefix = "ns" _period_dtype_code = PeriodDtypeCode.N _creso = NPY_DATETIMEUNIT.NPY_FR_ns @@ -1329,7 +1421,7 @@ if self._use_relativedelta: if isinstance(other, _Timestamp): other_nanos = other.nanosecond - other = _as_datetime(other) + other = other.to_pydatetime(warn=False) if len(self.kwds) > 0: tzinfo = getattr(other, "tzinfo", None) @@ -1352,10 +1444,10 @@ else: return other + timedelta(self.n) - @apply_array_wraps - def _apply_array(self, dtarr): - reso = get_unit_from_dtype(dtarr.dtype) - dt64other = np.asarray(dtarr) + @cache_readonly + def _pd_timedelta(self) -> Timedelta: + # components of _offset that can be cast to pd.Timedelta + kwds = self.kwds relativedelta_fast = { "years", @@ -1366,31 +1458,40 @@ "minutes", "seconds", "microseconds", + "milliseconds", } # relativedelta/_offset path only valid for base DateOffset if self._use_relativedelta and set(kwds).issubset(relativedelta_fast): - - months = (kwds.get("years", 0) * 12 + kwds.get("months", 0)) * self.n - if months: - shifted = shift_months(dt64other.view("i8"), months, reso=reso) - dt64other = shifted.view(dtarr.dtype) - - weeks = kwds.get("weeks", 0) * self.n - if weeks: - delta = Timedelta(days=7 * weeks) - td = (<_Timedelta>delta)._as_creso(reso) - dt64other = dt64other + td - - timedelta_kwds = { - k: v - for k, v in kwds.items() - if k in ["days", "hours", "minutes", "seconds", "microseconds"] + td_args = { + "days", + "hours", + "minutes", + "seconds", + "microseconds", + "milliseconds" + } + td_kwds = { + key: val + for key, val in kwds.items() + if key in td_args } - if timedelta_kwds: - delta = Timedelta(**timedelta_kwds) - td = (<_Timedelta>delta)._as_creso(reso) - dt64other = dt64other + (self.n * td) - return dt64other + if "weeks" in kwds: + days = td_kwds.get("days", 0) + td_kwds["days"] = days + 7 * kwds["weeks"] + + if td_kwds: + delta = Timedelta(**td_kwds) + if "microseconds" in kwds: + delta = delta.as_unit("us") + elif "milliseconds" in kwds: + delta = delta.as_unit("ms") + else: + delta = delta.as_unit("s") + else: + delta = Timedelta(0).as_unit("s") + + return delta * self.n + elif not self._use_relativedelta and hasattr(self, "_offset"): # timedelta num_nano = getattr(self, "nanoseconds", 0) @@ -1399,8 +1500,14 @@ delta = Timedelta((self._offset + rem_nano) * self.n) else: delta = Timedelta(self._offset * self.n) - td = (<_Timedelta>delta)._as_creso(reso) - return dt64other + td + if "microseconds" in kwds: + delta = delta.as_unit("us") + elif "milliseconds" in kwds: + delta = delta.as_unit("ms") + else: + delta = delta.as_unit("s") + return delta + else: # relativedelta with other keywords kwd = set(kwds) - relativedelta_fast @@ -1410,6 +1517,19 @@ "applied vectorized" ) + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: + reso = get_unit_from_dtype(dtarr.dtype) + dt64other = np.asarray(dtarr) + + delta = self._pd_timedelta # may raise NotImplementedError + + kwds = self.kwds + months = (kwds.get("years", 0) * 12 + kwds.get("months", 0)) * self.n + if months: + shifted = shift_months(dt64other.view("i8"), months, reso=reso) + dt64other = shifted.view(dtarr.dtype) + return dt64other + delta + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False @@ -1487,6 +1607,28 @@ normalize : bool, default False Whether to round the result of a DateOffset addition down to the previous midnight. + weekday : int {0, 1, ..., 6}, default 0 + + A specific integer for the day of the week. + + - 0 is Monday + - 1 is Tuesday + - 2 is Wednesday + - 3 is Thursday + - 4 is Friday + - 5 is Saturday + - 6 is Sunday + + Instead Weekday type from dateutil.relativedelta can be used. + + - MO is Monday + - TU is Tuesday + - WE is Wednesday + - TH is Thursday + - FR is Friday + - SA is Saturday + - SU is Sunday. + **kwds Temporal parameter that add to or replace the offset value. @@ -1597,7 +1739,7 @@ # Older (<0.22.0) versions have offset attribute instead of _offset self._offset = state.pop("offset") - if self._prefix.startswith("C"): + if self._prefix.startswith(("C", "c")): # i.e. this is a Custom class weekmask = state.pop("weekmask") holidays = state.pop("holidays") @@ -1621,6 +1763,8 @@ The number of days represented. normalize : bool, default False Normalize start/end dates to midnight. + offset : timedelta, default timedelta(0) + Time offset to apply. Examples -------- @@ -1661,7 +1805,7 @@ s = td.seconds hrs = int(s / 3600) if hrs != 0: - off_str += str(hrs) + "H" + off_str += str(hrs) + "h" s -= hrs * 3600 mts = int(s / 60) if mts != 0: @@ -1787,14 +1931,12 @@ days = n + 2 return days - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: i8other = dtarr.view("i8") reso = get_unit_from_dtype(dtarr.dtype) res = self._shift_bdays(i8other, reso=reso) if self.offset: res = res.view(dtarr.dtype) + Timedelta(self.offset) - res = res.view("i8") return res def is_on_offset(self, dt: datetime) -> bool: @@ -1858,10 +2000,10 @@ '2022-12-12 06:00:00', '2022-12-12 07:00:00', '2022-12-12 10:00:00', '2022-12-12 11:00:00', '2022-12-12 15:00:00', '2022-12-12 16:00:00'], - dtype='datetime64[ns]', freq='BH') + dtype='datetime64[ns]', freq='bh') """ - _prefix = "BH" + _prefix = "bh" _anchor = 0 _attributes = tuple(["n", "normalize", "start", "end", "offset"]) _adjust_dst = False @@ -1981,7 +2123,7 @@ nb_offset = 1 else: nb_offset = -1 - if self._prefix.startswith("C"): + if self._prefix.startswith(("c")): # CustomBusinessHour return CustomBusinessDay( n=nb_offset, @@ -2141,7 +2283,7 @@ # adjust by business days first if bd != 0: - if self._prefix.startswith("C"): + if self._prefix.startswith("c"): # GH#30593 this is a Custom offset skip_bd = CustomBusinessDay( n=bd, @@ -2207,7 +2349,7 @@ dt = datetime( dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.microsecond ) - # Valid BH can be on the different BusinessDay during midnight + # Valid bh can be on the different BusinessDay during midnight # Distinguish by the time spent from previous opening time return self._is_on_offset(dt) @@ -2217,7 +2359,7 @@ """ # if self.normalize and not _is_normalized(dt): # return False - # Valid BH can be on the different BusinessDay during midnight + # Valid bh can be on the different BusinessDay during midnight # Distinguish by the time spent from previous opening time if self.n >= 0: op = self._prev_opening_time(dt) @@ -2335,8 +2477,7 @@ months = years * 12 + (self.month - other.month) return shift_month(other, months, self._day_opt) - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: reso = get_unit_from_dtype(dtarr.dtype) shifted = shift_quarters( dtarr.view("i8"), self.n, self.month, self._day_opt, modby=12, reso=reso @@ -2379,7 +2520,7 @@ _outputName = "BusinessYearEnd" _default_month = 12 - _prefix = "BA" + _prefix = "BYE" _day_opt = "business_end" @@ -2418,7 +2559,7 @@ _outputName = "BusinessYearBegin" _default_month = 1 - _prefix = "BAS" + _prefix = "BYS" _day_opt = "business_start" @@ -2463,7 +2604,7 @@ """ _default_month = 12 - _prefix = "A" + _prefix = "YE" _day_opt = "end" cdef readonly: @@ -2517,7 +2658,7 @@ """ _default_month = 1 - _prefix = "AS" + _prefix = "YS" _day_opt = "start" @@ -2565,6 +2706,13 @@ return f"{self._prefix}-{month}" def is_anchored(self) -> bool: + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1 " + f"and obj.startingMonth is not None\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self.n == 1 and self.startingMonth is not None def is_on_offset(self, dt: datetime) -> bool: @@ -2587,8 +2735,7 @@ months = qtrs * 3 - months_since return shift_month(other, months, self._day_opt) - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: reso = get_unit_from_dtype(dtarr.dtype) shifted = shift_quarters( dtarr.view("i8"), @@ -2638,7 +2785,7 @@ _output_name = "BusinessQuarterEnd" _default_starting_month = 3 _from_name_starting_month = 12 - _prefix = "BQ" + _prefix = "BQE" _day_opt = "business_end" @@ -2711,7 +2858,7 @@ Timestamp('2022-03-31 00:00:00') """ _default_starting_month = 3 - _prefix = "Q" + _prefix = "QE" _day_opt = "end" cdef readonly: @@ -2772,8 +2919,7 @@ n = roll_convention(other.day, self.n, compare_day) return shift_month(other, n, self._day_opt) - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: reso = get_unit_from_dtype(dtarr.dtype) shifted = shift_months(dtarr.view("i8"), self.n, self._day_opt, reso=reso) return shifted @@ -2821,7 +2967,7 @@ Timestamp('2022-01-31 00:00:00') """ _period_dtype_code = PeriodDtypeCode.M - _prefix = "M" + _prefix = "ME" _day_opt = "end" @@ -2895,7 +3041,7 @@ >>> pd.offsets.BMonthEnd().rollforward(ts) Timestamp('2022-11-30 00:00:00') """ - _prefix = "BM" + _prefix = "BME" _day_opt = "business_end" @@ -3003,10 +3149,9 @@ return shift_month(other, months, to_day) - @apply_array_wraps @cython.wraparound(False) @cython.boundscheck(False) - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: cdef: ndarray i8other = dtarr.view("i8") Py_ssize_t i, count = dtarr.size @@ -3079,9 +3224,12 @@ Parameters ---------- - n : int + n : int, default 1 + The number of months represented. normalize : bool, default False + Normalize start/end dates to midnight before generating date range. day_of_month : int, {1, 3,...,27}, default 15 + A specific integer for the day of the month. Examples -------- @@ -3103,7 +3251,7 @@ >>> pd.offsets.SemiMonthEnd().rollforward(ts) Timestamp('2022-01-15 00:00:00') """ - _prefix = "SM" + _prefix = "SME" _min_day_of_month = 1 def is_on_offset(self, dt: datetime) -> bool: @@ -3119,9 +3267,12 @@ Parameters ---------- - n : int + n : int, default 1 + The number of months represented. normalize : bool, default False - day_of_month : int, {2, 3,...,27}, default 15 + Normalize start/end dates to midnight before generating date range. + day_of_month : int, {1, 3,...,27}, default 15 + A specific integer for the day of the month. Examples -------- @@ -3148,6 +3299,10 @@ Parameters ---------- + n : int, default 1 + The number of weeks represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. weekday : int or None, default None Always generate specific day of week. 0 for Monday and 6 for Sunday. @@ -3203,6 +3358,13 @@ self._cache = state.pop("_cache", {}) def is_anchored(self) -> bool: + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1 " + f"and obj.weekday is not None\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return self.n == 1 and self.weekday is not None @apply_wraps @@ -3224,11 +3386,11 @@ return other + timedelta(weeks=k) - @apply_array_wraps - def _apply_array(self, dtarr): + def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: if self.weekday is None: td = timedelta(days=7 * self.n) - td64 = np.timedelta64(td, "ns") + unit = np.datetime_data(dtarr.dtype)[0] + td64 = np.timedelta64(td, unit) return dtarr + td64 else: reso = get_unit_from_dtype(dtarr.dtype) @@ -3398,6 +3560,9 @@ Parameters ---------- n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. weekday : int {0, 1, ..., 6}, default 0 A specific integer for the day of the week. @@ -3489,6 +3654,12 @@ self.variation = state.pop("variation") def is_anchored(self) -> bool: + warnings.warn( + f"{type(self).__name__}.is_anchored is deprecated and will be removed " + f"in a future version, please use \'obj.n == 1\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return ( self.n == 1 and self.startingMonth is not None and self.weekday is not None ) @@ -3540,6 +3711,9 @@ Parameters ---------- n : int + The number of fiscal years represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. weekday : int {0, 1, ..., 6}, default 0 A specific integer for the day of the week. @@ -3562,11 +3736,31 @@ - "nearest" means year end is **weekday** closest to last day of month in year. - "last" means year end is final **weekday** of the final month in fiscal year. + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + Examples -------- + In the example below the default parameters give the next 52-53 week fiscal year. + >>> ts = pd.Timestamp(2022, 1, 1) >>> ts + pd.offsets.FY5253() Timestamp('2022-01-31 00:00:00') + + By the parameter ``startingMonth`` we can specify + the month in which fiscal years end. + + >>> ts = pd.Timestamp(2022, 1, 1) + >>> ts + pd.offsets.FY5253(startingMonth=3) + Timestamp('2022-03-28 00:00:00') + + 52-53 week fiscal year can be specified by + ``weekday`` and ``variation`` parameters. + + >>> ts = pd.Timestamp(2022, 1, 1) + >>> ts + pd.offsets.FY5253(weekday=5, startingMonth=12, variation="last") + Timestamp('2022-12-31 00:00:00') """ _prefix = "RE" @@ -3720,6 +3914,9 @@ Parameters ---------- n : int + The number of business quarters represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. weekday : int {0, 1, ..., 6}, default 0 A specific integer for the day of the week. @@ -3745,11 +3942,32 @@ - "nearest" means year end is **weekday** closest to last day of month in year. - "last" means year end is final **weekday** of the final month in fiscal year. + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + Examples -------- + In the example below the default parameters give + the next business quarter for 52-53 week fiscal year. + >>> ts = pd.Timestamp(2022, 1, 1) >>> ts + pd.offsets.FY5253Quarter() Timestamp('2022-01-31 00:00:00') + + By the parameter ``startingMonth`` we can specify + the month in which fiscal years end. + + >>> ts = pd.Timestamp(2022, 1, 1) + >>> ts + pd.offsets.FY5253Quarter(startingMonth=3) + Timestamp('2022-03-28 00:00:00') + + Business quarters for 52-53 week fiscal year can be specified by + ``weekday`` and ``variation`` parameters. + + >>> ts = pd.Timestamp(2022, 1, 1) + >>> ts + pd.offsets.FY5253Quarter(weekday=5, startingMonth=12, variation="last") + Timestamp('2022-04-02 00:00:00') """ _prefix = "REQ" @@ -4066,9 +4284,7 @@ @property def _period_dtype_code(self): # GH#52534 - raise TypeError( - "CustomBusinessDay cannot be used with Period or PeriodDtype" - ) + raise ValueError(f"{self.base} is not supported as period frequency") _apply_array = BaseOffset._apply_array @@ -4150,6 +4366,8 @@ Start time of your custom business hour in 24h format. end : str, time, or list of str/time, default: "17:00" End time of your custom business hour in 24h format. + offset : timedelta, default timedelta(0) + Time offset to apply. Examples -------- @@ -4186,7 +4404,7 @@ '2022-12-12 06:00:00', '2022-12-12 07:00:00', '2022-12-12 10:00:00', '2022-12-12 11:00:00', '2022-12-12 15:00:00', '2022-12-12 16:00:00'], - dtype='datetime64[ns]', freq='CBH') + dtype='datetime64[ns]', freq='cbh') Business days can be specified by ``weekmask`` parameter. To convert the returned datetime object to its string representation @@ -4215,10 +4433,10 @@ '2022-12-15 11:00:00', '2022-12-15 12:00:00', '2022-12-16 10:00:00', '2022-12-16 11:00:00', '2022-12-16 12:00:00'], - dtype='datetime64[ns]', freq='CBH') + dtype='datetime64[ns]', freq='cbh') """ - _prefix = "CBH" + _prefix = "cbh" _anchor = 0 _attributes = tuple( ["n", "normalize", "weekmask", "holidays", "calendar", "start", "end", "offset"] @@ -4240,28 +4458,6 @@ cdef class _CustomBusinessMonth(BusinessMixin): - """ - DateOffset subclass representing custom business month(s). - - Increments between beginning/end of month dates. - - Parameters - ---------- - n : int, default 1 - The number of months represented. - normalize : bool, default False - Normalize start/end dates to midnight before generating date range. - weekmask : str, Default 'Mon Tue Wed Thu Fri' - Weekmask of valid business days, passed to ``numpy.busdaycalendar``. - holidays : list - List/array of dates to exclude from the set of valid business days, - passed to ``numpy.busdaycalendar``. - calendar : np.busdaycalendar - Calendar to integrate. - offset : timedelta, default timedelta(0) - Time offset to apply. - """ - _attributes = tuple( ["n", "normalize", "weekmask", "holidays", "calendar", "offset"] ) @@ -4337,10 +4533,124 @@ cdef class CustomBusinessMonthEnd(_CustomBusinessMonth): - _prefix = "CBM" + """ + DateOffset subclass representing custom business month(s). + + Increments between end of month dates. + + Parameters + ---------- + n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize end dates to midnight before generating date range. + weekmask : str, Default 'Mon Tue Wed Thu Fri' + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. + holidays : list + List/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar``. + calendar : np.busdaycalendar + Calendar to integrate. + offset : timedelta, default timedelta(0) + Time offset to apply. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + In the example below we use the default parameters. + + >>> ts = pd.Timestamp(2022, 8, 5) + >>> ts + pd.offsets.CustomBusinessMonthEnd() + Timestamp('2022-08-31 00:00:00') + + Custom business month end can be specified by ``weekmask`` parameter. + To convert the returned datetime object to its string representation + the function strftime() is used in the next example. + + >>> import datetime as dt + >>> freq = pd.offsets.CustomBusinessMonthEnd(weekmask="Wed Thu") + >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 12, 18), + ... freq=freq).strftime('%a %d %b %Y %H:%M') + Index(['Thu 28 Jul 2022 00:00', 'Wed 31 Aug 2022 00:00', + 'Thu 29 Sep 2022 00:00', 'Thu 27 Oct 2022 00:00', + 'Wed 30 Nov 2022 00:00'], + dtype='object') + + Using NumPy business day calendar you can define custom holidays. + + >>> import datetime as dt + >>> bdc = np.busdaycalendar(holidays=['2022-08-01', '2022-09-30', + ... '2022-10-31', '2022-11-01']) + >>> freq = pd.offsets.CustomBusinessMonthEnd(calendar=bdc) + >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 11, 10), freq=freq) + DatetimeIndex(['2022-07-29', '2022-08-31', '2022-09-29', '2022-10-28'], + dtype='datetime64[ns]', freq='CBME') + """ + + _prefix = "CBME" cdef class CustomBusinessMonthBegin(_CustomBusinessMonth): + """ + DateOffset subclass representing custom business month(s). + + Increments between beginning of month dates. + + Parameters + ---------- + n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize start dates to midnight before generating date range. + weekmask : str, Default 'Mon Tue Wed Thu Fri' + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. + holidays : list + List/array of dates to exclude from the set of valid business days, + passed to ``numpy.busdaycalendar``. + calendar : np.busdaycalendar + Calendar to integrate. + offset : timedelta, default timedelta(0) + Time offset to apply. + + See Also + -------- + :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment. + + Examples + -------- + In the example below we use the default parameters. + + >>> ts = pd.Timestamp(2022, 8, 5) + >>> ts + pd.offsets.CustomBusinessMonthBegin() + Timestamp('2022-09-01 00:00:00') + + Custom business month start can be specified by ``weekmask`` parameter. + To convert the returned datetime object to its string representation + the function strftime() is used in the next example. + + >>> import datetime as dt + >>> freq = pd.offsets.CustomBusinessMonthBegin(weekmask="Wed Thu") + >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 12, 18), + ... freq=freq).strftime('%a %d %b %Y %H:%M') + Index(['Wed 03 Aug 2022 00:00', 'Thu 01 Sep 2022 00:00', + 'Wed 05 Oct 2022 00:00', 'Wed 02 Nov 2022 00:00', + 'Thu 01 Dec 2022 00:00'], + dtype='object') + + Using NumPy business day calendar you can define custom holidays. + + >>> import datetime as dt + >>> bdc = np.busdaycalendar(holidays=['2022-08-01', '2022-09-30', + ... '2022-10-31', '2022-11-01']) + >>> freq = pd.offsets.CustomBusinessMonthBegin(calendar=bdc) + >>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 11, 10), freq=freq) + DatetimeIndex(['2022-08-02', '2022-09-01', '2022-10-03', '2022-11-02'], + dtype='datetime64[ns]', freq='CBMS') + """ + _prefix = "CBMS" @@ -4357,33 +4667,33 @@ prefix_mapping = { offset._prefix: offset for offset in [ - YearBegin, # 'AS' - YearEnd, # 'A' - BYearBegin, # 'BAS' - BYearEnd, # 'BA' + YearBegin, # 'YS' + YearEnd, # 'YE' + BYearBegin, # 'BYS' + BYearEnd, # 'BYE' BusinessDay, # 'B' BusinessMonthBegin, # 'BMS' - BusinessMonthEnd, # 'BM' - BQuarterEnd, # 'BQ' + BusinessMonthEnd, # 'BME' + BQuarterEnd, # 'BQE' BQuarterBegin, # 'BQS' - BusinessHour, # 'BH' + BusinessHour, # 'bh' CustomBusinessDay, # 'C' - CustomBusinessMonthEnd, # 'CBM' + CustomBusinessMonthEnd, # 'CBME' CustomBusinessMonthBegin, # 'CBMS' - CustomBusinessHour, # 'CBH' - MonthEnd, # 'M' + CustomBusinessHour, # 'cbh' + MonthEnd, # 'ME' MonthBegin, # 'MS' - Nano, # 'N' - SemiMonthEnd, # 'SM' + Nano, # 'ns' + SemiMonthEnd, # 'SME' SemiMonthBegin, # 'SMS' Week, # 'W' - Second, # 'S' - Minute, # 'T' - Micro, # 'U' - QuarterEnd, # 'Q' + Second, # 's' + Minute, # 'min' + Micro, # 'us' + QuarterEnd, # 'QE' QuarterBegin, # 'QS' - Milli, # 'L' - Hour, # 'H' + Milli, # 'ms' + Hour, # 'h' Day, # 'D' WeekOfMonth, # 'WOM' FY5253, @@ -4398,25 +4708,22 @@ _lite_rule_alias = { "W": "W-SUN", - "Q": "Q-DEC", + "QE": "QE-DEC", - "A": "A-DEC", # YearEnd(month=12), - "Y": "A-DEC", - "AS": "AS-JAN", # YearBegin(month=1), - "YS": "AS-JAN", - "BA": "BA-DEC", # BYearEnd(month=12), - "BY": "BA-DEC", - "BAS": "BAS-JAN", # BYearBegin(month=1), - "BYS": "BAS-JAN", - - "Min": "T", - "min": "T", - "ms": "L", - "us": "U", - "ns": "N", + "YE": "YE-DEC", # YearEnd(month=12), + "YS": "YS-JAN", # YearBegin(month=1), + "BYE": "BYE-DEC", # BYearEnd(month=12), + "BYS": "BYS-JAN", # BYearBegin(month=1), + + "Min": "min", + "min": "min", + "ms": "ms", + "us": "us", + "ns": "ns", } -_dont_uppercase = {"MS", "ms"} +_dont_uppercase = _dont_uppercase = {"h", "bh", "cbh", "MS", "ms", "s"} + INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" @@ -4434,6 +4741,28 @@ -------- _get_offset('EOM') --> BMonthEnd(1) """ + if ( + name not in _lite_rule_alias + and (name.upper() in _lite_rule_alias) + and name != "ms" + ): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use \'{name.upper()}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif ( + name not in _lite_rule_alias + and (name.lower() in _lite_rule_alias) + and name != "MS" + ): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use \'{name.lower()}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) if name not in _dont_uppercase: name = name.upper() name = _lite_rule_alias.get(name, name) @@ -4450,14 +4779,16 @@ offset = klass._from_name(*split[1:]) except (ValueError, TypeError, KeyError) as err: # bad prefix or suffix - raise ValueError(INVALID_FREQ_ERR_MSG.format(name)) from err + raise ValueError(INVALID_FREQ_ERR_MSG.format( + f"{name}, failed to parse with error message: {repr(err)}") + ) # cache _offset_map[name] = offset return _offset_map[name] -cpdef to_offset(freq): +cpdef to_offset(freq, bint is_period=False): """ Return DateOffset object from string or datetime.timedelta object. @@ -4484,7 +4815,7 @@ >>> to_offset("5min") <5 * Minutes> - >>> to_offset("1D1H") + >>> to_offset("1D1h") <25 * Hours> >>> to_offset("2W") @@ -4502,19 +4833,19 @@ if freq is None: return None - if isinstance(freq, BaseOffset): - return freq - if isinstance(freq, tuple): raise TypeError( f"to_offset does not support tuples {freq}, pass as a string instead" ) + if isinstance(freq, BaseOffset): + result = freq + elif PyDelta_Check(freq): - return delta_to_tick(freq) + result = delta_to_tick(freq) elif isinstance(freq, str): - delta = None + result = None stride_sign = None try: @@ -4525,6 +4856,62 @@ tups = zip(split[0::4], split[1::4], split[2::4]) for n, (sep, stride, name) in enumerate(tups): + if not is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{c_OFFSET_DEPR_FREQSTR.get(name.upper())}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + name = c_OFFSET_DEPR_FREQSTR[name.upper()] + if (not is_period and + name != name.upper() and + name.lower() not in {"s", "ms", "us", "ns"} and + name.upper().split("-")[0].endswith(("S", "E"))): + warnings.warn( + f"\'{name}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{name.upper()}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + name = name.upper() + if is_period and name.upper() in c_REVERSE_OFFSET_DEPR_FREQSTR: + if name.upper().startswith("Y"): + raise ValueError( + f"for Period, please use \'Y{name.upper()[2:]}\' " + f"instead of \'{name}\'" + ) + if (name.upper().startswith("B") or + name.upper().startswith("S") or + name.upper().startswith("C")): + raise ValueError(INVALID_FREQ_ERR_MSG.format(name)) + else: + raise ValueError( + f"for Period, please use " + f"\'{c_REVERSE_OFFSET_DEPR_FREQSTR.get(name.upper())}\' " + f"instead of \'{name}\'" + ) + elif is_period and name.upper() in c_OFFSET_DEPR_FREQSTR: + if name.upper().startswith("A"): + warnings.warn( + f"\'{name}\' is deprecated and will be removed in a future " + f"version, please use " + f"\'{c_DEPR_ABBREVS.get(name.upper())}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if name.upper() != name: + warnings.warn( + f"\'{name}\' is deprecated and will be removed in " + f"a future version, please use \'{name.upper()}\' " + f"instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + name = c_OFFSET_DEPR_FREQSTR.get(name.upper()) + if sep != "" and not sep.isspace(): raise ValueError("separator must be spaces") prefix = _lite_rule_alias.get(name) or name @@ -4533,9 +4920,19 @@ if not stride: stride = 1 - if prefix in {"D", "H", "T", "S", "L", "U", "N"}: - # For these prefixes, we have something like "3H" or - # "2.5T", so we can construct a Timedelta with the + if prefix in c_DEPR_ABBREVS: + warnings.warn( + f"\'{prefix}\' is deprecated and will be removed " + f"in a future version, please use " + f"\'{c_DEPR_ABBREVS.get(prefix)}\' instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + prefix = c_DEPR_ABBREVS[prefix] + + if prefix in {"D", "h", "min", "s", "ms", "us", "ns"}: + # For these prefixes, we have something like "3h" or + # "2.5min", so we can construct a Timedelta with the # matching unit and get our offset from delta_to_tick td = Timedelta(1, unit=prefix) off = delta_to_tick(td) @@ -4546,22 +4943,30 @@ offset *= stride_sign else: stride = int(stride) - offset = _get_offset(name) + offset = _get_offset(prefix) offset = offset * int(np.fabs(stride) * stride_sign) - if delta is None: - delta = offset + if result is None: + result = offset else: - delta = delta + offset + result = result + offset except (ValueError, TypeError) as err: - raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) from err + raise ValueError(INVALID_FREQ_ERR_MSG.format( + f"{freq}, failed to parse with error message: {repr(err)}") + ) else: - delta = None + result = None - if delta is None: + if result is None: raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) - return delta + if is_period and not hasattr(result, "_period_dtype_code"): + if isinstance(freq, str): + raise ValueError(f"{result.name} is not supported as period frequency") + else: + raise ValueError(f"{freq} is not supported as period frequency") + + return result # ---------------------------------------------------------------------- diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/parsing.pxd pandas-2.2.2+dfsg/pandas/_libs/tslibs/parsing.pxd --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/parsing.pxd 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/parsing.pxd 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,5 @@ from cpython.datetime cimport datetime +from numpy cimport int64_t from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT @@ -10,5 +11,6 @@ str date_string, bint dayfirst, bint yearfirst, - NPY_DATETIMEUNIT* out_bestunit + NPY_DATETIMEUNIT* out_bestunit, + int64_t* nanos, ) diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/parsing.pyi pandas-2.2.2+dfsg/pandas/_libs/tslibs/parsing.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/parsing.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/parsing.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -23,13 +23,8 @@ values: npt.NDArray[np.object_], # object[:] parser, ) -> npt.NDArray[np.object_]: ... -def try_parse_year_month_day( - years: npt.NDArray[np.object_], # object[:] - months: npt.NDArray[np.object_], # object[:] - days: npt.NDArray[np.object_], # object[:] -) -> npt.NDArray[np.object_]: ... def guess_datetime_format( - dt_str, + dt_str: str, dayfirst: bool | None = ..., ) -> str | None: ... def concat_date_cols( diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/parsing.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/parsing.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/parsing.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/parsing.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -34,6 +34,7 @@ PyArray_IterNew, flatiter, float64_t, + int64_t, ) cnp.import_array() @@ -51,7 +52,7 @@ from pandas._config import get_option -from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS +from pandas._libs.tslibs.ccalendar cimport MONTH_TO_CAL_NUM from pandas._libs.tslibs.dtypes cimport ( attrname_to_npy_unit, npy_unit_to_attrname, @@ -272,8 +273,11 @@ # parse_datetime_string cpdef bc it has a pointer argument) cdef: NPY_DATETIMEUNIT out_bestunit + int64_t nanos - return parse_datetime_string(date_string, dayfirst, yearfirst, &out_bestunit) + return parse_datetime_string( + date_string, dayfirst, yearfirst, &out_bestunit, &nanos + ) cdef datetime parse_datetime_string( @@ -283,7 +287,8 @@ str date_string, bint dayfirst, bint yearfirst, - NPY_DATETIMEUNIT* out_bestunit + NPY_DATETIMEUNIT* out_bestunit, + int64_t* nanos, ): """ Parse datetime string, only returns datetime. @@ -311,7 +316,7 @@ default = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) dt = dateutil_parse(date_string, default=default, dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False, out_bestunit=out_bestunit) + ignoretz=False, out_bestunit=out_bestunit, nanos=nanos) return dt dt = _parse_delimited_date(date_string, dayfirst, out_bestunit) @@ -330,7 +335,7 @@ dt = dateutil_parse(date_string, default=_DEFAULT_DATETIME, dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False, out_bestunit=out_bestunit) + ignoretz=False, out_bestunit=out_bestunit, nanos=nanos) return dt @@ -436,7 +441,7 @@ parsed = dateutil_parse(date_string, _DEFAULT_DATETIME, dayfirst=dayfirst, yearfirst=yearfirst, - ignoretz=False, out_bestunit=&out_bestunit) + ignoretz=False, out_bestunit=&out_bestunit, nanos=NULL) reso = npy_unit_to_attrname[out_bestunit] return parsed, reso @@ -576,7 +581,7 @@ # e.g. if "Q" is not in date_string and .index raised pass - if date_len == 6 and freq == "M": + if date_len == 6 and freq == "ME": year = int(date_string[:4]) month = int(date_string[4:6]) try: @@ -623,7 +628,7 @@ raise ValueError("Quarter must be 1 <= q <= 4") if freq is not None: - mnum = c_MONTH_NUMBERS[get_rule_month(freq)] + 1 + mnum = MONTH_TO_CAL_NUM[get_rule_month(freq)] month = (mnum + (quarter - 1) * 3) % 12 + 1 if month > mnum: year -= 1 @@ -639,7 +644,8 @@ bint ignoretz, bint dayfirst, bint yearfirst, - NPY_DATETIMEUNIT* out_bestunit + NPY_DATETIMEUNIT* out_bestunit, + int64_t* nanos, ): """ lifted from dateutil to get resolution""" @@ -671,11 +677,8 @@ if reso is None: raise DateParseError(f"Unable to parse datetime string: {timestr}") - if reso == "microsecond": - if repl["microsecond"] == 0: - reso = "second" - elif repl["microsecond"] % 1000 == 0: - reso = "millisecond" + if reso == "microsecond" and repl["microsecond"] % 1000 == 0: + reso = _find_subsecond_reso(timestr, nanos=nanos) try: ret = default.replace(**repl) @@ -716,7 +719,7 @@ elif res.tzoffset: ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset)) - # dateutil can return a datetime with a tzoffset outside of (-24H, 24H) + # dateutil can return a datetime with a tzoffset outside of (-24h, 24h) # bounds, which is invalid (can be constructed, but raises if we call # str(ret)). Check that and raise here if necessary. try: @@ -745,6 +748,38 @@ return ret +cdef object _reso_pattern = re.compile(r"\d:\d{2}:\d{2}\.(?P\d+)") + +cdef _find_subsecond_reso(str timestr, int64_t* nanos): + # GH#55737 + # Check for trailing zeros in a H:M:S.f pattern + match = _reso_pattern.search(timestr) + if not match: + reso = "second" + else: + frac = match.groupdict()["frac"] + if len(frac) <= 3: + reso = "millisecond" + elif len(frac) > 6: + if frac[6:] == "0" * len(frac[6:]): + # corner case where we haven't lost any data + reso = "nanosecond" + elif len(frac) <= 9: + reso = "nanosecond" + if nanos is not NULL: + if len(frac) < 9: + frac = frac + "0" * (9 - len(frac)) + nanos[0] = int(frac[6:]) + else: + # TODO: should we warn/raise in higher-than-nano cases? + reso = "nanosecond" + if nanos is not NULL: + nanos[0] = int(frac[6:9]) + else: + reso = "microsecond" + return reso + + # ---------------------------------------------------------------------- # Parsing for type-inference @@ -766,25 +801,6 @@ return result.base # .base to access underlying ndarray -def try_parse_year_month_day( - object[:] years, object[:] months, object[:] days -) -> np.ndarray: - cdef: - Py_ssize_t i, n - object[::1] result - - n = len(years) - # TODO(cython3): Use len instead of `shape[0]` - if months.shape[0] != n or days.shape[0] != n: - raise ValueError("Length of years/months/days must all be equal") - result = np.empty(n, dtype="O") - - for i in range(n): - result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) - - return result.base # .base to access underlying ndarray - - # ---------------------------------------------------------------------- # Miscellaneous @@ -796,7 +812,7 @@ # is not practical. In fact, using this class issues warnings (xref gh-21322). # Thus, we port the class over so that both issues are resolved. # -# Copyright (c) 2017 - dateutil contributors +# Licence at LICENSES/DATEUTIL_LICENSE class _timelex: def __init__(self, instream): if getattr(instream, "decode", None) is not None: @@ -874,14 +890,24 @@ Datetime string to guess the format of. dayfirst : bool, default False If True parses dates with the day first, eg 20/01/2005 - Warning: dayfirst=True is not strict, but will prefer to parse - with day first (this is a known bug). + + .. warning:: + dayfirst=True is not strict, but will prefer to parse + with day first (this is a known bug). Returns ------- str or None : ret datetime format string (for `strftime` or `strptime`), or None if it can't be guessed. + + Examples + -------- + >>> from pandas.tseries.api import guess_datetime_format + >>> guess_datetime_format('09/13/2023') + '%m/%d/%Y' + + >>> guess_datetime_format('2023|September|13') """ cdef: NPY_DATETIMEUNIT out_bestunit @@ -925,6 +951,7 @@ yearfirst=False, ignoretz=False, out_bestunit=&out_bestunit, + nanos=NULL, ) except (ValueError, OverflowError, InvalidOperation): # In case the datetime can't be parsed, its format cannot be guessed @@ -959,7 +986,7 @@ # the offset is separated into two tokens, ex. ['+', '0900’]. # This separation will prevent subsequent processing # from correctly parsing the time zone format. - # So in addition to the format nomalization, we rejoin them here. + # So in addition to the format normalization, we rejoin them here. try: tokens[offset_index] = parsed_datetime.strftime("%z") except ValueError: diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/period.pyi pandas-2.2.2+dfsg/pandas/_libs/tslibs/period.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/period.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/period.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -63,7 +63,7 @@ def end_time(self) -> Timestamp: ... @property def start_time(self) -> Timestamp: ... - def _require_matching_freq(self, other, base: bool = ...) -> None: ... + def _require_matching_freq(self, other: BaseOffset, base: bool = ...) -> None: ... class Period(PeriodMixin): ordinal: int # int64_t @@ -87,10 +87,10 @@ @classmethod def _maybe_convert_freq(cls, freq) -> BaseOffset: ... @classmethod - def _from_ordinal(cls, ordinal: int, freq) -> Period: ... + def _from_ordinal(cls, ordinal: int, freq: BaseOffset) -> Period: ... @classmethod - def now(cls, freq: BaseOffset = ...) -> Period: ... - def strftime(self, fmt: str) -> str: ... + def now(cls, freq: Frequency) -> Period: ... + def strftime(self, fmt: str | None) -> str: ... def to_timestamp( self, freq: str | BaseOffset | None = ..., diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/period.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/period.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/period.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/period.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -38,6 +38,13 @@ tm, ) +from pandas._libs.tslibs.dtypes cimport ( + c_OFFSET_TO_PERIOD_FREQSTR, + freq_to_period_freqstr, +) + +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime + # import datetime C API import_datetime() @@ -47,8 +54,7 @@ NPY_DATETIMEUNIT, NPY_FR_D, astype_overflowsafe, - check_dts_bounds, - get_timedelta64_value, + dts_to_iso_string, import_pandas_datetime, npy_datetimestruct, npy_datetimestruct_to_datetime, @@ -91,6 +97,9 @@ attrname_to_abbrevs, freq_group_code_to_npy_unit, ) + +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr + from pandas._libs.tslibs.parsing cimport quarter_to_myear from pandas._libs.tslibs.parsing import parse_datetime_string_with_reso @@ -1149,14 +1158,20 @@ cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: cdef: npy_datetimestruct dts + int64_t result if ordinal == NPY_NAT: return NPY_NAT get_date_info(ordinal, freq, &dts) - check_dts_bounds(&dts) - return npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT.NPY_FR_ns, &dts) + try: + result = npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT.NPY_FR_ns, &dts) + except OverflowError as err: + fmt = dts_to_iso_string(&dts) + raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp: {fmt}") from err + + return result cdef str period_format(int64_t value, int freq, object fmt=None): @@ -1507,7 +1522,7 @@ int64_t[::1] result = np.empty(len(values), dtype="i8") int64_t val - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) if not isinstance(freq, BaseOffset): raise ValueError("freq not specified and cannot be inferred") @@ -1539,7 +1554,7 @@ # if we don't raise here, we'll segfault later! raise TypeError("extract_ordinals values must be object-dtype") - freqstr = Period._maybe_convert_freq(freq).freqstr + freqstr = freq_to_period_freqstr(freq.n, freq.name) for i in range(n): # Analogous to: p = values[i] @@ -1699,23 +1714,20 @@ """ return self.to_timestamp(how="end") - def _require_matching_freq(self, other, base=False): + def _require_matching_freq(self, other: BaseOffset, bint base=False): # See also arrays.period.raise_on_incompatible - if is_offset_object(other): - other_freq = other - else: - other_freq = other.freq - if base: - condition = self.freq.base != other_freq.base + condition = self.freq.base != other.base else: - condition = self.freq != other_freq + condition = self.freq != other if condition: + freqstr = freq_to_period_freqstr(self.freq.n, self.freq.name) + other_freqstr = freq_to_period_freqstr(other.n, other.name) msg = DIFFERENT_FREQ.format( cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=other_freq.freqstr, + own_freq=freqstr, + other_freq=other_freqstr, ) raise IncompatibleFrequency(msg) @@ -1744,22 +1756,13 @@ @classmethod def _maybe_convert_freq(cls, object freq) -> BaseOffset: """ - Internally we allow integer and tuple representations (for now) that - are not recognized by to_offset, so we convert them here. Also, a - Period's freq attribute must have `freq.n > 0`, which we check for here. + A Period's freq attribute must have `freq.n > 0`, which we check for here. Returns ------- DateOffset """ - if isinstance(freq, int): - # We already have a dtype code - dtype = PeriodDtypeBase(freq, 1) - freq = dtype._freqstr - elif isinstance(freq, PeriodDtypeBase): - freq = freq._freqstr - - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) if freq.n <= 0: raise ValueError("Frequency must be positive, because it " @@ -1768,7 +1771,7 @@ return freq @classmethod - def _from_ordinal(cls, ordinal: int64_t, freq) -> "Period": + def _from_ordinal(cls, ordinal: int64_t, freq: BaseOffset) -> "Period": """ Fast creation from an ordinal and freq that are already validated! """ @@ -1786,7 +1789,7 @@ return False elif op == Py_NE: return True - self._require_matching_freq(other) + self._require_matching_freq(other.freq) return PyObject_RichCompareBool(self.ordinal, other.ordinal, op) elif other is NaT: return op == Py_NE @@ -1804,15 +1807,15 @@ def _add_timedeltalike_scalar(self, other) -> "Period": cdef: - int64_t inc + int64_t inc, ordinal if not self._dtype._is_tick_like(): raise IncompatibleFrequency("Input cannot be converted to " f"Period(freq={self.freqstr})") if ( - util.is_timedelta64_object(other) and - get_timedelta64_value(other) == NPY_NAT + cnp.is_timedelta64_object(other) and + cnp.get_timedelta64_value(other) == NPY_NAT ): # i.e. np.timedelta64("nat") return NaT @@ -1822,8 +1825,8 @@ except ValueError as err: raise IncompatibleFrequency("Input cannot be converted to " f"Period(freq={self.freqstr})") from err - # TODO: overflow-check here - ordinal = self.ordinal + inc + with cython.overflowcheck(True): + ordinal = self.ordinal + inc return Period(ordinal=ordinal, freq=self.freq) def _add_offset(self, other) -> "Period": @@ -1836,14 +1839,8 @@ ordinal = self.ordinal + other.n return Period(ordinal=ordinal, freq=self.freq) + @cython.overflowcheck(True) def __add__(self, other): - if not is_period_object(self): - # cython semantics; this is analogous to a call to __radd__ - # TODO(cython3): remove this - if self is NaT: - return NaT - return other.__add__(self) - if is_any_td_scalar(other): return self._add_timedeltalike_scalar(other) elif is_offset_object(other): @@ -1875,21 +1872,14 @@ return self.__add__(other) def __sub__(self, other): - if not is_period_object(self): - # cython semantics; this is like a call to __rsub__ - # TODO(cython3): remove this - if self is NaT: - return NaT - return NotImplemented - - elif ( + if ( is_any_td_scalar(other) or is_offset_object(other) or util.is_integer_object(other) ): return self + (-other) elif is_period_object(other): - self._require_matching_freq(other) + self._require_matching_freq(other.freq) # GH 23915 - mul by base freq since __add__ is agnostic of n return (self.ordinal - other.ordinal) * self.freq.base elif other is NaT: @@ -1932,8 +1922,8 @@ Examples -------- >>> period = pd.Period('2023-1-1', freq='D') - >>> period.asfreq('H') - Period('2023-01-01 23:00', 'H') + >>> period.asfreq('h') + Period('2023-01-01 23:00', 'h') """ freq = self._maybe_convert_freq(freq) how = validate_end_alias(how) @@ -1989,8 +1979,10 @@ return endpoint - np.timedelta64(1, "ns") if freq is None: - freq = self._dtype._get_to_timestamp_base() - base = freq + freq_code = self._dtype._get_to_timestamp_base() + dtype = PeriodDtypeBase(freq_code, 1) + freq = dtype._freqstr + base = freq_code else: freq = self._maybe_convert_freq(freq) base = freq._period_dtype_code @@ -2044,7 +2036,7 @@ Examples -------- - >>> p = pd.Period("2018-03-11", freq='H') + >>> p = pd.Period("2018-03-11", freq='h') >>> p.day 11 """ @@ -2145,7 +2137,7 @@ Examples -------- - >>> p = pd.Period("2018-03-11", "H") + >>> p = pd.Period("2018-03-11", "h") >>> p.weekofyear 10 @@ -2176,7 +2168,7 @@ Examples -------- - >>> p = pd.Period("2018-03-11", "H") + >>> p = pd.Period("2018-03-11", "h") >>> p.week 10 @@ -2216,14 +2208,14 @@ Examples -------- - >>> per = pd.Period('2017-12-31 22:00', 'H') + >>> per = pd.Period('2017-12-31 22:00', 'h') >>> per.day_of_week 6 For periods that span over multiple days, the day at the beginning of the period is returned. - >>> per = pd.Period('2017-12-31 22:00', '4H') + >>> per = pd.Period('2017-12-31 22:00', '4h') >>> per.day_of_week 6 >>> per.start_time.day_of_week @@ -2267,14 +2259,14 @@ Examples -------- - >>> per = pd.Period('2017-12-31 22:00', 'H') + >>> per = pd.Period('2017-12-31 22:00', 'h') >>> per.dayofweek 6 For periods that span over multiple days, the day at the beginning of the period is returned. - >>> per = pd.Period('2017-12-31 22:00', '4H') + >>> per = pd.Period('2017-12-31 22:00', '4h') >>> per.dayofweek 6 >>> per.start_time.dayofweek @@ -2316,7 +2308,7 @@ Examples -------- - >>> period = pd.Period("2015-10-23", freq='H') + >>> period = pd.Period("2015-10-23", freq='h') >>> period.day_of_year 296 >>> period = pd.Period("2012-12-31", freq='D') @@ -2437,7 +2429,7 @@ Examples -------- - >>> p = pd.Period("2018-03-11", freq='H') + >>> p = pd.Period("2018-03-11", freq='h') >>> p.daysinmonth 31 """ @@ -2472,8 +2464,8 @@ Examples -------- - >>> pd.Period.now('H') # doctest: +SKIP - Period('2023-06-12 11:00', 'H') + >>> pd.Period.now('h') # doctest: +SKIP + Period('2023-06-12 11:00', 'h') """ return Period(datetime.now(), freq=freq) @@ -2487,7 +2479,8 @@ >>> pd.Period('2020-01', 'D').freqstr 'D' """ - return self._dtype._freqstr + freqstr = freq_to_period_freqstr(self.freq.n, self.freq.name) + return freqstr def __repr__(self) -> str: base = self._dtype._dtype_code @@ -2511,11 +2504,12 @@ object_state = None, self.freq, self.ordinal return (Period, object_state) - def strftime(self, fmt: str) -> str: + def strftime(self, fmt: str | None) -> str: r""" Returns a formatted string representation of the :class:`Period`. - ``fmt`` must be a string containing one or several directives. + ``fmt`` must be ``None`` or a string containing one or several directives. + When ``None``, the format will be determined from the frequency of the Period. The method recognizes the same directives as the :func:`time.strftime` function of the standard Python distribution, as well as the specific additional directives ``%f``, ``%F``, ``%q``, ``%l``, ``%u``, ``%n``. @@ -2707,13 +2701,21 @@ year=None, month=None, quarter=None, day=None, hour=None, minute=None, second=None): # freq points to a tuple (base, mult); base is one of the defined - # periods such as A, Q, etc. Every five minutes would be, e.g., - # ('T', 5) but may be passed in as a string like '5T' + # periods such as Y, Q, etc. Every five minutes would be, e.g., + # ('min', 5) but may be passed in as a string like '5min' # ordinal is the period offset from the gregorian proleptic epoch if freq is not None: freq = cls._maybe_convert_freq(freq) + try: + period_dtype_code = freq._period_dtype_code + except (AttributeError, TypeError): + # AttributeError: _period_dtype_code might not exist + # TypeError: _period_dtype_code might intentionally raise + raise TypeError( + f"{(type(freq).__name__)} is not supported as period frequency" + ) nanosecond = 0 if ordinal is not None and value is not None: @@ -2746,7 +2748,7 @@ elif is_period_object(value): other = value - if freq is None or freq._period_dtype_code == other._dtype._dtype_code: + if freq is None or period_dtype_code == other._dtype._dtype_code: ordinal = other.ordinal freq = other.freq else: @@ -2788,7 +2790,8 @@ if freq is None and ordinal != NPY_NAT: # Skip NaT, since it doesn't have a resolution freq = attrname_to_abbrevs[reso] - freq = to_offset(freq) + freq = c_OFFSET_TO_PERIOD_FREQSTR.get(freq, freq) + freq = to_offset(freq, is_period=True) elif PyDateTime_Check(value): dt = value @@ -2796,7 +2799,7 @@ raise ValueError("Must supply freq for datetime value") if isinstance(dt, Timestamp): nanosecond = dt.nanosecond - elif util.is_datetime64_object(value): + elif cnp.is_datetime64_object(value): dt = Timestamp(value) if freq is None: raise ValueError("Must supply freq for datetime value") @@ -2826,7 +2829,8 @@ FutureWarning, stacklevel=find_stack_level(), ) - + if ordinal == NPY_NAT: + return NaT return cls._from_ordinal(ordinal, freq) @@ -2881,7 +2885,7 @@ if freq is None: day_name = end.day_name()[:3].upper() freqstr = f"W-{day_name}" - freq = to_offset(freqstr) + freq = to_offset(freqstr, is_period=True) # We _should_ have freq.is_on_offset(end) return end, freq diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/strptime.pxd pandas-2.2.2+dfsg/pandas/_libs/tslibs/strptime.pxd --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/strptime.pxd 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/strptime.pxd 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,26 @@ +from cpython.datetime cimport ( + datetime, + tzinfo, +) from numpy cimport int64_t +from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT -cdef bint parse_today_now(str val, int64_t* iresult, bint utc) + +cdef bint parse_today_now( + str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso, bint infer_reso=* +) + + +cdef class DatetimeParseState: + cdef: + # See comments describing these attributes in the __cinit__ method + bint found_tz + bint found_naive + bint found_naive_str + bint found_other + bint creso_ever_changed + NPY_DATETIMEUNIT creso + + cdef tzinfo process_datetime(self, datetime dt, tzinfo tz, bint utc_convert) + cdef bint update_creso(self, NPY_DATETIMEUNIT item_reso) noexcept diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/strptime.pyi pandas-2.2.2+dfsg/pandas/_libs/tslibs/strptime.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/strptime.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/strptime.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,7 @@ exact: bool = ..., errors: str = ..., utc: bool = ..., + creso: int = ..., # NPY_DATETIMEUNIT ) -> tuple[np.ndarray, np.ndarray]: ... # first ndarray is M8[ns], second is object ndarray of tzinfo | None diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/strptime.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/strptime.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/strptime.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/strptime.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,7 @@ TimeRE, _calc_julian_from_U_or_W are vendored from the standard library, see https://github.com/python/cpython/blob/main/Lib/_strptime.py +Licence at LICENSES/PSF_LICENSE The original module-level docstring follows. Strptime-related classes and functions. @@ -24,6 +25,7 @@ timedelta, tzinfo, ) + from _strptime import ( TimeRE as _TimeRE, _getlang, @@ -46,39 +48,48 @@ from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.conversion cimport ( - convert_timezone, get_datetime64_nanos, + parse_pydatetime, +) +from pandas._libs.tslibs.dtypes cimport ( + get_supported_reso, + npy_unit_to_abbrev, + npy_unit_to_attrname, ) from pandas._libs.tslibs.nattype cimport ( NPY_NAT, + c_NaT as NaT, c_nat_strings as nat_strings, ) from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, - check_dts_bounds, + get_datetime64_unit, import_pandas_datetime, npy_datetimestruct, npy_datetimestruct_to_datetime, pydate_to_dt64, - pydatetime_to_dt64, string_to_dts, ) import_pandas_datetime() from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime + from pandas._libs.tslibs.timestamps cimport _Timestamp +from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.util cimport ( - is_datetime64_object, is_float_object, is_integer_object, ) from pandas._libs.tslibs.timestamps import Timestamp +from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single + cnp.import_array() + cdef bint format_is_iso(f: str): """ Does format match the iso8601 set that can be handled by the C parser? @@ -109,22 +120,31 @@ return format_is_iso(f) -cdef bint parse_today_now(str val, int64_t* iresult, bint utc): +cdef bint parse_today_now( + str val, int64_t* iresult, bint utc, NPY_DATETIMEUNIT creso, bint infer_reso = False +): # We delay this check for as long as possible # because it catches relatively rare cases + cdef: + _Timestamp ts - # Multiply by 1000 to convert to nanos, since these methods naturally have - # microsecond resolution if val == "now": + if infer_reso: + creso = NPY_DATETIMEUNIT.NPY_FR_us if utc: - iresult[0] = Timestamp.utcnow()._value * 1000 + ts = <_Timestamp>Timestamp.utcnow() + iresult[0] = ts._as_creso(creso)._value else: # GH#18705 make sure to_datetime("now") matches Timestamp("now") # Note using Timestamp.now() is faster than Timestamp("now") - iresult[0] = Timestamp.now()._value * 1000 + ts = <_Timestamp>Timestamp.now() + iresult[0] = ts._as_creso(creso)._value return True elif val == "today": - iresult[0] = Timestamp.today()._value * 1000 + if infer_reso: + creso = NPY_DATETIMEUNIT.NPY_FR_us + ts = <_Timestamp>Timestamp.today() + iresult[0] = ts._as_creso(creso)._value return True return False @@ -153,47 +173,7 @@ "u": 22} -def array_strptime( - ndarray[object] values, - str fmt, - bint exact=True, - errors="raise", - bint utc=False, -): - """ - Calculates the datetime structs represented by the passed array of strings - - Parameters - ---------- - values : ndarray of string-like objects - fmt : string-like regex - exact : matches must be exact if True, search if False - errors : string specifying error handling, {'raise', 'ignore', 'coerce'} - """ - - cdef: - Py_ssize_t i, n = len(values) - npy_datetimestruct dts - int64_t[::1] iresult - object[::1] result_timezone - int year, month, day, minute, hour, second, weekday, julian - int week_of_year, week_of_year_start, parse_code, ordinal - int iso_week, iso_year - int64_t us, ns - object val, group_key, ampm, found, tz - bint is_raise = errors=="raise" - bint is_ignore = errors=="ignore" - bint is_coerce = errors=="coerce" - bint found_naive = False - bint found_tz = False - tzinfo tz_out = None - bint iso_format = format_is_iso(fmt) - NPY_DATETIMEUNIT out_bestunit - int out_local = 0, out_tzoffset = 0 - bint string_to_dts_succeeded = 0 - - assert is_raise or is_ignore or is_coerce - +cdef _validate_fmt(str fmt): if "%W" in fmt or "%U" in fmt: if "%Y" not in fmt and "%y" not in fmt: raise ValueError("Cannot use '%W' or '%U' without day and year") @@ -234,6 +214,8 @@ "the ISO year directive '%G' and a weekday " "directive '%A', '%a', '%w', or '%u'.") + +cdef _get_format_regex(str fmt): global _TimeRE_cache, _regex_cache with _cache_lock: if _getlang() != _TimeRE_cache.locale_time.lang: @@ -246,23 +228,123 @@ if not format_regex: try: format_regex = _TimeRE_cache.compile(fmt) - # KeyError raised when a bad format is found; can be specified as - # \\, in which case it was a stray % but with a space after it except KeyError, err: + # KeyError raised when a bad format is found; can be specified as + # \\, in which case it was a stray % but with a space after it bad_directive = err.args[0] if bad_directive == "\\": bad_directive = "%" del err raise ValueError(f"'{bad_directive}' is a bad directive " f"in format '{fmt}'") - # IndexError only occurs when the format string is "%" except IndexError: + # IndexError only occurs when the format string is "%" raise ValueError(f"stray % in format '{fmt}'") _regex_cache[fmt] = format_regex + return format_regex, locale_time + + +cdef class DatetimeParseState: + def __cinit__(self, NPY_DATETIMEUNIT creso): + # found_tz and found_naive are specifically about datetime/Timestamp + # objects with and without tzinfos attached. + self.found_tz = False + self.found_naive = False + # found_naive_str refers to a string that was parsed to a timezone-naive + # datetime. + self.found_naive_str = False + self.found_other = False + + self.creso = creso + self.creso_ever_changed = False + + cdef bint update_creso(self, NPY_DATETIMEUNIT item_reso) noexcept: + # Return a bool indicating whether we bumped to a higher resolution + if self.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + self.creso = item_reso + elif item_reso > self.creso: + self.creso = item_reso + self.creso_ever_changed = True + return True + return False + + cdef tzinfo process_datetime(self, datetime dt, tzinfo tz, bint utc_convert): + if dt.tzinfo is not None: + self.found_tz = True + else: + self.found_naive = True + + if dt.tzinfo is not None: + if utc_convert: + pass + elif self.found_naive: + raise ValueError("Tz-aware datetime.datetime " + "cannot be converted to " + "datetime64 unless utc=True") + elif tz is not None and not tz_compare(tz, dt.tzinfo): + raise ValueError("Tz-aware datetime.datetime " + "cannot be converted to " + "datetime64 unless utc=True") + else: + tz = dt.tzinfo + else: + if self.found_tz and not utc_convert: + raise ValueError("Cannot mix tz-aware with " + "tz-naive values") + return tz - result = np.empty(n, dtype="M8[ns]") + +def array_strptime( + ndarray[object] values, + str fmt, + bint exact=True, + errors="raise", + bint utc=False, + NPY_DATETIMEUNIT creso=NPY_FR_ns, +): + """ + Calculates the datetime structs represented by the passed array of strings + + Parameters + ---------- + values : ndarray of string-like objects + fmt : string-like regex + exact : matches must be exact if True, search if False + errors : string specifying error handling, {'raise', 'ignore', 'coerce'} + creso : NPY_DATETIMEUNIT, default NPY_FR_ns + Set to NPY_FR_GENERIC to infer a resolution. + """ + + cdef: + Py_ssize_t i, n = len(values) + npy_datetimestruct dts + int64_t[::1] iresult + object val + bint seen_datetime_offset = False + bint is_raise = errors=="raise" + bint is_ignore = errors=="ignore" + bint is_coerce = errors=="coerce" + bint is_same_offsets + set out_tzoffset_vals = set() + tzinfo tz, tz_out = None + bint iso_format = format_is_iso(fmt) + NPY_DATETIMEUNIT out_bestunit, item_reso + int out_local = 0, out_tzoffset = 0 + bint string_to_dts_succeeded = 0 + bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC + DatetimeParseState state = DatetimeParseState(creso) + + assert is_raise or is_ignore or is_coerce + + _validate_fmt(fmt) + format_regex, locale_time = _get_format_regex(fmt) + + if infer_reso: + abbrev = "ns" + else: + abbrev = npy_unit_to_abbrev(creso) + result = np.empty(n, dtype=f"M8[{abbrev}]") iresult = result.view("i8") - result_timezone = np.empty(n, dtype="object") dts.us = dts.ps = dts.as = 0 @@ -277,30 +359,31 @@ iresult[i] = NPY_NAT continue elif PyDateTime_Check(val): - if val.tzinfo is not None: - found_tz = True - else: - found_naive = True - tz_out = convert_timezone( - val.tzinfo, - tz_out, - found_naive, - found_tz, - utc, - ) if isinstance(val, _Timestamp): - iresult[i] = val.tz_localize(None).as_unit("ns")._value + item_reso = val._creso else: - iresult[i] = pydatetime_to_dt64(val.replace(tzinfo=None), &dts) - check_dts_bounds(&dts) - result_timezone[i] = val.tzinfo + item_reso = NPY_DATETIMEUNIT.NPY_FR_us + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + tz_out = state.process_datetime(val, tz_out, utc) + iresult[i] = parse_pydatetime(val, &dts, state.creso) continue elif PyDate_Check(val): - iresult[i] = pydate_to_dt64(val, &dts) - check_dts_bounds(&dts) + state.found_other = True + item_reso = NPY_DATETIMEUNIT.NPY_FR_s + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + iresult[i] = pydate_to_dt64(val, &dts, reso=creso) continue - elif is_datetime64_object(val): - iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) + elif cnp.is_datetime64_object(val): + state.found_other = True + item_reso = get_supported_reso(get_datetime64_unit(val)) + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + iresult[i] = get_datetime64_nanos(val, creso) continue elif ( (is_integer_object(val) or is_float_object(val)) @@ -324,20 +407,37 @@ if string_to_dts_succeeded: # No error reported by string_to_dts, pick back up # where we left off - value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) + item_reso = get_supported_reso(out_bestunit) + state.update_creso(item_reso) + if infer_reso: + creso = state.creso + try: + value = npy_datetimestruct_to_datetime(creso, &dts) + except OverflowError as err: + attrname = npy_unit_to_attrname[creso] + raise OutOfBoundsDatetime( + f"Out of bounds {attrname} timestamp: {val}" + ) from err if out_local == 1: - # Store the out_tzoffset in seconds - # since we store the total_seconds of - # dateutil.tz.tzoffset objects + nsecs = out_tzoffset * 60 + out_tzoffset_vals.add(nsecs) + seen_datetime_offset = True tz = timezone(timedelta(minutes=out_tzoffset)) - result_timezone[i] = tz - out_local = 0 - out_tzoffset = 0 + value = tz_localize_to_utc_single( + value, tz, ambiguous="raise", nonexistent=None, creso=creso + ) + else: + tz = None + out_tzoffset_vals.add("naive") + state.found_naive_str = True iresult[i] = value - check_dts_bounds(&dts) continue - if parse_today_now(val, &iresult[i], utc): + if parse_today_now(val, &iresult[i], utc, creso, infer_reso=infer_reso): + item_reso = NPY_DATETIMEUNIT.NPY_FR_us + state.update_creso(item_reso) + if infer_reso: + creso = state.creso continue # Some ISO formats can't be parsed by string_to_dts @@ -348,174 +448,466 @@ if not string_to_dts_succeeded and fmt == "ISO8601": raise ValueError(f"Time data {val} is not ISO8601 format") - # exact matching - if exact: - found = format_regex.match(val) - if not found: - raise ValueError( - f"time data \"{val}\" doesn't match format \"{fmt}\"" - ) - if len(val) != found.end(): - raise ValueError( - "unconverted data remains when parsing with " - f"format \"{fmt}\": \"{val[found.end():]}\"" - ) + tz = _parse_with_format( + val, fmt, exact, format_regex, locale_time, &dts, &item_reso + ) + + state.update_creso(item_reso) + if infer_reso: + creso = state.creso - # search + try: + iresult[i] = npy_datetimestruct_to_datetime(creso, &dts) + except OverflowError as err: + attrname = npy_unit_to_attrname[creso] + raise OutOfBoundsDatetime( + f"Out of bounds {attrname} timestamp: {val}" + ) from err + + if tz is not None: + ival = iresult[i] + iresult[i] = tz_localize_to_utc_single( + ival, tz, ambiguous="raise", nonexistent=None, creso=creso + ) + nsecs = (ival - iresult[i]) + if creso == NPY_FR_ns: + nsecs = nsecs // 10**9 + elif creso == NPY_DATETIMEUNIT.NPY_FR_us: + nsecs = nsecs // 10**6 + elif creso == NPY_DATETIMEUNIT.NPY_FR_ms: + nsecs = nsecs // 10**3 + + out_tzoffset_vals.add(nsecs) + seen_datetime_offset = True else: - found = format_regex.search(val) - if not found: - raise ValueError( - f"time data \"{val}\" doesn't match format \"{fmt}\"" - ) + state.found_naive_str = True + tz = None + out_tzoffset_vals.add("naive") - iso_year = -1 - year = 1900 - month = day = 1 - hour = minute = second = ns = us = 0 - tz = None - # Default to -1 to signify that values not known; not critical to have, - # though - iso_week = week_of_year = -1 - week_of_year_start = -1 - # weekday and julian defaulted to -1 so as to signal need to calculate - # values - weekday = julian = -1 - found_dict = found.groupdict() - for group_key in found_dict.iterkeys(): - # Directives not explicitly handled below: - # c, x, X - # handled by making out of other directives - # U, W - # worthless without day of the week - parse_code = _parse_code_table[group_key] - - if parse_code == 0: - year = int(found_dict["y"]) - # Open Group specification for strptime() states that a %y - # value in the range of [00, 68] is in the century 2000, while - # [69,99] is in the century 1900 - if year <= 68: - year += 2000 - else: - year += 1900 - elif parse_code == 1: - year = int(found_dict["Y"]) - elif parse_code == 2: - month = int(found_dict["m"]) - # elif group_key == 'B': - elif parse_code == 3: - month = locale_time.f_month.index(found_dict["B"].lower()) - # elif group_key == 'b': - elif parse_code == 4: - month = locale_time.a_month.index(found_dict["b"].lower()) - # elif group_key == 'd': - elif parse_code == 5: - day = int(found_dict["d"]) - # elif group_key == 'H': - elif parse_code == 6: - hour = int(found_dict["H"]) - elif parse_code == 7: - hour = int(found_dict["I"]) - ampm = found_dict.get("p", "").lower() - # If there was no AM/PM indicator, we'll treat this like AM - if ampm in ("", locale_time.am_pm[0]): - # We're in AM so the hour is correct unless we're - # looking at 12 midnight. - # 12 midnight == 12 AM == hour 0 - if hour == 12: - hour = 0 - elif ampm == locale_time.am_pm[1]: - # We're in PM so we need to add 12 to the hour unless - # we're looking at 12 noon. - # 12 noon == 12 PM == hour 12 - if hour != 12: - hour += 12 - elif parse_code == 8: - minute = int(found_dict["M"]) - elif parse_code == 9: - second = int(found_dict["S"]) - elif parse_code == 10: - s = found_dict["f"] - # Pad to always return nanoseconds - s += "0" * (9 - len(s)) - us = long(s) - ns = us % 1000 - us = us // 1000 - elif parse_code == 11: - weekday = locale_time.f_weekday.index(found_dict["A"].lower()) - elif parse_code == 12: - weekday = locale_time.a_weekday.index(found_dict["a"].lower()) - elif parse_code == 13: - weekday = int(found_dict["w"]) - if weekday == 0: - weekday = 6 - else: - weekday -= 1 - elif parse_code == 14: - julian = int(found_dict["j"]) - elif parse_code == 15 or parse_code == 16: - week_of_year = int(found_dict[group_key]) - if group_key == "U": - # U starts week on Sunday. - week_of_year_start = 6 - else: - # W starts week on Monday. - week_of_year_start = 0 - elif parse_code == 17: - tz = pytz.timezone(found_dict["Z"]) - elif parse_code == 19: - tz = parse_timezone_directive(found_dict["z"]) - elif parse_code == 20: - iso_year = int(found_dict["G"]) - elif parse_code == 21: - iso_week = int(found_dict["V"]) - elif parse_code == 22: - weekday = int(found_dict["u"]) - weekday -= 1 - - # If we know the wk of the year and what day of that wk, we can figure - # out the Julian day of the year. - if julian == -1 and weekday != -1: - if week_of_year != -1: - week_starts_Mon = week_of_year_start == 0 - julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, - week_starts_Mon) - elif iso_year != -1 and iso_week != -1: - year, julian = _calc_julian_from_V(iso_year, iso_week, - weekday + 1) - # Cannot pre-calculate date() since can change in Julian - # calculation and thus could have different value for the day of the wk - # calculation. - if julian == -1: - # Need to add 1 to result since first day of the year is 1, not - # 0. - ordinal = date(year, month, day).toordinal() - julian = ordinal - date(year, 1, 1).toordinal() + 1 + except ValueError as ex: + ex.args = ( + f"{str(ex)}, at position {i}. You might want to try:\n" + " - passing `format` if your strings have a consistent format;\n" + " - passing `format='ISO8601'` if your strings are " + "all ISO8601 but not necessarily in exactly the same format;\n" + " - passing `format='mixed'`, and the format will be " + "inferred for each element individually. " + "You might want to use `dayfirst` alongside this.", + ) + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise + return values, None + + if seen_datetime_offset and not utc: + is_same_offsets = len(out_tzoffset_vals) == 1 + if not is_same_offsets or (state.found_naive or state.found_other): + result2 = _array_strptime_object_fallback( + values, fmt=fmt, exact=exact, errors=errors, utc=utc + ) + return result2, None + elif tz_out is not None: + # GH#55693 + tz_offset = out_tzoffset_vals.pop() + tz_out2 = timezone(timedelta(seconds=tz_offset)) + if not tz_compare(tz_out, tz_out2): + # e.g. test_to_datetime_mixed_offsets_with_utc_false_deprecated + result2 = _array_strptime_object_fallback( + values, fmt=fmt, exact=exact, errors=errors, utc=utc + ) + return result2, None + # e.g. test_guess_datetime_format_with_parseable_formats + else: + # e.g. test_to_datetime_iso8601_with_timezone_valid + tz_offset = out_tzoffset_vals.pop() + tz_out = timezone(timedelta(seconds=tz_offset)) + elif not utc: + if tz_out and (state.found_other or state.found_naive_str): + # found_other indicates a tz-naive int, float, dt64, or date + result2 = _array_strptime_object_fallback( + values, fmt=fmt, exact=exact, errors=errors, utc=utc + ) + return result2, None + + if infer_reso: + if state.creso_ever_changed: + # We encountered mismatched resolutions, need to re-parse with + # the correct one. + return array_strptime( + values, + fmt=fmt, + exact=exact, + errors=errors, + utc=utc, + creso=state.creso, + ) + elif state.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + # i.e. we never encountered anything non-NaT, default to "s". This + # ensures that insert and concat-like operations with NaT + # do not upcast units + result = iresult.base.view("M8[s]") + else: + # Otherwise we can use the single reso that we encountered and avoid + # a second pass. + abbrev = npy_unit_to_abbrev(state.creso) + result = iresult.base.view(f"M8[{abbrev}]") + return result, tz_out + + +cdef tzinfo _parse_with_format( + str val, + str fmt, + bint exact, + format_regex, + locale_time, + npy_datetimestruct* dts, + NPY_DATETIMEUNIT* item_reso, +): + # Based on https://github.com/python/cpython/blob/main/Lib/_strptime.py#L293 + cdef: + int year, month, day, minute, hour, second, weekday, julian + int week_of_year, week_of_year_start, parse_code, ordinal + int iso_week, iso_year + int64_t us, ns + object found + tzinfo tz + dict found_dict + str group_key, ampm + + if exact: + # exact matching + found = format_regex.match(val) + if not found: + raise ValueError( + f"time data \"{val}\" doesn't match format \"{fmt}\"" + ) + if len(val) != found.end(): + raise ValueError( + "unconverted data remains when parsing with " + f"format \"{fmt}\": \"{val[found.end():]}\"" + ) + + else: + # search + found = format_regex.search(val) + if not found: + raise ValueError( + f"time data \"{val}\" doesn't match format \"{fmt}\"" + ) + + item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_s + + iso_year = -1 + year = 1900 + month = day = 1 + hour = minute = second = ns = us = 0 + tz = None + # Default to -1 to signify that values not known; not critical to have, + # though + iso_week = week_of_year = -1 + week_of_year_start = -1 + # weekday and julian defaulted to -1 so as to signal need to calculate + # values + weekday = julian = -1 + found_dict = found.groupdict() + for group_key in found_dict.iterkeys(): + # Directives not explicitly handled below: + # c, x, X + # handled by making out of other directives + # U, W + # worthless without day of the week + parse_code = _parse_code_table[group_key] + + if parse_code == 0: + year = int(found_dict["y"]) + # Open Group specification for strptime() states that a %y + # value in the range of [00, 68] is in the century 2000, while + # [69,99] is in the century 1900 + if year <= 68: + # e.g. val='May 04'; fmt='%b %y' + year += 2000 + else: + year += 1900 + # TODO: not reached in tests 2023-10-28 + elif parse_code == 1: + # e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S' + year = int(found_dict["Y"]) + elif parse_code == 2: + # e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S' + month = int(found_dict["m"]) + # elif group_key == 'B': + elif parse_code == 3: + # e.g. val='30/December/2011'; fmt='%d/%B/%Y' + month = locale_time.f_month.index(found_dict["B"].lower()) + # elif group_key == 'b': + elif parse_code == 4: + # e.g. val='30/Dec/2011 00:00:00'; fmt='%d/%b/%Y %H:%M:%S' + month = locale_time.a_month.index(found_dict["b"].lower()) + # elif group_key == 'd': + elif parse_code == 5: + # e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S' + day = int(found_dict["d"]) + # elif group_key == 'H': + elif parse_code == 6: + # e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S' + hour = int(found_dict["H"]) + elif parse_code == 7: + hour = int(found_dict["I"]) + ampm = found_dict.get("p", "").lower() + # If there was no AM/PM indicator, we'll treat this like AM + if ampm in ("", locale_time.am_pm[0]): + # We're in AM so the hour is correct unless we're + # looking at 12 midnight. + # 12 midnight == 12 AM == hour 0 + if hour == 12: + hour = 0 + # TODO: not reached in tests 2023-10-28; the implicit `else` + # branch is tested with e.g. + # val='Tuesday 24 Aug 2021 01:30:48 AM' + # fmt='%A %d %b %Y %I:%M:%S %p' + elif ampm == locale_time.am_pm[1]: + # We're in PM so we need to add 12 to the hour unless + # we're looking at 12 noon. + # 12 noon == 12 PM == hour 12 + if hour != 12: + # e.g. val='01/10/2010 08:14 PM'; fmt='%m/%d/%Y %I:%M %p' + hour += 12 + # TODO: the implicit `else` branch is not tested 2023-10-28 + # TODO: the implicit `else` branch is not reached 2023-10-28; possible? + elif parse_code == 8: + # e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S' + minute = int(found_dict["M"]) + elif parse_code == 9: + # e.g. val='17-10-2010 07:15:30'; fmt='%d-%m-%Y %H:%M:%S' + second = int(found_dict["S"]) + elif parse_code == 10: + # e.g. val='10:10:10.100'; fmt='%H:%M:%S.%f' + s = found_dict["f"] + if len(s) <= 3: + item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_ms + elif len(s) <= 6: + item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_us + else: + item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_ns + # Pad to always return nanoseconds + s += "0" * (9 - len(s)) + us = int(s) + ns = us % 1000 + us = us // 1000 + elif parse_code == 11: + # e.g val='Tuesday 24 Aug 2021 01:30:48 AM'; fmt='%A %d %b %Y %I:%M:%S %p' + weekday = locale_time.f_weekday.index(found_dict["A"].lower()) + elif parse_code == 12: + # e.g. val='Tue 24 Aug 2021 01:30:48 AM'; fmt='%a %d %b %Y %I:%M:%S %p' + weekday = locale_time.a_weekday.index(found_dict["a"].lower()) + elif parse_code == 13: + weekday = int(found_dict["w"]) + if weekday == 0: + # e.g. val='2013020'; fmt='%Y%U%w' + weekday = 6 + else: + # e.g. val='2009324'; fmt='%Y%W%w' + weekday -= 1 + elif parse_code == 14: + # e.g. val='2009164202000'; fmt='%Y%j%H%M%S' + julian = int(found_dict["j"]) + elif parse_code == 15 or parse_code == 16: + week_of_year = int(found_dict[group_key]) + if group_key == "U": + # e.g. val='2013020'; fmt='%Y%U%w' + # U starts week on Sunday. + week_of_year_start = 6 + else: + # e.g. val='2009324'; fmt='%Y%W%w' + # W starts week on Monday. + week_of_year_start = 0 + elif parse_code == 17: + # e.g. val='2011-12-30T00:00:00.000000UTC'; fmt='%Y-%m-%dT%H:%M:%S.%f%Z' + tz = pytz.timezone(found_dict["Z"]) + elif parse_code == 19: + # e.g. val='March 1, 2018 12:00:00+0400'; fmt='%B %d, %Y %H:%M:%S%z' + tz = parse_timezone_directive(found_dict["z"]) + elif parse_code == 20: + # e.g. val='2015-1-7'; fmt='%G-%V-%u' + iso_year = int(found_dict["G"]) + elif parse_code == 21: + # e.g. val='2015-1-7'; fmt='%G-%V-%u' + iso_week = int(found_dict["V"]) + elif parse_code == 22: + # e.g. val='2015-1-7'; fmt='%G-%V-%u' + weekday = int(found_dict["u"]) + weekday -= 1 + + # If we know the wk of the year and what day of that wk, we can figure + # out the Julian day of the year. + if julian == -1 and weekday != -1: + if week_of_year != -1: + # e.g. val='2013020'; fmt='%Y%U%w' + week_starts_Mon = week_of_year_start == 0 + julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, + week_starts_Mon) + elif iso_year != -1 and iso_week != -1: + # e.g. val='2015-1-7'; fmt='%G-%V-%u' + year, julian = _calc_julian_from_V(iso_year, iso_week, + weekday + 1) + # else: + # # e.g. val='Thu Sep 25 2003'; fmt='%a %b %d %Y' + # pass + + # Cannot pre-calculate date() since can change in Julian + # calculation and thus could have different value for the day of the wk + # calculation. + if julian == -1: + # Need to add 1 to result since first day of the year is 1, not + # 0. + # We don't actually need ordinal/julian here, but need to raise + # on e.g. val='2015-04-31'; fmt='%Y-%m-%d' + ordinal = date(year, month, day).toordinal() + julian = ordinal - date(year, 1, 1).toordinal() + 1 + else: + # Assume that if they bothered to include Julian day it will + # be accurate. + datetime_result = date.fromordinal( + (julian - 1) + date(year, 1, 1).toordinal()) + year = datetime_result.year + month = datetime_result.month + day = datetime_result.day + if weekday == -1: + # We don't actually use weekday here, but need to do this in order to + # raise on y/m/d combinations + # TODO: not reached in tests 2023-10-28; necessary? + weekday = date(year, month, day).weekday() + + dts.year = year + dts.month = month + dts.day = day + dts.hour = hour + dts.min = minute + dts.sec = second + dts.us = us + dts.ps = ns * 1000 + return tz + + +def _array_strptime_object_fallback( + ndarray[object] values, + str fmt, + bint exact=True, + errors="raise", + bint utc=False, +): + + cdef: + Py_ssize_t i, n = len(values) + npy_datetimestruct dts + int64_t iresult + object val + tzinfo tz + bint is_raise = errors=="raise" + bint is_ignore = errors=="ignore" + bint is_coerce = errors=="coerce" + bint iso_format = format_is_iso(fmt) + NPY_DATETIMEUNIT creso, out_bestunit, item_reso + int out_local = 0, out_tzoffset = 0 + bint string_to_dts_succeeded = 0 + + assert is_raise or is_ignore or is_coerce + + item_reso = NPY_DATETIMEUNIT.NPY_FR_GENERIC + format_regex, locale_time = _get_format_regex(fmt) + + result = np.empty(n, dtype=object) + + dts.us = dts.ps = dts.as = 0 + + for i in range(n): + val = values[i] + try: + if isinstance(val, str): + if len(val) == 0 or val in nat_strings: + result[i] = NaT + continue + elif checknull_with_nat_and_na(val): + result[i] = NaT + continue + elif PyDateTime_Check(val): + result[i] = Timestamp(val) + continue + elif PyDate_Check(val): + result[i] = Timestamp(val) + continue + elif cnp.is_datetime64_object(val): + result[i] = Timestamp(val) + continue + elif ( + (is_integer_object(val) or is_float_object(val)) + and (val != val or val == NPY_NAT) + ): + result[i] = NaT + continue else: - # Assume that if they bothered to include Julian day it will - # be accurate. - datetime_result = date.fromordinal( - (julian - 1) + date(year, 1, 1).toordinal()) - year = datetime_result.year - month = datetime_result.month - day = datetime_result.day - if weekday == -1: - weekday = date(year, month, day).weekday() - - dts.year = year - dts.month = month - dts.day = day - dts.hour = hour - dts.min = minute - dts.sec = second - dts.us = us - dts.ps = ns * 1000 + val = str(val) + + if fmt == "ISO8601": + string_to_dts_succeeded = not string_to_dts( + val, &dts, &out_bestunit, &out_local, + &out_tzoffset, False, None, False + ) + elif iso_format: + string_to_dts_succeeded = not string_to_dts( + val, &dts, &out_bestunit, &out_local, + &out_tzoffset, False, fmt, exact + ) + if string_to_dts_succeeded: + # No error reported by string_to_dts, pick back up + # where we left off + creso = get_supported_reso(out_bestunit) + try: + value = npy_datetimestruct_to_datetime(creso, &dts) + except OverflowError as err: + raise OutOfBoundsDatetime( + f"Out of bounds nanosecond timestamp: {val}" + ) from err + if out_local == 1: + tz = timezone(timedelta(minutes=out_tzoffset)) + value = tz_localize_to_utc_single( + value, tz, ambiguous="raise", nonexistent=None, creso=creso + ) + else: + tz = None + ts = Timestamp._from_value_and_reso(value, creso, tz) + result[i] = ts + continue - iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) - check_dts_bounds(&dts) + if parse_today_now(val, &iresult, utc, NPY_FR_ns): + result[i] = Timestamp(val) + continue - result_timezone[i] = tz + # Some ISO formats can't be parsed by string_to_dts + # For example, 6-digit YYYYMD. So, if there's an error, and a format + # was specified, then try the string-matching code below. If the format + # specified was 'ISO8601', then we need to error, because + # only string_to_dts handles mixed ISO8601 formats. + if not string_to_dts_succeeded and fmt == "ISO8601": + raise ValueError(f"Time data {val} is not ISO8601 format") + + tz = _parse_with_format( + val, fmt, exact, format_regex, locale_time, &dts, &item_reso + ) + try: + iresult = npy_datetimestruct_to_datetime(item_reso, &dts) + except OverflowError as err: + raise OutOfBoundsDatetime( + f"Out of bounds nanosecond timestamp: {val}" + ) from err + if tz is not None: + iresult = tz_localize_to_utc_single( + iresult, tz, ambiguous="raise", nonexistent=None, creso=item_reso + ) + ts = Timestamp._from_value_and_reso(iresult, item_reso, tz) + result[i] = ts except (ValueError, OutOfBoundsDatetime) as ex: ex.args = ( @@ -528,13 +920,26 @@ "You might want to use `dayfirst` alongside this.", ) if is_coerce: - iresult[i] = NPY_NAT + result[i] = NaT continue elif is_raise: raise - return values, [] + return values + + import warnings + + from pandas.util._exceptions import find_stack_level + warnings.warn( + "In a future version of pandas, parsing datetimes with mixed time " + "zones will raise an error unless `utc=True`. Please specify `utc=True` " + "to opt in to the new behaviour and silence this warning. " + "To create a `Series` with mixed offsets and `object` dtype, " + "please use `apply` and `datetime.datetime.strptime`", + FutureWarning, + stacklevel=find_stack_level(), + ) - return result, result_timezone.base + return result class TimeRE(_TimeRE): @@ -672,13 +1077,14 @@ Notes ----- This is essentially similar to the cpython implementation - https://github.com/python/cpython/blob/master/Lib/_strptime.py#L457-L479 + https://github.com/python/cpython/blob/546cab84448b892c92e68d9c1a3d3b58c13b3463/Lib/_strptime.py#L437-L454 + Licence at LICENSES/PSF_LICENSE """ cdef: int hours, minutes, seconds, pad_number, microseconds int total_minutes - object gmtoff_remainder, gmtoff_remainder_padding + str gmtoff_remainder, gmtoff_remainder_padding if z == "Z": return timezone(timedelta(0)) diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/timedeltas.pxd pandas-2.2.2+dfsg/pandas/_libs/tslibs/timedeltas.pxd --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/timedeltas.pxd 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/timedeltas.pxd 2024-04-10 17:42:52.000000000 +0000 @@ -4,6 +4,7 @@ from .np_datetime cimport NPY_DATETIMEUNIT +cpdef int64_t get_unit_for_round(freq, NPY_DATETIMEUNIT creso) except? -1 # Exposed for tslib, not intended for outside use. cpdef int64_t delta_to_nanoseconds( delta, NPY_DATETIMEUNIT reso=*, bint round_ok=* diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/timedeltas.pyi pandas-2.2.2+dfsg/pandas/_libs/tslibs/timedeltas.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/timedeltas.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/timedeltas.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -14,6 +14,7 @@ Tick, ) from pandas._typing import ( + Frequency, Self, npt, ) @@ -67,8 +68,10 @@ ] _S = TypeVar("_S", bound=timedelta) +def get_unit_for_round(freq, creso: int) -> int: ... +def disallow_ambiguous_unit(unit: str | None) -> None: ... def ints_to_pytimedelta( - arr: npt.NDArray[np.timedelta64], + m8values: npt.NDArray[np.timedelta64], box: bool = ..., ) -> npt.NDArray[np.object_]: ... def array_to_timedelta64( @@ -117,9 +120,9 @@ @property def asm8(self) -> np.timedelta64: ... # TODO: round/floor/ceil could return NaT? - def round(self, freq: str) -> Self: ... - def floor(self, freq: str) -> Self: ... - def ceil(self, freq: str) -> Self: ... + def round(self, freq: Frequency) -> Self: ... + def floor(self, freq: Frequency) -> Self: ... + def ceil(self, freq: Frequency) -> Self: ... @property def resolution_string(self) -> str: ... def __add__(self, other: timedelta) -> Timedelta: ... @@ -162,8 +165,10 @@ def __gt__(self, other: timedelta) -> bool: ... def __hash__(self) -> int: ... def isoformat(self) -> str: ... - def to_numpy(self) -> np.timedelta64: ... - def view(self, dtype: npt.DTypeLike = ...) -> object: ... + def to_numpy( + self, dtype: npt.DTypeLike = ..., copy: bool = False + ) -> np.timedelta64: ... + def view(self, dtype: npt.DTypeLike) -> object: ... @property def unit(self) -> str: ... def as_unit(self, unit: str, round_ok: bool = ...) -> Timedelta: ... diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/timedeltas.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/timedeltas.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/timedeltas.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/timedeltas.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,8 @@ import collections import warnings +from pandas.util._exceptions import find_stack_level + cimport cython from cpython.object cimport ( Py_EQ, @@ -41,6 +43,7 @@ precision_from_unit, ) from pandas._libs.tslibs.dtypes cimport ( + c_DEPR_ABBREVS, get_supported_reso, is_supported_unit, npy_unit_to_abbrev, @@ -58,7 +61,6 @@ cmp_scalar, convert_reso, get_datetime64_unit, - get_timedelta64_value, get_unit_from_dtype, import_pandas_datetime, npy_datetimestruct, @@ -77,10 +79,8 @@ from pandas._libs.tslibs.offsets cimport is_tick_object from pandas._libs.tslibs.util cimport ( is_array, - is_datetime64_object, is_float_object, is_integer_object, - is_timedelta64_object, ) from pandas._libs.tslibs.fields import ( @@ -124,7 +124,6 @@ "minute": "m", "min": "m", "minutes": "m", - "t": "m", "s": "s", "seconds": "s", "sec": "s", @@ -134,20 +133,17 @@ "millisecond": "ms", "milli": "ms", "millis": "ms", - "l": "ms", "us": "us", "microseconds": "us", "microsecond": "us", "µs": "us", "micro": "us", "micros": "us", - "u": "us", "ns": "ns", "nanoseconds": "ns", "nano": "ns", "nanos": "ns", "nanosecond": "ns", - "n": "ns", } _no_input = object() @@ -255,14 +251,14 @@ n = delta._value in_reso = delta._creso - elif is_timedelta64_object(delta): + elif cnp.is_timedelta64_object(delta): in_reso = get_datetime64_unit(delta) if in_reso == NPY_DATETIMEUNIT.NPY_FR_Y or in_reso == NPY_DATETIMEUNIT.NPY_FR_M: raise ValueError( "delta_to_nanoseconds does not support Y or M units, " "as their duration in nanoseconds is ambiguous." ) - n = get_timedelta64_value(delta) + n = cnp.get_timedelta64_value(delta) elif PyDelta_Check(delta): in_reso = NPY_DATETIMEUNIT.NPY_FR_us @@ -305,18 +301,16 @@ cdef: NPY_DATETIMEUNIT td64_unit int64_t td64_value, mult - str unitstr td64_unit = get_datetime64_unit(ts) if ( td64_unit != NPY_DATETIMEUNIT.NPY_FR_ns and td64_unit != NPY_DATETIMEUNIT.NPY_FR_GENERIC ): - unitstr = npy_unit_to_abbrev(td64_unit) - td64_value = get_timedelta64_value(ts) + td64_value = cnp.get_timedelta64_value(ts) - mult = precision_from_unit(unitstr)[0] + mult = precision_from_unit(td64_unit)[0] try: # NB: cython#1381 this cannot be *= td64_value = td64_value * mult @@ -351,7 +345,7 @@ ts = ts.as_unit("ns").asm8 else: ts = np.timedelta64(ts._value, "ns") - elif is_timedelta64_object(ts): + elif cnp.is_timedelta64_object(ts): ts = ensure_td64ns(ts) elif is_integer_object(ts): if ts == NPY_NAT: @@ -371,7 +365,7 @@ if PyDelta_Check(ts): ts = np.timedelta64(delta_to_nanoseconds(ts), "ns") - elif not is_timedelta64_object(ts): + elif not cnp.is_timedelta64_object(ts): raise TypeError(f"Invalid type for timedelta scalar: {type(ts)}") return ts.astype("timedelta64[ns]") @@ -485,7 +479,7 @@ See array_to_timedelta64. """ try: - return get_timedelta64_value(convert_to_timedelta64(item, parsed_unit)) + return cnp.get_timedelta64_value(convert_to_timedelta64(item, parsed_unit)) except ValueError as err: if errors == "coerce": return NPY_NAT @@ -505,9 +499,9 @@ """ cdef: - unicode c + str c bint neg = 0, have_dot = 0, have_value = 0, have_hhmmss = 0 - object current_unit = None + str current_unit = None int64_t result = 0, m = 0, r list number = [], frac = [], unit = [] @@ -725,6 +719,15 @@ return "ns" elif unit == "M": return unit + elif unit in c_DEPR_ABBREVS: + warnings.warn( + f"\'{unit}\' is deprecated and will be removed in a " + f"future version. Please use \'{c_DEPR_ABBREVS.get(unit)}\' " + f"instead of \'{unit}\'.", + FutureWarning, + stacklevel=find_stack_level(), + ) + unit = c_DEPR_ABBREVS[unit] try: return timedelta_abbrevs[unit.lower()] except KeyError: @@ -759,7 +762,7 @@ if other is NaT: return NaT - elif is_datetime64_object(other) or ( + elif cnp.is_datetime64_object(other) or ( PyDateTime_Check(other) and not isinstance(other, ABCTimestamp) ): # this case is for a datetime object that is specifically @@ -819,6 +822,14 @@ # ---------------------------------------------------------------------- # Timedelta Construction +cpdef disallow_ambiguous_unit(unit): + if unit in {"Y", "y", "M"}: + raise ValueError( + "Units 'M', 'Y', and 'y' are no longer supported, as they do not " + "represent unambiguous timedelta values durations." + ) + + cdef int64_t parse_iso_format_string(str ts) except? -1: """ Extracts and cleanses the appropriate values from a match object with @@ -890,8 +901,8 @@ elif c in ["W", "D", "H", "M"]: if c in ["H", "M"] and len(number) > 2: raise ValueError(err_msg) - if c == "M": - c = "min" + if c in ["M", "H"]: + c = c.replace("M", "min").replace("H", "h") unit.append(c) r = timedelta_from_spec(number, "0", unit) result += timedelta_as_neg(r, neg) @@ -901,7 +912,7 @@ elif c == ".": # append any seconds if len(number): - r = timedelta_from_spec(number, "0", "S") + r = timedelta_from_spec(number, "0", "s") result += timedelta_as_neg(r, neg) unit, number = [], [] have_dot = 1 @@ -918,7 +929,7 @@ r = timedelta_from_spec(number, "0", dec_unit) result += timedelta_as_neg(r, neg) else: # seconds - r = timedelta_from_spec(number, "0", "S") + r = timedelta_from_spec(number, "0", "s") result += timedelta_as_neg(r, neg) else: raise ValueError(err_msg) @@ -1216,7 +1227,7 @@ return cmp_scalar(self._value, ots._value, op) return self._compare_mismatched_resos(ots, op) - # TODO: re-use/share with Timestamp + # TODO: reuse/share with Timestamp cdef bint _compare_mismatched_resos(self, _Timedelta other, op): # Can't just dispatch to numpy as they silently overflow and get it wrong cdef: @@ -1434,12 +1445,12 @@ Resolution: Return value * Days: 'D' - * Hours: 'H' - * Minutes: 'T' - * Seconds: 'S' - * Milliseconds: 'L' - * Microseconds: 'U' - * Nanoseconds: 'N' + * Hours: 'h' + * Minutes: 'min' + * Seconds: 's' + * Milliseconds: 'ms' + * Microseconds: 'us' + * Nanoseconds: 'ns' Returns ------- @@ -1450,33 +1461,33 @@ -------- >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns') >>> td.resolution_string - 'N' + 'ns' >>> td = pd.Timedelta('1 days 2 min 3 us') >>> td.resolution_string - 'U' + 'us' >>> td = pd.Timedelta('2 min 3 s') >>> td.resolution_string - 'S' + 's' >>> td = pd.Timedelta(36, unit='us') >>> td.resolution_string - 'U' + 'us' """ self._ensure_components() if self._ns: - return "N" + return "ns" elif self._us: - return "U" + return "us" elif self._ms: - return "L" + return "ms" elif self._s: - return "S" + return "s" elif self._m: - return "T" + return "min" elif self._h: - return "H" + return "h" else: return "D" @@ -1706,15 +1717,20 @@ Possible values: - * 'W', 'D', 'T', 'S', 'L', 'U', or 'N' - * 'days' or 'day' + * 'W', or 'D' + * 'days', or 'day' * 'hours', 'hour', 'hr', or 'h' * 'minutes', 'minute', 'min', or 'm' - * 'seconds', 'second', or 'sec' - * 'milliseconds', 'millisecond', 'millis', or 'milli' - * 'microseconds', 'microsecond', 'micros', or 'micro' + * 'seconds', 'second', 'sec', or 's' + * 'milliseconds', 'millisecond', 'millis', 'milli', or 'ms' + * 'microseconds', 'microsecond', 'micros', 'micro', or 'us' * 'nanoseconds', 'nanosecond', 'nanos', 'nano', or 'ns'. + .. deprecated:: 2.2.0 + + Values `H`, `T`, `S`, `L`, `U`, and `N` are deprecated in favour + of the values `h`, `min`, `s`, `ms`, `us`, and `ns`. + **kwargs Available kwargs: {days, seconds, microseconds, milliseconds, minutes, hours, weeks}. @@ -1771,7 +1787,7 @@ ) # GH43764, convert any input to nanoseconds first and then - # create the timestamp. This ensures that any potential + # create the timedelta. This ensures that any potential # nanosecond contributions from kwargs parsed as floats # are taken into consideration. seconds = int(( @@ -1784,17 +1800,25 @@ ) * 1_000_000_000 ) - value = np.timedelta64( - int(kwargs.get("nanoseconds", 0)) - + int(kwargs.get("microseconds", 0) * 1_000) - + int(kwargs.get("milliseconds", 0) * 1_000_000) - + seconds - ) - if unit in {"Y", "y", "M"}: - raise ValueError( - "Units 'M', 'Y', and 'y' are no longer supported, as they do not " - "represent unambiguous timedelta values durations." - ) + ns = kwargs.get("nanoseconds", 0) + us = kwargs.get("microseconds", 0) + ms = kwargs.get("milliseconds", 0) + try: + value = np.timedelta64( + int(ns) + + int(us * 1_000) + + int(ms * 1_000_000) + + seconds + ) + except OverflowError as err: + # GH#55503 + msg = ( + f"seconds={seconds}, milliseconds={ms}, " + f"microseconds={us}, nanoseconds={ns}" + ) + raise OutOfBoundsTimedelta(msg) from err + + disallow_ambiguous_unit(unit) # GH 30543 if pd.Timedelta already passed, return it # check that only value is passed @@ -1827,10 +1851,10 @@ return cls._from_value_and_reso( new_value, reso=NPY_DATETIMEUNIT.NPY_FR_us ) - elif is_timedelta64_object(value): + elif cnp.is_timedelta64_object(value): # Retain the resolution if possible, otherwise cast to the nearest # supported resolution. - new_value = get_timedelta64_value(value) + new_value = cnp.get_timedelta64_value(value) if new_value == NPY_NAT: # i.e. np.timedelta64("NaT") return NaT @@ -1878,7 +1902,7 @@ f"float, timedelta or convertible, not {type(value).__name__}" ) - if is_timedelta64_object(value): + if cnp.is_timedelta64_object(value): value = value.view("i8") # nat @@ -1907,10 +1931,7 @@ int64_t result, unit ndarray[int64_t] arr - from pandas._libs.tslibs.offsets import to_offset - - to_offset(freq).nanos # raises on non-fixed freq - unit = delta_to_nanoseconds(to_offset(freq), self._creso) + unit = get_unit_for_round(freq, self._creso) arr = np.array([self._value], dtype="i8") try: @@ -1929,6 +1950,7 @@ ---------- freq : str Frequency string indicating the rounding resolution. + It uses the same units as class constructor :class:`~pandas.Timedelta`. Returns ------- @@ -1956,6 +1978,7 @@ ---------- freq : str Frequency string indicating the flooring resolution. + It uses the same units as class constructor :class:`~pandas.Timedelta`. Examples -------- @@ -1975,6 +1998,7 @@ ---------- freq : str Frequency string indicating the ceiling resolution. + It uses the same units as class constructor :class:`~pandas.Timedelta`. Examples -------- @@ -2036,6 +2060,12 @@ # integers or floats if util.is_nan(other): return NaT + # We want NumPy numeric scalars to behave like Python scalars + # post NEP 50 + if isinstance(other, cnp.integer): + other = int(other) + if isinstance(other, cnp.floating): + other = float(other) return Timedelta._from_value_and_reso( (self._value/ other), self._creso ) @@ -2090,6 +2120,12 @@ elif is_integer_object(other) or is_float_object(other): if util.is_nan(other): return NaT + # We want NumPy numeric scalars to behave like Python scalars + # post NEP 50 + if isinstance(other, cnp.integer): + other = int(other) + if isinstance(other, cnp.floating): + other = float(other) return type(self)._from_value_and_reso(self._value// other, self._creso) elif is_array(other): @@ -2194,7 +2230,7 @@ td64 = left[i] obj = right[i] - if get_timedelta64_value(td64) == NPY_NAT: + if cnp.get_timedelta64_value(td64) == NPY_NAT: # td here should be interpreted as a td64 NaT if _should_cast_to_timedelta(obj): res_value = np.nan @@ -2223,7 +2259,7 @@ td64 = left[i] obj = right[i] - if get_timedelta64_value(td64) == NPY_NAT: + if cnp.get_timedelta64_value(td64) == NPY_NAT: # td here should be interpreted as a td64 NaT if _should_cast_to_timedelta(obj): res_value = np.nan @@ -2253,7 +2289,7 @@ bool """ return ( - PyDelta_Check(obj) or is_timedelta64_object(obj) or is_tick_object(obj) + PyDelta_Check(obj) or cnp.is_timedelta64_object(obj) or is_tick_object(obj) ) @@ -2264,3 +2300,11 @@ return ( is_any_td_scalar(obj) or obj is None or obj is NaT or isinstance(obj, str) ) + + +cpdef int64_t get_unit_for_round(freq, NPY_DATETIMEUNIT creso) except? -1: + from pandas._libs.tslibs.offsets import to_offset + + freq = to_offset(freq) + freq.nanos # raises on non-fixed freq + return delta_to_nanoseconds(freq, creso) diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/timestamps.pxd pandas-2.2.2+dfsg/pandas/_libs/tslibs/timestamps.pxd --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/timestamps.pxd 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/timestamps.pxd 2024-04-10 17:42:52.000000000 +0000 @@ -26,7 +26,7 @@ cdef bint _get_start_end_field(self, str field, freq) cdef _get_date_name_field(self, str field, object locale) - cdef int64_t _maybe_convert_value_to_local(self) + cdef int64_t _maybe_convert_value_to_local(self) except? -1 cdef bint _can_compare(self, datetime other) cpdef to_datetime64(self) cpdef datetime to_pydatetime(_Timestamp self, bint warn=*) diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/timestamps.pyi pandas-2.2.2+dfsg/pandas/_libs/tslibs/timestamps.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/timestamps.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/timestamps.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -8,7 +8,8 @@ from time import struct_time from typing import ( ClassVar, - TypeVar, + Literal, + TypeAlias, overload, ) @@ -26,7 +27,7 @@ TimestampNonexistent, ) -_DatetimeT = TypeVar("_DatetimeT", bound=datetime) +_TimeZones: TypeAlias = str | _tzinfo | None | int def integer_op_not_supported(obj: object) -> TypeError: ... @@ -39,7 +40,7 @@ _value: int # np.int64 # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]") def __new__( # type: ignore[misc] - cls: type[_DatetimeT], + cls: type[Self], ts_input: np.integer | float | str | _date | datetime | np.datetime64 = ..., year: int | None = ..., month: int | None = ..., @@ -51,13 +52,13 @@ tzinfo: _tzinfo | None = ..., *, nanosecond: int | None = ..., - tz: str | _tzinfo | None | int = ..., + tz: _TimeZones = ..., unit: str | int | None = ..., fold: int | None = ..., - ) -> _DatetimeT | NaTType: ... + ) -> Self | NaTType: ... @classmethod def _from_value_and_reso( - cls, value: int, reso: int, tz: _tzinfo | None + cls, value: int, reso: int, tz: _TimeZones ) -> Timestamp: ... @property def value(self) -> int: ... # np.int64 @@ -84,19 +85,19 @@ @property def fold(self) -> int: ... @classmethod - def fromtimestamp(cls, ts: float, tz: _tzinfo | None = ...) -> Self: ... + def fromtimestamp(cls, ts: float, tz: _TimeZones = ...) -> Self: ... @classmethod def utcfromtimestamp(cls, ts: float) -> Self: ... @classmethod - def today(cls, tz: _tzinfo | str | None = ...) -> Self: ... + def today(cls, tz: _TimeZones = ...) -> Self: ... @classmethod def fromordinal( cls, ordinal: int, - tz: _tzinfo | str | None = ..., + tz: _TimeZones = ..., ) -> Self: ... @classmethod - def now(cls, tz: _tzinfo | str | None = ...) -> Self: ... + def now(cls, tz: _TimeZones = ...) -> Self: ... @classmethod def utcnow(cls) -> Self: ... # error: Signature of "combine" incompatible with supertype "datetime" @@ -131,7 +132,7 @@ fold: int | None = ..., ) -> Self: ... # LSP violation: datetime.datetime.astimezone has a default value for tz - def astimezone(self, tz: _tzinfo | None) -> Self: ... # type: ignore[override] + def astimezone(self, tz: _TimeZones) -> Self: ... # type: ignore[override] def ctime(self) -> str: ... def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ... @classmethod @@ -180,16 +181,16 @@ def is_year_end(self) -> bool: ... def to_pydatetime(self, warn: bool = ...) -> datetime: ... def to_datetime64(self) -> np.datetime64: ... - def to_period(self, freq: BaseOffset | str = ...) -> Period: ... + def to_period(self, freq: BaseOffset | str | None = None) -> Period: ... def to_julian_date(self) -> np.float64: ... @property def asm8(self) -> np.datetime64: ... - def tz_convert(self, tz: _tzinfo | str | None) -> Self: ... + def tz_convert(self, tz: _TimeZones) -> Self: ... # TODO: could return NaT? def tz_localize( self, - tz: _tzinfo | str | None, - ambiguous: str = ..., + tz: _TimeZones, + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def normalize(self) -> Self: ... @@ -197,19 +198,19 @@ def round( self, freq: str, - ambiguous: bool | str = ..., + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def floor( self, freq: str, - ambiguous: bool | str = ..., + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def ceil( self, freq: str, - ambiguous: bool | str = ..., + ambiguous: bool | Literal["raise", "NaT"] = ..., nonexistent: TimestampNonexistent = ..., ) -> Self: ... def day_name(self, locale: str | None = ...) -> str: ... diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/timestamps.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/timestamps.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/timestamps.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/timestamps.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -60,12 +60,12 @@ ) from pandas._libs.tslibs.dtypes cimport ( npy_unit_to_abbrev, + npy_unit_to_attrname, periods_per_day, periods_per_second, ) from pandas._libs.tslibs.util cimport ( is_array, - is_datetime64_object, is_integer_object, ) @@ -83,12 +83,11 @@ from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, NPY_FR_ns, - check_dts_bounds, cmp_dtstructs, cmp_scalar, convert_reso, + dts_to_iso_string, get_datetime64_unit, - get_datetime64_value, get_unit_from_dtype, import_pandas_datetime, npy_datetimestruct, @@ -107,7 +106,7 @@ from pandas._libs.tslibs.offsets cimport to_offset from pandas._libs.tslibs.timedeltas cimport ( _Timedelta, - delta_to_nanoseconds, + get_unit_for_round, is_any_td_scalar, ) @@ -307,7 +306,7 @@ NPY_DATETIMEUNIT reso reso = get_datetime64_unit(dt64) - value = get_datetime64_value(dt64) + value = cnp.get_datetime64_value(dt64) return cls._from_value_and_reso(value, reso, None) # ----------------------------------------------------------------- @@ -330,7 +329,7 @@ ots = other elif other is NaT: return op == Py_NE - elif is_datetime64_object(other): + elif cnp.is_datetime64_object(other): ots = Timestamp(other) elif PyDateTime_Check(other): if self.nanosecond == 0: @@ -449,15 +448,15 @@ nanos = other._value try: - new_value = self._value+ nanos + new_value = self._value + nanos result = type(self)._from_value_and_reso( new_value, reso=self._creso, tz=self.tzinfo ) except OverflowError as err: - # TODO: don't hard-code nanosecond here new_value = int(self._value) + int(nanos) + attrname = npy_unit_to_attrname[self._creso] raise OutOfBoundsDatetime( - f"Out of bounds nanosecond timestamp: {new_value}" + f"Out of bounds {attrname} timestamp: {new_value}" ) from err return result @@ -476,11 +475,6 @@ dtype=object, ) - elif not isinstance(self, _Timestamp): - # cython semantics, args have been switched and this is __radd__ - # TODO(cython3): remove this it moved to __radd__ - return other.__add__(self) - return NotImplemented def __radd__(self, other): @@ -510,17 +504,11 @@ return NotImplemented # coerce if necessary if we are a Timestamp-like - if (PyDateTime_Check(self) - and (PyDateTime_Check(other) or is_datetime64_object(other))): + if PyDateTime_Check(other) or cnp.is_datetime64_object(other): # both_timestamps is to determine whether Timedelta(self - other) # should raise the OOB error, or fall back returning a timedelta. - # TODO(cython3): clean out the bits that moved to __rsub__ - both_timestamps = (isinstance(other, _Timestamp) and - isinstance(self, _Timestamp)) - if isinstance(self, _Timestamp): - other = type(self)(other) - else: - self = type(other)(self) + both_timestamps = isinstance(other, _Timestamp) + other = type(self)(other) if (self.tzinfo is None) ^ (other.tzinfo is None): raise TypeError( @@ -537,24 +525,18 @@ # scalar Timestamp/datetime - Timestamp/datetime -> yields a # Timedelta try: - res_value = self._value- other._value + res_value = self._value - other._value return Timedelta._from_value_and_reso(res_value, self._creso) except (OverflowError, OutOfBoundsDatetime, OutOfBoundsTimedelta) as err: - if isinstance(other, _Timestamp): - if both_timestamps: - raise OutOfBoundsDatetime( - "Result is too large for pandas.Timedelta. Convert inputs " - "to datetime.datetime with 'Timestamp.to_pydatetime()' " - "before subtracting." - ) from err + if both_timestamps: + raise OutOfBoundsDatetime( + "Result is too large for pandas.Timedelta. Convert inputs " + "to datetime.datetime with 'Timestamp.to_pydatetime()' " + "before subtracting." + ) from err # We get here in stata tests, fall back to stdlib datetime # method and return stdlib timedelta object pass - elif is_datetime64_object(self): - # GH#28286 cython semantics for __rsub__, `other` is actually - # the Timestamp - # TODO(cython3): remove this, this moved to __rsub__ - return type(other)(self) - other return NotImplemented @@ -566,13 +548,13 @@ # We get here in stata tests, fall back to stdlib datetime # method and return stdlib timedelta object pass - elif is_datetime64_object(other): + elif cnp.is_datetime64_object(other): return type(self)(other) - self return NotImplemented # ----------------------------------------------------------------- - cdef int64_t _maybe_convert_value_to_local(self): + cdef int64_t _maybe_convert_value_to_local(self) except? -1: """Convert UTC i8 value to local i8 value if tz exists""" cdef: int64_t val @@ -1078,18 +1060,6 @@ return result - @property - def _short_repr(self) -> str: - # format a Timestamp with only _date_repr if possible - # otherwise _repr_base - if (self.hour == 0 and - self.minute == 0 and - self.second == 0 and - self.microsecond == 0 and - self.nanosecond == 0): - return self._date_repr - return self._repr_base - # ----------------------------------------------------------------- # Conversion Methods @@ -1261,7 +1231,7 @@ >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') >>> # Year end frequency >>> ts.to_period(freq='Y') - Period('2020', 'A-DEC') + Period('2020', 'Y-DEC') >>> # Month end frequency >>> ts.to_period(freq='M') @@ -1886,10 +1856,6 @@ "the tz parameter. Use tz_convert instead.") tzobj = maybe_get_tz(tz) - if tzobj is not None and is_datetime64_object(ts_input): - # GH#24559, GH#42288 As of 2.0 we treat datetime64 as - # wall-time (consistent with DatetimeIndex) - return cls(ts_input).tz_localize(tzobj) if nanosecond is None: nanosecond = 0 @@ -1907,9 +1873,8 @@ cdef: int64_t nanos - freq = to_offset(freq) - freq.nanos # raises on non-fixed freq - nanos = delta_to_nanoseconds(freq, self._creso) + freq = to_offset(freq, is_period=False) + nanos = get_unit_for_round(freq, self._creso) if nanos == 0: if freq.nanos == 0: raise ValueError("Division by zero in rounding") @@ -1917,8 +1882,6 @@ # e.g. self.unit == "s" and sub-second freq return self - # TODO: problem if nanos==0 - if self.tz is not None: value = self.tz_localize(None)._value else: @@ -1994,26 +1957,26 @@ A timestamp can be rounded using multiple frequency units: - >>> ts.round(freq='H') # hour + >>> ts.round(freq='h') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.round(freq='T') # minute + >>> ts.round(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.round(freq='S') # seconds + >>> ts.round(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.round(freq='L') # milliseconds + >>> ts.round(freq='ms') # milliseconds Timestamp('2020-03-14 15:32:52.193000') - ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): - >>> ts.round(freq='5T') + >>> ts.round(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.round(freq='1H30T') + >>> ts.round(freq='1h30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -2026,10 +1989,10 @@ >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.round("H", ambiguous=False) + >>> ts_tz.round("h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.round("H", ambiguous=True) + >>> ts_tz.round("h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """ return self._round( @@ -2085,26 +2048,26 @@ A timestamp can be floored using multiple frequency units: - >>> ts.floor(freq='H') # hour + >>> ts.floor(freq='h') # hour Timestamp('2020-03-14 15:00:00') - >>> ts.floor(freq='T') # minute + >>> ts.floor(freq='min') # minute Timestamp('2020-03-14 15:32:00') - >>> ts.floor(freq='S') # seconds + >>> ts.floor(freq='s') # seconds Timestamp('2020-03-14 15:32:52') - >>> ts.floor(freq='N') # nanoseconds + >>> ts.floor(freq='ns') # nanoseconds Timestamp('2020-03-14 15:32:52.192548651') - ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): - >>> ts.floor(freq='5T') + >>> ts.floor(freq='5min') Timestamp('2020-03-14 15:30:00') - or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.floor(freq='1H30T') + >>> ts.floor(freq='1h30min') Timestamp('2020-03-14 15:00:00') Analogous for ``pd.NaT``: @@ -2117,10 +2080,10 @@ >>> ts_tz = pd.Timestamp("2021-10-31 03:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.floor("2H", ambiguous=False) + >>> ts_tz.floor("2h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.floor("2H", ambiguous=True) + >>> ts_tz.floor("2h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """ return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) @@ -2174,26 +2137,26 @@ A timestamp can be ceiled using multiple frequency units: - >>> ts.ceil(freq='H') # hour + >>> ts.ceil(freq='h') # hour Timestamp('2020-03-14 16:00:00') - >>> ts.ceil(freq='T') # minute + >>> ts.ceil(freq='min') # minute Timestamp('2020-03-14 15:33:00') - >>> ts.ceil(freq='S') # seconds + >>> ts.ceil(freq='s') # seconds Timestamp('2020-03-14 15:32:53') - >>> ts.ceil(freq='U') # microseconds + >>> ts.ceil(freq='us') # microseconds Timestamp('2020-03-14 15:32:52.192549') - ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes): + ``freq`` can also be a multiple of a single unit, like '5min' (i.e. 5 minutes): - >>> ts.ceil(freq='5T') + >>> ts.ceil(freq='5min') Timestamp('2020-03-14 15:35:00') - or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes): + or a combination of multiple units, like '1h30min' (i.e. 1 hour and 30 minutes): - >>> ts.ceil(freq='1H30T') + >>> ts.ceil(freq='1h30min') Timestamp('2020-03-14 16:30:00') Analogous for ``pd.NaT``: @@ -2206,10 +2169,10 @@ >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam") - >>> ts_tz.ceil("H", ambiguous=False) + >>> ts_tz.ceil("h", ambiguous=False) Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam') - >>> ts_tz.ceil("H", ambiguous=True) + >>> ts_tz.ceil("h", ambiguous=True) Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam') """ return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) @@ -2509,8 +2472,13 @@ # We can avoid going through pydatetime paths, which is robust # to datetimes outside of pydatetime range. ts = _TSObject() - check_dts_bounds(&dts, self._creso) - ts.value = npy_datetimestruct_to_datetime(self._creso, &dts) + try: + ts.value = npy_datetimestruct_to_datetime(self._creso, &dts) + except OverflowError as err: + fmt = dts_to_iso_string(&dts) + raise OutOfBoundsDatetime( + f"Out of bounds timestamp: {fmt} with frequency '{self.unit}'" + ) from err ts.dts = dts ts.creso = self._creso ts.fold = fold diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/timezones.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/timezones.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/timezones.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/timezones.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -67,7 +67,7 @@ return False # Warn if tzdata is too old, even if there is a system tzdata to alert # users about the mismatch between local/system tzdata - import_optional_dependency("tzdata", errors="warn", min_version="2022.1") + import_optional_dependency("tzdata", errors="warn", min_version="2022.7") return tz is utc_zoneinfo diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/tzconversion.pyi pandas-2.2.2+dfsg/pandas/_libs/tslibs/tzconversion.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/tzconversion.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/tzconversion.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -10,7 +10,7 @@ # tz_convert_from_utc_single exposed for testing def tz_convert_from_utc_single( - val: np.int64, tz: tzinfo, creso: int = ... + utc_val: np.int64, tz: tzinfo, creso: int = ... ) -> np.int64: ... def tz_localize_to_utc( vals: npt.NDArray[np.int64], diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/tzconversion.pyx pandas-2.2.2+dfsg/pandas/_libs/tslibs/tzconversion.pyx --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/tzconversion.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/tzconversion.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -416,8 +416,13 @@ else: delta_idx = bisect_right_i8(info.tdata, new_local, info.ntrans) - - delta_idx = delta_idx - delta_idx_offset + # Logic similar to the precompute section. But check the current + # delta in case we are moving between UTC+0 and non-zero timezone + if (shift_forward or shift_delta > 0) and \ + info.deltas[delta_idx - 1] >= 0: + delta_idx = delta_idx - 1 + else: + delta_idx = delta_idx - delta_idx_offset result[i] = new_local - info.deltas[delta_idx] elif fill_nonexist: result[i] = NPY_NAT @@ -428,7 +433,11 @@ return result.base # .base to get underlying ndarray -cdef Py_ssize_t bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n): +cdef Py_ssize_t bisect_right_i8( + const int64_t *data, + int64_t val, + Py_ssize_t n +) noexcept: # Caller is responsible for checking n > 0 # This looks very similar to local_search_right in the ndarray.searchsorted # implementation. @@ -465,8 +474,8 @@ cdef _get_utc_bounds( - ndarray vals, - int64_t* tdata, + ndarray[int64_t] vals, + const int64_t* tdata, Py_ssize_t ntrans, const int64_t[::1] deltas, NPY_DATETIMEUNIT creso, @@ -475,7 +484,7 @@ # result_a) or right of the DST transition (store in result_b) cdef: - ndarray result_a, result_b + ndarray[int64_t] result_a, result_b Py_ssize_t i, n = vals.size int64_t val, v_left, v_right Py_ssize_t isl, isr, pos_left, pos_right @@ -598,7 +607,8 @@ ndarray[uint8_t, cast=True] mismatch ndarray[int64_t] delta, dst_hours ndarray[intp_t] switch_idxs, trans_idx, grp, a_idx, b_idx, one_diff - list trans_grp + # TODO: Can uncomment when numpy >=2 is the minimum + # tuple trans_grp intp_t switch_idx int64_t left, right diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/util.pxd pandas-2.2.2+dfsg/pandas/_libs/tslibs/util.pxd --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/util.pxd 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/util.pxd 2024-04-10 17:42:52.000000000 +0000 @@ -1,5 +1,6 @@ from cpython.object cimport PyTypeObject +from cpython.unicode cimport PyUnicode_AsUTF8AndSize cdef extern from "Python.h": @@ -10,21 +11,18 @@ bint PyComplex_Check(object obj) nogil bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil - # TODO(cython3): cimport this, xref GH#49670 # Note that following functions can potentially raise an exception, - # thus they cannot be declared 'nogil'. Also PyUnicode_AsUTF8AndSize() can - # potentially allocate memory inside in unlikely case of when underlying - # unicode object was stored as non-utf8 and utf8 wasn't requested before. - const char* PyUnicode_AsUTF8AndSize(object obj, - Py_ssize_t* length) except NULL - + # thus they cannot be declared 'nogil'. object PyUnicode_EncodeLocale(object obj, const char *errors) nogil object PyUnicode_DecodeLocale(const char *str, const char *errors) nogil +cimport numpy as cnp from numpy cimport ( + PyArray_Check, float64_t, int64_t, + is_timedelta64_object, ) @@ -32,13 +30,10 @@ PyTypeObject PyFloatingArrType_Type cdef extern from "numpy/ndarrayobject.h": - PyTypeObject PyTimedeltaArrType_Type - PyTypeObject PyDatetimeArrType_Type PyTypeObject PyComplexFloatingArrType_Type PyTypeObject PyBoolArrType_Type bint PyArray_IsIntegerScalar(obj) nogil - bint PyArray_Check(obj) nogil cdef extern from "numpy/npy_common.h": int64_t NPY_MIN_INT64 @@ -51,11 +46,11 @@ # -------------------------------------------------------------------- # Type Checking -cdef inline bint is_integer_object(object obj) noexcept nogil: +cdef inline bint is_integer_object(object obj) noexcept: """ Cython equivalent of - `isinstance(val, (int, long, np.integer)) and not isinstance(val, bool)` + `isinstance(val, (int, np.integer)) and not isinstance(val, (bool, np.timedelta64))` Parameters ---------- @@ -69,13 +64,13 @@ ----- This counts np.timedelta64 objects as integers. """ - return (not PyBool_Check(obj) and PyArray_IsIntegerScalar(obj) + return (not PyBool_Check(obj) and isinstance(obj, (int, cnp.integer)) and not is_timedelta64_object(obj)) cdef inline bint is_float_object(object obj) noexcept nogil: """ - Cython equivalent of `isinstance(val, (float, np.float64))` + Cython equivalent of `isinstance(val, (float, np.floating))` Parameters ---------- @@ -91,7 +86,7 @@ cdef inline bint is_complex_object(object obj) noexcept nogil: """ - Cython equivalent of `isinstance(val, (complex, np.complex128))` + Cython equivalent of `isinstance(val, (complex, np.complexfloating))` Parameters ---------- @@ -121,40 +116,10 @@ PyObject_TypeCheck(obj, &PyBoolArrType_Type)) -cdef inline bint is_real_number_object(object obj) noexcept nogil: +cdef inline bint is_real_number_object(object obj) noexcept: return is_bool_object(obj) or is_integer_object(obj) or is_float_object(obj) -cdef inline bint is_timedelta64_object(object obj) noexcept nogil: - """ - Cython equivalent of `isinstance(val, np.timedelta64)` - - Parameters - ---------- - val : object - - Returns - ------- - is_timedelta64 : bool - """ - return PyObject_TypeCheck(obj, &PyTimedeltaArrType_Type) - - -cdef inline bint is_datetime64_object(object obj) noexcept nogil: - """ - Cython equivalent of `isinstance(val, np.datetime64)` - - Parameters - ---------- - val : object - - Returns - ------- - is_datetime64 : bool - """ - return PyObject_TypeCheck(obj, &PyDatetimeArrType_Type) - - cdef inline bint is_array(object val) noexcept: """ Cython equivalent of `isinstance(val, np.ndarray)` @@ -210,6 +175,9 @@ ------- buf : const char* """ + # Note PyUnicode_AsUTF8AndSize() can + # potentially allocate memory inside in unlikely case of when underlying + # unicode object was stored as non-utf8 and utf8 wasn't requested before. return PyUnicode_AsUTF8AndSize(py_string, length) diff -Nru pandas-2.1.4+dfsg/pandas/_libs/tslibs/vectorized.pyi pandas-2.2.2+dfsg/pandas/_libs/tslibs/vectorized.pyi --- pandas-2.1.4+dfsg/pandas/_libs/tslibs/vectorized.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/tslibs/vectorized.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -31,7 +31,7 @@ reso: int = ..., # NPY_DATETIMEUNIT ) -> Resolution: ... def ints_to_pydatetime( - arr: npt.NDArray[np.int64], + stamps: npt.NDArray[np.int64], tz: tzinfo | None = ..., box: str = ..., reso: int = ..., # NPY_DATETIMEUNIT diff -Nru pandas-2.1.4+dfsg/pandas/_libs/window/aggregations.pyi pandas-2.2.2+dfsg/pandas/_libs/window/aggregations.pyi --- pandas-2.1.4+dfsg/pandas/_libs/window/aggregations.pyi 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/window/aggregations.pyi 2024-04-10 17:42:52.000000000 +0000 @@ -111,8 +111,8 @@ com: float, # float64_t adjust: bool, ignore_na: bool, - deltas: np.ndarray, # const float64_t[:] - normalize: bool, + deltas: np.ndarray | None = None, # const float64_t[:] + normalize: bool = True, ) -> np.ndarray: ... # np.ndarray[np.float64] def ewmcov( input_x: np.ndarray, # const float64_t[:] diff -Nru pandas-2.1.4+dfsg/pandas/_libs/window/aggregations.pyx pandas-2.2.2+dfsg/pandas/_libs/window/aggregations.pyx --- pandas-2.1.4+dfsg/pandas/_libs/window/aggregations.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/window/aggregations.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -987,9 +987,8 @@ # ---------------------------------------------------------------------- -# Moving maximum / minimum code taken from Bottleneck under the terms -# of its Simplified BSD license -# https://github.com/pydata/bottleneck +# Moving maximum / minimum code taken from Bottleneck +# Licence at LICENSES/BOTTLENECK_LICENCE cdef float64_t init_mm(float64_t ai, Py_ssize_t *nobs, bint is_max) noexcept nogil: diff -Nru pandas-2.1.4+dfsg/pandas/_libs/window/indexers.pyx pandas-2.2.2+dfsg/pandas/_libs/window/indexers.pyx --- pandas-2.1.4+dfsg/pandas/_libs/window/indexers.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/window/indexers.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -138,6 +138,8 @@ break # end bound is previous end # or current index + elif index[end[i - 1]] == end_bound and not right_closed: + end[i] = end[i - 1] + 1 elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0: end[i] = i + 1 else: diff -Nru pandas-2.1.4+dfsg/pandas/_libs/writers.pyx pandas-2.2.2+dfsg/pandas/_libs/writers.pyx --- pandas-2.1.4+dfsg/pandas/_libs/writers.pyx 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_libs/writers.pyx 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,5 @@ cimport cython +from cython cimport Py_ssize_t import numpy as np from cpython cimport ( diff -Nru pandas-2.1.4+dfsg/pandas/_testing/__init__.py pandas-2.2.2+dfsg/pandas/_testing/__init__.py --- pandas-2.1.4+dfsg/pandas/_testing/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_testing/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,13 +1,8 @@ from __future__ import annotations -import collections -from collections import Counter -from datetime import datetime from decimal import Decimal import operator import os -import re -import string from sys import byteorder from typing import ( TYPE_CHECKING, @@ -15,6 +10,7 @@ ContextManager, cast, ) +import warnings import numpy as np @@ -24,29 +20,18 @@ set_locale, ) -from pandas.compat import pa_version_under7p0 +from pandas.compat import pa_version_under10p1 -from pandas.core.dtypes.common import ( - is_float_dtype, - is_sequence, - is_signed_integer_dtype, - is_unsigned_integer_dtype, - pandas_dtype, -) +from pandas.core.dtypes.common import is_string_dtype import pandas as pd from pandas import ( ArrowDtype, - Categorical, - CategoricalIndex, DataFrame, - DatetimeIndex, Index, - IntervalIndex, MultiIndex, RangeIndex, Series, - bdate_range, ) from pandas._testing._io import ( round_trip_localpath, @@ -88,6 +73,7 @@ get_obj, ) from pandas._testing.contexts import ( + assert_cow_warning, decompress_file, ensure_clean, raises_chained_assignment_error, @@ -104,23 +90,13 @@ from pandas.core.construction import extract_array if TYPE_CHECKING: - from collections.abc import Iterable - from pandas._typing import ( Dtype, - Frequency, NpDtype, ) - from pandas import ( - PeriodIndex, - TimedeltaIndex, - ) from pandas.core.arrays import ArrowExtensionArray -_N = 30 -_K = 4 - UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"] SIGNED_INT_NUMPY_DTYPES: list[NpDtype] = [int, "int8", "int16", "int32", "int64"] @@ -210,7 +186,7 @@ ] ] -if not pa_version_under7p0: +if not pa_version_under10p1: import pyarrow as pa UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()] @@ -260,14 +236,18 @@ + TIMEDELTA_PYARROW_DTYPES + BOOL_PYARROW_DTYPES ) + ALL_REAL_PYARROW_DTYPES_STR_REPR = ( + ALL_INT_PYARROW_DTYPES_STR_REPR + FLOAT_PYARROW_DTYPES_STR_REPR + ) else: FLOAT_PYARROW_DTYPES_STR_REPR = [] ALL_INT_PYARROW_DTYPES_STR_REPR = [] ALL_PYARROW_DTYPES = [] + ALL_REAL_PYARROW_DTYPES_STR_REPR = [] - -EMPTY_STRING_PATTERN = re.compile("^$") - +ALL_REAL_NULLABLE_DTYPES = ( + FLOAT_NUMPY_DTYPES + ALL_REAL_EXTENSION_DTYPES + ALL_REAL_PYARROW_DTYPES_STR_REPR +) arithmetic_dunder_methods = [ "__add__", @@ -289,24 +269,10 @@ comparison_dunder_methods = ["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"] -def reset_display_options() -> None: - """ - Reset the display options for printing and representing objects. - """ - pd.reset_option("^display.", silent=True) - - # ----------------------------------------------------------------------------- # Comparators -def equalContents(arr1, arr2) -> bool: - """ - Checks if the set of unique elements of arr1 and arr2 are equivalent. - """ - return frozenset(arr1) == frozenset(arr2) - - def box_expected(expected, box_cls, transpose: bool = True): """ Helper function to wrap the expected output of a test in a given box_class. @@ -327,11 +293,17 @@ else: expected = pd.array(expected, copy=False) elif box_cls is Index: - expected = Index(expected) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) + expected = Index(expected) elif box_cls is Series: - expected = Series(expected) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) + expected = Series(expected) elif box_cls is DataFrame: - expected = Series(expected).to_frame() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) + expected = Series(expected).to_frame() if transpose: # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame @@ -361,473 +333,6 @@ return extract_array(obj, extract_numpy=True) -# ----------------------------------------------------------------------------- -# Others - - -def rands_array( - nchars, size: int, dtype: NpDtype = "O", replace: bool = True -) -> np.ndarray: - """ - Generate an array of byte strings. - """ - chars = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) - retval = ( - np.random.default_rng(2) - .choice(chars, size=nchars * np.prod(size), replace=replace) - .view((np.str_, nchars)) - .reshape(size) - ) - return retval.astype(dtype) - - -def getCols(k) -> str: - return string.ascii_uppercase[:k] - - -# make index -def makeStringIndex(k: int = 10, name=None) -> Index: - return Index(rands_array(nchars=10, size=k), name=name) - - -def makeCategoricalIndex( - k: int = 10, n: int = 3, name=None, **kwargs -) -> CategoricalIndex: - """make a length k index or n categories""" - x = rands_array(nchars=4, size=n, replace=False) - return CategoricalIndex( - Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs - ) - - -def makeIntervalIndex(k: int = 10, name=None, **kwargs) -> IntervalIndex: - """make a length k IntervalIndex""" - x = np.linspace(0, 100, num=(k + 1)) - return IntervalIndex.from_breaks(x, name=name, **kwargs) - - -def makeBoolIndex(k: int = 10, name=None) -> Index: - if k == 1: - return Index([True], name=name) - elif k == 2: - return Index([False, True], name=name) - return Index([False, True] + [False] * (k - 2), name=name) - - -def makeNumericIndex(k: int = 10, *, name=None, dtype: Dtype | None) -> Index: - dtype = pandas_dtype(dtype) - assert isinstance(dtype, np.dtype) - - if dtype.kind in "iu": - values = np.arange(k, dtype=dtype) - if is_unsigned_integer_dtype(dtype): - values += 2 ** (dtype.itemsize * 8 - 1) - elif dtype.kind == "f": - values = np.random.default_rng(2).random(k) - np.random.default_rng(2).random(1) - values.sort() - values = values * (10 ** np.random.default_rng(2).integers(0, 9)) - else: - raise NotImplementedError(f"wrong dtype {dtype}") - - return Index(values, dtype=dtype, name=name) - - -def makeIntIndex(k: int = 10, *, name=None, dtype: Dtype = "int64") -> Index: - dtype = pandas_dtype(dtype) - if not is_signed_integer_dtype(dtype): - raise TypeError(f"Wrong dtype {dtype}") - return makeNumericIndex(k, name=name, dtype=dtype) - - -def makeUIntIndex(k: int = 10, *, name=None, dtype: Dtype = "uint64") -> Index: - dtype = pandas_dtype(dtype) - if not is_unsigned_integer_dtype(dtype): - raise TypeError(f"Wrong dtype {dtype}") - return makeNumericIndex(k, name=name, dtype=dtype) - - -def makeRangeIndex(k: int = 10, name=None, **kwargs) -> RangeIndex: - return RangeIndex(0, k, 1, name=name, **kwargs) - - -def makeFloatIndex(k: int = 10, *, name=None, dtype: Dtype = "float64") -> Index: - dtype = pandas_dtype(dtype) - if not is_float_dtype(dtype): - raise TypeError(f"Wrong dtype {dtype}") - return makeNumericIndex(k, name=name, dtype=dtype) - - -def makeDateIndex( - k: int = 10, freq: Frequency = "B", name=None, **kwargs -) -> DatetimeIndex: - dt = datetime(2000, 1, 1) - dr = bdate_range(dt, periods=k, freq=freq, name=name) - return DatetimeIndex(dr, name=name, **kwargs) - - -def makeTimedeltaIndex( - k: int = 10, freq: Frequency = "D", name=None, **kwargs -) -> TimedeltaIndex: - return pd.timedelta_range(start="1 day", periods=k, freq=freq, name=name, **kwargs) - - -def makePeriodIndex(k: int = 10, name=None, **kwargs) -> PeriodIndex: - dt = datetime(2000, 1, 1) - pi = pd.period_range(start=dt, periods=k, freq="D", name=name, **kwargs) - return pi - - -def makeMultiIndex(k: int = 10, names=None, **kwargs): - N = (k // 2) + 1 - rng = range(N) - mi = MultiIndex.from_product([("foo", "bar"), rng], names=names, **kwargs) - assert len(mi) >= k # GH#38795 - return mi[:k] - - -def index_subclass_makers_generator(): - make_index_funcs = [ - makeDateIndex, - makePeriodIndex, - makeTimedeltaIndex, - makeRangeIndex, - makeIntervalIndex, - makeCategoricalIndex, - makeMultiIndex, - ] - yield from make_index_funcs - - -def all_timeseries_index_generator(k: int = 10) -> Iterable[Index]: - """ - Generator which can be iterated over to get instances of all the classes - which represent time-series. - - Parameters - ---------- - k: length of each of the index instances - """ - make_index_funcs: list[Callable[..., Index]] = [ - makeDateIndex, - makePeriodIndex, - makeTimedeltaIndex, - ] - for make_index_func in make_index_funcs: - yield make_index_func(k=k) - - -# make series -def make_rand_series(name=None, dtype=np.float64) -> Series: - index = makeStringIndex(_N) - data = np.random.default_rng(2).standard_normal(_N) - with np.errstate(invalid="ignore"): - data = data.astype(dtype, copy=False) - return Series(data, index=index, name=name) - - -def makeFloatSeries(name=None) -> Series: - return make_rand_series(name=name) - - -def makeStringSeries(name=None) -> Series: - return make_rand_series(name=name) - - -def makeObjectSeries(name=None) -> Series: - data = makeStringIndex(_N) - data = Index(data, dtype=object) - index = makeStringIndex(_N) - return Series(data, index=index, name=name) - - -def getSeriesData() -> dict[str, Series]: - index = makeStringIndex(_N) - return { - c: Series(np.random.default_rng(i).standard_normal(_N), index=index) - for i, c in enumerate(getCols(_K)) - } - - -def makeTimeSeries(nper=None, freq: Frequency = "B", name=None) -> Series: - if nper is None: - nper = _N - return Series( - np.random.default_rng(2).standard_normal(nper), - index=makeDateIndex(nper, freq=freq), - name=name, - ) - - -def makePeriodSeries(nper=None, name=None) -> Series: - if nper is None: - nper = _N - return Series( - np.random.default_rng(2).standard_normal(nper), - index=makePeriodIndex(nper), - name=name, - ) - - -def getTimeSeriesData(nper=None, freq: Frequency = "B") -> dict[str, Series]: - return {c: makeTimeSeries(nper, freq) for c in getCols(_K)} - - -def getPeriodData(nper=None) -> dict[str, Series]: - return {c: makePeriodSeries(nper) for c in getCols(_K)} - - -# make frame -def makeTimeDataFrame(nper=None, freq: Frequency = "B") -> DataFrame: - data = getTimeSeriesData(nper, freq) - return DataFrame(data) - - -def makeDataFrame() -> DataFrame: - data = getSeriesData() - return DataFrame(data) - - -def getMixedTypeDict(): - index = Index(["a", "b", "c", "d", "e"]) - - data = { - "A": [0.0, 1.0, 2.0, 3.0, 4.0], - "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], - "D": bdate_range("1/1/2009", periods=5), - } - - return index, data - - -def makeMixedDataFrame() -> DataFrame: - return DataFrame(getMixedTypeDict()[1]) - - -def makePeriodFrame(nper=None) -> DataFrame: - data = getPeriodData(nper) - return DataFrame(data) - - -def makeCustomIndex( - nentries, - nlevels, - prefix: str = "#", - names: bool | str | list[str] | None = False, - ndupe_l=None, - idx_type=None, -) -> Index: - """ - Create an index/multindex with given dimensions, levels, names, etc' - - nentries - number of entries in index - nlevels - number of levels (> 1 produces multindex) - prefix - a string prefix for labels - names - (Optional), bool or list of strings. if True will use default - names, if false will use no names, if a list is given, the name of - each level in the index will be taken from the list. - ndupe_l - (Optional), list of ints, the number of rows for which the - label will repeated at the corresponding level, you can specify just - the first few, the rest will use the default ndupe_l of 1. - len(ndupe_l) <= nlevels. - idx_type - "i"/"f"/"s"/"dt"/"p"/"td". - If idx_type is not None, `idx_nlevels` must be 1. - "i"/"f" creates an integer/float index, - "s" creates a string - "dt" create a datetime index. - "td" create a datetime index. - - if unspecified, string labels will be generated. - """ - if ndupe_l is None: - ndupe_l = [1] * nlevels - assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels - assert names is None or names is False or names is True or len(names) is nlevels - assert idx_type is None or ( - idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1 - ) - - if names is True: - # build default names - names = [prefix + str(i) for i in range(nlevels)] - if names is False: - # pass None to index constructor for no name - names = None - - # make singleton case uniform - if isinstance(names, str) and nlevels == 1: - names = [names] - - # specific 1D index type requested? - idx_func_dict: dict[str, Callable[..., Index]] = { - "i": makeIntIndex, - "f": makeFloatIndex, - "s": makeStringIndex, - "dt": makeDateIndex, - "td": makeTimedeltaIndex, - "p": makePeriodIndex, - } - idx_func = idx_func_dict.get(idx_type) - if idx_func: - idx = idx_func(nentries) - # but we need to fill in the name - if names: - idx.name = names[0] - return idx - elif idx_type is not None: - raise ValueError( - f"{repr(idx_type)} is not a legal value for `idx_type`, " - "use 'i'/'f'/'s'/'dt'/'p'/'td'." - ) - - if len(ndupe_l) < nlevels: - ndupe_l.extend([1] * (nlevels - len(ndupe_l))) - assert len(ndupe_l) == nlevels - - assert all(x > 0 for x in ndupe_l) - - list_of_lists = [] - for i in range(nlevels): - - def keyfunc(x): - numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") - return [int(num) for num in numeric_tuple] - - # build a list of lists to create the index from - div_factor = nentries // ndupe_l[i] + 1 - - # Deprecated since version 3.9: collections.Counter now supports []. See PEP 585 - # and Generic Alias Type. - cnt: Counter[str] = collections.Counter() - for j in range(div_factor): - label = f"{prefix}_l{i}_g{j}" - cnt[label] = ndupe_l[i] - # cute Counter trick - result = sorted(cnt.elements(), key=keyfunc)[:nentries] - list_of_lists.append(result) - - tuples = list(zip(*list_of_lists)) - - # convert tuples to index - if nentries == 1: - # we have a single level of tuples, i.e. a regular Index - name = None if names is None else names[0] - index = Index(tuples[0], name=name) - elif nlevels == 1: - name = None if names is None else names[0] - index = Index((x[0] for x in tuples), name=name) - else: - index = MultiIndex.from_tuples(tuples, names=names) - return index - - -def makeCustomDataframe( - nrows, - ncols, - c_idx_names: bool | list[str] = True, - r_idx_names: bool | list[str] = True, - c_idx_nlevels: int = 1, - r_idx_nlevels: int = 1, - data_gen_f=None, - c_ndupe_l=None, - r_ndupe_l=None, - dtype=None, - c_idx_type=None, - r_idx_type=None, -) -> DataFrame: - """ - Create a DataFrame using supplied parameters. - - Parameters - ---------- - nrows, ncols - number of data rows/cols - c_idx_names, r_idx_names - False/True/list of strings, yields No names , - default names or uses the provided names for the levels of the - corresponding index. You can provide a single string when - c_idx_nlevels ==1. - c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex - r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex - data_gen_f - a function f(row,col) which return the data value - at that position, the default generator used yields values of the form - "RxCy" based on position. - c_ndupe_l, r_ndupe_l - list of integers, determines the number - of duplicates for each label at a given level of the corresponding - index. The default `None` value produces a multiplicity of 1 across - all levels, i.e. a unique index. Will accept a partial list of length - N < idx_nlevels, for just the first N levels. If ndupe doesn't divide - nrows/ncol, the last label might have lower multiplicity. - dtype - passed to the DataFrame constructor as is, in case you wish to - have more control in conjunction with a custom `data_gen_f` - r_idx_type, c_idx_type - "i"/"f"/"s"/"dt"/"td". - If idx_type is not None, `idx_nlevels` must be 1. - "i"/"f" creates an integer/float index, - "s" creates a string index - "dt" create a datetime index. - "td" create a timedelta index. - - if unspecified, string labels will be generated. - - Examples - -------- - # 5 row, 3 columns, default names on both, single index on both axis - >> makeCustomDataframe(5,3) - - # make the data a random int between 1 and 100 - >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100)) - - # 2-level multiindex on rows with each label duplicated - # twice on first level, default names on both axis, single - # index on both axis - >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2]) - - # DatetimeIndex on row, index with unicode labels on columns - # no names on either axis - >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False, - r_idx_type="dt",c_idx_type="u") - - # 4-level multindex on rows with names provided, 2-level multindex - # on columns with default labels and default names. - >> a=makeCustomDataframe(5,3,r_idx_nlevels=4, - r_idx_names=["FEE","FIH","FOH","FUM"], - c_idx_nlevels=2) - - >> a=mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) - """ - assert c_idx_nlevels > 0 - assert r_idx_nlevels > 0 - assert r_idx_type is None or ( - r_idx_type in ("i", "f", "s", "dt", "p", "td") and r_idx_nlevels == 1 - ) - assert c_idx_type is None or ( - c_idx_type in ("i", "f", "s", "dt", "p", "td") and c_idx_nlevels == 1 - ) - - columns = makeCustomIndex( - ncols, - nlevels=c_idx_nlevels, - prefix="C", - names=c_idx_names, - ndupe_l=c_ndupe_l, - idx_type=c_idx_type, - ) - index = makeCustomIndex( - nrows, - nlevels=r_idx_nlevels, - prefix="R", - names=r_idx_names, - ndupe_l=r_ndupe_l, - idx_type=r_idx_type, - ) - - # by default, generate data based on location - if data_gen_f is None: - data_gen_f = lambda r, c: f"R{r}C{c}" - - data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)] - - return DataFrame(data, index, columns, dtype=dtype) - - class SubclassedSeries(Series): _metadata = ["testattr", "name"] @@ -857,42 +362,6 @@ return lambda *args, **kwargs: SubclassedSeries(*args, **kwargs) -class SubclassedCategorical(Categorical): - pass - - -def _make_skipna_wrapper(alternative, skipna_alternative=None): - """ - Create a function for calling on an array. - - Parameters - ---------- - alternative : function - The function to be called on the array with no NaNs. - Only used when 'skipna_alternative' is None. - skipna_alternative : function - The function to be called on the original array - - Returns - ------- - function - """ - if skipna_alternative: - - def skipna_wrapper(x): - return skipna_alternative(x.values) - - else: - - def skipna_wrapper(x): - nona = x.dropna() - if len(nona) == 0: - return np.nan - return alternative(nona) - - return skipna_wrapper - - def convert_rows_list_to_csv_str(rows_list: list[str]) -> str: """ Convert list of CSV rows to single CSV-formatted string for current OS. @@ -1018,6 +487,17 @@ # ----------------------------------------------------------------------------- +_UNITS = ["s", "ms", "us", "ns"] + + +def get_finest_unit(left: str, right: str): + """ + Find the higher of two datetime64 units. + """ + if _UNITS.index(left) >= _UNITS.index(right): + return left + return right + def shares_memory(left, right) -> bool: """ @@ -1043,10 +523,18 @@ if isinstance(left, pd.core.arrays.IntervalArray): return shares_memory(left._left, right) or shares_memory(left._right, right) - if isinstance(left, ExtensionArray) and left.dtype == "string[pyarrow]": + if ( + isinstance(left, ExtensionArray) + and is_string_dtype(left.dtype) + and left.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] + ): # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left = cast("ArrowExtensionArray", left) - if isinstance(right, ExtensionArray) and right.dtype == "string[pyarrow]": + if ( + isinstance(right, ExtensionArray) + and is_string_dtype(right.dtype) + and right.dtype.storage in ("pyarrow", "pyarrow_numpy") # type: ignore[attr-defined] + ): right = cast("ArrowExtensionArray", right) left_pa_data = left._pa_array right_pa_data = right._pa_array @@ -1073,7 +561,6 @@ "ALL_INT_NUMPY_DTYPES", "ALL_NUMPY_DTYPES", "ALL_REAL_NUMPY_DTYPES", - "all_timeseries_index_generator", "assert_almost_equal", "assert_attr_equal", "assert_categorical_equal", @@ -1097,6 +584,7 @@ "assert_series_equal", "assert_sp_array_equal", "assert_timedelta_array_equal", + "assert_cow_warning", "at", "BOOL_DTYPES", "box_expected", @@ -1106,60 +594,27 @@ "convert_rows_list_to_csv_str", "DATETIME64_DTYPES", "decompress_file", - "EMPTY_STRING_PATTERN", "ENDIAN", "ensure_clean", - "equalContents", "external_error_raised", "FLOAT_EA_DTYPES", "FLOAT_NUMPY_DTYPES", - "getCols", "get_cython_table_params", "get_dtype", "getitem", "get_locales", - "getMixedTypeDict", + "get_finest_unit", "get_obj", "get_op_from_name", - "getPeriodData", - "getSeriesData", - "getTimeSeriesData", "iat", "iloc", - "index_subclass_makers_generator", "loc", - "makeBoolIndex", - "makeCategoricalIndex", - "makeCustomDataframe", - "makeCustomIndex", - "makeDataFrame", - "makeDateIndex", - "makeFloatIndex", - "makeFloatSeries", - "makeIntervalIndex", - "makeIntIndex", - "makeMixedDataFrame", - "makeMultiIndex", - "makeNumericIndex", - "makeObjectSeries", - "makePeriodFrame", - "makePeriodIndex", - "makePeriodSeries", - "make_rand_series", - "makeRangeIndex", - "makeStringIndex", - "makeStringSeries", - "makeTimeDataFrame", - "makeTimedeltaIndex", - "makeTimeSeries", - "makeUIntIndex", "maybe_produces_warning", "NARROW_NP_DTYPES", "NP_NAT_OBJECTS", "NULL_OBJECTS", "OBJECT_DTYPES", "raise_assert_detail", - "reset_display_options", "raises_chained_assignment_error", "round_trip_localpath", "round_trip_pathlib", @@ -1171,7 +626,6 @@ "SIGNED_INT_EA_DTYPES", "SIGNED_INT_NUMPY_DTYPES", "STRING_DTYPES", - "SubclassedCategorical", "SubclassedDataFrame", "SubclassedSeries", "TIMEDELTA64_DTYPES", diff -Nru pandas-2.1.4+dfsg/pandas/_testing/_hypothesis.py pandas-2.2.2+dfsg/pandas/_testing/_hypothesis.py --- pandas-2.1.4+dfsg/pandas/_testing/_hypothesis.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_testing/_hypothesis.py 2024-04-10 17:42:52.000000000 +0000 @@ -54,8 +54,12 @@ DATETIME_NO_TZ = st.datetimes() DATETIME_JAN_1_1900_OPTIONAL_TZ = st.datetimes( - min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), - max_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), + min_value=pd.Timestamp( + 1900, 1, 1 + ).to_pydatetime(), # pyright: ignore[reportGeneralTypeIssues] + max_value=pd.Timestamp( + 1900, 1, 1 + ).to_pydatetime(), # pyright: ignore[reportGeneralTypeIssues] timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()), ) diff -Nru pandas-2.1.4+dfsg/pandas/_testing/_io.py pandas-2.2.2+dfsg/pandas/_testing/_io.py --- pandas-2.1.4+dfsg/pandas/_testing/_io.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_testing/_io.py 2024-04-10 17:42:52.000000000 +0000 @@ -118,7 +118,7 @@ return obj -def write_to_compressed(compression, path, data, dest: str = "test"): +def write_to_compressed(compression, path, data, dest: str = "test") -> None: """ Write data to a compressed file. diff -Nru pandas-2.1.4+dfsg/pandas/_testing/_warnings.py pandas-2.2.2+dfsg/pandas/_testing/_warnings.py --- pandas-2.1.4+dfsg/pandas/_testing/_warnings.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_testing/_warnings.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,6 +4,7 @@ contextmanager, nullcontext, ) +import inspect import re import sys from typing import ( @@ -213,15 +214,19 @@ def _assert_raised_with_correct_stacklevel( actual_warning: warnings.WarningMessage, ) -> None: - from inspect import ( - getframeinfo, - stack, - ) - - caller = getframeinfo(stack()[4][0]) + # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow + frame = inspect.currentframe() + for _ in range(4): + frame = frame.f_back # type: ignore[union-attr] + try: + caller_filename = inspect.getfile(frame) # type: ignore[arg-type] + finally: + # See note in + # https://docs.python.org/3/library/inspect.html#inspect.Traceback + del frame msg = ( "Warning not set with correct stacklevel. " f"File where warning is raised: {actual_warning.filename} != " - f"{caller.filename}. Warning message: {actual_warning.message}" + f"{caller_filename}. Warning message: {actual_warning.message}" ) - assert actual_warning.filename == caller.filename, msg + assert actual_warning.filename == caller_filename, msg diff -Nru pandas-2.1.4+dfsg/pandas/_testing/asserters.py pandas-2.2.2+dfsg/pandas/_testing/asserters.py --- pandas-2.1.4+dfsg/pandas/_testing/asserters.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_testing/asserters.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,11 +4,13 @@ from typing import ( TYPE_CHECKING, Literal, + NoReturn, cast, ) import numpy as np +from pandas._libs import lib from pandas._libs.missing import is_matching_na from pandas._libs.sparse import SparseIndex import pandas._libs.testing as _testing @@ -16,6 +18,7 @@ from pandas.core.dtypes.common import ( is_bool, + is_float_dtype, is_integer_dtype, is_number, is_numeric_dtype, @@ -43,7 +46,6 @@ Series, TimedeltaIndex, ) -from pandas.core.algorithms import take_nd from pandas.core.arrays import ( DatetimeArray, ExtensionArray, @@ -143,7 +145,7 @@ ) -def _check_isinstance(left, right, cls): +def _check_isinstance(left, right, cls) -> None: """ Helper method for our assert_* methods that ensures that the two objects being compared have the right type before @@ -209,8 +211,6 @@ Whether to compare the order of index entries as well as their values. If True, both indexes must contain the same elements, in the same order. If False, both indexes must contain the same elements, but in any order. - - .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. atol : float, default 1e-8 @@ -246,13 +246,6 @@ assert_attr_equal("dtype", left, right, obj=obj) - def _get_ilevel_values(index, level): - # accept level number only - unique = index.levels[level] - level_codes = index.codes[level] - filled = take_nd(unique._values, level_codes, fill_value=unique._na_value) - return unique._shallow_copy(filled, name=index.names[level]) - # instance validation _check_isinstance(left, right, Index) @@ -283,22 +276,36 @@ right = cast(MultiIndex, right) for level in range(left.nlevels): - # cannot use get_level_values here because it can change dtype - llevel = _get_ilevel_values(left, level) - rlevel = _get_ilevel_values(right, level) - lobj = f"MultiIndex level [{level}]" - assert_index_equal( - llevel, - rlevel, - exact=exact, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=lobj, - ) + try: + # try comparison on levels/codes to avoid densifying MultiIndex + assert_index_equal( + left.levels[level], + right.levels[level], + exact=exact, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=lobj, + ) + assert_numpy_array_equal(left.codes[level], right.codes[level]) + except AssertionError: + llevel = left.get_level_values(level) + rlevel = right.get_level_values(level) + + assert_index_equal( + llevel, + rlevel, + exact=exact, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=lobj, + ) # get_level_values may change dtype _check_types(left.levels[level], right.levels[level], obj=obj) @@ -416,22 +423,25 @@ def assert_is_valid_plot_return_object(objs) -> None: - import matplotlib.pyplot as plt + from matplotlib.artist import Artist + from matplotlib.axes import Axes if isinstance(objs, (Series, np.ndarray)): + if isinstance(objs, Series): + objs = objs._values for el in objs.ravel(): msg = ( "one of 'objs' is not a matplotlib Axes instance, " f"type encountered {repr(type(el).__name__)}" ) - assert isinstance(el, (plt.Axes, dict)), msg + assert isinstance(el, (Axes, dict)), msg else: msg = ( "objs is neither an ndarray of Artist instances nor a single " "ArtistArtist instance, tuple, or dict, 'objs' is a " f"{repr(type(objs).__name__)}" ) - assert isinstance(objs, (plt.Artist, tuple, dict)), msg + assert isinstance(objs, (Artist, tuple, dict)), msg def assert_is_sorted(seq) -> None: @@ -439,7 +449,10 @@ if isinstance(seq, (Index, Series)): seq = seq.values # sorting does not change precisions - assert_numpy_array_equal(seq, np.sort(np.array(seq))) + if isinstance(seq, np.ndarray): + assert_numpy_array_equal(seq, np.sort(np.array(seq))) + else: + assert_extension_array_equal(seq, seq[seq.argsort()]) def assert_categorical_equal( @@ -565,13 +578,16 @@ def raise_assert_detail( obj, message, left, right, diff=None, first_diff=None, index_values=None -): +) -> NoReturn: __tracebackhide__ = True msg = f"""{obj} are different {message}""" + if isinstance(index_values, Index): + index_values = np.asarray(index_values) + if isinstance(index_values, np.ndarray): msg += f"\n[index]: {pprint_thing(index_values)}" @@ -626,7 +642,7 @@ obj : str, default 'numpy array' Specify object name being compared, internally used to show appropriate assertion message. - index_values : numpy.ndarray, default None + index_values : Index | numpy.ndarray, default None optional index (shared by both left and right), used in output. """ __tracebackhide__ = True @@ -650,7 +666,7 @@ if left_base is right_base: raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") - def _raise(left, right, err_msg): + def _raise(left, right, err_msg) -> NoReturn: if err_msg is None: if left.shape != right.shape: raise_assert_detail( @@ -683,9 +699,9 @@ right, check_dtype: bool | Literal["equiv"] = True, index_values=None, - check_exact: bool = False, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + check_exact: bool | lib.NoDefault = lib.no_default, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "ExtensionArray", ) -> None: """ @@ -697,10 +713,15 @@ The two arrays to compare. check_dtype : bool, default True Whether to check if the ExtensionArray dtypes are identical. - index_values : numpy.ndarray, default None + index_values : Index | numpy.ndarray, default None Optional index (shared by both left and right), used in output. check_exact : bool, default False Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. atol : float, default 1e-8 @@ -724,6 +745,23 @@ >>> b, c = a.array, a.array >>> tm.assert_extension_array_equal(b, c) """ + if ( + check_exact is lib.no_default + and rtol is lib.no_default + and atol is lib.no_default + ): + check_exact = ( + is_numeric_dtype(left.dtype) + and not is_float_dtype(left.dtype) + or is_numeric_dtype(right.dtype) + and not is_float_dtype(right.dtype) + ) + elif check_exact is lib.no_default: + check_exact = False + + rtol = rtol if rtol is not lib.no_default else 1.0e-5 + atol = atol if atol is not lib.no_default else 1.0e-8 + assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" if check_dtype: @@ -741,7 +779,7 @@ else: l_unit = np.datetime_data(left.dtype)[0] if not isinstance(right.dtype, np.dtype): - r_unit = cast(DatetimeTZDtype, left.dtype).unit + r_unit = cast(DatetimeTZDtype, right.dtype).unit else: r_unit = np.datetime_data(right.dtype)[0] if ( @@ -793,14 +831,14 @@ check_index_type: bool | Literal["equiv"] = "equiv", check_series_type: bool = True, check_names: bool = True, - check_exact: bool = False, + check_exact: bool | lib.NoDefault = lib.no_default, check_datetimelike_compat: bool = False, check_categorical: bool = True, check_category_order: bool = True, check_freq: bool = True, check_flags: bool = True, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "Series", *, check_index: bool = True, @@ -824,6 +862,11 @@ Whether to check the Series and Index names attribute. check_exact : bool, default False Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -834,9 +877,6 @@ Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. check_flags : bool, default True Whether to check the `flags` attribute. - - .. versionadded:: 1.2.0 - rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. atol : float, default 1e-8 @@ -862,6 +902,23 @@ >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True + check_exact_index = False if check_exact is lib.no_default else check_exact + if ( + check_exact is lib.no_default + and rtol is lib.no_default + and atol is lib.no_default + ): + check_exact = ( + is_numeric_dtype(left.dtype) + and not is_float_dtype(left.dtype) + or is_numeric_dtype(right.dtype) + and not is_float_dtype(right.dtype) + ) + elif check_exact is lib.no_default: + check_exact = False + + rtol = rtol if rtol is not lib.no_default else 1.0e-5 + atol = atol if atol is not lib.no_default else 1.0e-8 if not check_index and check_like: raise ValueError("check_like must be False if check_index is False") @@ -888,7 +945,7 @@ right.index, exact=check_index_type, check_names=check_names, - check_exact=check_exact, + check_exact=check_exact_index, check_categorical=check_categorical, check_order=not check_like, rtol=rtol, @@ -916,8 +973,7 @@ pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - - if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): + if check_exact: left_values = left._values right_values = right._values # Only check exact if dtype is numeric @@ -928,16 +984,22 @@ left_values, right_values, check_dtype=check_dtype, - index_values=np.asarray(left.index), + index_values=left.index, obj=str(obj), ) else: + # convert both to NumPy if not, check_dtype would raise earlier + lv, rv = left_values, right_values + if isinstance(left_values, ExtensionArray): + lv = left_values.to_numpy() + if isinstance(right_values, ExtensionArray): + rv = right_values.to_numpy() assert_numpy_array_equal( - left_values, - right_values, + lv, + rv, check_dtype=check_dtype, obj=str(obj), - index_values=np.asarray(left.index), + index_values=left.index, ) elif check_datetimelike_compat and ( needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype) @@ -968,7 +1030,7 @@ atol=atol, check_dtype=bool(check_dtype), obj=str(obj), - index_values=np.asarray(left.index), + index_values=left.index, ) elif isinstance(left.dtype, ExtensionDtype) and isinstance( right.dtype, ExtensionDtype @@ -979,7 +1041,7 @@ rtol=rtol, atol=atol, check_dtype=check_dtype, - index_values=np.asarray(left.index), + index_values=left.index, obj=str(obj), ) elif is_extension_array_dtype_and_needs_i8_conversion( @@ -989,7 +1051,7 @@ left._values, right._values, check_dtype=check_dtype, - index_values=np.asarray(left.index), + index_values=left.index, obj=str(obj), ) elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype): @@ -998,7 +1060,7 @@ left._values, right._values, check_dtype=check_dtype, - index_values=np.asarray(left.index), + index_values=left.index, obj=str(obj), ) else: @@ -1009,7 +1071,7 @@ atol=atol, check_dtype=bool(check_dtype), obj=str(obj), - index_values=np.asarray(left.index), + index_values=left.index, ) # metadata comparison @@ -1038,14 +1100,14 @@ check_frame_type: bool = True, check_names: bool = True, by_blocks: bool = False, - check_exact: bool = False, + check_exact: bool | lib.NoDefault = lib.no_default, check_datetimelike_compat: bool = False, check_categorical: bool = True, check_like: bool = False, check_freq: bool = True, check_flags: bool = True, - rtol: float = 1.0e-5, - atol: float = 1.0e-8, + rtol: float | lib.NoDefault = lib.no_default, + atol: float | lib.NoDefault = lib.no_default, obj: str = "DataFrame", ) -> None: """ @@ -1081,6 +1143,11 @@ If True, compare by blocks. check_exact : bool, default False Whether to compare number exactly. + + .. versionchanged:: 2.2.0 + + Defaults to True for integer dtypes if none of + ``check_exact``, ``rtol`` and ``atol`` are specified. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True @@ -1135,6 +1202,9 @@ >>> assert_frame_equal(df1, df2, check_dtype=False) """ __tracebackhide__ = True + _rtol = rtol if rtol is not lib.no_default else 1.0e-5 + _atol = atol if atol is not lib.no_default else 1.0e-8 + _check_exact = check_exact if check_exact is not lib.no_default else False # instance validation _check_isinstance(left, right, DataFrame) @@ -1158,11 +1228,11 @@ right.index, exact=check_index_type, check_names=check_names, - check_exact=check_exact, + check_exact=_check_exact, check_categorical=check_categorical, check_order=not check_like, - rtol=rtol, - atol=atol, + rtol=_rtol, + atol=_atol, obj=f"{obj}.index", ) @@ -1172,11 +1242,11 @@ right.columns, exact=check_column_type, check_names=check_names, - check_exact=check_exact, + check_exact=_check_exact, check_categorical=check_categorical, check_order=not check_like, - rtol=rtol, - atol=atol, + rtol=_rtol, + atol=_atol, obj=f"{obj}.columns", ) @@ -1185,8 +1255,8 @@ # compare by blocks if by_blocks: - rblocks = right._to_dict_of_blocks(copy=False) - lblocks = left._to_dict_of_blocks(copy=False) + rblocks = right._to_dict_of_blocks() + lblocks = left._to_dict_of_blocks() for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): assert dtype in lblocks assert dtype in rblocks diff -Nru pandas-2.1.4+dfsg/pandas/_testing/contexts.py pandas-2.2.2+dfsg/pandas/_testing/contexts.py --- pandas-2.1.4+dfsg/pandas/_testing/contexts.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_testing/contexts.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,6 +11,8 @@ ) import uuid +from pandas._config import using_copy_on_write + from pandas.compat import PYPY from pandas.errors import ChainedAssignmentError @@ -193,9 +195,14 @@ set_option("compute.use_numexpr", olduse) -def raises_chained_assignment_error(extra_warnings=(), extra_match=()): +def raises_chained_assignment_error(warn=True, extra_warnings=(), extra_match=()): from pandas._testing import assert_produces_warning + if not warn: + from contextlib import nullcontext + + return nullcontext() + if PYPY and not extra_warnings: from contextlib import nullcontext @@ -206,11 +213,45 @@ match="|".join(extra_match), ) else: - match = ( - "A value is trying to be set on a copy of a DataFrame or Series " - "through chained assignment" - ) + if using_copy_on_write(): + warning = ChainedAssignmentError + match = ( + "A value is trying to be set on a copy of a DataFrame or Series " + "through chained assignment" + ) + else: + warning = FutureWarning # type: ignore[assignment] + # TODO update match + match = "ChainedAssignmentError" + if extra_warnings: + warning = (warning, *extra_warnings) # type: ignore[assignment] return assert_produces_warning( - (ChainedAssignmentError, *extra_warnings), + warning, match="|".join((match, *extra_match)), ) + + +def assert_cow_warning(warn=True, match=None, **kwargs): + """ + Assert that a warning is raised in the CoW warning mode. + + Parameters + ---------- + warn : bool, default True + By default, check that a warning is raised. Can be turned off by passing False. + match : str + The warning message to match against, if different from the default. + kwargs + Passed through to assert_produces_warning + """ + from pandas._testing import assert_produces_warning + + if not warn: + from contextlib import nullcontext + + return nullcontext() + + if not match: + match = "Setting a value on a view" + + return assert_produces_warning(FutureWarning, match=match, **kwargs) diff -Nru pandas-2.1.4+dfsg/pandas/_typing.py pandas-2.2.2+dfsg/pandas/_typing.py --- pandas-2.1.4+dfsg/pandas/_typing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_typing.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,6 +4,7 @@ Hashable, Iterator, Mapping, + MutableMapping, Sequence, ) from datetime import ( @@ -24,6 +25,7 @@ Type as type_t, TypeVar, Union, + overload, ) import numpy as np @@ -85,6 +87,8 @@ # Name "npt._ArrayLikeInt_co" is not defined [name-defined] NumpySorter = Optional[npt._ArrayLikeInt_co] # type: ignore[name-defined] + from typing import SupportsIndex + if sys.version_info >= (3, 10): from typing import TypeGuard # pyright: ignore[reportUnusedImport] else: @@ -100,6 +104,7 @@ TypeGuard: Any = None HashableT = TypeVar("HashableT", bound=Hashable) +MutableMappingT = TypeVar("MutableMappingT", bound=MutableMapping) # array-like @@ -109,10 +114,40 @@ # list-like -# Cannot use `Sequence` because a string is a sequence, and we don't want to -# accept that. Could refine if https://github.com/python/typing/issues/256 is -# resolved to differentiate between Sequence[str] and str -ListLike = Union[AnyArrayLike, list, range] +# from https://github.com/hauntsaninja/useful_types +# includes Sequence-like objects but excludes str and bytes +_T_co = TypeVar("_T_co", covariant=True) + + +class SequenceNotStr(Protocol[_T_co]): + @overload + def __getitem__(self, index: SupportsIndex, /) -> _T_co: + ... + + @overload + def __getitem__(self, index: slice, /) -> Sequence[_T_co]: + ... + + def __contains__(self, value: object, /) -> bool: + ... + + def __len__(self) -> int: + ... + + def __iter__(self) -> Iterator[_T_co]: + ... + + def index(self, value: Any, /, start: int = 0, stop: int = ...) -> int: + ... + + def count(self, value: Any, /) -> int: + ... + + def __reversed__(self) -> Iterator[_T_co]: + ... + + +ListLike = Union[AnyArrayLike, SequenceNotStr, range] # scalars @@ -120,7 +155,7 @@ DatetimeLikeScalar = Union["Period", "Timestamp", "Timedelta"] PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, date] -IntStrT = TypeVar("IntStrT", int, str) +IntStrT = TypeVar("IntStrT", bound=Union[int, str]) # timestamp and timedelta convertible types @@ -199,7 +234,9 @@ # types of `func` kwarg for DataFrame.aggregate and Series.aggregate AggFuncTypeBase = Union[Callable, str] -AggFuncTypeDict = dict[Hashable, Union[AggFuncTypeBase, list[AggFuncTypeBase]]] +AggFuncTypeDict = MutableMapping[ + Hashable, Union[AggFuncTypeBase, list[AggFuncTypeBase]] +] AggFuncType = Union[ AggFuncTypeBase, list[AggFuncTypeBase], @@ -375,6 +412,9 @@ # read_xml parsers XMLParsers = Literal["lxml", "etree"] +# read_html flavors +HTMLFlavors = Literal["lxml", "html5lib", "bs4"] + # Interval closed type IntervalLeftRight = Literal["left", "right"] IntervalClosedType = Union[IntervalLeftRight, Literal["both", "neither"]] @@ -474,3 +514,12 @@ # Offsets OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"] + +# read_csv: usecols +UsecolsArgType = Union[ + SequenceNotStr[Hashable], + range, + AnyArrayLike, + Callable[[HashableT], bool], + None, +] diff -Nru pandas-2.1.4+dfsg/pandas/_version.py pandas-2.2.2+dfsg/pandas/_version.py --- pandas-2.1.4+dfsg/pandas/_version.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/_version.py 2024-04-10 17:42:52.000000000 +0000 @@ -386,7 +386,7 @@ return pieces -def plus_or_dot(pieces): +def plus_or_dot(pieces) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." diff -Nru pandas-2.1.4+dfsg/pandas/arrays/__init__.py pandas-2.2.2+dfsg/pandas/arrays/__init__.py --- pandas-2.1.4+dfsg/pandas/arrays/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/arrays/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -36,7 +36,7 @@ ] -def __getattr__(name: str): +def __getattr__(name: str) -> type[NumpyExtensionArray]: if name == "PandasArray": # GH#53694 import warnings diff -Nru pandas-2.1.4+dfsg/pandas/compat/__init__.py pandas-2.2.2+dfsg/pandas/compat/__init__.py --- pandas-2.1.4+dfsg/pandas/compat/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/compat/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -25,13 +25,12 @@ import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( - pa_version_under7p0, - pa_version_under8p0, - pa_version_under9p0, + pa_version_under10p1, pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, pa_version_under14p1, + pa_version_under16p0, ) if TYPE_CHECKING: @@ -183,13 +182,12 @@ __all__ = [ "is_numpy_dev", - "pa_version_under7p0", - "pa_version_under8p0", - "pa_version_under9p0", + "pa_version_under10p1", "pa_version_under11p0", "pa_version_under13p0", "pa_version_under14p0", "pa_version_under14p1", + "pa_version_under16p0", "IS64", "ISMUSL", "PY310", diff -Nru pandas-2.1.4+dfsg/pandas/compat/_optional.py pandas-2.2.2+dfsg/pandas/compat/_optional.py --- pandas-2.1.4+dfsg/pandas/compat/_optional.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/compat/_optional.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,41 +15,44 @@ # Update install.rst & setup.cfg when updating versions! VERSIONS = { - "bs4": "4.11.1", - "blosc": "1.21.0", - "bottleneck": "1.3.4", + "adbc-driver-postgresql": "0.8.0", + "adbc-driver-sqlite": "0.8.0", + "bs4": "4.11.2", + "blosc": "1.21.3", + "bottleneck": "1.3.6", "dataframe-api-compat": "0.1.7", - "fastparquet": "0.8.1", - "fsspec": "2022.05.0", + "fastparquet": "2022.12.0", + "fsspec": "2022.11.0", "html5lib": "1.1", "hypothesis": "6.46.1", - "gcsfs": "2022.05.0", + "gcsfs": "2022.11.0", "jinja2": "3.1.2", - "lxml.etree": "4.8.0", - "matplotlib": "3.6.1", - "numba": "0.55.2", - "numexpr": "2.8.0", + "lxml.etree": "4.9.2", + "matplotlib": "3.6.3", + "numba": "0.56.4", + "numexpr": "2.8.4", "odfpy": "1.4.1", - "openpyxl": "3.0.10", - "pandas_gbq": "0.17.5", - "psycopg2": "2.9.3", # (dt dec pq3 ext lo64) + "openpyxl": "3.1.0", + "pandas_gbq": "0.19.0", + "psycopg2": "2.9.6", # (dt dec pq3 ext lo64) "pymysql": "1.0.2", - "pyarrow": "7.0.0", - "pyreadstat": "1.1.5", + "pyarrow": "10.0.1", + "pyreadstat": "1.2.0", "pytest": "7.3.2", - "pyxlsb": "1.0.9", - "s3fs": "2022.05.0", - "scipy": "1.8.1", - "sqlalchemy": "1.4.36", - "tables": "3.7.0", - "tabulate": "0.8.10", - "xarray": "2022.03.0", + "python-calamine": "0.1.7", + "pyxlsb": "1.0.10", + "s3fs": "2022.11.0", + "scipy": "1.10.0", + "sqlalchemy": "2.0.0", + "tables": "3.8.0", + "tabulate": "0.9.0", + "xarray": "2022.12.0", "xlrd": "2.0.1", - "xlsxwriter": "3.0.3", - "zstandard": "0.17.0", - "tzdata": "2022.1", - "qtpy": "2.2.0", - "pyqt5": "5.15.6", + "xlsxwriter": "3.0.5", + "zstandard": "0.19.0", + "tzdata": "2022.7", + "qtpy": "2.3.0", + "pyqt5": "5.15.9", } # A mapping from import name to package name (on PyPI) for packages where @@ -62,6 +65,7 @@ "lxml.etree": "lxml", "odf": "odfpy", "pandas_gbq": "pandas-gbq", + "python_calamine": "python-calamine", "sqlalchemy": "SQLAlchemy", "tables": "pytables", } @@ -116,9 +120,8 @@ The imported module, when found and the version is correct. None is returned when the package is not found and `errors` is False, or when the package's version is too old and `errors` - is ``'warn'``. + is ``'warn'`` or ``'ignore'``. """ - assert errors in {"warn", "raise", "ignore"} package_name = INSTALL_MAPPING.get(name) @@ -159,5 +162,7 @@ return None elif errors == "raise": raise ImportError(msg) + else: + return None return module diff -Nru pandas-2.1.4+dfsg/pandas/compat/numpy/__init__.py pandas-2.2.2+dfsg/pandas/compat/numpy/__init__.py --- pandas-2.1.4+dfsg/pandas/compat/numpy/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/compat/numpy/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,7 @@ # numpy versioning _np_version = np.__version__ _nlv = Version(_np_version) +np_version_lt1p23 = _nlv < Version("1.23") np_version_gte1p24 = _nlv >= Version("1.24") np_version_gte1p24p3 = _nlv >= Version("1.24.3") np_version_gte1p25 = _nlv >= Version("1.25") diff -Nru pandas-2.1.4+dfsg/pandas/compat/numpy/function.py pandas-2.2.2+dfsg/pandas/compat/numpy/function.py --- pandas-2.1.4+dfsg/pandas/compat/numpy/function.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/compat/numpy/function.py 2024-04-10 17:42:52.000000000 +0000 @@ -138,6 +138,7 @@ ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None ARGSORT_DEFAULTS["kind"] = None +ARGSORT_DEFAULTS["stable"] = None validate_argsort = CompatValidator( @@ -149,6 +150,7 @@ ARGSORT_DEFAULTS_KIND: dict[str, int | None] = {} ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None +ARGSORT_DEFAULTS_KIND["stable"] = None validate_argsort_kind = CompatValidator( ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both" ) diff -Nru pandas-2.1.4+dfsg/pandas/compat/pickle_compat.py pandas-2.2.2+dfsg/pandas/compat/pickle_compat.py --- pandas-2.1.4+dfsg/pandas/compat/pickle_compat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/compat/pickle_compat.py 2024-04-10 17:42:52.000000000 +0000 @@ -26,7 +26,7 @@ from collections.abc import Generator -def load_reduce(self): +def load_reduce(self) -> None: stack = self.stack args = stack.pop() func = stack[-1] diff -Nru pandas-2.1.4+dfsg/pandas/compat/pyarrow.py pandas-2.2.2+dfsg/pandas/compat/pyarrow.py --- pandas-2.1.4+dfsg/pandas/compat/pyarrow.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/compat/pyarrow.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,24 +8,20 @@ import pyarrow as pa _palv = Version(Version(pa.__version__).base_version) - pa_version_under7p0 = _palv < Version("7.0.0") - pa_version_under8p0 = _palv < Version("8.0.0") - pa_version_under9p0 = _palv < Version("9.0.0") - pa_version_under10p0 = _palv < Version("10.0.0") + pa_version_under10p1 = _palv < Version("10.0.1") pa_version_under11p0 = _palv < Version("11.0.0") pa_version_under12p0 = _palv < Version("12.0.0") pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") + pa_version_under16p0 = _palv < Version("16.0.0") except ImportError: - pa_version_under7p0 = True - pa_version_under8p0 = True - pa_version_under9p0 = True - pa_version_under10p0 = True + pa_version_under10p1 = True pa_version_under11p0 = True pa_version_under12p0 = True pa_version_under13p0 = True pa_version_under14p0 = True pa_version_under14p1 = True pa_version_under15p0 = True + pa_version_under16p0 = True diff -Nru pandas-2.1.4+dfsg/pandas/conftest.py pandas-2.2.2+dfsg/pandas/conftest.py --- pandas-2.1.4+dfsg/pandas/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -30,7 +30,6 @@ from decimal import Decimal import operator import os -from pathlib import Path from typing import ( TYPE_CHECKING, Callable, @@ -49,6 +48,8 @@ utc, ) +from pandas._config.config import _get_option + import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( @@ -58,12 +59,18 @@ import pandas as pd from pandas import ( + CategoricalIndex, DataFrame, Interval, + IntervalIndex, Period, + RangeIndex, Series, Timedelta, Timestamp, + date_range, + period_range, + timedelta_range, ) import pandas._testing as tm from pandas.core import ops @@ -141,10 +148,13 @@ ("is_datetime64tz_dtype", "is_datetime64tz_dtype is deprecated"), ("is_categorical_dtype", "is_categorical_dtype is deprecated"), ("is_sparse", "is_sparse is deprecated"), + ("DataFrameGroupBy.fillna", "DataFrameGroupBy.fillna is deprecated"), ("NDFrame.replace", "The 'method' keyword"), ("NDFrame.replace", "Series.replace without 'value'"), + ("NDFrame.clip", "Downcasting behavior in Series and DataFrame methods"), ("Series.idxmin", "The behavior of Series.idxmin"), ("Series.idxmax", "The behavior of Series.idxmax"), + ("SeriesGroupBy.fillna", "SeriesGroupBy.fillna is deprecated"), ("SeriesGroupBy.idxmin", "The behavior of Series.idxmin"), ("SeriesGroupBy.idxmax", "The behavior of Series.idxmax"), # Docstring divides by zero to show behavior difference @@ -175,21 +185,14 @@ "DataFrameGroupBy.fillna", "DataFrame.fillna with 'method' is deprecated", ), + ("read_parquet", "Passing a BlockManager to DataFrame is deprecated"), ] - for item in items: - if is_doctest: - # autouse=True for the add_doctest_imports can lead to expensive teardowns - # since doctest_namespace is a session fixture - item.add_marker(pytest.mark.usefixtures("add_doctest_imports")) - + if is_doctest: + for item in items: for path, message in ignored_doctest_warnings: ignore_doctest_warning(item, path, message) - # mark all tests in the pandas/tests/frame directory with "arraymanager" - if "/frame/" in item.nodeid: - item.add_marker(pytest.mark.arraymanager) - hypothesis_health_checks = [hypothesis.HealthCheck.too_slow] if Version(hypothesis.__version__) >= Version("6.83.2"): @@ -243,7 +246,14 @@ ) -@pytest.fixture +# ---------------------------------------------------------------- +# Autouse fixtures +# ---------------------------------------------------------------- + + +# https://github.com/pytest-dev/pytest/issues/11873 +# Would like to avoid autouse=True, but cannot as of pytest 8.0.0 +@pytest.fixture(autouse=True) def add_doctest_imports(doctest_namespace) -> None: """ Make `np` and `pd` names available for doctests. @@ -252,9 +262,6 @@ doctest_namespace["pd"] = pd -# ---------------------------------------------------------------- -# Autouse fixtures -# ---------------------------------------------------------------- @pytest.fixture(autouse=True) def configure_tests() -> None: """ @@ -496,7 +503,7 @@ @pytest.fixture -def dict_subclass(): +def dict_subclass() -> type[dict]: """ Fixture for a dictionary subclass. """ @@ -509,7 +516,7 @@ @pytest.fixture -def non_dict_mapping_subclass(): +def non_dict_mapping_subclass() -> type[abc.Mapping]: """ Fixture for a non-mapping dictionary subclass. """ @@ -539,7 +546,11 @@ DataFrame with 3 level MultiIndex (year, month, day) covering first 100 business days from 2000-01-01 with random data """ - tdf = tm.makeTimeDataFrame(100) + tdf = DataFrame( + np.random.default_rng(2).standard_normal((100, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100, freq="B"), + ) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use int64 Index, to make sure things work ymd.index = ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels]) @@ -598,34 +609,38 @@ """ # GH#8367 round trip with pickle return MultiIndex.from_product( - [[1, 2], ["a", "b"], pd.date_range("20130101", periods=3, tz="US/Eastern")], + [[1, 2], ["a", "b"], date_range("20130101", periods=3, tz="US/Eastern")], names=["one", "two", "three"], ) indices_dict = { - "string": tm.makeStringIndex(100), - "datetime": tm.makeDateIndex(100), - "datetime-tz": tm.makeDateIndex(100, tz="US/Pacific"), - "period": tm.makePeriodIndex(100), - "timedelta": tm.makeTimedeltaIndex(100), - "range": tm.makeRangeIndex(100), - "int8": tm.makeIntIndex(100, dtype="int8"), - "int16": tm.makeIntIndex(100, dtype="int16"), - "int32": tm.makeIntIndex(100, dtype="int32"), - "int64": tm.makeIntIndex(100, dtype="int64"), - "uint8": tm.makeUIntIndex(100, dtype="uint8"), - "uint16": tm.makeUIntIndex(100, dtype="uint16"), - "uint32": tm.makeUIntIndex(100, dtype="uint32"), - "uint64": tm.makeUIntIndex(100, dtype="uint64"), - "float32": tm.makeFloatIndex(100, dtype="float32"), - "float64": tm.makeFloatIndex(100, dtype="float64"), - "bool-object": tm.makeBoolIndex(10).astype(object), - "bool-dtype": Index(np.random.default_rng(2).standard_normal(10) < 0), - "complex64": tm.makeNumericIndex(100, dtype="float64").astype("complex64"), - "complex128": tm.makeNumericIndex(100, dtype="float64").astype("complex128"), - "categorical": tm.makeCategoricalIndex(100), - "interval": tm.makeIntervalIndex(100), + "string": Index([f"pandas_{i}" for i in range(100)]), + "datetime": date_range("2020-01-01", periods=100), + "datetime-tz": date_range("2020-01-01", periods=100, tz="US/Pacific"), + "period": period_range("2020-01-01", periods=100, freq="D"), + "timedelta": timedelta_range(start="1 day", periods=100, freq="D"), + "range": RangeIndex(100), + "int8": Index(np.arange(100), dtype="int8"), + "int16": Index(np.arange(100), dtype="int16"), + "int32": Index(np.arange(100), dtype="int32"), + "int64": Index(np.arange(100), dtype="int64"), + "uint8": Index(np.arange(100), dtype="uint8"), + "uint16": Index(np.arange(100), dtype="uint16"), + "uint32": Index(np.arange(100), dtype="uint32"), + "uint64": Index(np.arange(100), dtype="uint64"), + "float32": Index(np.arange(100), dtype="float32"), + "float64": Index(np.arange(100), dtype="float64"), + "bool-object": Index([True, False] * 5, dtype=object), + "bool-dtype": Index([True, False] * 5, dtype=bool), + "complex64": Index( + np.arange(100, dtype="complex64") + 1.0j * np.arange(100, dtype="complex64") + ), + "complex128": Index( + np.arange(100, dtype="complex128") + 1.0j * np.arange(100, dtype="complex128") + ), + "categorical": CategoricalIndex(list("abcd") * 25), + "interval": IntervalIndex.from_breaks(np.linspace(0, 100, num=101)), "empty": Index([]), "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(), @@ -635,10 +650,12 @@ "nullable_uint": Index(np.arange(100), dtype="UInt16"), "nullable_float": Index(np.arange(100), dtype="Float32"), "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"), - "string-python": Index(pd.array(tm.makeStringIndex(100), dtype="string[python]")), + "string-python": Index( + pd.array([f"pandas_{i}" for i in range(100)], dtype="string[python]") + ), } if has_pyarrow: - idx = Index(pd.array(tm.makeStringIndex(100), dtype="string[pyarrow]")) + idx = Index(pd.array([f"pandas_{i}" for i in range(100)], dtype="string[pyarrow]")) indices_dict["string-pyarrow"] = idx @@ -723,9 +740,11 @@ """ Fixture for Series of floats with Index of unique strings """ - s = tm.makeStringSeries() - s.name = "series" - return s + return Series( + np.arange(30, dtype=np.float64) * 1.1, + index=Index([f"i_{i}" for i in range(30)], dtype=object), + name="series", + ) @pytest.fixture @@ -733,9 +752,9 @@ """ Fixture for Series of dtype object with Index of unique strings """ - s = tm.makeObjectSeries() - s.name = "objects" - return s + data = [f"foo_{i}" for i in range(30)] + index = Index([f"bar_{i}" for i in range(30)], dtype=object) + return Series(data, index=index, name="objects", dtype=object) @pytest.fixture @@ -743,9 +762,11 @@ """ Fixture for Series of floats with DatetimeIndex """ - s = tm.makeTimeSeries() - s.name = "ts" - return s + return Series( + np.random.default_rng(2).standard_normal(30), + index=date_range("2000-01-01", periods=30, freq="B"), + name="ts", + ) def _create_series(index): @@ -769,25 +790,10 @@ return _create_series(index) -@pytest.fixture -def series_with_multilevel_index() -> Series: - """ - Fixture with a Series with a 2-level MultiIndex. - """ - arrays = [ - ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], - ["one", "two", "one", "two", "one", "two", "one", "two"], - ] - tuples = zip(*arrays) - index = MultiIndex.from_tuples(tuples) - data = np.random.default_rng(2).standard_normal(8) - ser = Series(data, index=index) - ser.iloc[3] = np.nan - return ser - - _narrow_series = { - f"{dtype.__name__}-series": tm.make_rand_series(name="a", dtype=dtype) + f"{dtype.__name__}-series": Series( + range(30), index=[f"i-{i}" for i in range(30)], name="a", dtype=dtype + ) for dtype in tm.NARROW_NP_DTYPES } @@ -836,56 +842,12 @@ Fixture for DataFrame of ints with index of unique strings Columns are ['A', 'B', 'C', 'D'] - - A B C D - vpBeWjM651 1 0 1 0 - 5JyxmrP1En -1 0 0 0 - qEDaoD49U2 -1 1 0 0 - m66TkTfsFe 0 0 0 0 - EHPaNzEUFm -1 0 -1 0 - fpRJCevQhi 2 0 0 0 - OlQvnmfi3Q 0 0 -2 0 - ... .. .. .. .. - uB1FPlz4uP 0 0 0 1 - EcSe6yNzCU 0 0 -1 0 - L50VudaiI8 -1 1 -2 0 - y3bpw4nwIp 0 -1 0 0 - H0RdLLwrCT 1 1 0 0 - rY82K0vMwm 0 0 0 0 - 1OPIUjnkjk 2 0 0 0 - - [30 rows x 4 columns] """ - return DataFrame(tm.getSeriesData()).astype("int64") - - -@pytest.fixture -def datetime_frame() -> DataFrame: - """ - Fixture for DataFrame of floats with DatetimeIndex - - Columns are ['A', 'B', 'C', 'D'] - - A B C D - 2000-01-03 -1.122153 0.468535 0.122226 1.693711 - 2000-01-04 0.189378 0.486100 0.007864 -1.216052 - 2000-01-05 0.041401 -0.835752 -0.035279 -0.414357 - 2000-01-06 0.430050 0.894352 0.090719 0.036939 - 2000-01-07 -0.620982 -0.668211 -0.706153 1.466335 - 2000-01-10 -0.752633 0.328434 -0.815325 0.699674 - 2000-01-11 -2.236969 0.615737 -0.829076 -1.196106 - ... ... ... ... ... - 2000-02-03 1.642618 -0.579288 0.046005 1.385249 - 2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351 - 2000-02-07 -2.656149 -0.601387 1.410148 0.444150 - 2000-02-08 -1.201881 -1.289040 0.772992 -1.445300 - 2000-02-09 1.377373 0.398619 1.008453 -0.928207 - 2000-02-10 0.473194 -0.636677 0.984058 0.511519 - 2000-02-11 -0.965556 0.408313 -1.312844 -0.381948 - - [30 rows x 4 columns] - """ - return DataFrame(tm.getTimeSeriesData()) + return DataFrame( + np.ones((30, 4), dtype=np.int64), + index=Index([f"foo_{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD"), dtype=object), + ) @pytest.fixture @@ -894,44 +856,11 @@ Fixture for DataFrame of floats with index of unique strings Columns are ['A', 'B', 'C', 'D']. - - A B C D - P7GACiRnxd -0.465578 -0.361863 0.886172 -0.053465 - qZKh6afn8n -0.466693 -0.373773 0.266873 1.673901 - tkp0r6Qble 0.148691 -0.059051 0.174817 1.598433 - wP70WOCtv8 0.133045 -0.581994 -0.992240 0.261651 - M2AeYQMnCz -1.207959 -0.185775 0.588206 0.563938 - QEPzyGDYDo -0.381843 -0.758281 0.502575 -0.565053 - r78Jwns6dn -0.653707 0.883127 0.682199 0.206159 - ... ... ... ... ... - IHEGx9NO0T -0.277360 0.113021 -1.018314 0.196316 - lPMj8K27FA -1.313667 -0.604776 -1.305618 -0.863999 - qa66YMWQa5 1.110525 0.475310 -0.747865 0.032121 - yOa0ATsmcE -0.431457 0.067094 0.096567 -0.264962 - 65znX3uRNG 1.528446 0.160416 -0.109635 -0.032987 - eCOBvKqf3e 0.235281 1.622222 0.781255 0.392871 - xSucinXxuV -1.263557 0.252799 -0.552247 0.400426 - - [30 rows x 4 columns] - """ - return DataFrame(tm.getSeriesData()) - - -@pytest.fixture -def mixed_type_frame() -> DataFrame: - """ - Fixture for DataFrame of float/int/string columns with RangeIndex - Columns are ['a', 'b', 'c', 'float32', 'int32']. """ return DataFrame( - { - "a": 1.0, - "b": 2, - "c": "foo", - "float32": np.array([1.0] * 10, dtype="float32"), - "int32": np.array([1] * 10, dtype="int32"), - }, - index=np.arange(10), + np.random.default_rng(2).standard_normal((30, 4)), + index=Index([f"foo_{i}" for i in range(30)]), + columns=Index(list("ABCD")), ) @@ -1169,16 +1098,6 @@ @pytest.fixture -def tests_path() -> Path: - return Path(__file__).parent / "tests" - - -@pytest.fixture -def tests_io_data_path(tests_path) -> Path: - return tests_path / "io" / "data" - - -@pytest.fixture def datapath(strict_data_files: str) -> Callable[..., str]: """ Get the path to a data file. @@ -1212,14 +1131,6 @@ return deco -@pytest.fixture -def iris(datapath) -> DataFrame: - """ - The iris dataset as a DataFrame. - """ - return pd.read_csv(datapath("io", "data", "csv", "iris.csv")) - - # ---------------------------------------------------------------- # Time zones # ---------------------------------------------------------------- @@ -1291,6 +1202,17 @@ utc_fixture2 = utc_fixture +@pytest.fixture(params=["s", "ms", "us", "ns"]) +def unit(request): + """ + datetime64 units we support. + """ + return request.param + + +unit2 = unit + + # ---------------------------------------------------------------- # Dtypes # ---------------------------------------------------------------- @@ -1427,7 +1349,7 @@ """ Fixture emits fixed Timestamp.now() """ - return Timestamp( + return Timestamp( # pyright: ignore[reportGeneralTypeIssues] year=2021, month=1, day=1, hour=12, minute=4, second=13, microsecond=22 ) @@ -1720,6 +1642,38 @@ return request.param +@pytest.fixture(params=tm.ALL_REAL_NULLABLE_DTYPES) +def any_real_nullable_dtype(request): + """ + Parameterized fixture for all real dtypes that can hold NA. + + * float + * 'float32' + * 'float64' + * 'Float32' + * 'Float64' + * 'UInt8' + * 'UInt16' + * 'UInt32' + * 'UInt64' + * 'Int8' + * 'Int16' + * 'Int32' + * 'Int64' + * 'uint8[pyarrow]' + * 'uint16[pyarrow]' + * 'uint32[pyarrow]' + * 'uint64[pyarrow]' + * 'int8[pyarrow]' + * 'int16[pyarrow]' + * 'int32[pyarrow]' + * 'int64[pyarrow]' + * 'float[pyarrow]' + * 'double[pyarrow]' + """ + return request.param + + @pytest.fixture(params=tm.ALL_NUMERIC_DTYPES) def any_numeric_dtype(request): """ @@ -1888,28 +1842,6 @@ return request.param -@pytest.fixture() -def fsspectest(): - pytest.importorskip("fsspec") - from fsspec import register_implementation - from fsspec.implementations.memory import MemoryFileSystem - from fsspec.registry import _registry as registry - - class TestMemoryFS(MemoryFileSystem): - protocol = "testmem" - test = [None] - - def __init__(self, **kwargs) -> None: - self.test[0] = kwargs.pop("test", None) - super().__init__(**kwargs) - - register_implementation("testmem", TestMemoryFS, clobber=True) - yield TestMemoryFS() - registry.pop("testmem", None) - TestMemoryFS.test[0] = None - TestMemoryFS.store.clear() - - @pytest.fixture( params=[ ("foo", None, None), @@ -1982,7 +1914,7 @@ """ Fixture to check if the array manager is being used. """ - return pd.options.mode.data_manager == "array" + return _get_option("mode.data_manager", silent=True) == "array" @pytest.fixture @@ -1990,7 +1922,29 @@ """ Fixture to check if Copy-on-Write is enabled. """ - return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" + return ( + pd.options.mode.copy_on_write is True + and _get_option("mode.data_manager", silent=True) == "block" + ) + + +@pytest.fixture +def warn_copy_on_write() -> bool: + """ + Fixture to check if Copy-on-Write is in warning mode. + """ + return ( + pd.options.mode.copy_on_write == "warn" + and _get_option("mode.data_manager", silent=True) == "block" + ) + + +@pytest.fixture +def using_infer_string() -> bool: + """ + Fixture to check if infer string option is enabled. + """ + return pd.options.future.infer_string is True warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] diff -Nru pandas-2.1.4+dfsg/pandas/core/_numba/executor.py pandas-2.2.2+dfsg/pandas/core/_numba/executor.py --- pandas-2.1.4+dfsg/pandas/core/_numba/executor.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/_numba/executor.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,6 +16,45 @@ @functools.cache +def generate_apply_looper(func, nopython=True, nogil=True, parallel=False): + if TYPE_CHECKING: + import numba + else: + numba = import_optional_dependency("numba") + nb_compat_func = numba.extending.register_jitable(func) + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def nb_looper(values, axis): + # Operate on the first row/col in order to get + # the output shape + if axis == 0: + first_elem = values[:, 0] + dim0 = values.shape[1] + else: + first_elem = values[0] + dim0 = values.shape[0] + res0 = nb_compat_func(first_elem) + # Use np.asarray to get shape for + # https://github.com/numba/numba/issues/4202#issuecomment-1185981507 + buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape + if axis == 0: + buf_shape = buf_shape[::-1] + buff = np.empty(buf_shape) + + if axis == 1: + buff[0] = res0 + for i in numba.prange(1, values.shape[0]): + buff[i] = nb_compat_func(values[i]) + else: + buff[:, 0] = res0 + for j in numba.prange(1, values.shape[1]): + buff[:, j] = nb_compat_func(values[:, j]) + return buff + + return nb_looper + + +@functools.cache def make_looper(func, result_dtype, is_grouped_kernel, nopython, nogil, parallel): if TYPE_CHECKING: import numba diff -Nru pandas-2.1.4+dfsg/pandas/core/_numba/extensions.py pandas-2.2.2+dfsg/pandas/core/_numba/extensions.py --- pandas-2.1.4+dfsg/pandas/core/_numba/extensions.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/_numba/extensions.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,584 @@ +# Disable type checking for this module since numba's internals +# are not typed, and we use numba's internals via its extension API +# mypy: ignore-errors +""" +Utility classes/functions to let numba recognize +pandas Index/Series/DataFrame + +Mostly vendored from https://github.com/numba/numba/blob/main/numba/tests/pdlike_usecase.py +""" + +from __future__ import annotations + +from contextlib import contextmanager +import operator + +import numba +from numba import types +from numba.core import cgutils +from numba.core.datamodel import models +from numba.core.extending import ( + NativeValue, + box, + lower_builtin, + make_attribute_wrapper, + overload, + overload_attribute, + overload_method, + register_model, + type_callable, + typeof_impl, + unbox, +) +from numba.core.imputils import impl_ret_borrowed +import numpy as np + +from pandas._libs import lib + +from pandas.core.indexes.base import Index +from pandas.core.indexing import _iLocIndexer +from pandas.core.internals import SingleBlockManager +from pandas.core.series import Series + + +# Helper function to hack around fact that Index casts numpy string dtype to object +# +# Idea is to set an attribute on a Index called _numba_data +# that is the original data, or the object data casted to numpy string dtype, +# with a context manager that is unset afterwards +@contextmanager +def set_numba_data(index: Index): + numba_data = index._data + if numba_data.dtype == object: + if not lib.is_string_array(numba_data): + raise ValueError( + "The numba engine only supports using string or numeric column names" + ) + numba_data = numba_data.astype("U") + try: + index._numba_data = numba_data + yield index + finally: + del index._numba_data + + +# TODO: Range index support +# (this currently lowers OK, but does not round-trip) +class IndexType(types.Type): + """ + The type class for Index objects. + """ + + def __init__(self, dtype, layout, pyclass: any) -> None: + self.pyclass = pyclass + name = f"index({dtype}, {layout})" + self.dtype = dtype + self.layout = layout + super().__init__(name) + + @property + def key(self): + return self.pyclass, self.dtype, self.layout + + @property + def as_array(self): + return types.Array(self.dtype, 1, self.layout) + + def copy(self, dtype=None, ndim: int = 1, layout=None): + assert ndim == 1 + if dtype is None: + dtype = self.dtype + layout = layout or self.layout + return type(self)(dtype, layout, self.pyclass) + + +class SeriesType(types.Type): + """ + The type class for Series objects. + """ + + def __init__(self, dtype, index, namety) -> None: + assert isinstance(index, IndexType) + self.dtype = dtype + self.index = index + self.values = types.Array(self.dtype, 1, "C") + self.namety = namety + name = f"series({dtype}, {index}, {namety})" + super().__init__(name) + + @property + def key(self): + return self.dtype, self.index, self.namety + + @property + def as_array(self): + return self.values + + def copy(self, dtype=None, ndim: int = 1, layout: str = "C"): + assert ndim == 1 + assert layout == "C" + if dtype is None: + dtype = self.dtype + return type(self)(dtype, self.index, self.namety) + + +@typeof_impl.register(Index) +def typeof_index(val, c): + """ + This will assume that only strings are in object dtype + index. + (you should check this before this gets lowered down to numba) + """ + # arrty = typeof_impl(val._data, c) + arrty = typeof_impl(val._numba_data, c) + assert arrty.ndim == 1 + return IndexType(arrty.dtype, arrty.layout, type(val)) + + +@typeof_impl.register(Series) +def typeof_series(val, c): + index = typeof_impl(val.index, c) + arrty = typeof_impl(val.values, c) + namety = typeof_impl(val.name, c) + assert arrty.ndim == 1 + assert arrty.layout == "C" + return SeriesType(arrty.dtype, index, namety) + + +@type_callable(Series) +def type_series_constructor(context): + def typer(data, index, name=None): + if isinstance(index, IndexType) and isinstance(data, types.Array): + assert data.ndim == 1 + if name is None: + name = types.intp + return SeriesType(data.dtype, index, name) + + return typer + + +@type_callable(Index) +def type_index_constructor(context): + def typer(data, hashmap=None): + if isinstance(data, types.Array): + assert data.layout == "C" + assert data.ndim == 1 + assert hashmap is None or isinstance(hashmap, types.DictType) + return IndexType(data.dtype, layout=data.layout, pyclass=Index) + + return typer + + +# Backend extensions for Index and Series and Frame +@register_model(IndexType) +class IndexModel(models.StructModel): + def __init__(self, dmm, fe_type) -> None: + # We don't want the numpy string scalar type in our hashmap + members = [ + ("data", fe_type.as_array), + # This is an attempt to emulate our hashtable code with a numba + # typed dict + # It maps from values in the index to their integer positions in the array + ("hashmap", types.DictType(fe_type.dtype, types.intp)), + # Pointer to the Index object this was created from, or that it + # boxes to + # https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1 + ("parent", types.pyobject), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +@register_model(SeriesType) +class SeriesModel(models.StructModel): + def __init__(self, dmm, fe_type) -> None: + members = [ + ("index", fe_type.index), + ("values", fe_type.as_array), + ("name", fe_type.namety), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +make_attribute_wrapper(IndexType, "data", "_data") +make_attribute_wrapper(IndexType, "hashmap", "hashmap") + +make_attribute_wrapper(SeriesType, "index", "index") +make_attribute_wrapper(SeriesType, "values", "values") +make_attribute_wrapper(SeriesType, "name", "name") + + +@lower_builtin(Series, types.Array, IndexType) +def pdseries_constructor(context, builder, sig, args): + data, index = args + series = cgutils.create_struct_proxy(sig.return_type)(context, builder) + series.index = index + series.values = data + series.name = context.get_constant(types.intp, 0) + return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue()) + + +@lower_builtin(Series, types.Array, IndexType, types.intp) +@lower_builtin(Series, types.Array, IndexType, types.float64) +@lower_builtin(Series, types.Array, IndexType, types.unicode_type) +def pdseries_constructor_with_name(context, builder, sig, args): + data, index, name = args + series = cgutils.create_struct_proxy(sig.return_type)(context, builder) + series.index = index + series.values = data + series.name = name + return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue()) + + +@lower_builtin(Index, types.Array, types.DictType, types.pyobject) +def index_constructor_2arg(context, builder, sig, args): + (data, hashmap, parent) = args + index = cgutils.create_struct_proxy(sig.return_type)(context, builder) + + index.data = data + index.hashmap = hashmap + index.parent = parent + return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue()) + + +@lower_builtin(Index, types.Array, types.DictType) +def index_constructor_2arg_parent(context, builder, sig, args): + # Basically same as index_constructor_1arg, but also lets you specify the + # parent object + (data, hashmap) = args + index = cgutils.create_struct_proxy(sig.return_type)(context, builder) + + index.data = data + index.hashmap = hashmap + return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue()) + + +@lower_builtin(Index, types.Array) +def index_constructor_1arg(context, builder, sig, args): + from numba.typed import Dict + + key_type = sig.return_type.dtype + value_type = types.intp + + def index_impl(data): + return Index(data, Dict.empty(key_type, value_type)) + + return context.compile_internal(builder, index_impl, sig, args) + + +# Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type +# (regular string) +def maybe_cast_str(x): + # Dummy function that numba can overload + pass + + +@overload(maybe_cast_str) +def maybe_cast_str_impl(x): + """Converts numba UnicodeCharSeq (numpy string scalar) -> unicode type (string). + Is a no-op for other types.""" + if isinstance(x, types.UnicodeCharSeq): + return lambda x: str(x) + else: + return lambda x: x + + +@unbox(IndexType) +def unbox_index(typ, obj, c): + """ + Convert a Index object to a native structure. + + Note: Object dtype is not allowed here + """ + data_obj = c.pyapi.object_getattr_string(obj, "_numba_data") + index = cgutils.create_struct_proxy(typ)(c.context, c.builder) + # If we see an object array, assume its been validated as only containing strings + # We still need to do the conversion though + index.data = c.unbox(typ.as_array, data_obj).value + typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict)) + # Create an empty typed dict in numba for the hashmap for indexing + # equiv of numba.typed.Dict.empty(typ.dtype, types.intp) + arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype)) + intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp)) + hashmap_obj = c.pyapi.call_method( + typed_dict_obj, "empty", (arr_type_obj, intp_type_obj) + ) + index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value + # Set the parent for speedy boxing. + index.parent = obj + + # Decrefs + c.pyapi.decref(data_obj) + c.pyapi.decref(arr_type_obj) + c.pyapi.decref(intp_type_obj) + c.pyapi.decref(typed_dict_obj) + + return NativeValue(index._getvalue()) + + +@unbox(SeriesType) +def unbox_series(typ, obj, c): + """ + Convert a Series object to a native structure. + """ + index_obj = c.pyapi.object_getattr_string(obj, "index") + values_obj = c.pyapi.object_getattr_string(obj, "values") + name_obj = c.pyapi.object_getattr_string(obj, "name") + + series = cgutils.create_struct_proxy(typ)(c.context, c.builder) + series.index = c.unbox(typ.index, index_obj).value + series.values = c.unbox(typ.values, values_obj).value + series.name = c.unbox(typ.namety, name_obj).value + + # Decrefs + c.pyapi.decref(index_obj) + c.pyapi.decref(values_obj) + c.pyapi.decref(name_obj) + + return NativeValue(series._getvalue()) + + +@box(IndexType) +def box_index(typ, val, c): + """ + Convert a native index structure to a Index object. + + If our native index is of a numpy string dtype, we'll cast it to + object. + """ + # First build a Numpy array object, then wrap it in a Index + index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) + + res = cgutils.alloca_once_value(c.builder, index.parent) + + # Does parent exist? + # (it means already boxed once, or Index same as original df.index or df.columns) + # xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17 + with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as ( + has_parent, + otherwise, + ): + with has_parent: + c.pyapi.incref(index.parent) + with otherwise: + # TODO: preserve the original class for the index + # Also need preserve the name of the Index + # class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass)) + class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index)) + array_obj = c.box(typ.as_array, index.data) + if isinstance(typ.dtype, types.UnicodeCharSeq): + # We converted to numpy string dtype, convert back + # to object since _simple_new won't do that for uss + object_str_obj = c.pyapi.unserialize(c.pyapi.serialize_object("object")) + array_obj = c.pyapi.call_method(array_obj, "astype", (object_str_obj,)) + c.pyapi.decref(object_str_obj) + # this is basically Index._simple_new(array_obj, name_obj) in python + index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,)) + index.parent = index_obj + c.builder.store(index_obj, res) + + # Decrefs + c.pyapi.decref(class_obj) + c.pyapi.decref(array_obj) + return c.builder.load(res) + + +@box(SeriesType) +def box_series(typ, val, c): + """ + Convert a native series structure to a Series object. + """ + series = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) + series_const_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Series._from_mgr)) + mgr_const_obj = c.pyapi.unserialize( + c.pyapi.serialize_object(SingleBlockManager.from_array) + ) + index_obj = c.box(typ.index, series.index) + array_obj = c.box(typ.as_array, series.values) + name_obj = c.box(typ.namety, series.name) + # This is basically equivalent of + # pd.Series(data=array_obj, index=index_obj) + # To improve perf, we will construct the Series from a manager + # object to avoid checks. + # We'll also set the name attribute manually to avoid validation + mgr_obj = c.pyapi.call_function_objargs( + mgr_const_obj, + ( + array_obj, + index_obj, + ), + ) + mgr_axes_obj = c.pyapi.object_getattr_string(mgr_obj, "axes") + # Series._constructor_from_mgr(mgr, axes) + series_obj = c.pyapi.call_function_objargs( + series_const_obj, (mgr_obj, mgr_axes_obj) + ) + c.pyapi.object_setattr_string(series_obj, "_name", name_obj) + + # Decrefs + c.pyapi.decref(series_const_obj) + c.pyapi.decref(mgr_axes_obj) + c.pyapi.decref(mgr_obj) + c.pyapi.decref(mgr_const_obj) + c.pyapi.decref(index_obj) + c.pyapi.decref(array_obj) + c.pyapi.decref(name_obj) + + return series_obj + + +# Add common series reductions (e.g. mean, sum), +# and also add common binops (e.g. add, sub, mul, div) +def generate_series_reduction(ser_reduction, ser_method): + @overload_method(SeriesType, ser_reduction) + def series_reduction(series): + def series_reduction_impl(series): + return ser_method(series.values) + + return series_reduction_impl + + return series_reduction + + +def generate_series_binop(binop): + @overload(binop) + def series_binop(series1, value): + if isinstance(series1, SeriesType): + if isinstance(value, SeriesType): + + def series_binop_impl(series1, series2): + # TODO: Check index matching? + return Series( + binop(series1.values, series2.values), + series1.index, + series1.name, + ) + + return series_binop_impl + else: + + def series_binop_impl(series1, value): + return Series( + binop(series1.values, value), series1.index, series1.name + ) + + return series_binop_impl + + return series_binop + + +series_reductions = [ + ("sum", np.sum), + ("mean", np.mean), + # Disabled due to discrepancies between numba std. dev + # and pandas std. dev (no way to specify dof) + # ("std", np.std), + # ("var", np.var), + ("min", np.min), + ("max", np.max), +] +for reduction, reduction_method in series_reductions: + generate_series_reduction(reduction, reduction_method) + +series_binops = [operator.add, operator.sub, operator.mul, operator.truediv] + +for ser_binop in series_binops: + generate_series_binop(ser_binop) + + +# get_loc on Index +@overload_method(IndexType, "get_loc") +def index_get_loc(index, item): + def index_get_loc_impl(index, item): + # Initialize the hash table if not initialized + if len(index.hashmap) == 0: + for i, val in enumerate(index._data): + index.hashmap[val] = i + return index.hashmap[item] + + return index_get_loc_impl + + +# Indexing for Series/Index +@overload(operator.getitem) +def series_indexing(series, item): + if isinstance(series, SeriesType): + + def series_getitem(series, item): + loc = series.index.get_loc(item) + return series.iloc[loc] + + return series_getitem + + +@overload(operator.getitem) +def index_indexing(index, idx): + if isinstance(index, IndexType): + + def index_getitem(index, idx): + return index._data[idx] + + return index_getitem + + +class IlocType(types.Type): + def __init__(self, obj_type) -> None: + self.obj_type = obj_type + name = f"iLocIndexer({obj_type})" + super().__init__(name=name) + + @property + def key(self): + return self.obj_type + + +@typeof_impl.register(_iLocIndexer) +def typeof_iloc(val, c): + objtype = typeof_impl(val.obj, c) + return IlocType(objtype) + + +@type_callable(_iLocIndexer) +def type_iloc_constructor(context): + def typer(obj): + if isinstance(obj, SeriesType): + return IlocType(obj) + + return typer + + +@lower_builtin(_iLocIndexer, SeriesType) +def iloc_constructor(context, builder, sig, args): + (obj,) = args + iloc_indexer = cgutils.create_struct_proxy(sig.return_type)(context, builder) + iloc_indexer.obj = obj + return impl_ret_borrowed( + context, builder, sig.return_type, iloc_indexer._getvalue() + ) + + +@register_model(IlocType) +class ILocModel(models.StructModel): + def __init__(self, dmm, fe_type) -> None: + members = [("obj", fe_type.obj_type)] + models.StructModel.__init__(self, dmm, fe_type, members) + + +make_attribute_wrapper(IlocType, "obj", "obj") + + +@overload_attribute(SeriesType, "iloc") +def series_iloc(series): + def get(series): + return _iLocIndexer(series) + + return get + + +@overload(operator.getitem) +def iloc_getitem(iloc_indexer, i): + if isinstance(iloc_indexer, IlocType): + + def getitem_impl(iloc_indexer, i): + return iloc_indexer.obj.values[i] + + return getitem_impl diff -Nru pandas-2.1.4+dfsg/pandas/core/_numba/kernels/var_.py pandas-2.2.2+dfsg/pandas/core/_numba/kernels/var_.py --- pandas-2.1.4+dfsg/pandas/core/_numba/kernels/var_.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/_numba/kernels/var_.py 2024-04-10 17:42:52.000000000 +0000 @@ -116,7 +116,7 @@ ssqdm_x, compensation_add, num_consecutive_same_value, - prev_value, # pyright: ignore[reportGeneralTypeIssues] + prev_value, ) else: for j in range(start[i - 1], s): @@ -141,7 +141,7 @@ ssqdm_x, compensation_add, num_consecutive_same_value, - prev_value, # pyright: ignore[reportGeneralTypeIssues] + prev_value, ) if nobs >= min_periods and nobs > ddof: diff -Nru pandas-2.1.4+dfsg/pandas/core/accessor.py pandas-2.2.2+dfsg/pandas/core/accessor.py --- pandas-2.1.4+dfsg/pandas/core/accessor.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/accessor.py 2024-04-10 17:42:52.000000000 +0000 @@ -187,7 +187,7 @@ return add_delegate_accessors -# Ported with modifications from xarray +# Ported with modifications from xarray; licence at LICENSES/XARRAY_LICENSE # https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py # 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors # 2. We use a UserWarning instead of a custom Warning diff -Nru pandas-2.1.4+dfsg/pandas/core/algorithms.py pandas-2.2.2+dfsg/pandas/core/algorithms.py --- pandas-2.1.4+dfsg/pandas/core/algorithms.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/algorithms.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,6 +4,7 @@ """ from __future__ import annotations +import decimal import operator from textwrap import dedent from typing import ( @@ -55,7 +56,6 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ( - ArrowDtype, BaseMaskedDtype, CategoricalDtype, ExtensionDtype, @@ -924,7 +924,7 @@ else: values = _ensure_arraylike(values, func_name="value_counts") - keys, counts = value_counts_arraylike(values, dropna) + keys, counts, _ = value_counts_arraylike(values, dropna) if keys.dtype == np.float16: keys = keys.astype(np.float32) @@ -933,6 +933,19 @@ idx = Index(keys) if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) + elif ( + idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714 + and idx.dtype != "string[pyarrow_numpy]" + ): + warnings.warn( + # GH#56161 + "The behavior of value_counts with object-dtype is deprecated. " + "In a future version, this will *not* perform dtype inference " + "on the resulting index. To retain the old behavior, use " + "`result.index = result.index.infer_objects()`", + FutureWarning, + stacklevel=find_stack_level(), + ) idx.name = index_name result = Series(counts, index=idx, name=name, copy=False) @@ -949,7 +962,7 @@ # Called once from SparseArray, otherwise could be private def value_counts_arraylike( values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None -) -> tuple[ArrayLike, npt.NDArray[np.int64]]: +) -> tuple[ArrayLike, npt.NDArray[np.int64], int]: """ Parameters ---------- @@ -965,7 +978,7 @@ original = values values = _ensure_data(values) - keys, counts = htable.value_count(values, dropna, mask=mask) + keys, counts, na_counter = htable.value_count(values, dropna, mask=mask) if needs_i8_conversion(original.dtype): # datetime, timedelta, or period @@ -975,18 +988,20 @@ keys, counts = keys[mask], counts[mask] res_keys = _reconstruct_data(keys, original.dtype, original) - return res_keys, counts + return res_keys, counts, na_counter def duplicated( - values: ArrayLike, keep: Literal["first", "last", False] = "first" + values: ArrayLike, + keep: Literal["first", "last", False] = "first", + mask: npt.NDArray[np.bool_] | None = None, ) -> npt.NDArray[np.bool_]: """ Return boolean ndarray denoting duplicate values. Parameters ---------- - values : nd.array, ExtensionArray or Series + values : np.ndarray or ExtensionArray Array over which to check for duplicate values. keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first @@ -994,21 +1009,15 @@ - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. + mask : ndarray[bool], optional + array indicating which elements to exclude from checking Returns ------- duplicated : ndarray[bool] """ - if hasattr(values, "dtype"): - if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub": - values = values._to_masked() # type: ignore[union-attr] - - if isinstance(values.dtype, BaseMaskedDtype): - values = cast("BaseMaskedArray", values) - return htable.duplicated(values._data, keep=keep, mask=values._mask) - values = _ensure_data(values) - return htable.duplicated(values, keep=keep) + return htable.duplicated(values, keep=keep, mask=mask) def mode( @@ -1039,7 +1048,10 @@ values = _ensure_data(values) - npresult = htable.mode(values, dropna=dropna, mask=mask) + npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask) + if res_mask is not None: + return npresult, res_mask # type: ignore[return-value] + try: npresult = np.sort(npresult) except TypeError as err: @@ -1111,98 +1123,6 @@ return ranks -def checked_add_with_arr( - arr: npt.NDArray[np.int64], - b: int | npt.NDArray[np.int64], - arr_mask: npt.NDArray[np.bool_] | None = None, - b_mask: npt.NDArray[np.bool_] | None = None, -) -> npt.NDArray[np.int64]: - """ - Perform array addition that checks for underflow and overflow. - - Performs the addition of an int64 array and an int64 integer (or array) - but checks that they do not result in overflow first. For elements that - are indicated to be NaN, whether or not there is overflow for that element - is automatically ignored. - - Parameters - ---------- - arr : np.ndarray[int64] addend. - b : array or scalar addend. - arr_mask : np.ndarray[bool] or None, default None - array indicating which elements to exclude from checking - b_mask : np.ndarray[bool] or None, default None - array or scalar indicating which element(s) to exclude from checking - - Returns - ------- - sum : An array for elements x + b for each element x in arr if b is - a scalar or an array for elements x + y for each element pair - (x, y) in (arr, b). - - Raises - ------ - OverflowError if any x + y exceeds the maximum or minimum int64 value. - """ - # For performance reasons, we broadcast 'b' to the new array 'b2' - # so that it has the same size as 'arr'. - b2 = np.broadcast_to(b, arr.shape) - if b_mask is not None: - # We do the same broadcasting for b_mask as well. - b2_mask = np.broadcast_to(b_mask, arr.shape) - else: - b2_mask = None - - # For elements that are NaN, regardless of their value, we should - # ignore whether they overflow or not when doing the checked add. - if arr_mask is not None and b2_mask is not None: - not_nan = np.logical_not(arr_mask | b2_mask) - elif arr_mask is not None: - not_nan = np.logical_not(arr_mask) - elif b_mask is not None: - # error: Argument 1 to "__call__" of "_UFunc_Nin1_Nout1" has - # incompatible type "Optional[ndarray[Any, dtype[bool_]]]"; - # expected "Union[_SupportsArray[dtype[Any]], _NestedSequence - # [_SupportsArray[dtype[Any]]], bool, int, float, complex, str - # , bytes, _NestedSequence[Union[bool, int, float, complex, str - # , bytes]]]" - not_nan = np.logical_not(b2_mask) # type: ignore[arg-type] - else: - not_nan = np.empty(arr.shape, dtype=bool) - not_nan.fill(True) - - # gh-14324: For each element in 'arr' and its corresponding element - # in 'b2', we check the sign of the element in 'b2'. If it is positive, - # we then check whether its sum with the element in 'arr' exceeds - # np.iinfo(np.int64).max. If so, we have an overflow error. If it - # it is negative, we then check whether its sum with the element in - # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow - # error as well. - i8max = lib.i8max - i8min = iNaT - - mask1 = b2 > 0 - mask2 = b2 < 0 - - if not mask1.any(): - to_raise = ((i8min - b2 > arr) & not_nan).any() - elif not mask2.any(): - to_raise = ((i8max - b2 < arr) & not_nan).any() - else: - to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or ( - (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2] - ).any() - - if to_raise: - raise OverflowError("Overflow in int64 addition") - - result = arr + b - if arr_mask is not None or b2_mask is not None: - np.putmask(result, ~not_nan, iNaT) - - return result - - # ---- # # take # # ---- # @@ -1595,7 +1515,7 @@ try: sorter = values.argsort() ordered = values.take(sorter) - except TypeError: + except (TypeError, decimal.InvalidOperation): # Previous sorters failed or were not applicable, try `_sort_mixed` # which would work, but which fails for special case of 1d arrays # with tuples. @@ -1635,11 +1555,12 @@ if use_na_sentinel: # take_nd is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() - new_codes = take_nd(order2, codes, fill_value=-1) if verify: mask = (codes < -len(values)) | (codes >= len(values)) + codes[mask] = 0 else: mask = None + new_codes = take_nd(order2, codes, fill_value=-1) else: reverse_indexer = np.empty(len(sorter), dtype=int) reverse_indexer.put(sorter, np.arange(len(sorter))) @@ -1713,8 +1634,16 @@ """ from pandas import Series - l_count = value_counts_internal(lvals, dropna=False) - r_count = value_counts_internal(rvals, dropna=False) + with warnings.catch_warnings(): + # filter warning from object dtype inference; we will end up discarding + # the index here, so the deprecation does not affect the end result here. + warnings.filterwarnings( + "ignore", + "The behavior of value_counts with object-dtype is deprecated", + category=FutureWarning, + ) + l_count = value_counts_internal(lvals, dropna=False) + r_count = value_counts_internal(rvals, dropna=False) l_count, r_count = l_count.align(r_count, fill_value=0) final_count = np.maximum(l_count.values, r_count.values) final_count = Series(final_count, index=l_count.index, dtype="int", copy=False) diff -Nru pandas-2.1.4+dfsg/pandas/core/apply.py pandas-2.2.2+dfsg/pandas/core/apply.py --- pandas-2.1.4+dfsg/pandas/core/apply.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/apply.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,13 +2,13 @@ import abc from collections import defaultdict +import functools from functools import partial import inspect from typing import ( TYPE_CHECKING, Any, Callable, - DefaultDict, Literal, cast, ) @@ -19,6 +19,7 @@ from pandas._config import option_context from pandas._libs import lib +from pandas._libs.internals import BlockValuesRefs from pandas._typing import ( AggFuncType, AggFuncTypeBase, @@ -29,6 +30,7 @@ NDFrameT, npt, ) +from pandas.compat._optional import import_optional_dependency from pandas.errors import SpecificationError from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level @@ -36,7 +38,9 @@ from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import ( is_dict_like, + is_extension_array_dtype, is_list_like, + is_numeric_dtype, is_sequence, ) from pandas.core.dtypes.dtypes import ( @@ -49,14 +53,16 @@ ABCSeries, ) +from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike if TYPE_CHECKING: from collections.abc import ( + Generator, Hashable, Iterable, - Iterator, + MutableMapping, Sequence, ) @@ -80,6 +86,8 @@ raw: bool = False, result_type: str | None = None, by_row: Literal[False, "compat"] = "compat", + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, args=None, kwargs=None, ) -> FrameApply: @@ -100,6 +108,8 @@ raw=raw, result_type=result_type, by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) @@ -116,6 +126,8 @@ result_type: str | None, *, by_row: Literal[False, "compat", "_compat"] = "compat", + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, args, kwargs, ) -> None: @@ -128,6 +140,9 @@ self.args = args or () self.kwargs = kwargs or {} + self.engine = engine + self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs + if result_type not in [None, "reduce", "broadcast", "expand"]: raise ValueError( "invalid value for result_type, must be one " @@ -253,7 +268,7 @@ return result - def transform_dict_like(self, func): + def transform_dict_like(self, func) -> DataFrame: """ Compute transform in the case of a dict-like func """ @@ -315,7 +330,7 @@ op_name: Literal["agg", "apply"], selected_obj: Series | DataFrame, kwargs: dict[str, Any], - ) -> tuple[list[Hashable], list[Any]]: + ) -> tuple[list[Hashable] | Index, list[Any]]: """ Compute agg/apply results for like-like input. @@ -330,7 +345,7 @@ Returns ------- - keys : list[hashable] + keys : list[Hashable] or Index Index labels for result. results : list Data for result. When aggregating with a Series, this can contain any @@ -370,12 +385,14 @@ new_res = getattr(colg, op_name)(func, *args, **kwargs) results.append(new_res) indices.append(index) - keys = selected_obj.columns.take(indices) + # error: Incompatible types in assignment (expression has type "Any | + # Index", variable has type "list[Any | Callable[..., Any] | str]") + keys = selected_obj.columns.take(indices) # type: ignore[assignment] return keys, results def wrap_results_list_like( - self, keys: list[Hashable], results: list[Series | DataFrame] + self, keys: Iterable[Hashable], results: list[Series | DataFrame] ): from pandas.core.reshape.concat import concat @@ -594,6 +611,13 @@ result: Series, DataFrame, or None Result when self.func is a list-like or dict-like, None otherwise. """ + + if self.engine == "numba": + raise NotImplementedError( + "The 'numba' engine doesn't support list-like/" + "dict likes of callables yet." + ) + if self.axis == 1 and isinstance(self.obj, ABCDataFrame): return self.obj.T.apply(self.func, 0, args=self.args, **self.kwargs).T @@ -754,13 +778,23 @@ result_type: str | None, *, by_row: Literal[False, "compat"] = False, + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, args, kwargs, ) -> None: if by_row is not False and by_row != "compat": raise ValueError(f"by_row={by_row} not allowed") super().__init__( - obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs + obj, + func, + raw, + result_type, + by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, ) # --------------------------------------------------------------- @@ -778,9 +812,35 @@ @property @abc.abstractmethod - def series_generator(self) -> Iterator[Series]: + def series_generator(self) -> Generator[Series, None, None]: pass + @staticmethod + @functools.cache + @abc.abstractmethod + def generate_numba_apply_func( + func, nogil=True, nopython=True, parallel=False + ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: + pass + + @abc.abstractmethod + def apply_with_numba(self): + pass + + def validate_values_for_numba(self): + # Validate column dtyps all OK + for colname, dtype in self.obj.dtypes.items(): + if not is_numeric_dtype(dtype): + raise ValueError( + f"Column {colname} must have a numeric dtype. " + f"Found '{dtype}' instead" + ) + if is_extension_array_dtype(dtype): + raise ValueError( + f"Column {colname} is backed by an extension array, " + f"which is not supported by the numba engine." + ) + @abc.abstractmethod def wrap_results_for_axis( self, results: ResType, res_index: Index @@ -803,8 +863,13 @@ def apply(self) -> DataFrame | Series: """compute the results""" + # dispatch to handle list-like or dict-like if is_list_like(self.func): + if self.engine == "numba": + raise NotImplementedError( + "the 'numba' engine doesn't support lists of callables yet" + ) return self.apply_list_or_dict_like() # all empty @@ -813,10 +878,20 @@ # string dispatch if isinstance(self.func, str): + if self.engine == "numba": + raise NotImplementedError( + "the 'numba' engine doesn't support using " + "a string as the callable function" + ) return self.apply_str() # ufunc elif isinstance(self.func, np.ufunc): + if self.engine == "numba": + raise NotImplementedError( + "the 'numba' engine doesn't support " + "using a numpy ufunc as the callable function" + ) with np.errstate(all="ignore"): results = self.obj._mgr.apply("apply", func=self.func) # _constructor will retain self.index and self.columns @@ -824,6 +899,10 @@ # broadcasting if self.result_type == "broadcast": + if self.engine == "numba": + raise NotImplementedError( + "the 'numba' engine doesn't support result_type='broadcast'" + ) return self.apply_broadcast(self.obj) # one axis empty @@ -832,7 +911,7 @@ # raw elif self.raw: - return self.apply_raw() + return self.apply_raw(engine=self.engine, engine_kwargs=self.engine_kwargs) return self.apply_standard() @@ -905,7 +984,7 @@ else: return self.obj.copy() - def apply_raw(self): + def apply_raw(self, engine="python", engine_kwargs=None): """apply to the values as a numpy array""" def wrap_function(func): @@ -923,9 +1002,27 @@ return wrapper - result = np.apply_along_axis( - wrap_function(self.func), self.axis, self.values, *self.args, **self.kwargs - ) + if engine == "numba": + engine_kwargs = {} if engine_kwargs is None else engine_kwargs + + # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has + # incompatible type "Callable[..., Any] | str | list[Callable + # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | + # list[Callable[..., Any] | str]]"; expected "Hashable" + nb_looper = generate_apply_looper( + self.func, **engine_kwargs # type: ignore[arg-type] + ) + result = nb_looper(self.values, self.axis) + # If we made the result 2-D, squeeze it back to 1-D + result = np.squeeze(result) + else: + result = np.apply_along_axis( + wrap_function(self.func), + self.axis, + self.values, + *self.args, + **self.kwargs, + ) # TODO: mixed type case if result.ndim == 2: @@ -962,7 +1059,10 @@ return result def apply_standard(self): - results, res_index = self.apply_series_generator() + if self.engine == "python": + results, res_index = self.apply_series_generator() + else: + results, res_index = self.apply_series_numba() # wrap results return self.wrap_results(results, res_index) @@ -986,6 +1086,19 @@ return results, res_index + def apply_series_numba(self): + if self.engine_kwargs.get("parallel", False): + raise NotImplementedError( + "Parallel apply is not supported when raw=False and engine='numba'" + ) + if not self.obj.index.is_unique or not self.columns.is_unique: + raise NotImplementedError( + "The index/columns must be unique when raw=False and engine='numba'" + ) + self.validate_values_for_numba() + results = self.apply_with_numba() + return results, self.result_index + def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series: from pandas import Series @@ -1022,9 +1135,58 @@ axis: AxisInt = 0 @property - def series_generator(self): + def series_generator(self) -> Generator[Series, None, None]: return (self.obj._ixs(i, axis=1) for i in range(len(self.columns))) + @staticmethod + @functools.cache + def generate_numba_apply_func( + func, nogil=True, nopython=True, parallel=False + ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: + numba = import_optional_dependency("numba") + from pandas import Series + + # Import helper from extensions to cast string object -> np strings + # Note: This also has the side effect of loading our numba extensions + from pandas.core._numba.extensions import maybe_cast_str + + jitted_udf = numba.extending.register_jitable(func) + + # Currently the parallel argument doesn't get passed through here + # (it's disabled) since the dicts in numba aren't thread-safe. + @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) + def numba_func(values, col_names, df_index): + results = {} + for j in range(values.shape[1]): + # Create the series + ser = Series( + values[:, j], index=df_index, name=maybe_cast_str(col_names[j]) + ) + results[j] = jitted_udf(ser) + return results + + return numba_func + + def apply_with_numba(self) -> dict[int, Any]: + nb_func = self.generate_numba_apply_func( + cast(Callable, self.func), **self.engine_kwargs + ) + from pandas.core._numba.extensions import set_numba_data + + index = self.obj.index + if index.dtype == "string": + index = index.astype(object) + + columns = self.obj.columns + if columns.dtype == "string": + columns = columns.astype(object) + + # Convert from numba dict to regular dict + # Our isinstance checks in the df constructor don't pass for numbas typed dict + with set_numba_data(index) as index, set_numba_data(columns) as columns: + res = dict(nb_func(self.values, columns, index)) + return res + @property def result_index(self) -> Index: return self.columns @@ -1083,7 +1245,7 @@ return result.T @property - def series_generator(self): + def series_generator(self) -> Generator[Series, None, None]: values = self.values values = ensure_wrapped_if_datetimelike(values) assert len(values) > 0 @@ -1093,6 +1255,8 @@ ser = self.obj._ixs(0, axis=0) mgr = ser._mgr + is_view = mgr.blocks[0].refs.has_reference() # type: ignore[union-attr] + if isinstance(ser.dtype, ExtensionDtype): # values will be incorrect for this block # TODO(EA2D): special case would be unnecessary with 2D EAs @@ -1106,8 +1270,62 @@ ser._mgr = mgr mgr.set_values(arr) object.__setattr__(ser, "_name", name) + if not is_view: + # In apply_series_generator we store the a shallow copy of the + # result, which potentially increases the ref count of this reused + # `ser` object (depending on the result of the applied function) + # -> if that happened and `ser` is already a copy, then we reset + # the refs here to avoid triggering a unnecessary CoW inside the + # applied function (https://github.com/pandas-dev/pandas/pull/56212) + mgr.blocks[0].refs = BlockValuesRefs(mgr.blocks[0]) # type: ignore[union-attr] yield ser + @staticmethod + @functools.cache + def generate_numba_apply_func( + func, nogil=True, nopython=True, parallel=False + ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: + numba = import_optional_dependency("numba") + from pandas import Series + from pandas.core._numba.extensions import maybe_cast_str + + jitted_udf = numba.extending.register_jitable(func) + + @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) + def numba_func(values, col_names_index, index): + results = {} + # Currently the parallel argument doesn't get passed through here + # (it's disabled) since the dicts in numba aren't thread-safe. + for i in range(values.shape[0]): + # Create the series + # TODO: values corrupted without the copy + ser = Series( + values[i].copy(), + index=col_names_index, + name=maybe_cast_str(index[i]), + ) + results[i] = jitted_udf(ser) + + return results + + return numba_func + + def apply_with_numba(self) -> dict[int, Any]: + nb_func = self.generate_numba_apply_func( + cast(Callable, self.func), **self.engine_kwargs + ) + + from pandas.core._numba.extensions import set_numba_data + + # Convert from numba dict to regular dict + # Our isinstance checks in the df constructor don't pass for numbas typed dict + with set_numba_data(self.obj.index) as index, set_numba_data( + self.columns + ) as columns: + res = dict(nb_func(self.values, columns, index)) + + return res + @property def result_index(self) -> Index: return self.index @@ -1424,7 +1642,7 @@ def reconstruct_func( func: AggFuncType | None, **kwargs -) -> tuple[bool, AggFuncType, list[str] | None, npt.NDArray[np.intp] | None]: +) -> tuple[bool, AggFuncType, tuple[str, ...] | None, npt.NDArray[np.intp] | None]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. @@ -1450,7 +1668,7 @@ ------- relabelling: bool, if there is relabelling or not func: normalized and mangled func - columns: list of column names + columns: tuple of column names order: array of columns indices Examples @@ -1462,7 +1680,7 @@ (False, 'min', None, None) """ relabeling = func is None and is_multi_agg_with_relabel(**kwargs) - columns: list[str] | None = None + columns: tuple[str, ...] | None = None order: npt.NDArray[np.intp] | None = None if not relabeling: @@ -1478,7 +1696,14 @@ raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") if relabeling: - func, columns, order = normalize_keyword_aggregation(kwargs) + # error: Incompatible types in assignment (expression has type + # "MutableMapping[Hashable, list[Callable[..., Any] | str]]", variable has type + # "Callable[..., Any] | str | list[Callable[..., Any] | str] | + # MutableMapping[Hashable, Callable[..., Any] | str | list[Callable[..., Any] | + # str]] | None") + func, columns, order = normalize_keyword_aggregation( # type: ignore[assignment] + kwargs + ) assert func is not None return relabeling, func, columns, order @@ -1512,7 +1737,11 @@ def normalize_keyword_aggregation( kwargs: dict, -) -> tuple[dict, list[str], npt.NDArray[np.intp]]: +) -> tuple[ + MutableMapping[Hashable, list[AggFuncTypeBase]], + tuple[str, ...], + npt.NDArray[np.intp], +]: """ Normalize user-provided "named aggregation" kwargs. Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs @@ -1526,7 +1755,7 @@ ------- aggspec : dict The transformed kwargs. - columns : List[str] + columns : tuple[str, ...] The user-provided keys. col_idx_order : List[int] List of columns indices. @@ -1541,9 +1770,7 @@ # Normalize the aggregation functions as Mapping[column, List[func]], # process normally, then fixup the names. # TODO: aggspec type: typing.Dict[str, List[AggScalar]] - # May be hitting https://github.com/python/mypy/issues/5958 - # saying it doesn't have an attribute __name__ - aggspec: DefaultDict = defaultdict(list) + aggspec = defaultdict(list) order = [] columns, pairs = list(zip(*kwargs.items())) diff -Nru pandas-2.1.4+dfsg/pandas/core/array_algos/quantile.py pandas-2.2.2+dfsg/pandas/core/array_algos/quantile.py --- pandas-2.1.4+dfsg/pandas/core/array_algos/quantile.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/array_algos/quantile.py 2024-04-10 17:42:52.000000000 +0000 @@ -102,7 +102,7 @@ interpolation=interpolation, ) - result = np.array(result, copy=False) + result = np.asarray(result) result = result.T return result @@ -201,9 +201,9 @@ ] if values.dtype.kind == "f": # preserve itemsize - result = np.array(result, dtype=values.dtype, copy=False).T + result = np.asarray(result, dtype=values.dtype).T else: - result = np.array(result, copy=False).T + result = np.asarray(result).T if ( result.dtype != values.dtype and not mask.all() diff -Nru pandas-2.1.4+dfsg/pandas/core/array_algos/take.py pandas-2.2.2+dfsg/pandas/core/array_algos/take.py --- pandas-2.1.4+dfsg/pandas/core/array_algos/take.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/array_algos/take.py 2024-04-10 17:42:52.000000000 +0000 @@ -66,8 +66,7 @@ """ Specialized Cython take which sets NaN values in one pass - This dispatches to ``take`` defined on ExtensionArrays. It does not - currently dispatch to ``SparseArray.take`` for sparse ``arr``. + This dispatches to ``take`` defined on ExtensionArrays. Note: this function assumes that the indexer is a valid(ated) indexer with no out of bound indices. diff -Nru pandas-2.1.4+dfsg/pandas/core/arraylike.py pandas-2.2.2+dfsg/pandas/core/arraylike.py --- pandas-2.1.4+dfsg/pandas/core/arraylike.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arraylike.py 2024-04-10 17:42:52.000000000 +0000 @@ -263,7 +263,10 @@ Series, ) from pandas.core.generic import NDFrame - from pandas.core.internals import BlockManager + from pandas.core.internals import ( + ArrayManager, + BlockManager, + ) cls = type(self) @@ -347,7 +350,7 @@ if method == "outer": raise NotImplementedError return result - if isinstance(result, BlockManager): + if isinstance(result, (BlockManager, ArrayManager)): # we went through BlockManager.apply e.g. np.sqrt result = self._constructor_from_mgr(result, axes=result.axes) else: diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/_arrow_string_mixins.py pandas-2.2.2+dfsg/pandas/core/arrays/_arrow_string_mixins.py --- pandas-2.1.4+dfsg/pandas/core/arrays/_arrow_string_mixins.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/_arrow_string_mixins.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,9 +4,9 @@ import numpy as np -from pandas.compat import pa_version_under7p0 +from pandas.compat import pa_version_under10p1 -if not pa_version_under7p0: +if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/_mixins.py pandas-2.2.2+dfsg/pandas/core/arrays/_mixins.py --- pandas-2.1.4+dfsg/pandas/core/arrays/_mixins.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/_mixins.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,6 +13,7 @@ from pandas._libs import lib from pandas._libs.arrays import NDArrayBacked +from pandas._libs.tslibs import is_supported_dtype from pandas._typing import ( ArrayLike, AxisInt, @@ -128,17 +129,24 @@ dtype = pandas_dtype(dtype) arr = self._ndarray - if isinstance(dtype, (PeriodDtype, DatetimeTZDtype)): + if isinstance(dtype, PeriodDtype): cls = dtype.construct_array_type() return cls(arr.view("i8"), dtype=dtype) - elif dtype == "M8[ns]": + elif isinstance(dtype, DatetimeTZDtype): + dt_cls = dtype.construct_array_type() + dt64_values = arr.view(f"M8[{dtype.unit}]") + return dt_cls._simple_new(dt64_values, dtype=dtype) + elif lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype): from pandas.core.arrays import DatetimeArray - return DatetimeArray(arr.view("i8"), dtype=dtype) - elif dtype == "m8[ns]": + dt64_values = arr.view(dtype) + return DatetimeArray._simple_new(dt64_values, dtype=dtype) + + elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray(arr.view("i8"), dtype=dtype) + td64_values = arr.view(dtype) + return TimedeltaArray._simple_new(td64_values, dtype=dtype) # error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible # type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None, @@ -297,7 +305,12 @@ func(self._ndarray.T, limit=limit, mask=mask.T) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: mask = self.isna() if mask.any(): @@ -307,7 +320,7 @@ npvalues = self._ndarray.T if copy: npvalues = npvalues.copy() - func(npvalues, limit=limit, mask=mask.T) + func(npvalues, limit=limit, limit_area=limit_area, mask=mask.T) npvalues = npvalues.T if copy: @@ -414,6 +427,12 @@ value = self._validate_setitem_value(value) res_values = np.where(mask, self._ndarray, value) + if res_values.dtype != self._ndarray.dtype: + raise AssertionError( + # GH#56410 + "Something has gone wrong, please report a bug at " + "github.com/pandas-dev/pandas/" + ) return self._from_backing_data(res_values) # ------------------------------------------------------------------------ diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/_ranges.py pandas-2.2.2+dfsg/pandas/core/arrays/_ranges.py --- pandas-2.1.4+dfsg/pandas/core/arrays/_ranges.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/_ranges.py 2024-04-10 17:42:52.000000000 +0000 @@ -54,12 +54,10 @@ iend = end._value if end is not None else None freq.nanos # raises if non-fixed frequency td = Timedelta(freq) - b: int | np.int64 | np.uint64 - e: int | np.int64 | np.uint64 + b: int + e: int try: - td = td.as_unit( # pyright: ignore[reportGeneralTypeIssues] - unit, round_ok=False - ) + td = td.as_unit(unit, round_ok=False) except ValueError as err: raise ValueError( f"freq={freq} is incompatible with unit={unit}. " @@ -98,7 +96,7 @@ def _generate_range_overflow_safe( endpoint: int, periods: int, stride: int, side: str = "start" -) -> np.int64 | np.uint64: +) -> int: """ Calculate the second endpoint for passing to np.arange, checking to avoid an integer overflow. Catch OverflowError and re-raise @@ -117,7 +115,7 @@ Returns ------- - other_end : np.int64 | np.uint64 + other_end : int Raises ------ @@ -165,7 +163,7 @@ def _generate_range_overflow_safe_signed( endpoint: int, periods: int, stride: int, side: str -) -> np.int64 | np.uint64: +) -> int: """ A special case for _generate_range_overflow_safe where `periods * stride` can be calculated without overflowing int64 bounds. @@ -183,7 +181,7 @@ # Putting this into a DatetimeArray/TimedeltaArray # would incorrectly be interpreted as NaT raise OverflowError - return result + return int(result) except (FloatingPointError, OverflowError): # with endpoint negative and addend positive we risk # FloatingPointError; with reversed signed we risk OverflowError @@ -202,7 +200,7 @@ i64max = np.uint64(i8max) assert uresult > i64max if uresult <= i64max + np.uint64(stride): - return uresult + return int(uresult) raise OutOfBoundsDatetime( f"Cannot generate range with {side}={endpoint} and periods={periods}" diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/_utils.py pandas-2.2.2+dfsg/pandas/core/arrays/_utils.py --- pandas-2.1.4+dfsg/pandas/core/arrays/_utils.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/_utils.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,63 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, +) + +import numpy as np + +from pandas._libs import lib +from pandas.errors import LossySetitemError + +from pandas.core.dtypes.cast import np_can_hold_element +from pandas.core.dtypes.common import is_numeric_dtype + +if TYPE_CHECKING: + from pandas._typing import ( + ArrayLike, + npt, + ) + + +def to_numpy_dtype_inference( + arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool +) -> tuple[npt.DTypeLike, Any]: + if dtype is None and is_numeric_dtype(arr.dtype): + dtype_given = False + if hasna: + if arr.dtype.kind == "b": + dtype = np.dtype(np.object_) + else: + if arr.dtype.kind in "iu": + dtype = np.dtype(np.float64) + else: + dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] + if na_value is lib.no_default: + na_value = np.nan + else: + dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] + elif dtype is not None: + dtype = np.dtype(dtype) + dtype_given = True + else: + dtype_given = True + + if na_value is lib.no_default: + if dtype is None or not hasna: + na_value = arr.dtype.na_value + elif dtype.kind == "f": # type: ignore[union-attr] + na_value = np.nan + elif dtype.kind == "M": # type: ignore[union-attr] + na_value = np.datetime64("nat") + elif dtype.kind == "m": # type: ignore[union-attr] + na_value = np.timedelta64("nat") + else: + na_value = arr.dtype.na_value + + if not dtype_given and hasna: + try: + np_can_hold_element(dtype, na_value) # type: ignore[arg-type] + except LossySetitemError: + dtype = np.dtype(np.object_) + return dtype, na_value diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/arrow/__init__.py pandas-2.2.2+dfsg/pandas/core/arrays/arrow/__init__.py --- pandas-2.1.4+dfsg/pandas/core/arrays/arrow/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/arrow/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,3 +1,7 @@ +from pandas.core.arrays.arrow.accessors import ( + ListAccessor, + StructAccessor, +) from pandas.core.arrays.arrow.array import ArrowExtensionArray -__all__ = ["ArrowExtensionArray"] +__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"] diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/arrow/accessors.py pandas-2.2.2+dfsg/pandas/core/arrays/arrow/accessors.py --- pandas-2.1.4+dfsg/pandas/core/arrays/arrow/accessors.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/arrow/accessors.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,473 @@ +"""Accessors for arrow-backed data.""" + +from __future__ import annotations + +from abc import ( + ABCMeta, + abstractmethod, +) +from typing import ( + TYPE_CHECKING, + cast, +) + +from pandas.compat import ( + pa_version_under10p1, + pa_version_under11p0, +) + +from pandas.core.dtypes.common import is_list_like + +if not pa_version_under10p1: + import pyarrow as pa + import pyarrow.compute as pc + + from pandas.core.dtypes.dtypes import ArrowDtype + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pandas import ( + DataFrame, + Series, + ) + + +class ArrowAccessor(metaclass=ABCMeta): + @abstractmethod + def __init__(self, data, validation_msg: str) -> None: + self._data = data + self._validation_msg = validation_msg + self._validate(data) + + @abstractmethod + def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: + pass + + def _validate(self, data): + dtype = data.dtype + if not isinstance(dtype, ArrowDtype): + # Raise AttributeError so that inspect can handle non-struct Series. + raise AttributeError(self._validation_msg.format(dtype=dtype)) + + if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype): + # Raise AttributeError so that inspect can handle invalid Series. + raise AttributeError(self._validation_msg.format(dtype=dtype)) + + @property + def _pa_array(self): + return self._data.array._pa_array + + +class ListAccessor(ArrowAccessor): + """ + Accessor object for list data properties of the Series values. + + Parameters + ---------- + data : Series + Series containing Arrow list data. + """ + + def __init__(self, data=None) -> None: + super().__init__( + data, + validation_msg="Can only use the '.list' accessor with " + "'list[pyarrow]' dtype, not {dtype}.", + ) + + def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: + return ( + pa.types.is_list(pyarrow_dtype) + or pa.types.is_fixed_size_list(pyarrow_dtype) + or pa.types.is_large_list(pyarrow_dtype) + ) + + def len(self) -> Series: + """ + Return the length of each list in the Series. + + Returns + ------- + pandas.Series + The length of each list. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=pd.ArrowDtype(pa.list_( + ... pa.int64() + ... )) + ... ) + >>> s.list.len() + 0 3 + 1 1 + dtype: int32[pyarrow] + """ + from pandas import Series + + value_lengths = pc.list_value_length(self._pa_array) + return Series(value_lengths, dtype=ArrowDtype(value_lengths.type)) + + def __getitem__(self, key: int | slice) -> Series: + """ + Index or slice lists in the Series. + + Parameters + ---------- + key : int | slice + Index or slice of indices to access from each list. + + Returns + ------- + pandas.Series + The list at requested index. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=pd.ArrowDtype(pa.list_( + ... pa.int64() + ... )) + ... ) + >>> s.list[0] + 0 1 + 1 3 + dtype: int64[pyarrow] + """ + from pandas import Series + + if isinstance(key, int): + # TODO: Support negative key but pyarrow does not allow + # element index to be an array. + # if key < 0: + # key = pc.add(key, pc.list_value_length(self._pa_array)) + element = pc.list_element(self._pa_array, key) + return Series(element, dtype=ArrowDtype(element.type)) + elif isinstance(key, slice): + if pa_version_under11p0: + raise NotImplementedError( + f"List slice not supported by pyarrow {pa.__version__}." + ) + + # TODO: Support negative start/stop/step, ideally this would be added + # upstream in pyarrow. + start, stop, step = key.start, key.stop, key.step + if start is None: + # TODO: When adding negative step support + # this should be setto last element of array + # when step is negative. + start = 0 + if step is None: + step = 1 + sliced = pc.list_slice(self._pa_array, start, stop, step) + return Series(sliced, dtype=ArrowDtype(sliced.type)) + else: + raise ValueError(f"key must be an int or slice, got {type(key).__name__}") + + def __iter__(self) -> Iterator: + raise TypeError(f"'{type(self).__name__}' object is not iterable") + + def flatten(self) -> Series: + """ + Flatten list values. + + Returns + ------- + pandas.Series + The data from all lists in the series flattened. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=pd.ArrowDtype(pa.list_( + ... pa.int64() + ... )) + ... ) + >>> s.list.flatten() + 0 1 + 1 2 + 2 3 + 3 3 + dtype: int64[pyarrow] + """ + from pandas import Series + + flattened = pc.list_flatten(self._pa_array) + return Series(flattened, dtype=ArrowDtype(flattened.type)) + + +class StructAccessor(ArrowAccessor): + """ + Accessor object for structured data properties of the Series values. + + Parameters + ---------- + data : Series + Series containing Arrow struct data. + """ + + def __init__(self, data=None) -> None: + super().__init__( + data, + validation_msg=( + "Can only use the '.struct' accessor with 'struct[pyarrow]' " + "dtype, not {dtype}." + ), + ) + + def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: + return pa.types.is_struct(pyarrow_dtype) + + @property + def dtypes(self) -> Series: + """ + Return the dtype object of each child field of the struct. + + Returns + ------- + pandas.Series + The data type of each child field. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + >>> s.struct.dtypes + version int64[pyarrow] + project string[pyarrow] + dtype: object + """ + from pandas import ( + Index, + Series, + ) + + pa_type = self._data.dtype.pyarrow_dtype + types = [ArrowDtype(struct.type) for struct in pa_type] + names = [struct.name for struct in pa_type] + return Series(types, index=Index(names)) + + def field( + self, + name_or_index: list[str] + | list[bytes] + | list[int] + | pc.Expression + | bytes + | str + | int, + ) -> Series: + """ + Extract a child field of a struct as a Series. + + Parameters + ---------- + name_or_index : str | bytes | int | expression | list + Name or index of the child field to extract. + + For list-like inputs, this will index into a nested + struct. + + Returns + ------- + pandas.Series + The data corresponding to the selected child field. + + See Also + -------- + Series.struct.explode : Return all child fields as a DataFrame. + + Notes + ----- + The name of the resulting Series will be set using the following + rules: + + - For string, bytes, or integer `name_or_index` (or a list of these, for + a nested selection), the Series name is set to the selected + field's name. + - For a :class:`pyarrow.compute.Expression`, this is set to + the string form of the expression. + - For list-like `name_or_index`, the name will be set to the + name of the final field selected. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + Extract by field name. + + >>> s.struct.field("project") + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string[pyarrow] + + Extract by field index. + + >>> s.struct.field(0) + 0 1 + 1 2 + 2 1 + Name: version, dtype: int64[pyarrow] + + Or an expression + + >>> import pyarrow.compute as pc + >>> s.struct.field(pc.field("project")) + 0 pandas + 1 pandas + 2 numpy + Name: project, dtype: string[pyarrow] + + For nested struct types, you can pass a list of values to index + multiple levels: + + >>> version_type = pa.struct([ + ... ("major", pa.int64()), + ... ("minor", pa.int64()), + ... ]) + >>> s = pd.Series( + ... [ + ... {"version": {"major": 1, "minor": 5}, "project": "pandas"}, + ... {"version": {"major": 2, "minor": 1}, "project": "pandas"}, + ... {"version": {"major": 1, "minor": 26}, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", version_type), ("project", pa.string())] + ... )) + ... ) + >>> s.struct.field(["version", "minor"]) + 0 5 + 1 1 + 2 26 + Name: minor, dtype: int64[pyarrow] + >>> s.struct.field([0, 0]) + 0 1 + 1 2 + 2 1 + Name: major, dtype: int64[pyarrow] + """ + from pandas import Series + + def get_name( + level_name_or_index: list[str] + | list[bytes] + | list[int] + | pc.Expression + | bytes + | str + | int, + data: pa.ChunkedArray, + ): + if isinstance(level_name_or_index, int): + name = data.type.field(level_name_or_index).name + elif isinstance(level_name_or_index, (str, bytes)): + name = level_name_or_index + elif isinstance(level_name_or_index, pc.Expression): + name = str(level_name_or_index) + elif is_list_like(level_name_or_index): + # For nested input like [2, 1, 2] + # iteratively get the struct and field name. The last + # one is used for the name of the index. + level_name_or_index = list(reversed(level_name_or_index)) + selected = data + while level_name_or_index: + # we need the cast, otherwise mypy complains about + # getting ints, bytes, or str here, which isn't possible. + level_name_or_index = cast(list, level_name_or_index) + name_or_index = level_name_or_index.pop() + name = get_name(name_or_index, selected) + selected = selected.type.field(selected.type.get_field_index(name)) + name = selected.name + else: + raise ValueError( + "name_or_index must be an int, str, bytes, " + "pyarrow.compute.Expression, or list of those" + ) + return name + + pa_arr = self._data.array._pa_array + name = get_name(name_or_index, pa_arr) + field_arr = pc.struct_field(pa_arr, name_or_index) + + return Series( + field_arr, + dtype=ArrowDtype(field_arr.type), + index=self._data.index, + name=name, + ) + + def explode(self) -> DataFrame: + """ + Extract all child fields of a struct as a DataFrame. + + Returns + ------- + pandas.DataFrame + The data corresponding to all child fields. + + See Also + -------- + Series.struct.field : Return a single child field as a Series. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, + ... ], + ... dtype=pd.ArrowDtype(pa.struct( + ... [("version", pa.int64()), ("project", pa.string())] + ... )) + ... ) + + >>> s.struct.explode() + version project + 0 1 pandas + 1 2 pandas + 2 1 numpy + """ + from pandas import concat + + pa_type = self._pa_array.type + return concat( + [self.field(i) for i in range(pa_type.num_fields)], axis="columns" + ) diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/arrow/array.py pandas-2.2.2+dfsg/pandas/core/arrays/arrow/array.py --- pandas-2.1.4+dfsg/pandas/core/arrays/arrow/array.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/arrow/array.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,39 +17,46 @@ from pandas._libs import lib from pandas._libs.tslibs import ( + NaT, Timedelta, Timestamp, timezones, ) from pandas.compat import ( - pa_version_under7p0, - pa_version_under8p0, - pa_version_under9p0, + pa_version_under10p1, pa_version_under11p0, pa_version_under13p0, ) from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs -from pandas.core.dtypes.cast import infer_dtype_from_scalar +from pandas.core.dtypes.cast import ( + can_hold_element, + infer_dtype_from_scalar, +) from pandas.core.dtypes.common import ( CategoricalDtype, is_array_like, is_bool_dtype, + is_float_dtype, is_integer, is_list_like, - is_object_dtype, + is_numeric_dtype, is_scalar, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas.core import ( + algorithms as algos, missing, + ops, roperator, ) +from pandas.core.algorithms import map_array from pandas.core.arraylike import OpsMixin from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin +from pandas.core.arrays._utils import to_numpy_dtype_inference from pandas.core.arrays.base import ( ExtensionArray, ExtensionArraySupportsAnyAll, @@ -67,7 +74,7 @@ from pandas.io._util import _arrow_dtype_mapping from pandas.tseries.frequencies import to_offset -if not pa_version_under7p0: +if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc @@ -102,25 +109,50 @@ def cast_for_truediv( arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar - ) -> pa.ChunkedArray: + ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]: # Ensure int / int -> float mirroring Python/Numpy behavior # as pc.divide_checked(int, int) -> int if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( pa_object.type ): - return arrow_array.cast(pa.float64()) - return arrow_array + # GH: 56645. + # https://github.com/apache/arrow/issues/35563 + return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast( + pa_object, pa.float64(), safe=False + ) + + return arrow_array, pa_object def floordiv_compat( left: pa.ChunkedArray | pa.Array | pa.Scalar, right: pa.ChunkedArray | pa.Array | pa.Scalar, ) -> pa.ChunkedArray: - # Ensure int // int -> int mirroring Python/Numpy behavior - # as pc.floor(pc.divide_checked(int, int)) -> float - converted_left = cast_for_truediv(left, right) - result = pc.floor(pc.divide(converted_left, right)) + # TODO: Replace with pyarrow floordiv kernel. + # https://github.com/apache/arrow/issues/39386 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): + divided = pc.divide_checked(left, right) + if pa.types.is_signed_integer(divided.type): + # GH 56676 + has_remainder = pc.not_equal(pc.multiply(divided, right), left) + has_one_negative_operand = pc.less( + pc.bit_wise_xor(left, right), + pa.scalar(0, type=divided.type), + ) + result = pc.if_else( + pc.and_( + has_remainder, + has_one_negative_operand, + ), + # GH: 55561 + pc.subtract(divided, pa.scalar(1, type=divided.type)), + divided, + ) + else: + result = divided result = result.cast(left.type) + else: + divided = pc.divide(left, right) + result = pc.floor(divided) return result ARROW_ARITHMETIC_FUNCS = { @@ -130,8 +162,8 @@ "rsub": lambda x, y: pc.subtract_checked(y, x), "mul": pc.multiply_checked, "rmul": lambda x, y: pc.multiply_checked(y, x), - "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y), - "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)), + "truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)), + "rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)), "floordiv": lambda x, y: floordiv_compat(x, y), "rfloordiv": lambda x, y: floordiv_compat(y, x), "mod": NotImplemented, @@ -150,6 +182,7 @@ AxisInt, Dtype, FillnaOptions, + InterpolateOptions, Iterator, NpDtype, NumpySorter, @@ -254,8 +287,8 @@ _dtype: ArrowDtype def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: - if pa_version_under7p0: - msg = "pyarrow>=7.0.0 is required for PyArrow backed ArrowExtensionArray." + if pa_version_under10p1: + msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray." raise ImportError(msg) if isinstance(values, pa.Array): self._pa_array = pa.chunked_array([values]) @@ -289,6 +322,7 @@ pa_type is None or pa.types.is_binary(pa_type) or pa.types.is_string(pa_type) + or pa.types.is_large_string(pa_type) ): # pa_type is None: Let pa.array infer # pa_type is string/binary: scalars already correct type @@ -487,7 +521,23 @@ if pa.types.is_dictionary(pa_type): pa_array = pa_array.dictionary_encode() else: - pa_array = pa_array.cast(pa_type) + try: + pa_array = pa_array.cast(pa_type) + except ( + pa.ArrowInvalid, + pa.ArrowTypeError, + pa.ArrowNotImplementedError, + ): + if pa.types.is_string(pa_array.type) or pa.types.is_large_string( + pa_array.type + ): + # TODO: Move logic in _from_sequence_of_strings into + # _box_pa_array + return cls._from_sequence_of_strings( + value, dtype=pa_type + )._pa_array + else: + raise return pa_array @@ -606,7 +656,9 @@ """Convert myself to a pyarrow ChunkedArray.""" return self._pa_array - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" return self.to_numpy(dtype=dtype) @@ -614,6 +666,11 @@ # This is a bit wise op for integer types if pa.types.is_integer(self._pa_array.type): return type(self)(pc.bit_wise_not(self._pa_array)) + elif pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( + self._pa_array.type + ): + # Raise TypeError instead of pa.ArrowNotImplementedError + raise TypeError("__invert__ is not supported for string dtypes") else: return type(self)(pc.invert(self._pa_array)) @@ -654,7 +711,11 @@ mask = isna(self) | isna(other) valid = ~mask result = np.zeros(len(self), dtype="bool") - result[valid] = op(np.array(self)[valid], other) + np_array = np.array(self) + try: + result[valid] = op(np_array[valid], other) + except TypeError: + result = ops.invalid_comparison(np_array, other, op) result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) else: @@ -667,17 +728,38 @@ pa_type = self._pa_array.type other = self._box_pa(other) - if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [ - operator.add, - roperator.radd, - ]: - sep = pa.scalar("", type=pa_type) - if op is operator.add: - result = pc.binary_join_element_wise(self._pa_array, other, sep) - else: - result = pc.binary_join_element_wise(other, self._pa_array, sep) + if ( + pa.types.is_string(pa_type) + or pa.types.is_large_string(pa_type) + or pa.types.is_binary(pa_type) + ): + if op in [operator.add, roperator.radd]: + sep = pa.scalar("", type=pa_type) + if op is operator.add: + result = pc.binary_join_element_wise(self._pa_array, other, sep) + elif op is roperator.radd: + result = pc.binary_join_element_wise(other, self._pa_array, sep) + return type(self)(result) + elif op in [operator.mul, roperator.rmul]: + binary = self._pa_array + integral = other + if not pa.types.is_integer(integral.type): + raise TypeError("Can only string multiply by an integer.") + pa_integral = pc.if_else(pc.less(integral, 0), 0, integral) + result = pc.binary_repeat(binary, pa_integral) + return type(self)(result) + elif ( + pa.types.is_string(other.type) + or pa.types.is_binary(other.type) + or pa.types.is_large_string(other.type) + ) and op in [operator.mul, roperator.rmul]: + binary = other + integral = self._pa_array + if not pa.types.is_integer(integral.type): + raise TypeError("Can only string multiply by an integer.") + pa_integral = pc.if_else(pc.less(integral, 0), 0, integral) + result = pc.binary_repeat(binary, pa_integral) return type(self)(result) - if ( isinstance(other, pa.Scalar) and pc.is_null(other).as_py() @@ -946,13 +1028,18 @@ return type(self)(pc.drop_null(self._pa_array)) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: if not self._hasna: # TODO(CoW): Not necessary anymore when CoW is the default return self.copy() - if limit is None: + if limit is None and limit_area is None: method = missing.clean_fill_method(method) try: if method == "pad": @@ -968,7 +1055,9 @@ # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) @doc(ExtensionArray.fillna) def fillna( @@ -1016,7 +1105,7 @@ return super().fillna(value=value, method=method, limit=limit, copy=copy) - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: # short-circuit to return all False array. if not len(values): return np.zeros(len(self), dtype=bool) @@ -1123,7 +1212,16 @@ if isinstance(value, ExtensionArray): value = value.astype(object) # Base class searchsorted would cast to object, which is *much* slower. - return self.to_numpy().searchsorted(value, side=side, sorter=sorter) + dtype = None + if isinstance(self.dtype, ArrowDtype): + pa_dtype = self.dtype.pyarrow_dtype + if ( + pa.types.is_timestamp(pa_dtype) or pa.types.is_duration(pa_dtype) + ) and pa_dtype.unit == "ns": + # np.array[datetime/timedelta].searchsorted(datetime/timedelta) + # erroneously fails when numpy type resolution is nanoseconds + dtype = object + return self.to_numpy(dtype=dtype).searchsorted(value, side=side, sorter=sorter) def take( self, @@ -1252,6 +1350,11 @@ np_array = np_array.astype(np_dtype) return TimedeltaArray._simple_new(np_array, dtype=np_dtype) + def _values_for_json(self) -> np.ndarray: + if is_numeric_dtype(self.dtype): + return np.asarray(self, dtype=object) + return super()._values_for_json() + @doc(ExtensionArray.to_numpy) def to_numpy( self, @@ -1259,50 +1362,88 @@ copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: - if dtype is not None: - dtype = np.dtype(dtype) - elif self._hasna: - dtype = np.dtype(object) - - if na_value is lib.no_default: - na_value = self.dtype.na_value - + original_na_value = na_value + dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna) pa_type = self._pa_array.type + if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): + data = self + else: + data = self.fillna(na_value) + copy = False + if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): - result = self._maybe_convert_datelike_array() - if dtype is None or dtype.kind == "O": - result = result.to_numpy(dtype=object, na_value=na_value) - else: - result = result.to_numpy(dtype=dtype) - return result + # GH 55997 + if dtype != object and na_value is self.dtype.na_value: + na_value = lib.no_default + result = data._maybe_convert_datelike_array().to_numpy( + dtype=dtype, na_value=na_value + ) elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type): # convert to list of python datetime.time objects before # wrapping in ndarray - result = np.array(list(self), dtype=dtype) - elif is_object_dtype(dtype) and self._hasna: - result = np.empty(len(self), dtype=object) - mask = ~self.isna() - result[mask] = np.asarray(self[mask]._pa_array) - elif pa.types.is_null(self._pa_array.type): - fill_value = None if isna(na_value) else na_value - return np.full(len(self), fill_value=fill_value, dtype=dtype) - elif self._hasna: - data = self.fillna(na_value) + result = np.array(list(data), dtype=dtype) + if data._hasna: + result[data.isna()] = na_value + elif pa.types.is_null(pa_type): + if dtype is not None and isna(na_value): + na_value = None + result = np.full(len(data), fill_value=na_value, dtype=dtype) + elif not data._hasna or ( + pa.types.is_floating(pa_type) + and ( + na_value is np.nan + or original_na_value is lib.no_default + and is_float_dtype(dtype) + ) + ): result = data._pa_array.to_numpy() if dtype is not None: result = result.astype(dtype, copy=False) - return result - else: - result = self._pa_array.to_numpy() - if dtype is not None: - result = result.astype(dtype, copy=False) if copy: result = result.copy() - return result - if self._hasna: - result[self.isna()] = na_value + else: + if dtype is None: + empty = pa.array([], type=pa_type).to_numpy(zero_copy_only=False) + if can_hold_element(empty, na_value): + dtype = empty.dtype + else: + dtype = np.object_ + result = np.empty(len(data), dtype=dtype) + mask = data.isna() + result[mask] = na_value + result[~mask] = data[~mask]._pa_array.to_numpy() return result + def map(self, mapper, na_action=None): + if is_numeric_dtype(self.dtype): + return map_array(self.to_numpy(), mapper, na_action=na_action) + else: + return super().map(mapper, na_action) + + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + pa_type = self._pa_array.type + if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type): + values = self.to_numpy(na_value=0) + elif pa.types.is_boolean(pa_type): + values = self.to_numpy(na_value=False) + elif pa.types.is_temporal(pa_type): + if pa_type.bit_width == 32: + pa_type = pa.int32() + else: + pa_type = pa.int64() + arr = self.astype(ArrowDtype(pa_type)) + values = arr.to_numpy(na_value=0) + else: + # factorize the values to avoid the performance penalty of + # converting to object dtype + values = self.factorize()[0] + + mask = self.isna() if self._hasna else None + return algos.duplicated(values, keep=keep, mask=mask) + def unique(self) -> Self: """ Compute the ArrowExtensionArray of unique values. @@ -1389,7 +1530,7 @@ chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] if to_concat[0].dtype == "string": # StringDtype has no attribute pyarrow_dtype - pa_dtype = pa.string() + pa_dtype = pa.large_string() else: pa_dtype = to_concat[0].dtype.pyarrow_dtype arr = pa.chunked_array(chunks, type=pa_dtype) @@ -1642,6 +1783,10 @@ """ See Series.explode.__doc__. """ + # child class explode method supports only list types; return + # default implementation for non list types. + if not pa.types.is_list(self.dtype.pyarrow_dtype): + return super()._explode() values = self counts = pa.compute.list_value_length(values._pa_array) counts = counts.fill_null(1).to_numpy() @@ -1744,7 +1889,7 @@ ascending: bool = True, pct: bool = False, ): - if pa_version_under9p0 or axis != 0: + if axis != 0: ranked = super()._rank( axis=axis, method=method, @@ -1911,6 +2056,45 @@ raise TypeError(msg) from err return value + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> Self: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + mask = self.isna() + if self.dtype.kind == "f": + data = self._pa_array.to_numpy() + elif self.dtype.kind in "iu": + data = self.to_numpy(dtype="f8", na_value=0.0) + else: + raise NotImplementedError( + f"interpolate is not implemented for dtype={self.dtype}" + ) + + missing.interpolate_2d_inplace( + data, + method=method, + axis=0, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + mask=mask, + **kwargs, + ) + return type(self)(self._box_pa_array(pa.array(data, mask=mask))) + @classmethod def _if_else( cls, @@ -1982,16 +2166,6 @@ if isinstance(replacements, pa.ChunkedArray): # replacements must be array or scalar, not ChunkedArray replacements = replacements.combine_chunks() - if pa_version_under8p0: - # pc.replace_with_mask seems to be a bit unreliable for versions < 8.0: - # version <= 7: segfaults with various types - # version <= 6: fails to replace nulls - if isinstance(replacements, pa.Array): - indices = np.full(len(values), None) - indices[mask] = np.arange(len(replacements)) - indices = pa.array(indices, type=pa.int64()) - replacements = replacements.take(indices) - return cls._if_else(mask, replacements, values) if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type): # GH#52059 replace_with_mask segfaults for chunked array # https://github.com/apache/arrow/issues/34634 @@ -2046,9 +2220,17 @@ **kwargs, ) - masked = self._to_masked() + # maybe convert to a compatible dtype optimized for groupby + values: ExtensionArray + pa_type = self._pa_array.type + if pa.types.is_timestamp(pa_type): + values = self._to_datetimearray() + elif pa.types.is_duration(pa_type): + values = self._to_timedeltaarray() + else: + values = self._to_masked() - result = masked._groupby_op( + result = values._groupby_op( how=how, has_dropped_na=has_dropped_na, min_count=min_count, @@ -2090,14 +2272,36 @@ result = result.fill_null(na) return type(self)(result) - def _str_startswith(self, pat: str, na=None): - result = pc.starts_with(self._pa_array, pattern=pat) + def _str_startswith(self, pat: str | tuple[str, ...], na=None): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) return type(self)(result) - def _str_endswith(self, pat: str, na=None): - result = pc.ends_with(self._pa_array, pattern=pat) + def _str_endswith(self, pat: str | tuple[str, ...], na=None): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) return type(self)(result) @@ -2118,7 +2322,15 @@ ) func = pc.replace_substring_regex if regex else pc.replace_substring - result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) + # https://github.com/apache/arrow/issues/39149 + # GH 56404, unexpected behavior with negative max_replacements with pyarrow. + pa_max_replacements = None if n < 0 else n + result = func( + self._pa_array, + pattern=pat, + replacement=repl, + max_replacements=pa_max_replacements, + ) return type(self)(result) def _str_repeat(self, repeats: int | Sequence[int]): @@ -2139,7 +2351,7 @@ def _str_fullmatch( self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None ): - if not pat.endswith("$") or pat.endswith("//$"): + if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" return self._str_match(pat, case, flags, na) @@ -2148,7 +2360,8 @@ slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) result = pc.find_substring(slices, sub) not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) + start_offset = max(0, start) + offset_result = pc.add(result, start_offset) result = pc.if_else(not_found, result, offset_result) elif start == 0 and end is None: slices = self._pa_array @@ -2160,7 +2373,9 @@ return type(self)(result) def _str_join(self, sep: str): - if pa.types.is_string(self._pa_array.type): + if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( + self._pa_array.type + ): result = self._apply_elementwise(list) result = pa.chunked_array(result, type=pa.list_(pa.string())) else: @@ -2246,11 +2461,11 @@ return type(self)(result) def _str_removeprefix(self, prefix: str): - # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed - # starts_with = pc.starts_with(self._pa_array, pattern=prefix) - # removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - # result = pc.if_else(starts_with, removed, self._pa_array) - # return type(self)(result) + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) predicate = lambda val: val.removeprefix(prefix) result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) @@ -2266,9 +2481,19 @@ return type(self)(pa.chunked_array(result)) def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): - raise NotImplementedError( - "str.extract not supported with pd.ArrowDtype(pa.string())." - ) + if flags: + raise NotImplementedError("Only flags=0 is implemented.") + groups = re.compile(pat).groupindex.keys() + if len(groups) == 0: + raise ValueError(f"{pat=} must contain a symbolic group name.") + result = pc.extract_regex(self._pa_array, pat) + if expand: + return { + col: type(self)(pc.struct_field(result, [i])) + for col, i in zip(groups, range(result.type.num_fields)) + } + else: + return type(self)(pc.struct_field(result, [0])) def _str_findall(self, pat: str, flags: int = 0): regex = re.compile(pat, flags=flags) @@ -2354,6 +2579,92 @@ return type(self)(pa.chunked_array(result)) @property + def _dt_days(self): + return type(self)( + pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32()) + ) + + @property + def _dt_hours(self): + return type(self)( + pa.array( + [ + td.components.hours if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_minutes(self): + return type(self)( + pa.array( + [ + td.components.minutes if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_seconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32() + ) + ) + + @property + def _dt_milliseconds(self): + return type(self)( + pa.array( + [ + td.components.milliseconds if td is not NaT else None + for td in self._to_timedeltaarray() + ], + type=pa.int32(), + ) + ) + + @property + def _dt_microseconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().microseconds, + from_pandas=True, + type=pa.int32(), + ) + ) + + @property + def _dt_nanoseconds(self): + return type(self)( + pa.array( + self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32() + ) + ) + + def _dt_to_pytimedelta(self): + data = self._pa_array.to_pylist() + if self._dtype.pyarrow_dtype.unit == "ns": + data = [None if ts is None else ts.to_pytimedelta() for ts in data] + return np.array(data, dtype=object) + + def _dt_total_seconds(self): + return type(self)( + pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True) + ) + + def _dt_as_unit(self, unit: str): + if pa.types.is_date(self.dtype.pyarrow_dtype): + raise NotImplementedError("as_unit not implemented for date types") + pd_array = self._maybe_convert_datelike_array() + # Don't just cast _pa_array in order to follow pandas unit conversion rules + return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True)) + + @property def _dt_year(self): return type(self)(pc.year(self._pa_array)) @@ -2513,20 +2824,20 @@ if offset is None: raise ValueError(f"Must specify a valid frequency: {freq}") pa_supported_unit = { - "A": "year", - "AS": "year", + "Y": "year", + "YS": "year", "Q": "quarter", "QS": "quarter", "M": "month", "MS": "month", "W": "week", "D": "day", - "H": "hour", - "T": "minute", - "S": "second", - "L": "millisecond", - "U": "microsecond", - "N": "nanosecond", + "h": "hour", + "min": "minute", + "s": "second", + "ms": "millisecond", + "us": "microsecond", + "ns": "nanosecond", } unit = pa_supported_unit.get(offset._prefix, None) if unit is None: diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/arrow/extension_types.py pandas-2.2.2+dfsg/pandas/core/arrays/arrow/extension_types.py --- pandas-2.1.4+dfsg/pandas/core/arrays/arrow/extension_types.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/arrow/extension_types.py 2024-04-10 17:42:52.000000000 +0000 @@ -50,7 +50,7 @@ def __hash__(self) -> int: return hash((str(self), self.freq)) - def to_pandas_dtype(self): + def to_pandas_dtype(self) -> PeriodDtype: return PeriodDtype(freq=self.freq) @@ -107,7 +107,7 @@ def __hash__(self) -> int: return hash((str(self), str(self.subtype), self.closed)) - def to_pandas_dtype(self): + def to_pandas_dtype(self) -> IntervalDtype: return IntervalDtype(self.subtype.to_pandas_dtype(), self.closed) diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/base.py pandas-2.2.2+dfsg/pandas/core/arrays/base.py --- pandas-2.1.4+dfsg/pandas/core/arrays/base.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/base.py 2024-04-10 17:42:52.000000000 +0000 @@ -61,6 +61,7 @@ roperator, ) from pandas.core.algorithms import ( + duplicated, factorize_array, isin, map_array, @@ -69,6 +70,7 @@ unique, ) from pandas.core.array_algos.quantile import quantile_with_mask +from pandas.core.missing import _fill_limit_area_1d from pandas.core.sorting import ( nargminmax, nargsort, @@ -85,6 +87,7 @@ AstypeArg, AxisInt, Dtype, + DtypeObj, FillnaOptions, InterpolateOptions, NumpySorter, @@ -125,6 +128,7 @@ astype copy dropna + duplicated factorize fillna equals @@ -142,6 +146,7 @@ view _accumulate _concat_same_type + _explode _formatter _from_factorized _from_sequence @@ -291,6 +296,38 @@ raise AbstractMethodError(cls) @classmethod + def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: + """ + Strict analogue to _from_sequence, allowing only sequences of scalars + that should be specifically inferred to the given dtype. + + Parameters + ---------- + scalars : sequence + dtype : ExtensionDtype + + Raises + ------ + TypeError or ValueError + + Notes + ----- + This is called in a try/except block when casting the result of a + pointwise operation. + """ + try: + return cls._from_sequence(scalars, dtype=dtype, copy=False) + except (ValueError, TypeError): + raise + except Exception: + warnings.warn( + "_from_scalars should only raise ValueError or TypeError. " + "Consider overriding _from_scalars where appropriate.", + stacklevel=find_stack_level(), + ) + raise + + @classmethod def _from_sequence_of_strings( cls, strings, *, dtype: Dtype | None = None, copy: bool = False ): @@ -419,7 +456,7 @@ ------- None """ - # Some notes to the ExtensionArray implementor who may have ended up + # Some notes to the ExtensionArray implementer who may have ended up # here. While this method is not required for the interface, if you # *do* choose to implement __setitem__, then some semantics should be # observed: @@ -479,7 +516,7 @@ return (item == self).any() # type: ignore[union-attr] # error: Signature of "__eq__" incompatible with supertype "object" - def __eq__(self, other: Any) -> ArrayLike: # type: ignore[override] + def __eq__(self, other: object) -> ArrayLike: # type: ignore[override] """ Return for `self == other` (element-wise equality). """ @@ -492,11 +529,12 @@ raise AbstractMethodError(self) # error: Signature of "__ne__" incompatible with supertype "object" - def __ne__(self, other: Any) -> ArrayLike: # type: ignore[override] + def __ne__(self, other: object) -> ArrayLike: # type: ignore[override] """ Return for `self != other` (element-wise in-equality). """ - return ~(self == other) + # error: Unsupported operand type for ~ ("ExtensionArray") + return ~(self == other) # type: ignore[operator] def to_numpy( self, @@ -681,7 +719,10 @@ return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy) - return np.array(self, dtype=dtype, copy=copy) + if not copy: + return np.asarray(self, dtype=dtype) + else: + return np.array(self, dtype=dtype, copy=copy) def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: """ @@ -738,7 +779,7 @@ Notes ----- The caller is responsible for *not* modifying these values in-place, so - it is safe for implementors to give views on ``self``. + it is safe for implementers to give views on ``self``. Functions that use this (e.g. ``ExtensionArray.argsort``) should ignore entries with missing values in the original array (according to @@ -796,7 +837,7 @@ >>> arr.argsort() array([1, 2, 0, 4, 3]) """ - # Implementor note: You have two places to override the behavior of + # Implementer note: You have two places to override the behavior of # argsort. # 1. _values_for_argsort : construct the values passed to np.argsort # 2. argsort : total control over sorting. In case of overriding this, @@ -837,7 +878,7 @@ >>> arr.argmin() 1 """ - # Implementor note: You have two places to override the behavior of + # Implementer note: You have two places to override the behavior of # argmin. # 1. _values_for_argsort : construct the values used in nargminmax # 2. argmin itself : total control over sorting. @@ -871,7 +912,7 @@ >>> arr.argmax() 3 """ - # Implementor note: You have two places to override the behavior of + # Implementer note: You have two places to override the behavior of # argmax. # 1. _values_for_argsort : construct the values used in nargminmax # 2. argmax itself : total control over sorting. @@ -917,7 +958,12 @@ ) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: """ Pad or backfill values, used by Series/DataFrame ffill and bfill. @@ -975,6 +1021,12 @@ DeprecationWarning, stacklevel=find_stack_level(), ) + if limit_area is not None: + raise NotImplementedError( + f"{type(self).__name__} does not implement limit_area " + "(added in pandas 2.2). 3rd-party ExtnsionArray authors " + "need to add this argument to _pad_or_backfill." + ) return self.fillna(method=method, limit=limit) mask = self.isna() @@ -984,6 +1036,8 @@ meth = missing.clean_fill_method(method) npmask = np.asarray(mask) + if limit_area is not None and not npmask.all(): + _fill_limit_area_1d(npmask, limit_area) if meth == "pad": indexer = libalgos.get_fill_indexer(npmask, limit=limit) return self.take(indexer, allow_fill=True) @@ -1113,6 +1167,31 @@ # error: Unsupported operand type for ~ ("ExtensionArray") return self[~self.isna()] # type: ignore[operator] + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + """ + Return boolean ndarray denoting duplicate values. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + ndarray[bool] + + Examples + -------- + >>> pd.array([1, 1, 2, 3, 3], dtype="Int64").duplicated() + array([False, True, False, False, True]) + """ + mask = self.isna().astype(np.bool_, copy=False) + return duplicated(values=self, keep=keep, mask=mask) + def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: """ Shift values by desired number. @@ -1293,7 +1372,7 @@ equal_na = self.isna() & other.isna() # type: ignore[operator] return bool((equal_values | equal_na).all()) - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: """ Pointwise comparison for set containment in the given values. @@ -1301,7 +1380,7 @@ Parameters ---------- - values : Sequence + values : np.ndarray or ExtensionArray Returns ------- @@ -1634,7 +1713,14 @@ self, self._formatter(), indent_for_name=False ).rstrip(", \n") class_name = f"<{type(self).__name__}>\n" - return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" + footer = self._get_repr_footer() + return f"{class_name}{data}\n{footer}" + + def _get_repr_footer(self) -> str: + # GH#24278 + if self.ndim > 1: + return f"Shape: {self.shape}, dtype: {self.dtype}" + return f"Length: {len(self)}, dtype: {self.dtype}" def _repr_2d(self) -> str: from pandas.io.formats.printing import format_object_summary @@ -1650,7 +1736,8 @@ ] data = ",\n".join(lines) class_name = f"<{type(self).__name__}>" - return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}" + footer = self._get_repr_footer() + return f"{class_name}\n[\n{data}\n]\n{footer}" def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]: """ @@ -1700,6 +1787,17 @@ Because ExtensionArrays are always 1D, this is a no-op. It is included for compatibility with np.ndarray. + + Returns + ------- + ExtensionArray + + Examples + -------- + >>> pd.array([1, 2, 3]).transpose() + + [1, 2, 3] + Length: 3, dtype: Int64 """ return self[:] @@ -1913,7 +2011,7 @@ ... hash_key="1000000000000000", ... categorize=False ... ) - array([11381023671546835630, 4641644667904626417], dtype=uint64) + array([ 6238072747940578789, 15839785061582574730], dtype=uint64) """ from pandas.core.util.hashing import hash_array @@ -1922,6 +2020,41 @@ values, encoding=encoding, hash_key=hash_key, categorize=categorize ) + def _explode(self) -> tuple[Self, npt.NDArray[np.uint64]]: + """ + Transform each element of list-like to a row. + + For arrays that do not contain list-like elements the default + implementation of this method just returns a copy and an array + of ones (unchanged index). + + Returns + ------- + ExtensionArray + Array with the exploded values. + np.ndarray[uint64] + The original lengths of each list-like for determining the + resulting index. + + See Also + -------- + Series.explode : The method on the ``Series`` object that this + extension array method is meant to support. + + Examples + -------- + >>> import pyarrow as pa + >>> a = pd.array([[1, 2, 3], [4], [5, 6]], + ... dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + >>> a._explode() + ( + [1, 2, 3, 4, 5, 6] + Length: 6, dtype: int64[pyarrow], array([3, 1, 2], dtype=int32)) + """ + values = self.copy() + counts = np.ones(shape=(len(self),), dtype=np.uint64) + return values, counts + def tolist(self) -> list: """ Return a list of the values. @@ -2034,6 +2167,7 @@ result[~mask] = val return result + # TODO(3.0): this can be removed once GH#33302 deprecation is enforced def _fill_mask_inplace( self, method: str, limit: int | None, mask: npt.NDArray[np.bool_] ) -> None: diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/boolean.py pandas-2.2.2+dfsg/pandas/core/arrays/boolean.py --- pandas-2.1.4+dfsg/pandas/core/arrays/boolean.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/boolean.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,7 @@ import numbers from typing import ( TYPE_CHECKING, + ClassVar, cast, ) @@ -60,7 +61,7 @@ BooleanDtype """ - name = "boolean" + name: ClassVar[str] = "boolean" # https://github.com/python/mypy/issues/4125 # error: Signature of "type" incompatible with supertype "BaseMaskedDtype" diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/categorical.py pandas-2.2.2+dfsg/pandas/core/arrays/categorical.py --- pandas-2.1.4+dfsg/pandas/core/arrays/categorical.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/categorical.py 2024-04-10 17:42:52.000000000 +0000 @@ -44,7 +44,9 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, + CategoricalDtypeType, ExtensionDtype, ) from pandas.core.dtypes.generic import ( @@ -101,6 +103,7 @@ AstypeArg, AxisInt, Dtype, + DtypeObj, NpDtype, Ordered, Self, @@ -442,24 +445,32 @@ values = arr if dtype.categories is None: - if not isinstance(values, ABCIndex): - # in particular RangeIndex xref test_index_equal_range_categories - values = sanitize_array(values, None) - try: - codes, categories = factorize(values, sort=True) - except TypeError as err: - codes, categories = factorize(values, sort=False) - if dtype.ordered: - # raise, as we don't have a sortable data structure and so - # the user should give us one by specifying categories - raise TypeError( - "'values' is not ordered, please " - "explicitly specify the categories order " - "by passing in a categories argument." - ) from err + if isinstance(values.dtype, ArrowDtype) and issubclass( + values.dtype.type, CategoricalDtypeType + ): + arr = values._pa_array.combine_chunks() + categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype) + codes = arr.indices.to_numpy() + dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) + else: + if not isinstance(values, ABCIndex): + # in particular RangeIndex xref test_index_equal_range_categories + values = sanitize_array(values, None) + try: + codes, categories = factorize(values, sort=True) + except TypeError as err: + codes, categories = factorize(values, sort=False) + if dtype.ordered: + # raise, as we don't have a sortable data structure and so + # the user should give us one by specifying categories + raise TypeError( + "'values' is not ordered, please " + "explicitly specify the categories order " + "by passing in a categories argument." + ) from err - # we're inferring from values - dtype = CategoricalDtype(categories, dtype.ordered) + # we're inferring from values + dtype = CategoricalDtype(categories, dtype.ordered) elif isinstance(values.dtype, CategoricalDtype): old_codes = extract_array(values)._codes @@ -509,6 +520,22 @@ ) -> Self: return cls(scalars, dtype=dtype, copy=copy) + @classmethod + def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: + if dtype is None: + # The _from_scalars strictness doesn't make much sense in this case. + raise NotImplementedError + + res = cls._from_sequence(scalars, dtype=dtype) + + # if there are any non-category elements in scalars, these will be + # converted to NAs in res. + mask = isna(scalars) + if not (mask == res.isna()).all(): + # Some non-category element in scalars got converted to NA in res. + raise ValueError + return res + @overload def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ... @@ -1609,7 +1636,9 @@ # ------------------------------------------------------------- @ravel_compat - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ The numpy array interface. @@ -1819,7 +1848,7 @@ return arr._from_backing_data(backing) - def _internal_get_values(self): + def _internal_get_values(self) -> ArrayLike: """ Return the values. @@ -1827,15 +1856,19 @@ Returns ------- - np.ndarray or Index - A numpy array of the same dtype as categorical.categories.dtype or - Index if datetime / periods. + np.ndarray or ExtensionArray + A numpy array or ExtensionArray of the same dtype as + categorical.categories.dtype. """ # if we are a datetime and period index, return Index to keep metadata if needs_i8_conversion(self.categories.dtype): - return self.categories.take(self._codes, fill_value=NaT) + return self.categories.take(self._codes, fill_value=NaT)._values elif is_integer_dtype(self.categories.dtype) and -1 in self._codes: - return self.categories.astype("object").take(self._codes, fill_value=np.nan) + return ( + self.categories.astype("object") + .take(self._codes, fill_value=np.nan) + ._values + ) return np.array(self) def check_for_ordered(self, op) -> None: @@ -2144,24 +2177,9 @@ # Rendering Methods def _formatter(self, boxed: bool = False): - # Defer to CategoricalFormatter's formatter. + # Returning None here will cause format_array to do inference. return None - def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str: - """ - a short repr displaying only max_vals and an optional (but default - footer) - """ - num = max_vals // 2 - head = self[:num]._get_repr(length=False, footer=False) - tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) - - result = f"{head[:-1]}, ..., {tail[1:]}" - if footer: - result = f"{result}\n{self._repr_footer()}" - - return str(result) - def _repr_categories(self) -> list[str]: """ return the base repr for the categories @@ -2178,17 +2196,17 @@ ) if len(self.categories) > max_categories: num = max_categories // 2 - head = format_array(self.categories[:num]) - tail = format_array(self.categories[-num:]) + head = format_array(self.categories[:num]._values) + tail = format_array(self.categories[-num:]._values) category_strs = head + ["..."] + tail else: - category_strs = format_array(self.categories) + category_strs = format_array(self.categories._values) # Strip all leading spaces, which format_array adds for columns... category_strs = [x.strip() for x in category_strs] return category_strs - def _repr_categories_info(self) -> str: + def _get_repr_footer(self) -> str: """ Returns a string representation of the footer. """ @@ -2217,33 +2235,49 @@ # replace to simple save space by return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]" - def _repr_footer(self) -> str: - info = self._repr_categories_info() - return f"Length: {len(self)}\n{info}" - - def _get_repr( - self, length: bool = True, na_rep: str = "NaN", footer: bool = True - ) -> str: + def _get_values_repr(self) -> str: from pandas.io.formats import format as fmt - formatter = fmt.CategoricalFormatter( - self, length=length, na_rep=na_rep, footer=footer + assert len(self) > 0 + + vals = self._internal_get_values() + fmt_values = fmt.format_array( + vals, + None, + float_format=None, + na_rep="NaN", + quoting=QUOTE_NONNUMERIC, ) - result = formatter.to_string() - return str(result) + + fmt_values = [i.strip() for i in fmt_values] + joined = ", ".join(fmt_values) + result = "[" + joined + "]" + return result def __repr__(self) -> str: """ String representation. """ - _maxlen = 10 - if len(self._codes) > _maxlen: - result = self._tidy_repr(_maxlen) - elif len(self._codes) > 0: - result = self._get_repr(length=len(self) > _maxlen) - else: - msg = self._get_repr(length=False, footer=True).replace("\n", ", ") - result = f"[], {msg}" + footer = self._get_repr_footer() + length = len(self) + max_len = 10 + if length > max_len: + # In long cases we do not display all entries, so we add Length + # information to the __repr__. + num = max_len // 2 + head = self[:num]._get_values_repr() + tail = self[-(max_len - num) :]._get_values_repr() + body = f"{head[:-1]}, ..., {tail[1:]}" + length_info = f"Length: {len(self)}" + result = f"{body}\n{length_info}\n{footer}" + elif length > 0: + body = self._get_values_repr() + result = f"{body}\n{footer}" + else: + # In the empty case we use a comma instead of newline to get + # a more compact __repr__ + body = "[]" + result = f"{body}, {footer}" return result @@ -2410,7 +2444,7 @@ # ------------------------------------------------------------------ # ExtensionArray Interface - def unique(self): + def unique(self) -> Self: """ Return the ``Categorical`` which ``categories`` and ``codes`` are unique. @@ -2548,7 +2582,7 @@ return result - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: """ Check whether `values` are contained in Categorical. @@ -2558,7 +2592,7 @@ Parameters ---------- - values : set or list-like + values : np.ndarray or ExtensionArray The sequence of values to test. Passing in a single string will raise a ``TypeError``. Instead, turn a single string into a list of one element. @@ -2589,21 +2623,16 @@ >>> s.isin(['lama']) array([ True, False, True, False, True, False]) """ - if not is_list_like(values): - values_type = type(values).__name__ - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a `{values_type}`" - ) - values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) - code_values = self.categories.get_indexer(values) + code_values = self.categories.get_indexer_for(values) code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) def _replace(self, *, to_replace, value, inplace: bool = False): from pandas import Index + orig_dtype = self.dtype + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() @@ -2634,6 +2663,17 @@ new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered) NDArrayBacked.__init__(cat, new_codes, new_dtype) + if new_dtype != orig_dtype: + warnings.warn( + # GH#55147 + "The behavior of Series.replace (and DataFrame.replace) with " + "CategoricalDtype is deprecated. In a future version, replace " + "will only be used for cases that preserve the categories. " + "To change the categories, use ser.cat.rename_categories " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) if not inplace: return cat @@ -2679,12 +2719,22 @@ dtype = self.dtype if how in ["sum", "prod", "cumsum", "cumprod", "skew"]: raise TypeError(f"{dtype} type does not support {how} operations") - if how in ["min", "max", "rank"] and not dtype.ordered: + if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered: # raise TypeError instead of NotImplementedError to ensure we # don't go down a group-by-group path, since in the empty-groups # case that would fail to raise raise TypeError(f"Cannot perform {how} with non-ordered Categorical") - if how not in ["rank", "any", "all", "first", "last", "min", "max"]: + if how not in [ + "rank", + "any", + "all", + "first", + "last", + "min", + "max", + "idxmin", + "idxmax", + ]: if kind == "transform": raise TypeError(f"{dtype} type does not support {how} operations") raise TypeError(f"{dtype} dtype does not support aggregation '{how}'") @@ -2694,7 +2744,7 @@ if how == "rank": assert self.ordered # checked earlier npvalues = self._ndarray - elif how in ["first", "last", "min", "max"]: + elif how in ["first", "last", "min", "max", "idxmin", "idxmax"]: npvalues = self._ndarray result_mask = np.zeros(ngroups, dtype=bool) else: @@ -2856,9 +2906,7 @@ if not isinstance(data.dtype, CategoricalDtype): raise AttributeError("Can only use .cat accessor with a 'category' dtype") - # error: Signature of "_delegate_property_get" incompatible with supertype - # "PandasDelegate" - def _delegate_property_get(self, name: str): # type: ignore[override] + def _delegate_property_get(self, name: str): return getattr(self._parent, name) # error: Signature of "_delegate_property_set" incompatible with supertype @@ -2898,17 +2946,15 @@ # utility routines -def _get_codes_for_values(values, categories: Index) -> np.ndarray: +def _get_codes_for_values( + values: Index | Series | ExtensionArray | np.ndarray, + categories: Index, +) -> np.ndarray: """ utility routine to turn values into codes given the specified categories If `values` is known to be a Categorical, use recode_for_categories instead. """ - if values.ndim > 1: - flat = values.ravel() - codes = _get_codes_for_values(flat, categories) - return codes.reshape(values.shape) - codes = categories.get_indexer_for(values) return coerce_indexer_dtype(codes, categories) diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/datetimelike.py pandas-2.2.2+dfsg/pandas/core/arrays/datetimelike.py --- pandas-2.1.4+dfsg/pandas/core/arrays/datetimelike.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/datetimelike.py 2024-04-10 17:42:52.000000000 +0000 @@ -35,12 +35,13 @@ Tick, Timedelta, Timestamp, + add_overflowsafe, astype_overflowsafe, - delta_to_nanoseconds, get_unit_from_dtype, iNaT, ints_to_pydatetime, ints_to_pytimedelta, + periods_per_day, to_offset, ) from pandas._libs.tslibs.fields import ( @@ -48,6 +49,7 @@ round_nsint64, ) from pandas._libs.tslibs.np_datetime import compare_mismatched_resolutions +from pandas._libs.tslibs.timedeltas import get_unit_for_round from pandas._libs.tslibs.timestamps import integer_op_not_supported from pandas._typing import ( ArrayLike, @@ -80,6 +82,7 @@ ) from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( is_all_strings, is_integer_dtype, @@ -89,6 +92,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import ( + ArrowDtype, CategoricalDtype, DatetimeTZDtype, ExtensionDtype, @@ -110,7 +114,6 @@ ops, ) from pandas.core.algorithms import ( - checked_add_with_arr, isin, map_array, unique1d, @@ -267,7 +270,7 @@ Examples -------- - >>> arr = pd.arrays.DatetimeArray(np.array(['1970-01-01'], 'datetime64[ns]')) + >>> arr = pd.array(np.array(['1970-01-01'], 'datetime64[ns]')) >>> arr._unbox_scalar(arr[0]) numpy.datetime64('1970-01-01T00:00:00.000000000') """ @@ -348,7 +351,9 @@ # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): return np.array(list(self), dtype=object) @@ -598,7 +603,9 @@ raise TypeError(msg) elif isinstance(value, self._recognized_scalars): - value = self._scalar_type(value) + # error: Argument 1 to "Timestamp" has incompatible type "object"; expected + # "integer[Any] | float | str | date | datetime | datetime64" + value = self._scalar_type(value) # type: ignore[arg-type] else: msg = self._validation_error_message(value, allow_listlike) @@ -626,20 +633,27 @@ ------- str """ + if hasattr(value, "dtype") and getattr(value, "ndim", 0) > 0: + msg_got = f"{value.dtype} array" + else: + msg_got = f"'{type(value).__name__}'" if allow_listlike: msg = ( f"value should be a '{self._scalar_type.__name__}', 'NaT', " - f"or array of those. Got '{type(value).__name__}' instead." + f"or array of those. Got {msg_got} instead." ) else: msg = ( f"value should be a '{self._scalar_type.__name__}' or 'NaT'. " - f"Got '{type(value).__name__}' instead." + f"Got {msg_got} instead." ) return msg def _validate_listlike(self, value, allow_object: bool = False): if isinstance(value, type(self)): + if self.dtype.kind in "mM" and not allow_object: + # error: "DatetimeLikeArrayMixin" has no attribute "as_unit" + value = value.as_unit(self.unit, round_ok=False) # type: ignore[attr-defined] return value if isinstance(value, list) and len(value) == 0: @@ -688,6 +702,9 @@ msg = self._validation_error_message(value, True) raise TypeError(msg) + if self.dtype.kind in "mM" and not allow_object: + # error: "DatetimeLikeArrayMixin" has no attribute "as_unit" + value = value.as_unit(self.unit, round_ok=False) # type: ignore[attr-defined] return value def _validate_setitem_value(self, value): @@ -728,26 +745,25 @@ else: return result.array - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: """ Compute boolean array of whether each value is found in the passed set of values. Parameters ---------- - values : set or sequence of values + values : np.ndarray or ExtensionArray Returns ------- ndarray[bool] """ - if not hasattr(values, "dtype"): - values = np.asarray(values) - if values.dtype.kind in "fiuc": # TODO: de-duplicate with equals, validate_comparison_value return np.zeros(self.shape, dtype=bool) + values = ensure_wrapped_if_datetimelike(values) + if not isinstance(values, type(self)): inferable = [ "timedelta", @@ -758,6 +774,14 @@ "period", ] if values.dtype == object: + values = lib.maybe_convert_objects( + values, # type: ignore[arg-type] + convert_non_numeric=True, + dtype_if_all_nat=self.dtype, + ) + if values.dtype != object: + return self.isin(values) + inferred = lib.infer_dtype(values, skipna=False) if inferred not in inferable: if inferred == "string": @@ -772,18 +796,36 @@ values = type(self)._from_sequence(values) except ValueError: return isin(self.astype(object), values) + else: + warnings.warn( + # GH#53111 + f"The behavior of 'isin' with dtype={self.dtype} and " + "castable values (e.g. strings) is deprecated. In a " + "future version, these will not be considered matching " + "by isin. Explicitly cast to the appropriate dtype before " + "calling isin instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) if self.dtype.kind in "mM": self = cast("DatetimeArray | TimedeltaArray", self) - values = values.as_unit(self.unit) + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "as_unit" + values = values.as_unit(self.unit) # type: ignore[union-attr] try: - self._check_compatible_with(values) + # error: Argument 1 to "_check_compatible_with" of "DatetimeLikeArrayMixin" + # has incompatible type "ExtensionArray | ndarray[Any, Any]"; expected + # "Period | Timestamp | Timedelta | NaTType" + self._check_compatible_with(values) # type: ignore[arg-type] except (TypeError, ValueError): # Includes tzawareness mismatch and IncompatibleFrequencyError return np.zeros(self.shape, dtype=bool) - return isin(self.asi8, values.asi8) + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "asi8" + return isin(self.asi8, values.asi8) # type: ignore[union-attr] # ------------------------------------------------------------------ # Null Handling @@ -1007,7 +1049,7 @@ self, other ) -> tuple[int | npt.NDArray[np.int64], None | npt.NDArray[np.bool_]]: """ - Get the int64 values and b_mask to pass to checked_add_with_arr. + Get the int64 values and b_mask to pass to add_overflowsafe. """ if isinstance(other, Period): i8values = other.ordinal @@ -1063,9 +1105,7 @@ self = cast("TimedeltaArray", self) other_i8, o_mask = self._get_i8_values_and_mask(other) - result = checked_add_with_arr( - self.asi8, other_i8, arr_mask=self._isnan, b_mask=o_mask - ) + result = add_overflowsafe(self.asi8, np.asarray(other_i8, dtype="i8")) res_values = result.view(f"M8[{self.unit}]") dtype = tz_to_dtype(tz=other.tz, unit=self.unit) @@ -1128,9 +1168,7 @@ raise type(err)(new_message) from err other_i8, o_mask = self._get_i8_values_and_mask(other) - res_values = checked_add_with_arr( - self.asi8, -other_i8, arr_mask=self._isnan, b_mask=o_mask - ) + res_values = add_overflowsafe(self.asi8, np.asarray(-other_i8, dtype="i8")) res_m8 = res_values.view(f"timedelta64[{self.unit}]") new_freq = self._get_arithmetic_result_freq(other) @@ -1196,9 +1234,7 @@ self = cast("DatetimeArray | TimedeltaArray", self) other_i8, o_mask = self._get_i8_values_and_mask(other) - new_values = checked_add_with_arr( - self.asi8, other_i8, arr_mask=self._isnan, b_mask=o_mask - ) + new_values = add_overflowsafe(self.asi8, np.asarray(other_i8, dtype="i8")) res_values = new_values.view(self._ndarray.dtype) new_freq = self._get_arithmetic_result_freq(other) @@ -1266,9 +1302,7 @@ self._check_compatible_with(other) other_i8, o_mask = self._get_i8_values_and_mask(other) - new_i8_data = checked_add_with_arr( - self.asi8, -other_i8, arr_mask=self._isnan, b_mask=o_mask - ) + new_i8_data = add_overflowsafe(self.asi8, np.asarray(-other_i8, dtype="i8")) new_data = np.array([self.freq.base * x for x in new_i8_data]) if o_mask is None: @@ -1378,7 +1412,7 @@ if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray(result) + return TimedeltaArray._from_sequence(result) return result def __radd__(self, other): @@ -1438,7 +1472,7 @@ if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray(result) + return TimedeltaArray._from_sequence(result) return result def __rsub__(self, other): @@ -1457,7 +1491,7 @@ # Avoid down-casting DatetimeIndex from pandas.core.arrays import DatetimeArray - other = DatetimeArray(other) + other = DatetimeArray._from_sequence(other) return other - self elif self.dtype.kind == "M" and hasattr(other, "dtype") and not other_is_dt64: # GH#19959 datetime - datetime is well-defined as timedelta, @@ -1694,7 +1728,7 @@ self = cast("DatetimeArray | TimedeltaArray", self) new_dtype = f"m8[{self.unit}]" res_values = res_values.view(new_dtype) - return TimedeltaArray(res_values) + return TimedeltaArray._simple_new(res_values, dtype=res_values.dtype) res_values = res_values.view(self._ndarray.dtype) return self._from_backing_data(res_values) @@ -1818,17 +1852,17 @@ >>> rng DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', '2018-01-01 12:01:00'], - dtype='datetime64[ns]', freq='T') + dtype='datetime64[ns]', freq='min') """ -_round_example = """>>> rng.round('H') +_round_example = """>>> rng.round('h') DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', '2018-01-01 12:00:00'], dtype='datetime64[ns]', freq=None) **Series** - >>> pd.Series(rng).dt.round("H") + >>> pd.Series(rng).dt.round("h") 0 2018-01-01 12:00:00 1 2018-01-01 12:00:00 2 2018-01-01 12:00:00 @@ -1839,23 +1873,23 @@ >>> rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00"], tz="Europe/Amsterdam") - >>> rng_tz.floor("2H", ambiguous=False) + >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) - >>> rng_tz.floor("2H", ambiguous=True) + >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) """ -_floor_example = """>>> rng.floor('H') +_floor_example = """>>> rng.floor('h') DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', '2018-01-01 12:00:00'], dtype='datetime64[ns]', freq=None) **Series** - >>> pd.Series(rng).dt.floor("H") + >>> pd.Series(rng).dt.floor("h") 0 2018-01-01 11:00:00 1 2018-01-01 12:00:00 2 2018-01-01 12:00:00 @@ -1866,23 +1900,23 @@ >>> rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00"], tz="Europe/Amsterdam") - >>> rng_tz.floor("2H", ambiguous=False) + >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) - >>> rng_tz.floor("2H", ambiguous=True) + >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) """ -_ceil_example = """>>> rng.ceil('H') +_ceil_example = """>>> rng.ceil('h') DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', '2018-01-01 13:00:00'], dtype='datetime64[ns]', freq=None) **Series** - >>> pd.Series(rng).dt.ceil("H") + >>> pd.Series(rng).dt.ceil("h") 0 2018-01-01 12:00:00 1 2018-01-01 12:00:00 2 2018-01-01 13:00:00 @@ -1893,11 +1927,11 @@ >>> rng_tz = pd.DatetimeIndex(["2021-10-31 01:30:00"], tz="Europe/Amsterdam") - >>> rng_tz.ceil("H", ambiguous=False) + >>> rng_tz.ceil("h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) - >>> rng_tz.ceil("H", ambiguous=True) + >>> rng_tz.ceil("h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], dtype='datetime64[ns, Europe/Amsterdam]', freq=None) """ @@ -1913,6 +1947,16 @@ def __init__( self, values, dtype=None, freq=lib.no_default, copy: bool = False ) -> None: + warnings.warn( + # GH#55623 + f"{type(self).__name__}.__init__ is deprecated and will be " + "removed in a future version. Use pd.array instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if dtype is not None: + dtype = pandas_dtype(dtype) + values = extract_array(values, extract_numpy=True) if isinstance(values, IntegerArray): values = values.to_numpy("int64", na_value=iNaT) @@ -1929,15 +1973,13 @@ freq = values.freq elif freq and values.freq: freq = to_offset(freq) - freq, _ = validate_inferred_freq(freq, values.freq, False) + freq = _validate_inferred_freq(freq, values.freq) - if dtype is not None: - dtype = pandas_dtype(dtype) - if dtype != values.dtype: - # TODO: we only have tests for this for DTA, not TDA (2022-07-01) - raise TypeError( - f"dtype={dtype} does not match data dtype {values.dtype}" - ) + if dtype is not None and dtype != values.dtype: + # TODO: we only have tests for this for DTA, not TDA (2022-07-01) + raise TypeError( + f"dtype={dtype} does not match data dtype {values.dtype}" + ) dtype = values.dtype values = values._ndarray @@ -1947,6 +1989,8 @@ dtype = values.dtype else: dtype = self._default_dtype + if isinstance(values, np.ndarray) and values.dtype == "i8": + values = values.view(dtype) if not isinstance(values, np.ndarray): raise ValueError( @@ -1961,7 +2005,15 @@ # for compat with datetime/timedelta/period shared methods, # we can sometimes get here with int64 values. These represent # nanosecond UTC (or tz-naive) unix timestamps - values = values.view(self._default_dtype) + if dtype is None: + dtype = self._default_dtype + values = values.view(self._default_dtype) + elif lib.is_np_dtype(dtype, "mM"): + values = values.view(dtype) + elif isinstance(dtype, DatetimeTZDtype): + kind = self._default_dtype.kind + new_dtype = f"{kind}8[{dtype.unit}]" + values = values.view(new_dtype) dtype = self._validate_dtype(values, dtype) @@ -2008,8 +2060,42 @@ self._freq = value + @final + def _maybe_pin_freq(self, freq, validate_kwds: dict): + """ + Constructor helper to pin the appropriate `freq` attribute. Assumes + that self._freq is currently set to any freq inferred in + _from_sequence_not_strict. + """ + if freq is None: + # user explicitly passed None -> override any inferred_freq + self._freq = None + elif freq == "infer": + # if self._freq is *not* None then we already inferred a freq + # and there is nothing left to do + if self._freq is None: + # Set _freq directly to bypass duplicative _validate_frequency + # check. + self._freq = to_offset(self.inferred_freq) + elif freq is lib.no_default: + # user did not specify anything, keep inferred freq if the original + # data had one, otherwise do nothing + pass + elif self._freq is None: + # We cannot inherit a freq from the data, so we need to validate + # the user-passed freq + freq = to_offset(freq) + type(self)._validate_frequency(self, freq, **validate_kwds) + self._freq = freq + else: + # Otherwise we just need to check that the user-passed freq + # doesn't conflict with the one we already have. + freq = to_offset(freq) + _validate_inferred_freq(freq, self._freq) + + @final @classmethod - def _validate_frequency(cls, index, freq, **kwargs): + def _validate_frequency(cls, index, freq: BaseOffset, **kwargs): """ Validate that a frequency is compatible with the values of a given Datetime Array/Index or Timedelta Array/Index @@ -2052,7 +2138,9 @@ ) from err @classmethod - def _generate_range(cls, start, end, periods, freq, *args, **kwargs) -> Self: + def _generate_range( + cls, start, end, periods: int | None, freq, *args, **kwargs + ) -> Self: raise AbstractMethodError(cls) # -------------------------------------------------------------- @@ -2068,12 +2156,12 @@ # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]" return dtype_to_unit(self.dtype) # type: ignore[arg-type] - def as_unit(self, unit: str) -> Self: + def as_unit(self, unit: str, round_ok: bool = True) -> Self: if unit not in ["s", "ms", "us", "ns"]: raise ValueError("Supported units are 's', 'ms', 'us', 'ns'") dtype = np.dtype(f"{self.dtype.kind}8[{unit}]") - new_values = astype_overflowsafe(self._ndarray, dtype, round_ok=True) + new_values = astype_overflowsafe(self._ndarray, dtype, round_ok=round_ok) if isinstance(self.dtype, np.dtype): new_dtype = new_values.dtype @@ -2124,9 +2212,7 @@ values = self.view("i8") values = cast(np.ndarray, values) - offset = to_offset(freq) - offset.nanos # raises on non-fixed frequencies - nanos = delta_to_nanoseconds(offset, self._creso) + nanos = get_unit_for_round(freq, self._creso) if nanos == 0: # GH 52761 return self.copy() @@ -2265,8 +2351,7 @@ return new_obj def copy(self, order: str = "C") -> Self: - # error: Unexpected keyword argument "order" for "copy" - new_obj = super().copy(order=order) # type: ignore[call-arg] + new_obj = super().copy(order=order) new_obj._freq = self.freq return new_obj @@ -2308,18 +2393,45 @@ return self return type(self)._simple_new(out_data, dtype=self.dtype) + # -------------------------------------------------------------- + # Unsorted + + @property + def _is_dates_only(self) -> bool: + """ + Check if we are round times at midnight (and no timezone), which will + be given a more compact __repr__ than other cases. For TimedeltaArray + we are checking for multiples of 24H. + """ + if not lib.is_np_dtype(self.dtype): + # i.e. we have a timezone + return False + + values_int = self.asi8 + consider_values = values_int != iNaT + reso = get_unit_from_dtype(self.dtype) + ppd = periods_per_day(reso) + + # TODO: can we reuse is_date_array_normalized? would need a skipna kwd + # (first attempt at this was less performant than this implementation) + even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0 + return even_days + # ------------------------------------------------------------------- # Shared Constructor Helpers -def ensure_arraylike_for_datetimelike(data, copy: bool, cls_name: str): +def ensure_arraylike_for_datetimelike( + data, copy: bool, cls_name: str +) -> tuple[ArrayLike, bool]: if not hasattr(data, "dtype"): # e.g. list, tuple if not isinstance(data, (list, tuple)) and np.ndim(data) == 0: # i.e. generator data = list(data) - data = np.asarray(data) + + data = construct_1d_object_array_from_listlike(data) copy = False elif isinstance(data, ABCMultiIndex): raise TypeError(f"Cannot create a {cls_name} from a MultiIndex.") @@ -2379,15 +2491,23 @@ """ if periods is not None: if lib.is_float(periods): + warnings.warn( + # GH#56036 + "Non-integer 'periods' in pd.date_range, pd.timedelta_range, " + "pd.period_range, and pd.interval_range are deprecated and " + "will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) periods = int(periods) elif not lib.is_integer(periods): raise TypeError(f"periods must be a number, got {periods}") return periods -def validate_inferred_freq( - freq, inferred_freq, freq_infer -) -> tuple[BaseOffset | None, bool]: +def _validate_inferred_freq( + freq: BaseOffset | None, inferred_freq: BaseOffset | None +) -> BaseOffset | None: """ If the user passes a freq and another freq is inferred from passed data, require that they match. @@ -2396,17 +2516,10 @@ ---------- freq : DateOffset or None inferred_freq : DateOffset or None - freq_infer : bool Returns ------- freq : DateOffset or None - freq_infer : bool - - Notes - ----- - We assume at this point that `maybe_infer_freq` has been called, so - `freq` is either a DateOffset object or None. """ if inferred_freq is not None: if freq is not None and freq != inferred_freq: @@ -2417,40 +2530,11 @@ ) if freq is None: freq = inferred_freq - freq_infer = False - - return freq, freq_infer - - -def maybe_infer_freq(freq): - """ - Comparing a DateOffset to the string "infer" raises, so we need to - be careful about comparisons. Make a dummy variable `freq_infer` to - signify the case where the given freq is "infer" and set freq to None - to avoid comparison trouble later on. - - Parameters - ---------- - freq : {DateOffset, None, str} - Returns - ------- - freq : {DateOffset, None} - freq_infer : bool - Whether we should inherit the freq of passed data. - """ - freq_infer = False - if not isinstance(freq, BaseOffset): - # if a passed freq is None, don't infer automatically - if freq != "infer": - freq = to_offset(freq) - else: - freq_infer = True - freq = None - return freq, freq_infer + return freq -def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str: +def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype | ArrowDtype) -> str: """ Return the unit str corresponding to the dtype's resolution. @@ -2465,4 +2549,8 @@ """ if isinstance(dtype, DatetimeTZDtype): return dtype.unit + elif isinstance(dtype, ArrowDtype): + if dtype.kind not in "mM": + raise ValueError(f"{dtype=} does not have a resolution.") + return dtype.pyarrow_dtype.unit return np.datetime_data(dtype)[0] diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/datetimes.py pandas-2.2.2+dfsg/pandas/core/arrays/datetimes.py --- pandas-2.1.4+dfsg/pandas/core/arrays/datetimes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/datetimes.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,7 @@ from typing import ( TYPE_CHECKING, cast, + overload, ) import warnings @@ -26,14 +27,13 @@ astype_overflowsafe, fields, get_resolution, - get_supported_reso, + get_supported_dtype, get_unit_from_dtype, ints_to_pydatetime, is_date_array_normalized, - is_supported_unit, + is_supported_dtype, is_unitless, normalize_i8_timestamps, - npy_unit_to_abbrev, timezones, to_offset, tz_convert_from_utc, @@ -73,7 +73,9 @@ from collections.abc import Iterator from pandas._typing import ( + ArrayLike, DateTimeErrorChoices, + DtypeObj, IntervalClosedType, Self, TimeAmbiguous, @@ -85,6 +87,19 @@ from pandas.core.arrays import PeriodArray +_ITER_CHUNKSIZE = 10_000 + + +@overload +def tz_to_dtype(tz: tzinfo, unit: str = ...) -> DatetimeTZDtype: + ... + + +@overload +def tz_to_dtype(tz: None, unit: str = ...) -> np.dtype[np.datetime64]: + ... + + def tz_to_dtype( tz: tzinfo | None, unit: str = "ns" ) -> np.dtype[np.datetime64] | DatetimeTZDtype: @@ -184,8 +199,8 @@ Examples -------- - >>> pd.arrays.DatetimeArray(pd.DatetimeIndex(['2023-01-01', '2023-01-02']), - ... freq='D') + >>> pd.arrays.DatetimeArray._from_sequence( + ... pd.DatetimeIndex(['2023-01-01', '2023-01-02'], freq='D')) ['2023-01-01 00:00:00', '2023-01-02 00:00:00'] Length: 2, dtype: datetime64[ns] @@ -264,10 +279,25 @@ _default_dtype = DT64NS_DTYPE # used in TimeLikeOps.__init__ @classmethod + def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: + if lib.infer_dtype(scalars, skipna=True) not in ["datetime", "datetime64"]: + # TODO: require any NAs be valid-for-DTA + # TODO: if dtype is passed, check for tzawareness compat? + raise ValueError + return cls._from_sequence(scalars, dtype=dtype) + + @classmethod def _validate_dtype(cls, values, dtype): # used in TimeLikeOps.__init__ - _validate_dt64_dtype(values.dtype) dtype = _validate_dt64_dtype(dtype) + _validate_dt64_dtype(values.dtype) + if isinstance(dtype, np.dtype): + if values.dtype != dtype: + raise ValueError("Values resolution does not match dtype.") + else: + vunit = np.datetime_data(values.dtype)[0] + if vunit != dtype.unit: + raise ValueError("Values resolution does not match dtype.") return dtype # error: Signature of "_simple_new" incompatible with supertype "NDArrayBacked" @@ -308,13 +338,10 @@ dayfirst: bool = False, yearfirst: bool = False, ambiguous: TimeAmbiguous = "raise", - ): + ) -> Self: """ A non-strict version of _from_sequence, called from DatetimeIndex.__new__. """ - explicit_none = freq is None - freq = freq if freq is not lib.no_default else None - freq, freq_infer = dtl.maybe_infer_freq(freq) # if the user either explicitly passes tz=None or a tz-naive dtype, we # disallows inferring a tz. @@ -330,13 +357,16 @@ unit = None if dtype is not None: - if isinstance(dtype, np.dtype): - unit = np.datetime_data(dtype)[0] - else: - # DatetimeTZDtype - unit = dtype.unit + unit = dtl.dtype_to_unit(dtype) + + data, copy = dtl.ensure_arraylike_for_datetimelike( + data, copy, cls_name="DatetimeArray" + ) + inferred_freq = None + if isinstance(data, DatetimeArray): + inferred_freq = data.freq - subarr, tz, inferred_freq = _sequence_to_dt64ns( + subarr, tz = _sequence_to_dt64( data, copy=copy, tz=tz, @@ -353,36 +383,23 @@ "Use obj.tz_localize(None) instead." ) - freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer) - if explicit_none: - freq = None - data_unit = np.datetime_data(subarr.dtype)[0] data_dtype = tz_to_dtype(tz, data_unit) - result = cls._simple_new(subarr, freq=freq, dtype=data_dtype) + result = cls._simple_new(subarr, freq=inferred_freq, dtype=data_dtype) if unit is not None and unit != result.unit: # If unit was specified in user-passed dtype, cast to it here result = result.as_unit(unit) - if inferred_freq is None and freq is not None: - # this condition precludes `freq_infer` - cls._validate_frequency(result, freq, ambiguous=ambiguous) - - elif freq_infer: - # Set _freq directly to bypass duplicative _validate_frequency - # check. - result._freq = to_offset(result.inferred_freq) - + validate_kwds = {"ambiguous": ambiguous} + result._maybe_pin_freq(freq, validate_kwds) return result - # error: Signature of "_generate_range" incompatible with supertype - # "DatetimeLikeArrayMixin" @classmethod - def _generate_range( # type: ignore[override] + def _generate_range( cls, start, end, - periods, + periods: int | None, freq, tz=None, normalize: bool = False, @@ -418,9 +435,9 @@ else: unit = "ns" - if start is not None and unit is not None: + if start is not None: start = start.as_unit(unit, round_ok=False) - if end is not None and unit is not None: + if end is not None: end = end.as_unit(unit, round_ok=False) left_inclusive, right_inclusive = validate_inclusive(inclusive) @@ -429,14 +446,8 @@ if tz is not None: # Localize the start and end arguments - start_tz = None if start is None else start.tz - end_tz = None if end is None else end.tz - start = _maybe_localize_point( - start, start_tz, start, freq, tz, ambiguous, nonexistent - ) - end = _maybe_localize_point( - end, end_tz, end, freq, tz, ambiguous, nonexistent - ) + start = _maybe_localize_point(start, freq, tz, ambiguous, nonexistent) + end = _maybe_localize_point(end, freq, tz, ambiguous, nonexistent) if freq is not None: # We break Day arithmetic (fixed 24 hour) here and opt for @@ -482,6 +493,7 @@ # Nanosecond-granularity timestamps aren't always correctly # representable with doubles, so we limit the range that we # pass to np.linspace as much as possible + periods = cast(int, periods) i8values = ( np.linspace(0, end._value - start._value, periods, dtype="int64") + start._value @@ -540,7 +552,7 @@ # error: Return type "Union[dtype, DatetimeTZDtype]" of "dtype" # incompatible with return type "ExtensionDtype" in supertype # "ExtensionArray" - def dtype(self) -> np.dtype[np.datetime64] | DatetimeTZDtype: # type: ignore[override] # noqa: E501 + def dtype(self) -> np.dtype[np.datetime64] | DatetimeTZDtype: # type: ignore[override] """ The dtype for the DatetimeArray. @@ -623,12 +635,12 @@ # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: if dtype is None and self.tz: # The default for tz-aware is object, to preserve tz info dtype = object - return super().__array__(dtype=dtype) + return super().__array__(dtype=dtype, copy=copy) def __iter__(self) -> Iterator: """ @@ -645,7 +657,7 @@ # convert in chunks of 10k for efficiency data = self.asi8 length = len(self) - chunksize = 10000 + chunksize = _ITER_CHUNKSIZE chunks = (length // chunksize) + 1 for i in range(chunks): @@ -694,7 +706,7 @@ self.tz is None and lib.is_np_dtype(dtype, "M") and not is_unitless(dtype) - and is_supported_unit(get_unit_from_dtype(dtype)) + and is_supported_dtype(dtype) ): # unit conversion e.g. datetime64[s] res_values = astype_overflowsafe(self._ndarray, dtype, copy=True) @@ -732,12 +744,12 @@ def _format_native_types( self, *, na_rep: str | float = "NaT", date_format=None, **kwargs ) -> npt.NDArray[np.object_]: - from pandas.io.formats.format import get_format_datetime64_from_values - - fmt = get_format_datetime64_from_values(self, date_format) + if date_format is None and self._is_dates_only: + # Only dates and no timezone: provide a default format + date_format = "%Y-%m-%d" return tslib.format_array_from_datetime( - self.asi8, tz=self.tz, format=fmt, na_rep=na_rep, reso=self._creso + self.asi8, tz=self.tz, format=date_format, na_rep=na_rep, reso=self._creso ) # ----------------------------------------------------------------- @@ -778,7 +790,7 @@ # ----------------------------------------------------------------- # Arithmetic Methods - def _add_offset(self, offset) -> Self: + def _add_offset(self, offset: BaseOffset) -> Self: assert not isinstance(offset, Tick) if self.tz is not None: @@ -787,21 +799,31 @@ values = self try: - result = offset._apply_array(values).view(values.dtype) + res_values = offset._apply_array(values._ndarray) + if res_values.dtype.kind == "i": + # error: Argument 1 to "view" of "ndarray" has incompatible type + # "dtype[datetime64] | DatetimeTZDtype"; expected + # "dtype[Any] | type[Any] | _SupportsDType[dtype[Any]]" + res_values = res_values.view(values.dtype) # type: ignore[arg-type] except NotImplementedError: warnings.warn( "Non-vectorized DateOffset being applied to Series or DatetimeIndex.", PerformanceWarning, stacklevel=find_stack_level(), ) - result = self.astype("O") + offset - result = type(self)._from_sequence(result).as_unit(self.unit) + res_values = self.astype("O") + offset + # TODO(GH#55564): as_unit will be unnecessary + result = type(self)._from_sequence(res_values).as_unit(self.unit) if not len(self): # GH#30336 _from_sequence won't be able to infer self.tz return result.tz_localize(self.tz) else: - result = type(self)._simple_new(result, dtype=result.dtype) + result = type(self)._simple_new(res_values, dtype=res_values.dtype) + if offset.normalize: + result = result.normalize() + result._freq = None + if self.tz is not None: result = result.tz_localize(self.tz) @@ -854,37 +876,37 @@ to other time zones: >>> dti = pd.date_range(start='2014-08-01 09:00', - ... freq='H', periods=3, tz='Europe/Berlin') + ... freq='h', periods=3, tz='Europe/Berlin') >>> dti DatetimeIndex(['2014-08-01 09:00:00+02:00', '2014-08-01 10:00:00+02:00', '2014-08-01 11:00:00+02:00'], - dtype='datetime64[ns, Europe/Berlin]', freq='H') + dtype='datetime64[ns, Europe/Berlin]', freq='h') >>> dti.tz_convert('US/Central') DatetimeIndex(['2014-08-01 02:00:00-05:00', '2014-08-01 03:00:00-05:00', '2014-08-01 04:00:00-05:00'], - dtype='datetime64[ns, US/Central]', freq='H') + dtype='datetime64[ns, US/Central]', freq='h') With the ``tz=None``, we can remove the timezone (after converting to UTC if necessary): - >>> dti = pd.date_range(start='2014-08-01 09:00', freq='H', + >>> dti = pd.date_range(start='2014-08-01 09:00', freq='h', ... periods=3, tz='Europe/Berlin') >>> dti DatetimeIndex(['2014-08-01 09:00:00+02:00', '2014-08-01 10:00:00+02:00', '2014-08-01 11:00:00+02:00'], - dtype='datetime64[ns, Europe/Berlin]', freq='H') + dtype='datetime64[ns, Europe/Berlin]', freq='h') >>> dti.tz_convert(None) DatetimeIndex(['2014-08-01 07:00:00', '2014-08-01 08:00:00', '2014-08-01 09:00:00'], - dtype='datetime64[ns]', freq='H') + dtype='datetime64[ns]', freq='h') """ tz = timezones.maybe_get_tz(tz) @@ -1039,7 +1061,7 @@ 1 2015-03-29 03:30:00+02:00 dtype: datetime64[ns, Europe/Warsaw] - >>> s.dt.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H')) + >>> s.dt.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h')) 0 2015-03-29 03:30:00+02:00 1 2015-03-29 03:30:00+02:00 dtype: datetime64[ns, Europe/Warsaw] @@ -1129,13 +1151,13 @@ Examples -------- - >>> idx = pd.date_range(start='2014-08-01 10:00', freq='H', + >>> idx = pd.date_range(start='2014-08-01 10:00', freq='h', ... periods=3, tz='Asia/Calcutta') >>> idx DatetimeIndex(['2014-08-01 10:00:00+05:30', '2014-08-01 11:00:00+05:30', '2014-08-01 12:00:00+05:30'], - dtype='datetime64[ns, Asia/Calcutta]', freq='H') + dtype='datetime64[ns, Asia/Calcutta]', freq='h') >>> idx.normalize() DatetimeIndex(['2014-08-01 00:00:00+05:30', '2014-08-01 00:00:00+05:30', @@ -1207,6 +1229,10 @@ if freq is None: freq = self.freqstr or self.inferred_freq + if isinstance(self.freq, BaseOffset) and hasattr( + self.freq, "_period_dtype_code" + ): + freq = PeriodDtype(self.freq)._freqstr if freq is None: raise ValueError( @@ -1220,7 +1246,6 @@ res = freq freq = res - return PeriodArray._from_datetime64(self._ndarray, freq, tz=self.tz) # ----------------------------------------------------------------- @@ -1245,7 +1270,7 @@ Examples -------- - >>> s = pd.Series(pd.date_range(start='2018-01', freq='M', periods=3)) + >>> s = pd.Series(pd.date_range(start='2018-01', freq='ME', periods=3)) >>> s 0 2018-01-31 1 2018-02-28 @@ -1257,10 +1282,10 @@ 2 March dtype: object - >>> idx = pd.date_range(start='2018-01', freq='M', periods=3) + >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], - dtype='datetime64[ns]', freq='M') + dtype='datetime64[ns]', freq='ME') >>> idx.month_name() Index(['January', 'February', 'March'], dtype='object') @@ -1268,11 +1293,11 @@ for example: ``idx.month_name(locale='pt_BR.utf8')`` will return month names in Brazilian Portuguese language. - >>> idx = pd.date_range(start='2018-01', freq='M', periods=3) + >>> idx = pd.date_range(start='2018-01', freq='ME', periods=3) >>> idx DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'], - dtype='datetime64[ns]', freq='M') - >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP + dtype='datetime64[ns]', freq='ME') + >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object') """ values = self._local_timestamps() @@ -1497,7 +1522,7 @@ Examples -------- >>> datetime_series = pd.Series( - ... pd.date_range("2000-01-01", periods=3, freq="Y") + ... pd.date_range("2000-01-01", periods=3, freq="YE") ... ) >>> datetime_series 0 2000-12-31 @@ -1520,7 +1545,7 @@ Examples -------- >>> datetime_series = pd.Series( - ... pd.date_range("2000-01-01", periods=3, freq="M") + ... pd.date_range("2000-01-01", periods=3, freq="ME") ... ) >>> datetime_series 0 2000-01-31 @@ -1589,7 +1614,7 @@ Examples -------- >>> datetime_series = pd.Series( - ... pd.date_range("2000-01-01", periods=3, freq="T") + ... pd.date_range("2000-01-01", periods=3, freq="min") ... ) >>> datetime_series 0 2000-01-01 00:00:00 @@ -2035,10 +2060,10 @@ This method is available on Series with datetime values under the ``.dt`` accessor, and directly on DatetimeIndex. - >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="Y") + >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="YE") >>> idx DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], - dtype='datetime64[ns]', freq='A-DEC') + dtype='datetime64[ns]', freq='YE-DEC') >>> idx.is_leap_year array([ True, False, False]) @@ -2156,8 +2181,8 @@ # Constructor Helpers -def _sequence_to_dt64ns( - data, +def _sequence_to_dt64( + data: ArrayLike, *, copy: bool = False, tz: tzinfo | None = None, @@ -2169,7 +2194,8 @@ """ Parameters ---------- - data : list-like + data : np.ndarray or ExtensionArray + dtl.ensure_arraylike_for_datetimelike has already been called. copy : bool, default False tz : tzinfo or None, default None dayfirst : bool, default False @@ -2182,62 +2208,65 @@ Returns ------- result : numpy.ndarray - The sequence converted to a numpy array with dtype ``datetime64[ns]``. + The sequence converted to a numpy array with dtype ``datetime64[unit]``. + Where `unit` is "ns" unless specified otherwise by `out_unit`. tz : tzinfo or None Either the user-provided tzinfo or one inferred from the data. - inferred_freq : Tick or None - The inferred frequency of the sequence. Raises ------ TypeError : PeriodDType data is passed """ - inferred_freq = None - - data, copy = dtl.ensure_arraylike_for_datetimelike( - data, copy, cls_name="DatetimeArray" - ) - - if isinstance(data, DatetimeArray): - inferred_freq = data.freq # By this point we are assured to have either a numpy array or Index data, copy = maybe_convert_dtype(data, copy, tz=tz) data_dtype = getattr(data, "dtype", None) - out_dtype = DT64NS_DTYPE - if out_unit is not None: - out_dtype = np.dtype(f"M8[{out_unit}]") + if out_unit is None: + out_unit = "ns" + out_dtype = np.dtype(f"M8[{out_unit}]") if data_dtype == object or is_string_dtype(data_dtype): # TODO: We do not have tests specific to string-dtypes, # also complex or categorical or other extension + data = cast(np.ndarray, data) copy = False if lib.infer_dtype(data, skipna=False) == "integer": + # Much more performant than going through array_to_datetime data = data.astype(np.int64) elif tz is not None and ambiguous == "raise": - # TODO: yearfirst/dayfirst/etc? obj_data = np.asarray(data, dtype=object) - i8data = tslib.array_to_datetime_with_tz(obj_data, tz) - return i8data.view(DT64NS_DTYPE), tz, None + result = tslib.array_to_datetime_with_tz( + obj_data, + tz=tz, + dayfirst=dayfirst, + yearfirst=yearfirst, + creso=abbrev_to_npy_unit(out_unit), + ) + return result, tz else: - # data comes back here as either i8 to denote UTC timestamps - # or M8[ns] to denote wall times - data, inferred_tz = objects_to_datetime64ns( + converted, inferred_tz = objects_to_datetime64( data, dayfirst=dayfirst, yearfirst=yearfirst, allow_object=False, + out_unit=out_unit or "ns", ) + copy = False if tz and inferred_tz: # two timezones: convert to intended from base UTC repr - assert data.dtype == "i8" - # GH#42505 - # by convention, these are _already_ UTC, e.g - return data.view(DT64NS_DTYPE), tz, None + # GH#42505 by convention, these are _already_ UTC + result = converted elif inferred_tz: tz = inferred_tz + result = converted + + else: + result, _ = _construct_from_dt64_naive( + converted, tz=tz, copy=copy, ambiguous=ambiguous + ) + return result, tz data_dtype = data.dtype @@ -2245,52 +2274,27 @@ # so we need to handle these types. if isinstance(data_dtype, DatetimeTZDtype): # DatetimeArray -> ndarray + data = cast(DatetimeArray, data) tz = _maybe_infer_tz(tz, data.tz) result = data._ndarray elif lib.is_np_dtype(data_dtype, "M"): # tz-naive DatetimeArray or ndarray[datetime64] - data = getattr(data, "_ndarray", data) - new_dtype = data.dtype - data_unit = get_unit_from_dtype(new_dtype) - if not is_supported_unit(data_unit): - # Cast to the nearest supported unit, generally "s" - new_reso = get_supported_reso(data_unit) - new_unit = npy_unit_to_abbrev(new_reso) - new_dtype = np.dtype(f"M8[{new_unit}]") - data = astype_overflowsafe(data, dtype=new_dtype, copy=False) - data_unit = get_unit_from_dtype(new_dtype) - copy = False - - if data.dtype.byteorder == ">": - # TODO: better way to handle this? non-copying alternative? - # without this, test_constructor_datetime64_bigendian fails - data = data.astype(data.dtype.newbyteorder("<")) - new_dtype = data.dtype - copy = False + if isinstance(data, DatetimeArray): + data = data._ndarray - if tz is not None: - # Convert tz-naive to UTC - # TODO: if tz is UTC, are there situations where we *don't* want a - # copy? tz_localize_to_utc always makes one. - shape = data.shape - if data.ndim > 1: - data = data.ravel() - - data = tzconversion.tz_localize_to_utc( - data.view("i8"), tz, ambiguous=ambiguous, creso=data_unit - ) - data = data.view(new_dtype) - data = data.reshape(shape) - - assert data.dtype == new_dtype, data.dtype - result = data + data = cast(np.ndarray, data) + result, copy = _construct_from_dt64_naive( + data, tz=tz, copy=copy, ambiguous=ambiguous + ) else: # must be integer dtype otherwise # assume this data are epoch timestamps if data.dtype != INT64_DTYPE: data = data.astype(np.int64, copy=False) + copy = False + data = cast(np.ndarray, data) result = data.view(out_dtype) if copy: @@ -2299,17 +2303,62 @@ assert isinstance(result, np.ndarray), type(result) assert result.dtype.kind == "M" assert result.dtype != "M8" - assert is_supported_unit(get_unit_from_dtype(result.dtype)) - return result, tz, inferred_freq + assert is_supported_dtype(result.dtype) + return result, tz + + +def _construct_from_dt64_naive( + data: np.ndarray, *, tz: tzinfo | None, copy: bool, ambiguous: TimeAmbiguous +) -> tuple[np.ndarray, bool]: + """ + Convert datetime64 data to a supported dtype, localizing if necessary. + """ + # Caller is responsible for ensuring + # lib.is_np_dtype(data.dtype) + + new_dtype = data.dtype + if not is_supported_dtype(new_dtype): + # Cast to the nearest supported unit, generally "s" + new_dtype = get_supported_dtype(new_dtype) + data = astype_overflowsafe(data, dtype=new_dtype, copy=False) + copy = False + + if data.dtype.byteorder == ">": + # TODO: better way to handle this? non-copying alternative? + # without this, test_constructor_datetime64_bigendian fails + data = data.astype(data.dtype.newbyteorder("<")) + new_dtype = data.dtype + copy = False + + if tz is not None: + # Convert tz-naive to UTC + # TODO: if tz is UTC, are there situations where we *don't* want a + # copy? tz_localize_to_utc always makes one. + shape = data.shape + if data.ndim > 1: + data = data.ravel() + data_unit = get_unit_from_dtype(new_dtype) + data = tzconversion.tz_localize_to_utc( + data.view("i8"), tz, ambiguous=ambiguous, creso=data_unit + ) + data = data.view(new_dtype) + data = data.reshape(shape) + + assert data.dtype == new_dtype, data.dtype + result = data + + return result, copy -def objects_to_datetime64ns( + +def objects_to_datetime64( data: np.ndarray, dayfirst, yearfirst, utc: bool = False, errors: DateTimeErrorChoices = "raise", allow_object: bool = False, + out_unit: str = "ns", ): """ Convert data to array of timestamps. @@ -2325,23 +2374,26 @@ allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. + out_unit : str, default "ns" Returns ------- result : ndarray - np.int64 dtype if returned values represent UTC timestamps - np.datetime64[ns] if returned values represent wall times + np.datetime64[out_unit] if returned values represent wall times or UTC + timestamps. object if mixed timezones inferred_tz : tzinfo or None + If not None, then the datetime64 values in `result` denote UTC timestamps. Raises ------ ValueError : if data cannot be converted to datetimes + TypeError : When a type cannot be converted to datetime """ assert errors in ["raise", "ignore", "coerce"] # if str-dtype, convert - data = np.array(data, copy=False, dtype=np.object_) + data = np.asarray(data, dtype=np.object_) result, tz_parsed = tslib.array_to_datetime( data, @@ -2349,16 +2401,14 @@ utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, + creso=abbrev_to_npy_unit(out_unit), ) if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array # is in UTC - # Return i8 values to denote unix timestamps - return result.view("i8"), tz_parsed + return result, tz_parsed elif result.dtype.kind == "M": - # returning M8[ns] denotes wall-times; since tz is None - # the distinction is a thin one return result, tz_parsed elif result.dtype == object: # GH#23675 when called via `pd.to_datetime`, returning an object-dtype @@ -2495,7 +2545,7 @@ if ( isinstance(dtype, np.dtype) - and (dtype.kind != "M" or not is_supported_unit(get_unit_from_dtype(dtype))) + and (dtype.kind != "M" or not is_supported_dtype(dtype)) ) or not isinstance(dtype, (np.dtype, DatetimeTZDtype)): raise ValueError( f"Unexpected value for 'dtype': '{dtype}'. " @@ -2626,7 +2676,9 @@ return start, end -def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexistent): +def _maybe_localize_point( + ts: Timestamp | None, freq, tz, ambiguous, nonexistent +) -> Timestamp | None: """ Localize a start or end Timestamp to the timezone of the corresponding start or end Timestamp @@ -2634,8 +2686,6 @@ Parameters ---------- ts : start or end Timestamp to potentially localize - is_none : argument that should be None - is_not_none : argument that should not be None freq : Tick, DateOffset, or None tz : str, timezone object or None ambiguous: str, localization behavior for ambiguous times @@ -2648,7 +2698,7 @@ # Make sure start and end are timezone localized if: # 1) freq = a Timedelta-like frequency (Tick) # 2) freq = None i.e. generating a linspaced range - if is_none is None and is_not_none is not None: + if ts is not None and ts.tzinfo is None: # Note: We can't ambiguous='infer' a singular ambiguous time; however, # we have historically defaulted ambiguous=False ambiguous = ambiguous if ambiguous != "infer" else False @@ -2748,13 +2798,7 @@ break # faster than cur + offset - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Discarding nonzero nanoseconds in conversion", - category=UserWarning, - ) - next_date = offset._apply(cur) + next_date = offset._apply(cur) next_date = next_date.as_unit(unit) if next_date <= cur: raise ValueError(f"Offset {offset} did not increment date") @@ -2769,13 +2813,7 @@ break # faster than cur + offset - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Discarding nonzero nanoseconds in conversion", - category=UserWarning, - ) - next_date = offset._apply(cur) + next_date = offset._apply(cur) next_date = next_date.as_unit(unit) if next_date >= cur: raise ValueError(f"Offset {offset} did not decrement date") diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/floating.py pandas-2.2.2+dfsg/pandas/core/arrays/floating.py --- pandas-2.1.4+dfsg/pandas/core/arrays/floating.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/floating.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import ClassVar + import numpy as np from pandas.core.dtypes.base import register_extension_dtype @@ -55,8 +57,6 @@ """ Array of floating (optional missing) values. - .. versionadded:: 1.2.0 - .. warning:: FloatingArray is currently experimental, and its API or internal @@ -156,14 +156,14 @@ @register_extension_dtype class Float32Dtype(FloatingDtype): type = np.float32 - name = "Float32" + name: ClassVar[str] = "Float32" __doc__ = _dtype_docstring.format(dtype="float32") @register_extension_dtype class Float64Dtype(FloatingDtype): type = np.float64 - name = "Float64" + name: ClassVar[str] = "Float64" __doc__ = _dtype_docstring.format(dtype="float64") diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/integer.py pandas-2.2.2+dfsg/pandas/core/arrays/integer.py --- pandas-2.1.4+dfsg/pandas/core/arrays/integer.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/integer.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import ClassVar + import numpy as np from pandas.core.dtypes.base import register_extension_dtype @@ -205,56 +207,56 @@ @register_extension_dtype class Int8Dtype(IntegerDtype): type = np.int8 - name = "Int8" + name: ClassVar[str] = "Int8" __doc__ = _dtype_docstring.format(dtype="int8") @register_extension_dtype class Int16Dtype(IntegerDtype): type = np.int16 - name = "Int16" + name: ClassVar[str] = "Int16" __doc__ = _dtype_docstring.format(dtype="int16") @register_extension_dtype class Int32Dtype(IntegerDtype): type = np.int32 - name = "Int32" + name: ClassVar[str] = "Int32" __doc__ = _dtype_docstring.format(dtype="int32") @register_extension_dtype class Int64Dtype(IntegerDtype): type = np.int64 - name = "Int64" + name: ClassVar[str] = "Int64" __doc__ = _dtype_docstring.format(dtype="int64") @register_extension_dtype class UInt8Dtype(IntegerDtype): type = np.uint8 - name = "UInt8" + name: ClassVar[str] = "UInt8" __doc__ = _dtype_docstring.format(dtype="uint8") @register_extension_dtype class UInt16Dtype(IntegerDtype): type = np.uint16 - name = "UInt16" + name: ClassVar[str] = "UInt16" __doc__ = _dtype_docstring.format(dtype="uint16") @register_extension_dtype class UInt32Dtype(IntegerDtype): type = np.uint32 - name = "UInt32" + name: ClassVar[str] = "UInt32" __doc__ = _dtype_docstring.format(dtype="uint32") @register_extension_dtype class UInt64Dtype(IntegerDtype): type = np.uint64 - name = "UInt64" + name: ClassVar[str] = "UInt64" __doc__ = _dtype_docstring.format(dtype="uint64") diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/interval.py pandas-2.2.2+dfsg/pandas/core/arrays/interval.py --- pandas-2.1.4+dfsg/pandas/core/arrays/interval.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/interval.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,11 +12,10 @@ Union, overload, ) +import warnings import numpy as np -from pandas._config import get_option - from pandas._libs import lib from pandas._libs.interval import ( VALID_CLOSED, @@ -80,6 +79,7 @@ unique, value_counts_internal as value_counts, ) +from pandas.core.arrays import ArrowExtensionArray from pandas.core.arrays.base import ( ExtensionArray, _extension_array_shared_docs, @@ -110,7 +110,7 @@ ) -IntervalSideT = Union[TimeArrayLike, np.ndarray] +IntervalSide = Union[TimeArrayLike, np.ndarray] IntervalOrNA = Union[Interval, float] _interval_shared_docs: dict[str, str] = {} @@ -219,8 +219,8 @@ return 1 # To make mypy recognize the fields - _left: IntervalSideT - _right: IntervalSideT + _left: IntervalSide + _right: IntervalSide _dtype: IntervalDtype # --------------------------------------------------------------------- @@ -237,8 +237,8 @@ data = extract_array(data, extract_numpy=True) if isinstance(data, cls): - left: IntervalSideT = data._left - right: IntervalSideT = data._right + left: IntervalSide = data._left + right: IntervalSide = data._right closed = closed or data.closed dtype = IntervalDtype(left.dtype, closed=closed) else: @@ -280,8 +280,8 @@ @classmethod def _simple_new( cls, - left: IntervalSideT, - right: IntervalSideT, + left: IntervalSide, + right: IntervalSide, dtype: IntervalDtype, ) -> Self: result = IntervalMixin.__new__(cls) @@ -299,7 +299,7 @@ closed: IntervalClosedType | None = None, copy: bool = False, dtype: Dtype | None = None, - ) -> tuple[IntervalSideT, IntervalSideT, IntervalDtype]: + ) -> tuple[IntervalSide, IntervalSide, IntervalDtype]: """Ensure correctness of input parameters for cls._simple_new.""" from pandas.core.indexes.base import ensure_index @@ -359,6 +359,11 @@ f"'{left.tz}' and '{right.tz}'" ) raise ValueError(msg) + elif needs_i8_conversion(left.dtype) and left.unit != right.unit: + # e.g. m8[s] vs m8[ms], try to cast to a common dtype GH#55714 + left_arr, right_arr = left._data._ensure_matching_resos(right._data) + left = ensure_index(left_arr) + right = ensure_index(right_arr) # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray left = ensure_wrapped_if_datetimelike(left) @@ -366,11 +371,18 @@ right = ensure_wrapped_if_datetimelike(right) right = extract_array(right, extract_numpy=True) - lbase = getattr(left, "_ndarray", left).base - rbase = getattr(right, "_ndarray", right).base - if lbase is not None and lbase is rbase: - # If these share data, then setitem could corrupt our IA - right = right.copy() + if isinstance(left, ArrowExtensionArray) or isinstance( + right, ArrowExtensionArray + ): + pass + else: + lbase = getattr(left, "_ndarray", left) + lbase = getattr(lbase, "_data", lbase).base + rbase = getattr(right, "_ndarray", right) + rbase = getattr(rbase, "_data", rbase).base + if lbase is not None and lbase is rbase: + # If these share data, then setitem could corrupt our IA + right = right.copy() dtype = IntervalDtype(left.dtype, closed=closed) @@ -388,12 +400,7 @@ @classmethod def _from_factorized(cls, values: np.ndarray, original: IntervalArray) -> Self: - if len(values) == 0: - # An empty array returns object-dtype here. We can't create - # a new IA from an (empty) object-dtype array, so turn it into the - # correct dtype. - values = values.astype(original.dtype.subtype) - return cls(values, closed=original.closed) + return cls._from_sequence(values, dtype=original.dtype) _interval_shared_docs["from_breaks"] = textwrap.dedent( """ @@ -764,7 +771,7 @@ if self.closed != other.categories.closed: return invalid_comparison(self, other, op) - other = other.categories.take( + other = other.categories._values.take( other.codes, allow_fill=True, fill_value=other.categories._na_value ) @@ -847,7 +854,7 @@ ascending = nv.validate_argsort_with_ascending(ascending, (), kwargs) if ascending and kind == "quicksort" and na_position == "last": - # TODO: in an IntervalIndex we can re-use the cached + # TODO: in an IntervalIndex we can reuse the cached # IntervalTree.left_sorter return np.lexsort((self.right, self.left)) @@ -891,11 +898,18 @@ return obj[indexer] def _pad_or_backfill( # pylint: disable=useless-parent-delegation - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove # this method entirely. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) def fillna( self, value=None, method=None, limit: int | None = None, copy: bool = True @@ -1031,8 +1045,8 @@ raise ValueError("Intervals must all be closed on the same side.") closed = closed_set.pop() - left = np.concatenate([interval.left for interval in to_concat]) - right = np.concatenate([interval.right for interval in to_concat]) + left: IntervalSide = np.concatenate([interval.left for interval in to_concat]) + right: IntervalSide = np.concatenate([interval.right for interval in to_concat]) left, right, dtype = cls._ensure_simple_new_inputs(left, right, closed=closed) @@ -1072,7 +1086,7 @@ fill_value = Index(self._left, copy=False)._na_value empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) else: - empty = self._from_sequence([fill_value] * empty_len) + empty = self._from_sequence([fill_value] * empty_len, dtype=self.dtype) if periods > 0: a = empty @@ -1228,62 +1242,30 @@ Series.value_counts """ # TODO: implement this is a non-naive way! - return value_counts(np.asarray(self), dropna=dropna) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "The behavior of value_counts with object-dtype is deprecated", + category=FutureWarning, + ) + result = value_counts(np.asarray(self), dropna=dropna) + # Once the deprecation is enforced, we will need to do + # `result.index = result.index.astype(self.dtype)` + return result # --------------------------------------------------------------------- # Rendering Methods - def _format_data(self) -> str: - # TODO: integrate with categorical and make generic - # name argument is unused here; just for compat with base / categorical - n = len(self) - max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) - - formatter = str - - if n == 0: - summary = "[]" - elif n == 1: - first = formatter(self[0]) - summary = f"[{first}]" - elif n == 2: - first = formatter(self[0]) - last = formatter(self[-1]) - summary = f"[{first}, {last}]" - else: - if n > max_seq_items: - n = min(max_seq_items // 2, 10) - head = [formatter(x) for x in self[:n]] - tail = [formatter(x) for x in self[-n:]] - head_str = ", ".join(head) - tail_str = ", ".join(tail) - summary = f"[{head_str} ... {tail_str}]" - else: - tail = [formatter(x) for x in self] - tail_str = ", ".join(tail) - summary = f"[{tail_str}]" - - return summary - - def __repr__(self) -> str: - # the short repr has no trailing newline, while the truncated - # repr does. So we include a newline in our template, and strip - # any trailing newlines from format_object_summary - data = self._format_data() - class_name = f"<{type(self).__name__}>\n" - - template = f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" - return template - - def _format_space(self) -> str: - space = " " * (len(type(self).__name__) + 1) - return f"\n{space}" + def _formatter(self, boxed: bool = False): + # returning 'str' here causes us to render as e.g. "(0, 1]" instead of + # "Interval(0, 1, closed='right')" + return str # --------------------------------------------------------------------- # Vectorized Interval Properties/Attributes @property - def left(self): + def left(self) -> Index: """ Return the left endpoints of each Interval in the IntervalArray as an Index. @@ -1303,7 +1285,7 @@ return Index(self._left, copy=False) @property - def right(self): + def right(self) -> Index: """ Return the right endpoints of each Interval in the IntervalArray as an Index. @@ -1585,7 +1567,9 @@ # --------------------------------------------------------------------- # Conversion - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') @@ -1822,12 +1806,8 @@ other < self._right if self.open_right else other <= self._right ) - def isin(self, values) -> npt.NDArray[np.bool_]: - if not hasattr(values, "dtype"): - values = np.array(values) - values = extract_array(values, extract_numpy=True) - - if isinstance(values.dtype, IntervalDtype): + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + if isinstance(values, IntervalArray): if self.closed != values.closed: # not comparable -> no overlap return np.zeros(self.shape, dtype=bool) @@ -1855,11 +1835,17 @@ return isin(self.astype(object), values.astype(object)) @property - def _combined(self) -> IntervalSideT: - left = self.left._values.reshape(-1, 1) - right = self.right._values.reshape(-1, 1) + def _combined(self) -> IntervalSide: + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "reshape" [union-attr] + left = self.left._values.reshape(-1, 1) # type: ignore[union-attr] + right = self.right._values.reshape(-1, 1) # type: ignore[union-attr] if needs_i8_conversion(left.dtype): - comb = left._concat_same_type([left, right], axis=1) + # error: Item "ndarray[Any, Any]" of "Any | ndarray[Any, Any]" has + # no attribute "_concat_same_type" + comb = left._concat_same_type( # type: ignore[union-attr] + [left, right], axis=1 + ) else: comb = np.concatenate([left, right], axis=1) return comb diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/masked.py pandas-2.2.2+dfsg/pandas/core/arrays/masked.py --- pandas-2.1.4+dfsg/pandas/core/arrays/masked.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/masked.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,16 +15,14 @@ lib, missing as libmissing, ) -from pandas._libs.tslibs import ( - get_unit_from_dtype, - is_supported_unit, -) +from pandas._libs.tslibs import is_supported_dtype from pandas._typing import ( ArrayLike, AstypeArg, AxisInt, DtypeObj, FillnaOptions, + InterpolateOptions, NpDtype, PositionalIndexer, Scalar, @@ -69,6 +67,8 @@ from pandas.core.algorithms import ( factorize_array, isin, + map_array, + mode, take, ) from pandas.core.array_algos import ( @@ -77,6 +77,7 @@ ) from pandas.core.array_algos.quantile import quantile_with_mask from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._utils import to_numpy_dtype_inference from pandas.core.arrays.base import ExtensionArray from pandas.core.construction import ( array as pd_array, @@ -85,6 +86,7 @@ ) from pandas.core.indexers import check_array_indexer from pandas.core.ops import invalid_comparison +from pandas.core.util.hashing import hash_array if TYPE_CHECKING: from collections.abc import ( @@ -97,6 +99,7 @@ NumpySorter, NumpyValueArrayLike, ) + from pandas.core.arrays import FloatingArray from pandas.compat.numpy import function as nv @@ -191,7 +194,12 @@ return self._simple_new(self._data[item], newmask) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: mask = self._mask @@ -203,7 +211,21 @@ if copy: npvalues = npvalues.copy() new_mask = new_mask.copy() + elif limit_area is not None: + mask = mask.copy() func(npvalues, limit=limit, mask=new_mask) + + if limit_area is not None and not mask.all(): + mask = mask.T + neg_mask = ~mask + first = neg_mask.argmax() + last = len(neg_mask) - neg_mask[::-1].argmax() - 1 + if limit_area == "inside": + new_mask[:first] |= mask[:first] + new_mask[last + 1 :] |= mask[last + 1 :] + elif limit_area == "outside": + new_mask[first + 1 : last] |= mask[first + 1 : last] + if copy: return self._simple_new(npvalues.T, new_mask.T) else: @@ -383,6 +405,8 @@ DataFrame.round : Round values of a DataFrame. Series.round : Round values of a Series. """ + if self.dtype.kind == "b": + return self nv.validate_round(args, kwargs) values = np.round(self._data, decimals=decimals, **kwargs) @@ -406,6 +430,9 @@ # ------------------------------------------------------------------ + def _values_for_json(self) -> np.ndarray: + return np.asarray(self, dtype=object) + def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -472,13 +499,12 @@ >>> a.to_numpy(dtype="bool", na_value=False) array([ True, False, False]) """ - if na_value is lib.no_default: - na_value = libmissing.NA + hasna = self._hasna + dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna) if dtype is None: dtype = object - else: - dtype = np.dtype(dtype) - if self._hasna: + + if hasna: if ( dtype != object and not is_string_dtype(dtype) @@ -505,7 +531,7 @@ if self.ndim > 1: return [x.tolist() for x in self] dtype = None if self._hasna else self._data.dtype - return self.to_numpy(dtype=dtype).tolist() + return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist() @overload def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: @@ -567,7 +593,9 @@ __array_priority__ = 1000 # higher than ndarray so ops dispatch to us - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: """ the array interface, return my values We return an object array here to preserve our scalar values @@ -848,9 +876,7 @@ return BooleanArray(result, mask, copy=False) - elif lib.is_np_dtype(result.dtype, "m") and is_supported_unit( - get_unit_from_dtype(result.dtype) - ): + elif lib.is_np_dtype(result.dtype, "m") and is_supported_dtype(result.dtype): # e.g. test_numeric_arr_mul_tdscalar_numexpr_path from pandas.core.arrays import TimedeltaArray @@ -891,6 +917,15 @@ mask = np.concatenate([x._mask for x in to_concat], axis=axis) return cls(data, mask) + def _hash_pandas_object( + self, *, encoding: str, hash_key: str, categorize: bool + ) -> npt.NDArray[np.uint64]: + hashed_array = hash_array( + self._data, encoding=encoding, hash_key=hash_key, categorize=categorize + ) + hashed_array[self.isna()] = hash(self.dtype.na_value) + return hashed_array + def take( self, indexer, @@ -927,7 +962,7 @@ # error: Return type "BooleanArray" of "isin" incompatible with return type # "ndarray" in supertype "ExtensionArray" - def isin(self, values) -> BooleanArray: # type: ignore[override] + def isin(self, values: ArrayLike) -> BooleanArray: # type: ignore[override] from pandas.core.arrays import BooleanArray # algorithms.isin will eventually convert values to an ndarray, so no extra @@ -952,6 +987,14 @@ mask = self._mask.copy() return self._simple_new(data, mask) + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + values = self._data + mask = self._mask + return algos.duplicated(values, keep=keep, mask=mask) + def unique(self) -> Self: """ Compute the BaseMaskedArray of unique values. @@ -1044,28 +1087,31 @@ ) from pandas.arrays import IntegerArray - keys, value_counts = algos.value_counts_arraylike( - self._data, dropna=True, mask=self._mask + keys, value_counts, na_counter = algos.value_counts_arraylike( + self._data, dropna=dropna, mask=self._mask ) + mask_index = np.zeros((len(value_counts),), dtype=np.bool_) + mask = mask_index.copy() - if dropna: - res = Series(value_counts, index=keys, name="count", copy=False) - res.index = res.index.astype(self.dtype) - res = res.astype("Int64") - return res - - # if we want nans, count the mask - counts = np.empty(len(value_counts) + 1, dtype="int64") - counts[:-1] = value_counts - counts[-1] = self._mask.sum() - - index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value) - index = index.astype(self.dtype) + if na_counter > 0: + mask_index[-1] = True - mask = np.zeros(len(counts), dtype="bool") - counts_array = IntegerArray(counts, mask) + arr = IntegerArray(value_counts, mask) + index = Index( + self.dtype.construct_array_type()( + keys, mask_index # type: ignore[arg-type] + ) + ) + return Series(arr, index=index, name="count", copy=False) - return Series(counts_array, index=index, name="count", copy=False) + def _mode(self, dropna: bool = True) -> Self: + if dropna: + result = mode(self._data, dropna=dropna, mask=self._mask) + res_mask = np.zeros(result.shape, dtype=np.bool_) + else: + result, res_mask = mode(self._data, dropna=dropna, mask=self._mask) + result = type(self)(result, res_mask) # type: ignore[arg-type] + return result[result.argsort()] @doc(ExtensionArray.equals) def equals(self, other) -> bool: @@ -1288,6 +1334,9 @@ ) return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) + def map(self, mapper, na_action=None): + return map_array(self.to_numpy(), mapper, na_action=na_action) + def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ Return whether any element is truthy. @@ -1451,6 +1500,58 @@ else: return self.dtype.na_value + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index, + limit, + limit_direction, + limit_area, + copy: bool, + **kwargs, + ) -> FloatingArray: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if copy=False + if self.dtype.kind == "f": + if copy: + data = self._data.copy() + mask = self._mask.copy() + else: + data = self._data + mask = self._mask + elif self.dtype.kind in "iu": + copy = True + data = self._data.astype("f8") + mask = self._mask.copy() + else: + raise NotImplementedError( + f"interpolate is not implemented for dtype={self.dtype}" + ) + + missing.interpolate_2d_inplace( + data, + method=method, + axis=0, + index=index, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + mask=mask, + **kwargs, + ) + if not copy: + return self # type: ignore[return-value] + if self.dtype.kind == "f": + return type(self)._simple_new(data, mask) # type: ignore[return-value] + else: + from pandas.core.arrays import FloatingArray + + return FloatingArray._simple_new(data, mask) + def _accumulate( self, name: str, *, skipna: bool = True, **kwargs ) -> BaseMaskedArray: @@ -1487,6 +1588,9 @@ else: result_mask = np.zeros(ngroups, dtype=bool) + if how == "rank" and kwargs.get("na_option") in ["top", "bottom"]: + result_mask[:] = False + res_values = op._cython_op_ndim_compat( self._data, min_count=min_count, @@ -1501,9 +1605,13 @@ arity = op._cython_arity.get(op.how, 1) result_mask = np.tile(result_mask, (arity, 1)).T - # res_values should already have the correct dtype, we just need to - # wrap in a MaskedArray - return self._maybe_mask_result(res_values, result_mask) + if op.how in ["idxmin", "idxmax"]: + # Result values are indexes to take, keep as ndarray + return res_values + else: + # res_values should already have the correct dtype, we just need to + # wrap in a MaskedArray + return self._maybe_mask_result(res_values, result_mask) def transpose_homogeneous_masked_arrays( @@ -1515,13 +1623,24 @@ same dtype. The caller is responsible for ensuring validity of input data. """ masked_arrays = list(masked_arrays) + dtype = masked_arrays[0].dtype + values = [arr._data.reshape(1, -1) for arr in masked_arrays] - transposed_values = np.concatenate(values, axis=0) + transposed_values = np.concatenate( + values, + axis=0, + out=np.empty( + (len(masked_arrays), len(masked_arrays[0])), + order="F", + dtype=dtype.numpy_dtype, + ), + ) masks = [arr._mask.reshape(1, -1) for arr in masked_arrays] - transposed_masks = np.concatenate(masks, axis=0) + transposed_masks = np.concatenate( + masks, axis=0, out=np.empty_like(transposed_values, dtype=bool) + ) - dtype = masked_arrays[0].dtype arr_type = dtype.construct_array_type() transposed_arrays: list[BaseMaskedArray] = [] for i in range(transposed_values.shape[1]): diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/numeric.py pandas-2.2.2+dfsg/pandas/core/arrays/numeric.py --- pandas-2.1.4+dfsg/pandas/core/arrays/numeric.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/numeric.py 2024-04-10 17:42:52.000000000 +0000 @@ -132,9 +132,12 @@ raise AbstractMethodError(cls) -def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype): +def _coerce_to_data_and_mask( + values, dtype, copy: bool, dtype_cls: type[NumericDtype], default_dtype: np.dtype +): checker = dtype_cls._checker + mask = None inferred_type = None if dtype is None and hasattr(values, "dtype"): @@ -156,7 +159,10 @@ return values, mask, dtype, inferred_type original = values - values = np.array(values, copy=copy) + if not copy: + values = np.asarray(values) + else: + values = np.array(values, copy=copy) inferred_type = None if values.dtype == object or is_string_dtype(values.dtype): inferred_type = lib.infer_dtype(values, skipna=True) @@ -165,7 +171,10 @@ raise TypeError(f"{values.dtype} cannot be converted to {name}") elif values.dtype.kind == "b" and checker(dtype): - values = np.array(values, dtype=default_dtype, copy=copy) + if not copy: + values = np.asarray(values, dtype=default_dtype) + else: + values = np.array(values, dtype=default_dtype, copy=copy) elif values.dtype.kind not in "iuf": name = dtype_cls.__name__.strip("_") @@ -190,7 +199,7 @@ if dtype is None: dtype = default_dtype else: - dtype = dtype.type + dtype = dtype.numpy_dtype if is_integer_dtype(dtype) and values.dtype.kind == "f" and len(values) > 0: if mask.all(): @@ -204,9 +213,9 @@ inferred_type not in ["floating", "mixed-integer-float"] and not mask.any() ): - values = np.array(original, dtype=dtype, copy=False) + values = np.asarray(original, dtype=dtype) else: - values = np.array(original, dtype="object", copy=False) + values = np.asarray(original, dtype="object") # we copy as need to coerce here if mask.any(): @@ -260,9 +269,8 @@ ) -> tuple[np.ndarray, np.ndarray]: dtype_cls = cls._dtype_cls default_dtype = dtype_cls._default_np_dtype - mask = None values, mask, _, _ = _coerce_to_data_and_mask( - value, mask, dtype, copy, dtype_cls, default_dtype + value, dtype, copy, dtype_cls, default_dtype ) return values, mask diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/numpy_.py pandas-2.2.2+dfsg/pandas/core/arrays/numpy_.py --- pandas-2.1.4+dfsg/pandas/core/arrays/numpy_.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/numpy_.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,10 +8,7 @@ import numpy as np from pandas._libs import lib -from pandas._libs.tslibs import ( - get_unit_from_dtype, - is_supported_unit, -) +from pandas._libs.tslibs import is_supported_dtype from pandas.compat.numpy import function as nv from pandas.core.dtypes.astype import astype_array @@ -153,7 +150,9 @@ # ------------------------------------------------------------------------ # NumPy Array Interface - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): @@ -553,9 +552,7 @@ def _wrap_ndarray_result(self, result: np.ndarray): # If we have timedelta64[ns] result, return a TimedeltaArray instead # of a NumpyExtensionArray - if result.dtype.kind == "m" and is_supported_unit( - get_unit_from_dtype(result.dtype) - ): + if result.dtype.kind == "m" and is_supported_dtype(result.dtype): from pandas.core.arrays import TimedeltaArray return TimedeltaArray._simple_new(result, dtype=result.dtype) diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/period.py pandas-2.2.2+dfsg/pandas/core/arrays/period.py --- pandas-2.1.4+dfsg/pandas/core/arrays/period.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/period.py 2024-04-10 17:42:52.000000000 +0000 @@ -25,6 +25,7 @@ NaT, NaTType, Timedelta, + add_overflowsafe, astype_overflowsafe, dt64arr_to_periodarr as c_dt64arr_to_periodarr, get_unit_from_dtype, @@ -33,7 +34,11 @@ period as libperiod, to_offset, ) -from pandas._libs.tslibs.dtypes import FreqGroup +from pandas._libs.tslibs.dtypes import ( + FreqGroup, + PeriodDtypeBase, + freq_to_period_freqstr, +) from pandas._libs.tslibs.fields import isleapyear_arr from pandas._libs.tslibs.offsets import ( Tick, @@ -68,7 +73,6 @@ ) from pandas.core.dtypes.missing import isna -import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com @@ -252,7 +256,10 @@ raise raise_on_incompatible(values, dtype.freq) values, dtype = values._ndarray, values.dtype - values = np.array(values, dtype="int64", copy=copy) + if not copy: + values = np.asarray(values, dtype="int64") + else: + values = np.array(values, dtype="int64", copy=copy) if dtype is None: raise ValueError("dtype is not specified and cannot be inferred") dtype = cast(PeriodDtype, dtype) @@ -319,31 +326,32 @@ ------- PeriodArray[freq] """ + if isinstance(freq, BaseOffset): + freq = freq_to_period_freqstr(freq.n, freq.name) data, freq = dt64arr_to_periodarr(data, freq, tz) dtype = PeriodDtype(freq) return cls(data, dtype=dtype) @classmethod - def _generate_range(cls, start, end, periods, freq, fields): + def _generate_range(cls, start, end, periods, freq): periods = dtl.validate_periods(periods) if freq is not None: freq = Period._maybe_convert_freq(freq) - field_count = len(fields) if start is not None or end is not None: - if field_count > 0: - raise ValueError( - "Can either instantiate from fields or endpoints, but not both" - ) subarr, freq = _get_ordinal_range(start, end, periods, freq) - elif field_count > 0: - subarr, freq = _range_from_fields(freq=freq, **fields) else: raise ValueError("Not enough parameters to construct Period range") return subarr, freq + @classmethod + def _from_fields(cls, *, fields: dict, freq) -> Self: + subarr, freq = _range_from_fields(freq=freq, **fields) + dtype = PeriodDtype(freq) + return cls._simple_new(subarr, dtype=dtype) + # ----------------------------------------------------------------- # DatetimeLike Interface @@ -366,10 +374,15 @@ def _scalar_from_string(self, value: str) -> Period: return Period(value, freq=self.freq) - def _check_compatible_with(self, other) -> None: + # error: Argument 1 of "_check_compatible_with" is incompatible with + # supertype "DatetimeLikeArrayMixin"; supertype defines the argument type + # as "Period | Timestamp | Timedelta | NaTType" + def _check_compatible_with(self, other: Period | NaTType | PeriodArray) -> None: # type: ignore[override] if other is NaT: return - self._require_matching_freq(other) + # error: Item "NaTType" of "Period | NaTType | PeriodArray" has no + # attribute "freq" + self._require_matching_freq(other.freq) # type: ignore[union-attr] # -------------------------------------------------------------------- # Data / Attributes @@ -386,7 +399,13 @@ """ return self.dtype.freq - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + @property + def freqstr(self) -> str: + return freq_to_period_freqstr(self.freq.n, self.freq.name) + + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: if dtype == "i8": return self.asi8 elif dtype == bool: @@ -468,7 +487,7 @@ Examples -------- - >>> idx = pd.PeriodIndex(["2023-01-01 10:00", "2023-01-01 11:00"], freq='H') + >>> idx = pd.PeriodIndex(["2023-01-01 10:00", "2023-01-01 11:00"], freq='h') >>> idx.hour Index([10, 11], dtype='int64') """, @@ -539,7 +558,7 @@ >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y") >>> idx - PeriodIndex(['2023', '2024', '2025'], dtype='period[A-DEC]') + PeriodIndex(['2023', '2024', '2025'], dtype='period[Y-DEC]') >>> idx.dayofyear Index([365, 366, 365], dtype='int64') """, @@ -609,7 +628,7 @@ ---------- freq : str or DateOffset, optional Target frequency. The default is 'D' for week or longer, - 'S' otherwise. + 's' otherwise. how : {'s', 'e', 'start', 'end'} Whether to use the start or end of the time period being converted. @@ -639,8 +658,10 @@ return (self + self.freq).to_timestamp(how="start") - adjust if freq is None: - freq = self._dtype._get_to_timestamp_base() - base = freq + freq_code = self._dtype._get_to_timestamp_base() + dtype = PeriodDtypeBase(freq_code, 1) + freq = dtype._freqstr + base = freq_code else: freq = Period._maybe_convert_freq(freq) base = freq._period_dtype_code @@ -648,7 +669,7 @@ new_parr = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_parr.asi8, base) - dta = DatetimeArray(new_data) + dta = DatetimeArray._from_sequence(new_data) if self.freq.name == "B": # See if we can retain BDay instead of Day in cases where @@ -703,10 +724,10 @@ Examples -------- - >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='A') + >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='Y') >>> pidx PeriodIndex(['2010', '2011', '2012', '2013', '2014', '2015'], - dtype='period[A-DEC]') + dtype='period[Y-DEC]') >>> pidx.asfreq('M') PeriodIndex(['2010-12', '2011-12', '2012-12', '2013-12', '2014-12', @@ -717,7 +738,8 @@ '2015-01'], dtype='period[M]') """ how = libperiod.validate_end_alias(how) - + if isinstance(freq, BaseOffset) and hasattr(freq, "_period_dtype_code"): + freq = PeriodDtype(freq)._freqstr freq = Period._maybe_convert_freq(freq) base1 = self._dtype._dtype_code @@ -774,7 +796,8 @@ if lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype): # GH#45038 match PeriodIndex behavior. tz = getattr(dtype, "tz", None) - return self.to_timestamp().tz_localize(tz) + unit = dtl.dtype_to_unit(dtype) + return self.to_timestamp().tz_localize(tz).as_unit(unit) return super().astype(dtype, copy=copy) @@ -792,12 +815,19 @@ return m8arr.searchsorted(npvalue, side=side, sorter=sorter) def _pad_or_backfill( - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # view as dt64 so we get treated as timelike in core.missing, # similar to dtl._period_dispatch dta = self.view("M8[ns]") - result = dta._pad_or_backfill(method=method, limit=limit, copy=copy) + result = dta._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) if copy: return cast("Self", result.view(self.dtype)) else: @@ -837,7 +867,7 @@ assert op in [operator.add, operator.sub] if op is operator.sub: other = -other - res_values = algos.checked_add_with_arr(self.asi8, other, arr_mask=self._isnan) + res_values = add_overflowsafe(self.asi8, np.asarray(other, dtype="i8")) return type(self)(res_values, dtype=self.dtype) def _add_offset(self, other: BaseOffset): @@ -902,12 +932,7 @@ "not an integer multiple of the PeriodArray's freq." ) from err - b_mask = np.isnat(delta) - - res_values = algos.checked_add_with_arr( - self.asi8, delta.view("i8"), arr_mask=self._isnan, b_mask=b_mask - ) - np.putmask(res_values, self._isnan | b_mask, iNaT) + res_values = add_overflowsafe(self.asi8, np.asarray(delta.view("i8"))) return type(self)(res_values, dtype=self.dtype) def _check_timedeltalike_freq_compat(self, other): @@ -948,7 +973,7 @@ return lib.item_from_zerodim(delta) -def raise_on_incompatible(left, right): +def raise_on_incompatible(left, right) -> IncompatibleFrequency: """ Helper function to render a consistent error message when raising IncompatibleFrequency. @@ -966,13 +991,16 @@ # GH#24283 error message format depends on whether right is scalar if isinstance(right, (np.ndarray, ABCTimedeltaArray)) or right is None: other_freq = None - elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period, BaseOffset)): + elif isinstance(right, BaseOffset): + other_freq = freq_to_period_freqstr(right.n, right.name) + elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period)): other_freq = right.freqstr else: other_freq = delta_to_tick(Timedelta(right)).freqstr + own_freq = freq_to_period_freqstr(left.freq.n, left.freq.name) msg = DIFFERENT_FREQ.format( - cls=type(left).__name__, own_freq=left.freqstr, other_freq=other_freq + cls=type(left).__name__, own_freq=own_freq, other_freq=other_freq ) return IncompatibleFrequency(msg) @@ -1012,18 +1040,18 @@ Examples -------- - >>> period_array([pd.Period('2017', freq='A'), - ... pd.Period('2018', freq='A')]) + >>> period_array([pd.Period('2017', freq='Y'), + ... pd.Period('2018', freq='Y')]) ['2017', '2018'] - Length: 2, dtype: period[A-DEC] + Length: 2, dtype: period[Y-DEC] - >>> period_array([pd.Period('2017', freq='A'), - ... pd.Period('2018', freq='A'), + >>> period_array([pd.Period('2017', freq='Y'), + ... pd.Period('2018', freq='Y'), ... pd.NaT]) ['2017', '2018', 'NaT'] - Length: 3, dtype: period[A-DEC] + Length: 3, dtype: period[Y-DEC] Integers that look like years are handled @@ -1074,7 +1102,9 @@ return PeriodArray(ordinals, dtype=dtype) data = ensure_object(arrdata) - + if freq is None: + freq = libperiod.extract_freq(data) + dtype = PeriodDtype(freq) return PeriodArray._from_sequence(data, dtype=dtype) @@ -1089,7 +1119,7 @@ def validate_dtype_freq( - dtype, freq: BaseOffsetT | timedelta | str | None + dtype, freq: BaseOffsetT | BaseOffset | timedelta | str | None ) -> BaseOffsetT: """ If both a dtype and a freq are available, ensure they match. If only @@ -1110,10 +1140,7 @@ IncompatibleFrequency : mismatch between dtype and freq """ if freq is not None: - # error: Incompatible types in assignment (expression has type - # "BaseOffset", variable has type "Union[BaseOffsetT, timedelta, - # str, None]") - freq = to_offset(freq) # type: ignore[assignment] + freq = to_offset(freq, is_period=True) if dtype is not None: dtype = pandas_dtype(dtype) @@ -1176,7 +1203,7 @@ ) if freq is not None: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) mult = freq.n if start is not None: @@ -1240,10 +1267,10 @@ if quarter is not None: if freq is None: - freq = to_offset("Q") + freq = to_offset("Q", is_period=True) base = FreqGroup.FR_QTR.value else: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) base = libperiod.freq_to_dtype_code(freq) if base != FreqGroup.FR_QTR.value: raise AssertionError("base must equal FR_QTR") @@ -1257,7 +1284,7 @@ ) ordinals.append(val) else: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) base = libperiod.freq_to_dtype_code(freq) arrays = _make_field_arrays(year, month, day, hour, minute, second) for y, mth, d, h, mn, s in zip(*arrays): diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/sparse/accessor.py pandas-2.2.2+dfsg/pandas/core/arrays/sparse/accessor.py --- pandas-2.1.4+dfsg/pandas/core/arrays/sparse/accessor.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/sparse/accessor.py 2024-04-10 17:42:52.000000000 +0000 @@ -270,12 +270,12 @@ Examples -------- >>> import scipy.sparse - >>> mat = scipy.sparse.eye(3) + >>> mat = scipy.sparse.eye(3, dtype=float) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 - 0 1.0 0.0 0.0 - 1 0.0 1.0 0.0 - 2 0.0 0.0 1.0 + 0 1.0 0 0 + 1 0 1.0 0 + 2 0 0 1.0 """ from pandas._libs.sparse import IntIndex diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/sparse/array.py pandas-2.2.2+dfsg/pandas/core/arrays/sparse/array.py --- pandas-2.1.4+dfsg/pandas/core/arrays/sparse/array.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/sparse/array.py 2024-04-10 17:42:52.000000000 +0000 @@ -28,6 +28,7 @@ from pandas._libs.tslibs import NaT from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning +from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_bool_kwarg, @@ -550,7 +551,9 @@ return cls._simple_new(arr, index, dtype) - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + def __array__( + self, dtype: NpDtype | None = None, copy: bool | None = None + ) -> np.ndarray: fill_value = self.fill_value if self.sp_index.ngaps == 0: @@ -702,7 +705,9 @@ """ return self.sp_index.npoints - def isna(self): + # error: Return type "SparseArray" of "isna" incompatible with return type + # "ndarray[Any, Any] | ExtensionArraySupportsAnyAll" in supertype "ExtensionArray" + def isna(self) -> Self: # type: ignore[override] # If null fill value, we want SparseDtype[bool, true] # to preserve the same memory usage. dtype = SparseDtype(bool, self._null_fill_value) @@ -713,11 +718,18 @@ return type(self)(mask, fill_value=False, dtype=dtype) def _pad_or_backfill( # pylint: disable=useless-parent-delegation - self, *, method: FillnaOptions, limit: int | None = None, copy: bool = True + self, + *, + method: FillnaOptions, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + copy: bool = True, ) -> Self: # TODO(3.0): We can remove this method once deprecation for fillna method # keyword is enforced. - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + return super()._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area, copy=copy + ) def fillna( self, @@ -828,6 +840,14 @@ diff = np.r_[np.diff(indices), 2] return indices[(diff > 1).argmax()] + 1 + @doc(ExtensionArray.duplicated) + def duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> npt.NDArray[np.bool_]: + values = np.asarray(self) + mask = np.asarray(self.isna()) + return algos.duplicated(values, keep=keep, mask=mask) + def unique(self) -> Self: uniques = algos.unique(self.sp_values) if len(self.sp_values) != len(self): @@ -879,7 +899,7 @@ Series, ) - keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) + keys, counts, _ = algos.value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps if fcounts > 0 and (not self._null_fill_value or not dropna): mask = isna(keys) if self._null_fill_value else keys == self.fill_value @@ -1075,9 +1095,10 @@ ) elif self.sp_index.npoints == 0: - # Avoid taking from the empty self.sp_values + # Use the old fill_value unless we took for an index of -1 _dtype = np.result_type(self.dtype.subtype, type(fill_value)) taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) + taken[old_fill_indices] = self.fill_value else: taken = self.sp_values.take(sp_indexer) @@ -1421,7 +1442,7 @@ return values.all() - def any(self, axis: AxisInt = 0, *args, **kwargs): + def any(self, axis: AxisInt = 0, *args, **kwargs) -> bool: """ Tests whether at least one of elements evaluate True diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/string_.py pandas-2.2.2+dfsg/pandas/core/arrays/string_.py --- pandas-2.1.4+dfsg/pandas/core/arrays/string_.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/string_.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,6 +2,7 @@ from typing import ( TYPE_CHECKING, + ClassVar, Literal, ) @@ -15,7 +16,7 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas._libs.lib import ensure_string_array -from pandas.compat import pa_version_under7p0 +from pandas.compat import pa_version_under10p1 from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -55,9 +56,11 @@ from pandas._typing import ( AxisInt, Dtype, + DtypeObj, NumpySorter, NumpyValueArrayLike, Scalar, + Self, npt, type_t, ) @@ -97,7 +100,9 @@ string[pyarrow] """ - name = "string" + # error: Cannot override instance variable (previously declared on + # base class "StorageExtensionDtype") with class variable + name: ClassVar[str] = "string" # type: ignore[misc] #: StringDtype().na_value uses pandas.NA except the implementation that # follows NumPy semantics, which uses nan. @@ -122,9 +127,9 @@ f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " f"Got {storage} instead." ) - if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0: + if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1: raise ImportError( - "pyarrow>=7.0.0 is required for PyArrow backed StringArray." + "pyarrow>=10.0.1 is required for PyArrow backed StringArray." ) self.storage = storage @@ -133,7 +138,7 @@ return str @classmethod - def construct_from_string(cls, string): + def construct_from_string(cls, string) -> Self: """ Construct a StringDtype from a string. @@ -257,6 +262,13 @@ return [x.tolist() for x in self] return list(self.to_numpy()) + @classmethod + def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: + if lib.infer_dtype(scalars, skipna=True) not in ["string", "empty"]: + # TODO: require any NAs be valid-for-string + raise ValueError + return cls._from_sequence(scalars, dtype=dtype) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" @@ -603,7 +615,7 @@ arr = np.asarray(self) if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray] | type[BooleanArray] + constructor: type[IntegerArray | BooleanArray] if is_integer_dtype(dtype): constructor = IntegerArray else: @@ -612,6 +624,8 @@ na_value_is_na = isna(na_value) if na_value_is_na: na_value = 1 + elif dtype == np.dtype("bool"): + na_value = bool(na_value) result = lib.map_infer_mask( arr, f, diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/string_arrow.py pandas-2.2.2+dfsg/pandas/core/arrays/string_arrow.py --- pandas-2.1.4+dfsg/pandas/core/arrays/string_arrow.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/string_arrow.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,7 +16,10 @@ lib, missing as libmissing, ) -from pandas.compat import pa_version_under7p0 +from pandas.compat import ( + pa_version_under10p1, + pa_version_under13p0, +) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( @@ -38,9 +41,10 @@ BaseStringArray, StringDtype, ) +from pandas.core.ops import invalid_comparison from pandas.core.strings.object_array import ObjectStringArrayMixin -if not pa_version_under7p0: +if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc @@ -48,20 +52,25 @@ if TYPE_CHECKING: + from collections.abc import Sequence + from pandas._typing import ( + ArrayLike, AxisInt, Dtype, Scalar, npt, ) + from pandas import Series + ArrowStringScalarOrNAT = Union[str, libmissing.NAType] def _chk_pyarrow_available() -> None: - if pa_version_under7p0: - msg = "pyarrow>=7.0.0 is required for PyArrow backed ArrowExtensionArray." + if pa_version_under10p1: + msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray." raise ImportError(msg) @@ -74,8 +83,6 @@ """ Extension array for string data in a ``pyarrow.ChunkedArray``. - .. versionadded:: 1.2.0 - .. warning:: ArrowStringArray is considered experimental. The implementation and @@ -120,17 +127,40 @@ _storage = "pyarrow" def __init__(self, values) -> None: + _chk_pyarrow_available() + if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( + values.type + ): + values = pc.cast(values, pa.large_string()) + super().__init__(values) self._dtype = StringDtype(storage=self._storage) - if not pa.types.is_string(self._pa_array.type) and not ( + if not pa.types.is_large_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_string(self._pa_array.type.value_type) + and pa.types.is_large_string(self._pa_array.type.value_type) ): raise ValueError( - "ArrowStringArray requires a PyArrow (chunked) array of string type" + "ArrowStringArray requires a PyArrow (chunked) array of " + "large_string type" ) + @classmethod + def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: + pa_scalar = super()._box_pa_scalar(value, pa_type) + if pa.types.is_string(pa_scalar.type) and pa_type is None: + pa_scalar = pc.cast(pa_scalar, pa.large_string()) + return pa_scalar + + @classmethod + def _box_pa_array( + cls, value, pa_type: pa.DataType | None = None, copy: bool = False + ) -> pa.Array | pa.ChunkedArray: + pa_array = super()._box_pa_array(value, pa_type) + if pa.types.is_string(pa_array.type) and pa_type is None: + pa_array = pc.cast(pa_array, pa.large_string()) + return pa_array + def __len__(self) -> int: """ Length of this array. @@ -142,7 +172,7 @@ return len(self._pa_array) @classmethod - def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): from pandas.core.arrays.masked import BaseMaskedArray _chk_pyarrow_available() @@ -205,7 +235,7 @@ raise TypeError("Scalar must be NA or str") return super()._maybe_convert_setitem_value(value) - def isin(self, values) -> npt.NDArray[np.bool_]: + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: value_set = [ pa_scalar.as_py() for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] @@ -276,7 +306,7 @@ arr = np.asarray(self) if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: type[IntegerArray] | type[BooleanArray] + constructor: type[IntegerArray | BooleanArray] if is_integer_dtype(dtype): constructor = IntegerArray else: @@ -332,23 +362,43 @@ result[isna(result)] = bool(na) return result - def _str_startswith(self, pat: str, na=None): - result = pc.starts_with(self._pa_array, pattern=pat) + def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.starts_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # mimic existing behaviour of string extension array + # and python string method + result = pa.array( + np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) + ) + else: + result = pc.starts_with(self._pa_array, pattern=pat[0]) + + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) - result = self._result_converter(result) - if not isna(na): - result[isna(result)] = bool(na) - return result + return self._result_converter(result) + + def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + if isinstance(pat, str): + result = pc.ends_with(self._pa_array, pattern=pat) + else: + if len(pat) == 0: + # mimic existing behaviour of string extension array + # and python string method + result = pa.array( + np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array) + ) + else: + result = pc.ends_with(self._pa_array, pattern=pat[0]) - def _str_endswith(self, pat: str, na=None): - result = pc.ends_with(self._pa_array, pattern=pat) + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) - result = self._result_converter(result) - if not isna(na): - result[isna(result)] = bool(na) - return result + return self._result_converter(result) def _str_replace( self, @@ -367,6 +417,12 @@ result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n) return type(self)(result) + def _str_repeat(self, repeats: int | Sequence[int]): + if not isinstance(repeats, int): + return super()._str_repeat(repeats) + else: + return type(self)(pc.binary_repeat(self._pa_array, repeats)) + def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None ): @@ -377,10 +433,23 @@ def _str_fullmatch( self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None ): - if not pat.endswith("$") or pat.endswith("//$"): + if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" return self._str_match(pat, case, flags, na) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if stop is None: + return super()._str_slice(start, stop, step) + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) return self._result_converter(result) @@ -419,7 +488,7 @@ def _str_len(self): result = pc.utf8_length(self._pa_array) - return Int64Dtype().__from_arrow__(result) + return self._convert_int_dtype(result) def _str_lower(self): return type(self)(pc.utf8_lower(self._pa_array)) @@ -448,6 +517,50 @@ result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + return super()._str_removeprefix(prefix) + + def _str_removesuffix(self, suffix: str): + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) + return type(self)(result) + + def _str_count(self, pat: str, flags: int = 0): + if flags: + return super()._str_count(pat, flags) + result = pc.count_substring_regex(self._pa_array, pat) + return self._convert_int_dtype(result) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if start != 0 and end is not None: + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + not_found = pc.equal(result, -1) + offset_result = pc.add(result, end - start) + result = pc.if_else(not_found, result, offset_result) + elif start == 0 and end is None: + slices = self._pa_array + result = pc.find_substring(slices, sub) + else: + return super()._str_find(sub, start, end) + return self._convert_int_dtype(result) + + def _str_get_dummies(self, sep: str = "|"): + dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) + if len(labels) == 0: + return np.empty(shape=(0, 0), dtype=np.int64), labels + dummies = np.vstack(dummies_pa.to_numpy()) + return dummies.astype(np.int64, copy=False), labels + + def _convert_int_dtype(self, result): + return Int64Dtype().__from_arrow__(result) + def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): @@ -459,9 +572,6 @@ else: return result - def _convert_int_dtype(self, result): - return Int64Dtype().__from_arrow__(result) - def _rank( self, *, @@ -488,15 +598,6 @@ class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" - def __init__(self, values) -> None: - _chk_pyarrow_available() - - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_large_string( - values.type - ): - values = pc.cast(values, pa.string()) - super().__init__(values) - @classmethod def _result_converter(cls, values, na=None): if not isna(na): @@ -569,44 +670,23 @@ def _convert_int_dtype(self, result): if isinstance(result, pa.Array): result = result.to_numpy(zero_copy_only=False) - elif not isinstance(result, np.ndarray): + else: result = result.to_numpy() if result.dtype == np.int32: result = result.astype(np.int64) return result - def _str_count(self, pat: str, flags: int = 0): - if flags: - return super()._str_count(pat, flags) - result = pc.count_substring_regex(self._pa_array, pat).to_numpy() - return self._convert_int_dtype(result) - - def _str_len(self): - result = pc.utf8_length(self._pa_array).to_numpy() - return self._convert_int_dtype(result) - - def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: - return super()._str_find(sub, start, end) - return self._convert_int_dtype(result.to_numpy()) - def _cmp_method(self, other, op): - result = super()._cmp_method(other, op) + try: + result = super()._cmp_method(other, op) + except pa.ArrowNotImplementedError: + return invalid_comparison(self, other, op) if op == operator.ne: return result.to_numpy(np.bool_, na_value=True) else: return result.to_numpy(np.bool_, na_value=False) - def value_counts(self, dropna: bool = True): + def value_counts(self, dropna: bool = True) -> Series: from pandas import Series result = super().value_counts(dropna) diff -Nru pandas-2.1.4+dfsg/pandas/core/arrays/timedeltas.py pandas-2.2.2+dfsg/pandas/core/arrays/timedeltas.py --- pandas-2.1.4+dfsg/pandas/core/arrays/timedeltas.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/arrays/timedeltas.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,7 +6,6 @@ TYPE_CHECKING, cast, ) -import warnings import numpy as np @@ -20,15 +19,12 @@ Tick, Timedelta, astype_overflowsafe, - get_supported_reso, - get_unit_from_dtype, + get_supported_dtype, iNaT, - is_supported_unit, - npy_unit_to_abbrev, + is_supported_dtype, periods_per_second, - to_offset, ) -from pandas._libs.tslibs.conversion import precision_from_unit +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized from pandas._libs.tslibs.fields import ( get_timedelta_days, get_timedelta_field, @@ -91,7 +87,7 @@ # error: Incompatible types in assignment ( # expression has type "ndarray[Any, dtype[signedinteger[_32Bit]]]", # variable has type "ndarray[Any, dtype[signedinteger[_64Bit]]] - result = get_timedelta_field(values, alias, reso=self._creso) # type: ignore[assignment] # noqa: E501 + result = get_timedelta_field(values, alias, reso=self._creso) # type: ignore[assignment] if self._hasna: result = self._maybe_mask_results( result, fill_value=None, convert="float64" @@ -136,7 +132,7 @@ Examples -------- - >>> pd.arrays.TimedeltaArray(pd.TimedeltaIndex(['1H', '2H'])) + >>> pd.arrays.TimedeltaArray._from_sequence(pd.TimedeltaIndex(['1h', '2h'])) ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] @@ -205,8 +201,10 @@ @classmethod def _validate_dtype(cls, values, dtype): # used in TimeLikeOps.__init__ - _validate_td64_dtype(values.dtype) dtype = _validate_td64_dtype(dtype) + _validate_td64_dtype(values.dtype) + if dtype != values.dtype: + raise ValueError("Values resolution does not match dtype.") return dtype # error: Signature of "_simple_new" incompatible with supertype "NDArrayBacked" @@ -233,9 +231,7 @@ if dtype: dtype = _validate_td64_dtype(dtype) - data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=None) - freq, _ = dtl.validate_inferred_freq(None, inferred_freq, False) - freq = cast("Tick | None", freq) + data, freq = sequence_to_td64ns(data, copy=copy, unit=None) if dtype is not None: data = astype_overflowsafe(data, dtype=dtype, copy=False) @@ -253,44 +249,26 @@ unit=None, ) -> Self: """ - A non-strict version of _from_sequence, called from TimedeltaIndex.__new__. + _from_sequence_not_strict but without responsibility for finding the + result's `freq`. """ if dtype: dtype = _validate_td64_dtype(dtype) assert unit not in ["Y", "y", "M"] # caller is responsible for checking - explicit_none = freq is None - freq = freq if freq is not lib.no_default else None - - freq, freq_infer = dtl.maybe_infer_freq(freq) - data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit) - freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer) - freq = cast("Tick | None", freq) - if explicit_none: - freq = None if dtype is not None: data = astype_overflowsafe(data, dtype=dtype, copy=False) - result = cls._simple_new(data, dtype=data.dtype, freq=freq) - - if inferred_freq is None and freq is not None: - # this condition precludes `freq_infer` - cls._validate_frequency(result, freq) - - elif freq_infer: - # Set _freq directly to bypass duplicative _validate_frequency - # check. - result._freq = to_offset(result.inferred_freq) + result = cls._simple_new(data, dtype=data.dtype, freq=inferred_freq) + result._maybe_pin_freq(freq, {}) return result - # Signature of "_generate_range" incompatible with supertype - # "DatetimeLikeArrayMixin" @classmethod - def _generate_range( # type: ignore[override] + def _generate_range( cls, start, end, periods, freq, closed=None, *, unit: str | None = None ) -> Self: periods = dtl.validate_periods(periods) @@ -370,7 +348,7 @@ return self.copy() return self - if is_supported_unit(get_unit_from_dtype(dtype)): + if is_supported_dtype(dtype): # unit conversion e.g. timedelta64[s] res_values = astype_overflowsafe(self._ndarray, dtype, copy=False) return type(self)._simple_new( @@ -471,7 +449,7 @@ from pandas.io.formats.format import get_format_timedelta64 # Relies on TimeDelta._repr_base - formatter = get_format_timedelta64(self._ndarray, na_rep) + formatter = get_format_timedelta64(self, na_rep) # equiv: np.array([formatter(x) for x in self._ndarray]) # but independent of dimension return np.frompyfunc(formatter, 1, 1)(self._ndarray) @@ -731,11 +709,13 @@ return type(self)._simple_new(-self._ndarray, dtype=self.dtype, freq=freq) def __pos__(self) -> TimedeltaArray: - return type(self)(self._ndarray.copy(), freq=self.freq) + return type(self)._simple_new( + self._ndarray.copy(), dtype=self.dtype, freq=self.freq + ) def __abs__(self) -> TimedeltaArray: # Note: freq is not preserved - return type(self)(np.abs(self._ndarray)) + return type(self)._simple_new(np.abs(self._ndarray), dtype=self.dtype) # ---------------------------------------------------------------- # Conversion Methods - Vectorized analogues of Timedelta methods @@ -854,7 +834,7 @@ -------- For Series: - >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='S')) + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='s')) >>> ser 0 0 days 00:00:01 1 0 days 00:00:02 @@ -868,7 +848,7 @@ For TimedeltaIndex: - >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='S') + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='s') >>> tdelta_idx TimedeltaIndex(['0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03'], dtype='timedelta64[ns]', freq=None) @@ -888,7 +868,7 @@ -------- For Series: - >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='U')) + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='us')) >>> ser 0 0 days 00:00:00.000001 1 0 days 00:00:00.000002 @@ -902,7 +882,7 @@ For TimedeltaIndex: - >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='U') + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='us') >>> tdelta_idx TimedeltaIndex(['0 days 00:00:00.000001', '0 days 00:00:00.000002', '0 days 00:00:00.000003'], @@ -923,7 +903,7 @@ -------- For Series: - >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='N')) + >>> ser = pd.Series(pd.to_timedelta([1, 2, 3], unit='ns')) >>> ser 0 0 days 00:00:00.000000001 1 0 days 00:00:00.000000002 @@ -937,7 +917,7 @@ For TimedeltaIndex: - >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='N') + >>> tdelta_idx = pd.to_timedelta([1, 2, 3], unit='ns') >>> tdelta_idx TimedeltaIndex(['0 days 00:00:00.000000001', '0 days 00:00:00.000000002', '0 days 00:00:00.000000003'], @@ -1075,32 +1055,16 @@ data = data._data else: mask = np.isnan(data) - # The next few lines are effectively a vectorized 'cast_from_unit' - m, p = precision_from_unit(unit or "ns") - with warnings.catch_warnings(): - # Suppress RuntimeWarning about All-NaN slice - warnings.filterwarnings( - "ignore", "invalid value encountered in cast", RuntimeWarning - ) - base = data.astype(np.int64) - frac = data - base - if p: - frac = np.round(frac, p) - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", "invalid value encountered in cast", RuntimeWarning - ) - data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") + + data = cast_from_unit_vectorized(data, unit or "ns") data[mask] = iNaT + data = data.view("m8[ns]") copy = False elif lib.is_np_dtype(data.dtype, "m"): - data_unit = get_unit_from_dtype(data.dtype) - if not is_supported_unit(data_unit): + if not is_supported_dtype(data.dtype): # cast to closest supported unit, i.e. s or ns - new_reso = get_supported_reso(data_unit) - new_unit = npy_unit_to_abbrev(new_reso) - new_dtype = np.dtype(f"m8[{new_unit}]") + new_dtype = get_supported_dtype(data.dtype) data = astype_overflowsafe(data, dtype=new_dtype, copy=False) copy = False @@ -1108,7 +1072,10 @@ # This includes datetime64-dtype, see GH#23539, GH#29794 raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]") - data = np.array(data, copy=copy) + if not copy: + data = np.asarray(data) + else: + data = np.array(data, copy=copy) assert data.dtype.kind == "m" assert data.dtype != "m8" # i.e. not unit-less @@ -1186,7 +1153,7 @@ higher level. """ # coerce Index to np.ndarray, converting string-dtype if necessary - values = np.array(data, dtype=np.object_, copy=False) + values = np.asarray(data, dtype=np.object_) result = array_to_timedelta64(values, unit=unit, errors=errors) return result.view("timedelta64[ns]") @@ -1202,11 +1169,9 @@ ) raise ValueError(msg) - if ( - not isinstance(dtype, np.dtype) - or dtype.kind != "m" - or not is_supported_unit(get_unit_from_dtype(dtype)) - ): - raise ValueError(f"dtype {dtype} cannot be converted to timedelta64[ns]") + if not lib.is_np_dtype(dtype, "m"): + raise ValueError(f"dtype '{dtype}' is invalid, should be np.timedelta64 dtype") + elif not is_supported_dtype(dtype): + raise ValueError("Supported timedelta64 resolutions are 's', 'ms', 'us', 'ns'") return dtype diff -Nru pandas-2.1.4+dfsg/pandas/core/base.py pandas-2.2.2+dfsg/pandas/core/base.py --- pandas-2.1.4+dfsg/pandas/core/base.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/base.py 2024-04-10 17:42:52.000000000 +0000 @@ -108,7 +108,7 @@ @property def _constructor(self): """ - Class constructor (for this class it's just `__class__`. + Class constructor (for this class it's just `__class__`). """ return type(self) @@ -485,8 +485,8 @@ types, this is the actual array. For NumPy native types, this is a thin (no copy) wrapper around :class:`numpy.ndarray`. - ``.array`` differs ``.values`` which may require converting the - data to a different form. + ``.array`` differs from ``.values``, which may require converting + the data to a different form. See Also -------- @@ -1310,12 +1310,10 @@ # This overload is needed so that the call to searchsorted in # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result - @overload - # The following ignore is also present in numpy/__init__.pyi - # Possibly a mypy bug?? # error: Overloaded function signatures 1 and 2 overlap with incompatible - # return types [misc] - def searchsorted( # type: ignore[misc] + # return types + @overload + def searchsorted( # type: ignore[overload-overlap] self, value: ScalarLike_co, side: Literal["left", "right"] = ..., @@ -1365,7 +1363,10 @@ @final def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: - return algorithms.duplicated(self._values, keep=keep) + arr = self._values + if isinstance(arr, ExtensionArray): + return arr.duplicated(keep=keep) + return algorithms.duplicated(arr, keep=keep) def _arith_method(self, other, op): res_name = ops.get_op_result_name(self, other) diff -Nru pandas-2.1.4+dfsg/pandas/core/common.py pandas-2.2.2+dfsg/pandas/core/common.py --- pandas-2.1.4+dfsg/pandas/core/common.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/common.py 2024-04-10 17:42:52.000000000 +0000 @@ -42,6 +42,7 @@ from pandas.core.dtypes.generic import ( ABCExtensionArray, ABCIndex, + ABCMultiIndex, ABCSeries, ) from pandas.core.dtypes.inference import iterable_not_string @@ -121,7 +122,9 @@ check_array_indexer : Check that `key` is a valid array to index, and convert to an ndarray. """ - if isinstance(key, (ABCSeries, np.ndarray, ABCIndex, ABCExtensionArray)): + if isinstance( + key, (ABCSeries, np.ndarray, ABCIndex, ABCExtensionArray) + ) and not isinstance(key, ABCMultiIndex): if key.dtype == np.object_: key_array = np.asarray(key) @@ -138,7 +141,7 @@ elif isinstance(key, list): # check if np.array(key).dtype would be bool if len(key) > 0: - if type(key) is not list: + if type(key) is not list: # noqa: E721 # GH#42461 cython will raise TypeError if we pass a subclass key = list(key) return lib.is_bool_list(key) @@ -230,6 +233,8 @@ values = list(values) elif isinstance(values, ABCIndex): return values._values + elif isinstance(values, ABCSeries): + return values._values if isinstance(values, list) and dtype in [np.object_, object]: return construct_1d_object_array_from_listlike(values) @@ -531,17 +536,24 @@ def temp_setattr( obj, attr: str, value, condition: bool = True ) -> Generator[None, None, None]: - """Temporarily set attribute on an object. + """ + Temporarily set attribute on an object. - Args: - obj: Object whose attribute will be modified. - attr: Attribute to modify. - value: Value to temporarily set attribute to. - condition: Whether to set the attribute. Provided in order to not have to - conditionally use this context manager. + Parameters + ---------- + obj : object + Object whose attribute will be modified. + attr : str + Attribute to modify. + value : Any + Value to temporarily set attribute to. + condition : bool, default True + Whether to set the attribute. Provided in order to not have to + conditionally use this context manager. - Yields: - obj with modified attribute. + Yields + ------ + object : obj with modified attribute. """ if condition: old_value = getattr(obj, attr) diff -Nru pandas-2.1.4+dfsg/pandas/core/computation/align.py pandas-2.2.2+dfsg/pandas/core/computation/align.py --- pandas-2.1.4+dfsg/pandas/core/computation/align.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/computation/align.py 2024-04-10 17:42:52.000000000 +0000 @@ -110,7 +110,7 @@ ax, itm = axis, items if not axes[ax].is_(itm): - axes[ax] = axes[ax].join(itm, how="outer") + axes[ax] = axes[ax].union(itm) for i, ndim in ndims.items(): for axis, items in zip(range(ndim), axes): diff -Nru pandas-2.1.4+dfsg/pandas/core/computation/check.py pandas-2.2.2+dfsg/pandas/core/computation/check.py --- pandas-2.1.4+dfsg/pandas/core/computation/check.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/computation/check.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,9 +4,5 @@ ne = import_optional_dependency("numexpr", errors="warn") NUMEXPR_INSTALLED = ne is not None -if NUMEXPR_INSTALLED: - NUMEXPR_VERSION = ne.__version__ -else: - NUMEXPR_VERSION = None -__all__ = ["NUMEXPR_INSTALLED", "NUMEXPR_VERSION"] +__all__ = ["NUMEXPR_INSTALLED"] diff -Nru pandas-2.1.4+dfsg/pandas/core/computation/eval.py pandas-2.2.2+dfsg/pandas/core/computation/eval.py --- pandas-2.1.4+dfsg/pandas/core/computation/eval.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/computation/eval.py 2024-04-10 17:42:52.000000000 +0000 @@ -388,14 +388,10 @@ # we will ignore numpy warnings here; e.g. if trying # to use a non-numeric indexer try: - with warnings.catch_warnings(record=True): - # TODO: Filter the warnings we actually care about here. - if inplace and isinstance(target, NDFrame): - target.loc[:, assigner] = ret - else: - target[ # pyright: ignore[reportGeneralTypeIssues] - assigner - ] = ret + if inplace and isinstance(target, NDFrame): + target.loc[:, assigner] = ret + else: + target[assigner] = ret # pyright: ignore[reportGeneralTypeIssues] except (TypeError, IndexError) as err: raise ValueError("Cannot assign expression output to target") from err diff -Nru pandas-2.1.4+dfsg/pandas/core/computation/expr.py pandas-2.2.2+dfsg/pandas/core/computation/expr.py --- pandas-2.1.4+dfsg/pandas/core/computation/expr.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/computation/expr.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,6 +12,7 @@ import tokenize from typing import ( Callable, + ClassVar, TypeVar, ) @@ -349,8 +350,8 @@ preparser : callable """ - const_type: type[Term] = Constant - term_type = Term + const_type: ClassVar[type[Term]] = Constant + term_type: ClassVar[type[Term]] = Term binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS binary_op_nodes = ( @@ -540,7 +541,7 @@ operand = self.visit(node.operand) return op(operand) - def visit_Name(self, node, **kwargs): + def visit_Name(self, node, **kwargs) -> Term: return self.term_type(node.id, self.env, **kwargs) # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min @@ -555,11 +556,11 @@ return self.const_type(node.value, self.env) # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min - def visit_Str(self, node, **kwargs): + def visit_Str(self, node, **kwargs) -> Term: name = self.env.add_tmp(node.s) return self.term_type(name, self.env) - def visit_List(self, node, **kwargs): + def visit_List(self, node, **kwargs) -> Term: name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts]) return self.term_type(name, self.env) @@ -569,7 +570,7 @@ """df.index[4]""" return self.visit(node.value) - def visit_Subscript(self, node, **kwargs): + def visit_Subscript(self, node, **kwargs) -> Term: from pandas import eval as pd_eval value = self.visit(node.value) @@ -589,7 +590,7 @@ name = self.env.add_tmp(v) return self.term_type(name, env=self.env) - def visit_Slice(self, node, **kwargs): + def visit_Slice(self, node, **kwargs) -> slice: """df.index[slice(4,6)]""" lower = node.lower if lower is not None: @@ -694,8 +695,8 @@ if not isinstance(key, ast.keyword): # error: "expr" has no attribute "id" raise ValueError( - "keyword error in function call " # type: ignore[attr-defined] - f"'{node.func.id}'" + "keyword error in function call " + f"'{node.func.id}'" # type: ignore[attr-defined] ) if key.arg: diff -Nru pandas-2.1.4+dfsg/pandas/core/computation/ops.py pandas-2.2.2+dfsg/pandas/core/computation/ops.py --- pandas-2.1.4+dfsg/pandas/core/computation/ops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/computation/ops.py 2024-04-10 17:42:52.000000000 +0000 @@ -617,5 +617,5 @@ self.name = name self.func = getattr(np, name) - def __call__(self, *args): + def __call__(self, *args) -> MathCall: return MathCall(self, args) diff -Nru pandas-2.1.4+dfsg/pandas/core/computation/pytables.py pandas-2.2.2+dfsg/pandas/core/computation/pytables.py --- pandas-2.1.4+dfsg/pandas/core/computation/pytables.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/computation/pytables.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,6 +10,7 @@ from typing import ( TYPE_CHECKING, Any, + ClassVar, ) import numpy as np @@ -40,7 +41,10 @@ ) if TYPE_CHECKING: - from pandas._typing import npt + from pandas._typing import ( + Self, + npt, + ) class PyTablesScope(_scope.Scope): @@ -213,7 +217,7 @@ kind = ensure_decoded(self.kind) meta = ensure_decoded(self.meta) - if kind in ("datetime64", "datetime"): + if kind == "datetime" or (kind and kind.startswith("datetime64")): if isinstance(v, (int, float)): v = stringify(v) v = ensure_decoded(v) @@ -283,7 +287,7 @@ return "Filter: Not Initialized" return pprint_thing(f"[Filter : [{self.filter[0]}] -> [{self.filter[1]}]") - def invert(self): + def invert(self) -> Self: """invert the filter""" if self.filter is not None: self.filter = ( @@ -297,7 +301,8 @@ """return the actual filter format""" return [self.filter] - def evaluate(self): + # error: Signature of "evaluate" incompatible with supertype "BinOp" + def evaluate(self) -> Self | None: # type: ignore[override] if not self.is_valid: raise ValueError(f"query term is not valid [{self}]") @@ -336,7 +341,8 @@ def format(self): raise NotImplementedError("unable to collapse Joint Filters") - def evaluate(self): + # error: Signature of "evaluate" incompatible with supertype "BinOp" + def evaluate(self) -> Self: # type: ignore[override] return self @@ -357,7 +363,8 @@ """return the actual ne format""" return self.condition - def evaluate(self): + # error: Signature of "evaluate" incompatible with supertype "BinOp" + def evaluate(self) -> Self | None: # type: ignore[override] if not self.is_valid: raise ValueError(f"query term is not valid [{self}]") @@ -385,7 +392,8 @@ class JointConditionBinOp(ConditionBinOp): - def evaluate(self): + # error: Signature of "evaluate" incompatible with supertype "BinOp" + def evaluate(self) -> Self: # type: ignore[override] self.condition = f"({self.lhs.condition} {self.op} {self.rhs.condition})" return self @@ -410,8 +418,8 @@ class PyTablesExprVisitor(BaseExprVisitor): - const_type = Constant - term_type = Term + const_type: ClassVar[type[ops.Term]] = Constant + term_type: ClassVar[type[Term]] = Term def __init__(self, env, engine, parser, **kwargs) -> None: super().__init__(env, engine, parser) @@ -423,13 +431,15 @@ lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs), ) - def visit_UnaryOp(self, node, **kwargs): + def visit_UnaryOp(self, node, **kwargs) -> ops.Term | UnaryOp | None: if isinstance(node.op, (ast.Not, ast.Invert)): return UnaryOp("~", self.visit(node.operand)) elif isinstance(node.op, ast.USub): return self.const_type(-self.visit(node.operand).value, self.env) elif isinstance(node.op, ast.UAdd): raise NotImplementedError("Unary addition not supported") + # TODO: return None might never be reached + return None def visit_Index(self, node, **kwargs): return self.visit(node.value).value @@ -440,7 +450,7 @@ ) return self.visit(cmpr) - def visit_Subscript(self, node, **kwargs): + def visit_Subscript(self, node, **kwargs) -> ops.Term: # only allow simple subscripts value = self.visit(node.value) diff -Nru pandas-2.1.4+dfsg/pandas/core/config_init.py pandas-2.2.2+dfsg/pandas/core/config_init.py --- pandas-2.1.4+dfsg/pandas/core/config_init.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/config_init.py 2024-04-10 17:42:52.000000000 +0000 @@ -265,7 +265,7 @@ """ pc_max_info_rows_doc = """ -: int or None +: int df.info() will usually show null-counts for each column. For large frames this can be quite slow. max_info_rows and max_info_cols limit this null check only to frames with smaller dimensions than @@ -322,7 +322,7 @@ "max_info_rows", 1690785, pc_max_info_rows_doc, - validator=is_instance_factory((int, type(None))), + validator=is_int, ) cf.register_option("max_rows", 60, pc_max_rows_doc, validator=is_nonnegative_int) cf.register_option( @@ -420,6 +420,7 @@ def use_inf_as_na_cb(key) -> None: + # TODO(3.0): enforcing this deprecation will close GH#52501 from pandas.core.dtypes.missing import _use_inf_as_na _use_inf_as_na(key) @@ -453,6 +454,13 @@ validator=is_one_of_factory(["block", "array"]), ) +cf.deprecate_option( + # GH#55043 + "mode.data_manager", + "data_manager option is deprecated and will be removed in a future " + "version. Only the BlockManager will be available.", +) + # TODO better name? copy_on_write_doc = """ @@ -468,9 +476,11 @@ "copy_on_write", # Get the default from an environment variable, if set, otherwise defaults # to False. This environment variable can be set for testing. - os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1", + "warn" + if os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "warn" + else os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1", copy_on_write_doc, - validator=is_bool, + validator=is_one_of_factory([True, False, "warn"]), ) @@ -512,11 +522,11 @@ auto, {others}. """ -_xls_options = ["xlrd"] -_xlsm_options = ["xlrd", "openpyxl"] -_xlsx_options = ["xlrd", "openpyxl"] -_ods_options = ["odf"] -_xlsb_options = ["pyxlsb"] +_xls_options = ["xlrd", "calamine"] +_xlsm_options = ["xlrd", "openpyxl", "calamine"] +_xlsx_options = ["xlrd", "openpyxl", "calamine"] +_ods_options = ["odf", "calamine"] +_xlsb_options = ["pyxlsb", "calamine"] with cf.config_prefix("io.excel.xls"): @@ -901,3 +911,14 @@ "(at which point this option will be deprecated).", validator=is_one_of_factory([True, False]), ) + + cf.register_option( + "no_silent_downcasting", + False, + "Whether to opt-in to the future behavior which will *not* silently " + "downcast results from Series and DataFrame `where`, `mask`, and `clip` " + "methods. " + "Silent downcasting will be removed in pandas 3.0 " + "(at which point this option will be deprecated).", + validator=is_one_of_factory([True, False]), + ) diff -Nru pandas-2.1.4+dfsg/pandas/core/construction.py pandas-2.2.2+dfsg/pandas/core/construction.py --- pandas-2.1.4+dfsg/pandas/core/construction.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/construction.py 2024-04-10 17:42:52.000000000 +0000 @@ -24,8 +24,8 @@ from pandas._libs import lib from pandas._libs.tslibs import ( Period, - get_unit_from_dtype, - is_supported_unit, + get_supported_dtype, + is_supported_dtype, ) from pandas._typing import ( AnyArrayLike, @@ -127,12 +127,6 @@ ``pd.options.mode.string_storage`` if the dtype is not explicitly given. For all other cases, NumPy's usual inference rules will be used. - - .. versionchanged:: 1.2.0 - - Pandas now also infers nullable-floating dtype for float-like - input data - copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require @@ -204,7 +198,7 @@ ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] Length: 2, dtype: datetime64[ns] - >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') + >>> pd.array(["1h", "2h"], dtype='timedelta64[ns]') ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] @@ -349,7 +343,9 @@ elif inferred_dtype == "string": # StringArray/ArrowStringArray depending on pd.options.mode.string_storage - return StringDtype().construct_array_type()._from_sequence(data, copy=copy) + dtype = StringDtype() + cls = dtype.construct_array_type() + return cls._from_sequence(data, dtype=dtype, copy=copy) elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) @@ -364,15 +360,15 @@ return FloatingArray._from_sequence(data, copy=copy) elif inferred_dtype == "boolean": - return BooleanArray._from_sequence(data, copy=copy) + return BooleanArray._from_sequence(data, dtype="boolean", copy=copy) # Pandas overrides NumPy for # 1. datetime64[ns,us,ms,s] # 2. timedelta64[ns,us,ms,s] # so that a DatetimeArray is returned. - if lib.is_np_dtype(dtype, "M") and is_supported_unit(get_unit_from_dtype(dtype)): + if lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype): return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) - if lib.is_np_dtype(dtype, "m") and is_supported_unit(get_unit_from_dtype(dtype)): + if lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype): return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) elif lib.is_np_dtype(dtype, "mM"): @@ -490,12 +486,14 @@ if arr.dtype.kind == "M": from pandas.core.arrays import DatetimeArray - return DatetimeArray._from_sequence(arr) + dtype = get_supported_dtype(arr.dtype) + return DatetimeArray._from_sequence(arr, dtype=dtype) elif arr.dtype.kind == "m": from pandas.core.arrays import TimedeltaArray - return TimedeltaArray._from_sequence(arr) + dtype = get_supported_dtype(arr.dtype) + return TimedeltaArray._from_sequence(arr, dtype=dtype) return arr @@ -628,7 +626,10 @@ elif hasattr(data, "__array__"): # e.g. dask array GH#38645 - data = np.array(data, copy=copy) + if not copy: + data = np.asarray(data) + else: + data = np.array(data, copy=copy) return sanitize_array( data, index=index, @@ -746,8 +747,11 @@ # GH#19853: If data is a scalar, result has already the result if not lib.is_scalar(data): if not np.all(isna(data)): - data = np.array(data, dtype=dtype, copy=False) - result = np.array(data, dtype=object, copy=copy) + data = np.asarray(data, dtype=dtype) + if not copy: + result = np.asarray(data, dtype=object) + else: + result = np.array(data, dtype=object, copy=copy) return result @@ -812,6 +816,8 @@ # this will raise if we have e.g. floats subarr = maybe_cast_to_integer_array(arr, dtype) + elif not copy: + subarr = np.asarray(arr, dtype=dtype) else: subarr = np.array(arr, dtype=dtype, copy=copy) diff -Nru pandas-2.1.4+dfsg/pandas/core/dtypes/astype.py pandas-2.2.2+dfsg/pandas/core/dtypes/astype.py --- pandas-2.1.4+dfsg/pandas/core/dtypes/astype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/dtypes/astype.py 2024-04-10 17:42:52.000000000 +0000 @@ -105,11 +105,10 @@ # then coerce to datetime64[ns] and use DatetimeArray.astype if lib.is_np_dtype(dtype, "M"): - from pandas import to_datetime + from pandas.core.arrays import DatetimeArray - dti = to_datetime(arr.ravel()) - dta = dti._data.reshape(arr.shape) - return dta.astype(dtype, copy=False)._ndarray + dta = DatetimeArray._from_sequence(arr, dtype=dtype) + return dta._ndarray elif lib.is_np_dtype(dtype, "m"): from pandas.core.construction import ensure_wrapped_if_datetimelike diff -Nru pandas-2.1.4+dfsg/pandas/core/dtypes/base.py pandas-2.2.2+dfsg/pandas/core/dtypes/base.py --- pandas-2.1.4+dfsg/pandas/core/dtypes/base.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/dtypes/base.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,6 +15,7 @@ from pandas._libs import missing as libmissing from pandas._libs.hashtable import object_hash +from pandas._libs.properties import cache_readonly from pandas.errors import AbstractMethodError from pandas.core.dtypes.generic import ( @@ -32,6 +33,7 @@ type_t, ) + from pandas import Index from pandas.core.arrays import ExtensionArray # To parameterize on same ExtensionDtype @@ -110,7 +112,7 @@ def __str__(self) -> str: return self.name - def __eq__(self, other: Any) -> bool: + def __eq__(self, other: object) -> bool: """ Check whether 'other' is equal to self. @@ -144,7 +146,7 @@ # we need to avoid that and thus use hash function with old behavior return object_hash(tuple(getattr(self, attr) for attr in self._metadata)) - def __ne__(self, other: Any) -> bool: + def __ne__(self, other: object) -> bool: return not self.__eq__(other) @property @@ -240,7 +242,7 @@ This is useful mainly for data types that accept parameters. For example, a period dtype accepts a frequency parameter that - can be set as ``period[H]`` (where H means hourly frequency). + can be set as ``period[h]`` (where H means hourly frequency). By default, in the abstract class, just the name of the type is expected. But subclasses can overwrite this method to accept @@ -406,6 +408,43 @@ """ return False + @cache_readonly + def index_class(self) -> type_t[Index]: + """ + The Index subclass to return from Index.__new__ when this dtype is + encountered. + """ + from pandas import Index + + return Index + + @property + def _supports_2d(self) -> bool: + """ + Do ExtensionArrays with this dtype support 2D arrays? + + Historically ExtensionArrays were limited to 1D. By returning True here, + authors can indicate that their arrays support 2D instances. This can + improve performance in some cases, particularly operations with `axis=1`. + + Arrays that support 2D values should: + + - implement Array.reshape + - subclass the Dim2CompatTests in tests.extension.base + - _concat_same_type should support `axis` keyword + - _reduce and reductions should support `axis` keyword + """ + return False + + @property + def _can_fast_transpose(self) -> bool: + """ + Is transposing an array with this dtype zero-copy? + + Only relevant for cases where _supports_2d is True. + """ + return False + class StorageExtensionDtype(ExtensionDtype): """ExtensionDtype that may be backed by more than one implementation.""" @@ -422,7 +461,7 @@ def __str__(self) -> str: return self.name - def __eq__(self, other: Any) -> bool: + def __eq__(self, other: object) -> bool: if isinstance(other, str) and other == self.name: return True return super().__eq__(other) diff -Nru pandas-2.1.4+dfsg/pandas/core/dtypes/cast.py pandas-2.2.2+dfsg/pandas/core/dtypes/cast.py --- pandas-2.1.4+dfsg/pandas/core/dtypes/cast.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/dtypes/cast.py 2024-04-10 17:42:52.000000000 +0000 @@ -20,7 +20,11 @@ from pandas._config import using_pyarrow_string_dtype -from pandas._libs import lib +from pandas._libs import ( + Interval, + Period, + lib, +) from pandas._libs.missing import ( NA, NAType, @@ -32,10 +36,10 @@ OutOfBoundsTimedelta, Timedelta, Timestamp, - get_unit_from_dtype, - is_supported_unit, + is_supported_dtype, ) from pandas._libs.tslibs.timedeltas import array_to_timedelta64 +from pandas.compat.numpy import np_version_gt2 from pandas.errors import ( IntCastingNaNError, LossySetitemError, @@ -68,6 +72,7 @@ PeriodDtype, ) from pandas.core.dtypes.generic import ( + ABCExtensionArray, ABCIndex, ABCSeries, ) @@ -256,6 +261,8 @@ try to cast to the specified dtype (e.g. convert back to bool/int or could be an astype of float64->float32 """ + if isinstance(result, ABCSeries): + result = result._values do_round = False if isinstance(dtype, str): @@ -356,15 +363,11 @@ # if we don't have any elements, just astype it return trans(result).astype(dtype) - # do a test on the first element, if it fails then we are done - r = result.ravel() - arr = np.array([r[0]]) - - if isna(arr).any(): - # if we have any nulls, then we are done - return result - - elif not isinstance(r[0], (np.integer, np.floating, int, float, bool)): + if isinstance(result, np.ndarray): + element = result.item(0) + else: + element = result.iloc[0] + if not isinstance(element, (np.integer, np.floating, int, float, bool)): # a comparable, e.g. a Decimal may slip in here return result @@ -463,16 +466,11 @@ """ if isinstance(dtype, ExtensionDtype): - if not isinstance(dtype, (CategoricalDtype, DatetimeTZDtype)): - # TODO: avoid this special-casing - # We have to special case categorical so as not to upcast - # things like counts back to categorical - - cls = dtype.construct_array_type() - if same_dtype: - result = _maybe_cast_to_extension_array(cls, result, dtype=dtype) - else: - result = _maybe_cast_to_extension_array(cls, result) + cls = dtype.construct_array_type() + if same_dtype: + result = _maybe_cast_to_extension_array(cls, result, dtype=dtype) + else: + result = _maybe_cast_to_extension_array(cls, result) elif (numeric_only and dtype.kind in "iufcb") or not numeric_only: result = maybe_downcast_to_dtype(result, dtype) @@ -497,11 +495,14 @@ ------- ExtensionArray or obj """ - from pandas.core.arrays.string_ import BaseStringArray + result: ArrayLike - # Everything can be converted to StringArrays, but we may not want to convert - if issubclass(cls, BaseStringArray) and lib.infer_dtype(obj) != "string": - return obj + if dtype is not None: + try: + result = cls._from_scalars(obj, dtype=dtype) + except (TypeError, ValueError): + return obj + return result try: result = cls._from_sequence(obj, dtype=dtype) @@ -851,9 +852,9 @@ elif is_complex(val): dtype = np.dtype(np.complex128) - if lib.is_period(val): + if isinstance(val, Period): dtype = PeriodDtype(freq=val.freq) - elif lib.is_interval(val): + elif isinstance(val, Interval): subtype = infer_dtype_from_scalar(val.left)[0] dtype = IntervalDtype(subtype=subtype, closed=val.closed) @@ -1132,7 +1133,16 @@ base_dtype = np.dtype(str) else: base_dtype = inferred_dtype - pa_type = to_pyarrow_type(base_dtype) + if ( + base_dtype.kind == "O" # type: ignore[union-attr] + and input_array.size > 0 + and isna(input_array).all() + ): + import pyarrow as pa + + pa_type = pa.null() + else: + pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: inferred_dtype = ArrowDtype(pa_type) elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): @@ -1256,8 +1266,7 @@ pass elif dtype.kind in "mM": - reso = get_unit_from_dtype(dtype) - if not is_supported_unit(reso): + if not is_supported_dtype(dtype): # pre-2.0 we would silently swap in nanos for lower-resolutions, # raise for above-nano resolutions if dtype.name in ["datetime64", "timedelta64"]: @@ -1306,6 +1315,30 @@ # which will make us upcast too far. if lib.is_float(right) and right.is_integer() and left_dtype.kind != "f": right = int(right) + # After NEP 50, numpy won't inspect Python scalars + # TODO: do we need to recreate numpy's inspection logic for floats too + # (this breaks some tests) + if isinstance(right, int) and not isinstance(right, np.integer): + # This gives an unsigned type by default + # (if our number is positive) + + # If our left dtype is signed, we might not want this since + # this might give us 1 dtype too big + # We should check if the corresponding int dtype (e.g. int64 for uint64) + # can hold the number + right_dtype = np.min_scalar_type(right) + if right == 0: + # Special case 0 + right = left_dtype + elif ( + not np.issubdtype(left_dtype, np.unsignedinteger) + and 0 < right <= np.iinfo(right_dtype).max + ): + # If left dtype isn't unsigned, check if it fits in the signed dtype + right = np.dtype(f"i{right_dtype.itemsize}") + else: + right = right_dtype + new_dtype = np.result_type(left_dtype, right) elif is_valid_na_for_dtype(right, left_dtype): @@ -1468,7 +1501,10 @@ # Attempt to coerce to a numpy array try: - arr = np.array(value, dtype=dtype, copy=copy) + if not copy: + arr = np.asarray(value, dtype=dtype) + else: + arr = np.array(value, dtype=dtype, copy=copy) except (ValueError, TypeError) as err: raise TypeError( f"DataFrame constructor called with incompatible data and dtype: {err}" @@ -1611,12 +1647,14 @@ with warnings.catch_warnings(): # We already disallow dtype=uint w/ negative numbers # (test_constructor_coercion_signed_to_unsigned) so safe to ignore. - warnings.filterwarnings( - "ignore", - "NumPy will stop allowing conversion of out-of-bound Python int", - DeprecationWarning, - ) - casted = np.array(arr, dtype=dtype, copy=False) + if not np_version_gt2: + warnings.filterwarnings( + "ignore", + "NumPy will stop allowing conversion of " + "out-of-bound Python int", + DeprecationWarning, + ) + casted = np.asarray(arr, dtype=dtype) else: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) @@ -1647,11 +1685,13 @@ arr = np.asarray(arr) if np.issubdtype(arr.dtype, str): + # TODO(numpy-2.0 min): This case will raise an OverflowError above if (casted.astype(str) == arr).all(): return casted raise ValueError(f"string values cannot be losslessly cast to {dtype}") if dtype.kind == "u" and (arr < 0).any(): + # TODO: can this be hit anymore after numpy 2.0? raise OverflowError("Trying to coerce negative values to unsigned integers") if arr.dtype.kind == "f": @@ -1664,6 +1704,7 @@ raise ValueError("Trying to coerce float values to integers") if casted.dtype < arr.dtype: + # TODO: Can this path be hit anymore with numpy > 2 # GH#41734 e.g. [1, 200, 923442] and dtype="int8" -> overflows raise ValueError( f"Values are too large to be losslessly converted to {dtype}. " @@ -1706,8 +1747,6 @@ arr._validate_setitem_value(element) return True except (ValueError, TypeError): - # TODO: re-use _catch_deprecated_value_error to ensure we are - # strict about what exceptions we allow through here. return False # This is technically incorrect, but maintains the behavior of @@ -1774,6 +1813,23 @@ return casted raise LossySetitemError + elif isinstance(element, ABCExtensionArray) and isinstance( + element.dtype, CategoricalDtype + ): + # GH#52927 setting Categorical value into non-EA frame + # TODO: general-case for EAs? + try: + casted = element.astype(dtype) + except (ValueError, TypeError): + raise LossySetitemError + # Check for cases of either + # a) lossy overflow/rounding or + # b) semantic changes like dt64->int64 + comp = casted == element + if not comp.all(): + raise LossySetitemError + return casted + # Anything other than integer we cannot hold raise LossySetitemError if ( @@ -1793,7 +1849,8 @@ if not isinstance(tipo, np.dtype): # i.e. nullable IntegerDtype; we can put this into an ndarray # losslessly iff it has no NAs - if element._hasna: + arr = element._values if isinstance(element, ABCSeries) else element + if arr._hasna: raise LossySetitemError return element diff -Nru pandas-2.1.4+dfsg/pandas/core/dtypes/common.py pandas-2.2.2+dfsg/pandas/core/dtypes/common.py --- pandas-2.1.4+dfsg/pandas/core/dtypes/common.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/dtypes/common.py 2024-04-10 17:42:52.000000000 +0000 @@ -169,6 +169,9 @@ """ Check whether an array-like is a 1-D pandas sparse array. + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.SparseDtype) instead. + Check that the one-dimensional array-like is a pandas sparse array. Returns True if it is a pandas sparse array, not another type of sparse array. @@ -295,6 +298,9 @@ """ Check whether an array-like or dtype is of a DatetimeTZDtype dtype. + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.DatetimeTZDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -381,6 +387,9 @@ """ Check whether an array-like or dtype is of the Period dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.Period) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -402,7 +411,7 @@ False >>> is_period_dtype(pd.Period("2017-01-01")) False - >>> is_period_dtype(pd.PeriodIndex([], freq="A")) + >>> is_period_dtype(pd.PeriodIndex([], freq="Y")) True """ warnings.warn( @@ -424,6 +433,9 @@ """ Check whether an array-like or dtype is of the Interval dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.IntervalDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -470,6 +482,9 @@ """ Check whether an array-like or dtype is of the Categorical dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.CategoricalDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -1256,13 +1271,7 @@ """ Analogue to is_extension_array_dtype but excluding DatetimeTZDtype. """ - # Note: if other EA dtypes are ever held in HybridBlock, exclude those - # here too. - # NB: need to check DatetimeTZDtype and not is_datetime64tz_dtype - # to exclude ArrowTimestampUSDtype - return isinstance(dtype, ExtensionDtype) and not isinstance( - dtype, (DatetimeTZDtype, PeriodDtype) - ) + return isinstance(dtype, ExtensionDtype) and not dtype._supports_2d def is_extension_array_dtype(arr_or_dtype) -> bool: @@ -1670,9 +1679,12 @@ dtype = value.dtype if isinstance(dtype, np.dtype): - return dtype == np.dtype("object") and lib.is_string_array( - np.asarray(value), skipna=False - ) + if len(value) == 0: + return dtype == np.dtype("object") + else: + return dtype == np.dtype("object") and lib.is_string_array( + np.asarray(value), skipna=False + ) elif isinstance(dtype, CategoricalDtype): return dtype.categories.inferred_type == "string" return dtype == "string" diff -Nru pandas-2.1.4+dfsg/pandas/core/dtypes/concat.py pandas-2.2.2+dfsg/pandas/core/dtypes/concat.py --- pandas-2.1.4+dfsg/pandas/core/dtypes/concat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/dtypes/concat.py 2024-04-10 17:42:52.000000000 +0000 @@ -129,7 +129,16 @@ # i.e. isinstance(to_concat[0], ExtensionArray) to_concat_eas = cast("Sequence[ExtensionArray]", to_concat) cls = type(to_concat[0]) - return cls._concat_same_type(to_concat_eas) + # GH#53640: eg. for datetime array, axis=1 but 0 is default + # However, class method `_concat_same_type()` for some classes + # may not support the `axis` keyword + if ea_compat_axis or axis == 0: + return cls._concat_same_type(to_concat_eas) + else: + return cls._concat_same_type( + to_concat_eas, + axis=axis, # type: ignore[call-arg] + ) else: to_concat_arrs = cast("Sequence[np.ndarray]", to_concat) result = np.concatenate(to_concat_arrs, axis=axis) diff -Nru pandas-2.1.4+dfsg/pandas/core/dtypes/dtypes.py pandas-2.2.2+dfsg/pandas/core/dtypes/dtypes.py --- pandas-2.1.4+dfsg/pandas/core/dtypes/dtypes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/dtypes/dtypes.py 2024-04-10 17:42:52.000000000 +0000 @@ -43,7 +43,7 @@ abbrev_to_npy_unit, ) from pandas._libs.tslibs.offsets import BDay -from pandas.compat import pa_version_under7p0 +from pandas.compat import pa_version_under10p1 from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level @@ -55,33 +55,41 @@ from pandas.core.dtypes.generic import ( ABCCategoricalIndex, ABCIndex, + ABCRangeIndex, ) from pandas.core.dtypes.inference import ( is_bool, is_list_like, ) -if not pa_version_under7p0: +from pandas.util import capitalize_first_letter + +if not pa_version_under10p1: import pyarrow as pa if TYPE_CHECKING: from collections.abc import MutableMapping from datetime import tzinfo - import pyarrow as pa # noqa: F811, TCH004 + import pyarrow as pa # noqa: TCH004 from pandas._typing import ( Dtype, DtypeObj, IntervalClosedType, Ordered, + Self, npt, type_t, ) from pandas import ( Categorical, + CategoricalIndex, + DatetimeIndex, Index, + IntervalIndex, + PeriodIndex, ) from pandas.core.arrays import ( BaseMaskedArray, @@ -206,6 +214,8 @@ base = np.dtype("O") _metadata = ("categories", "ordered") _cache_dtypes: dict[str_type, PandasExtensionDtype] = {} + _supports_2d = False + _can_fast_transpose = False def __init__(self, categories=None, ordered: Ordered = False) -> None: self._finalize(categories, ordered, fastpath=False) @@ -388,7 +398,7 @@ # We *do* want to include the real self.ordered here return int(self._hash_categories) - def __eq__(self, other: Any) -> bool: + def __eq__(self, other: object) -> bool: """ Rules for CDT equality: 1) Any CDT is equal to the string 'category' @@ -447,7 +457,7 @@ # With object-dtype we need a comparison that identifies # e.g. int(2) as distinct from float(2) - return hash(self) == hash(other) + return set(left) == set(right) def __repr__(self) -> str_type: if self.categories is None: @@ -455,8 +465,7 @@ dtype = "None" else: data = self.categories._format_data(name=type(self).__name__) - if data is None: - # self.categories is RangeIndex + if isinstance(self.categories, ABCRangeIndex): data = str(self.categories._range) data = data.rstrip(", ") dtype = self.categories.dtype @@ -671,6 +680,12 @@ return find_common_type(non_cat_dtypes) + @cache_readonly + def index_class(self) -> type_t[CategoricalIndex]: + from pandas import CategoricalIndex + + return CategoricalIndex + @register_extension_dtype class DatetimeTZDtype(PandasExtensionDtype): @@ -717,6 +732,8 @@ _metadata = ("unit", "tz") _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") _cache_dtypes: dict[str_type, PandasExtensionDtype] = {} + _supports_2d = True + _can_fast_transpose = True @property def na_value(self) -> NaTType: @@ -860,7 +877,7 @@ # TODO: update this. return hash(str(self)) - def __eq__(self, other: Any) -> bool: + def __eq__(self, other: object) -> bool: if isinstance(other, str): if other.startswith("M8["): other = f"datetime64[{other[3:]}" @@ -902,7 +919,7 @@ else: np_arr = array.to_numpy() - return DatetimeArray(np_arr, dtype=self, copy=False) + return DatetimeArray._simple_new(np_arr, dtype=self) def __setstate__(self, state) -> None: # for pickle compat. __get_state__ is defined in the @@ -911,6 +928,19 @@ self._tz = state["tz"] self._unit = state["unit"] + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + if all(isinstance(t, DatetimeTZDtype) and t.tz == self.tz for t in dtypes): + np_dtype = np.max([cast(DatetimeTZDtype, t).base for t in [self, *dtypes]]) + unit = np.datetime_data(np_dtype)[0] + return type(self)(unit=unit, tz=self.tz) + return super()._get_common_dtype(dtypes) + + @cache_readonly + def index_class(self) -> type_t[DatetimeIndex]: + from pandas import DatetimeIndex + + return DatetimeIndex + @register_extension_dtype class PeriodDtype(PeriodDtypeBase, PandasExtensionDtype): @@ -954,8 +984,10 @@ _cache_dtypes: dict[BaseOffset, int] = {} # type: ignore[assignment] __hash__ = PeriodDtypeBase.__hash__ _freq: BaseOffset + _supports_2d = True + _can_fast_transpose = True - def __new__(cls, freq): + def __new__(cls, freq) -> PeriodDtype: # noqa: PYI034 """ Parameters ---------- @@ -969,6 +1001,7 @@ if isinstance(freq, BDay): # GH#53446 + # TODO(3.0): enforcing this will close GH#10575 warnings.warn( "PeriodDtype[B] is deprecated and will be removed in a future " "version. Use a DatetimeIndex with freq='B' instead", @@ -985,11 +1018,11 @@ u._freq = freq return u - def __reduce__(self): + def __reduce__(self) -> tuple[type_t[Self], tuple[str_type]]: return type(self), (self.name,) @property - def freq(self): + def freq(self) -> BaseOffset: """ The frequency object of this PeriodDtype. @@ -1009,7 +1042,7 @@ if m is not None: freq = m.group("freq") - freq_offset = to_offset(freq) + freq_offset = to_offset(freq, is_period=True) if freq_offset is not None: return freq_offset @@ -1052,13 +1085,13 @@ def na_value(self) -> NaTType: return NaT - def __eq__(self, other: Any) -> bool: + def __eq__(self, other: object) -> bool: if isinstance(other, str): - return other in [self.name, self.name.title()] + return other in [self.name, capitalize_first_letter(self.name)] return super().__eq__(other) - def __ne__(self, other: Any) -> bool: + def __ne__(self, other: object) -> bool: return not self.__eq__(other) @classmethod @@ -1121,6 +1154,12 @@ return PeriodArray(np.array([], dtype="int64"), dtype=self, copy=False) return PeriodArray._concat_same_type(results) + @cache_readonly + def index_class(self) -> type_t[PeriodIndex]: + from pandas import PeriodIndex + + return PeriodIndex + @register_extension_dtype class IntervalDtype(PandasExtensionDtype): @@ -1301,7 +1340,7 @@ # make myself hashable return hash(str(self)) - def __eq__(self, other: Any) -> bool: + def __eq__(self, other: object) -> bool: if isinstance(other, str): return other.lower() in (self.name.lower(), str(self).lower()) elif not isinstance(other, IntervalDtype): @@ -1384,6 +1423,12 @@ return np.dtype(object) return IntervalDtype(common, closed=closed) + @cache_readonly + def index_class(self) -> type_t[IntervalIndex]: + from pandas import IntervalIndex + + return IntervalIndex + class NumpyEADtype(ExtensionDtype): """ @@ -1403,10 +1448,12 @@ """ _metadata = ("_dtype",) + _supports_2d = False + _can_fast_transpose = False def __init__(self, dtype: npt.DTypeLike | NumpyEADtype | None) -> None: if isinstance(dtype, NumpyEADtype): - # make constructor univalent + # make constructor idempotent dtype = dtype.numpy_dtype self._dtype = np.dtype(dtype) @@ -1488,7 +1535,6 @@ Base class for dtypes for BaseMaskedArray subclasses. """ - name: str base = None type: type @@ -1647,7 +1693,7 @@ # __eq__, so we explicitly do it here. return super().__hash__() - def __eq__(self, other: Any) -> bool: + def __eq__(self, other: object) -> bool: # We have to override __eq__ to handle NA values in _metadata. # The base class does simple == checks, which fail for NA. if isinstance(other, str): @@ -1699,7 +1745,7 @@ """ return self._fill_value - def _check_fill_value(self): + def _check_fill_value(self) -> None: if not lib.is_scalar(self._fill_value): raise ValueError( f"fill_value must be a scalar. Got {self._fill_value} instead" @@ -2046,8 +2092,8 @@ def __init__(self, pyarrow_dtype: pa.DataType) -> None: super().__init__("pyarrow") - if pa_version_under7p0: - raise ImportError("pyarrow>=7.0.0 is required for ArrowDtype") + if pa_version_under10p1: + raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") if not isinstance(pyarrow_dtype, pa.DataType): raise ValueError( f"pyarrow_dtype ({pyarrow_dtype}) must be an instance " @@ -2062,7 +2108,7 @@ # make myself hashable return hash(str(self)) - def __eq__(self, other: Any) -> bool: + def __eq__(self, other: object) -> bool: if not isinstance(other, type(self)): return super().__eq__(other) return self.pyarrow_dtype == other.pyarrow_dtype @@ -2144,7 +2190,9 @@ # This can be removed if/when pyarrow addresses it: # https://github.com/apache/arrow/issues/34462 return np.dtype(f"timedelta64[{self.pyarrow_dtype.unit}]") - if pa.types.is_string(self.pyarrow_dtype): + if pa.types.is_string(self.pyarrow_dtype) or pa.types.is_large_string( + self.pyarrow_dtype + ): # pa.string().to_pandas_dtype() = object which we don't want return np.dtype(str) try: diff -Nru pandas-2.1.4+dfsg/pandas/core/dtypes/inference.py pandas-2.2.2+dfsg/pandas/core/dtypes/inference.py --- pandas-2.1.4+dfsg/pandas/core/dtypes/inference.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/dtypes/inference.py 2024-04-10 17:42:52.000000000 +0000 @@ -401,7 +401,7 @@ return False -def is_dataclass(item): +def is_dataclass(item) -> bool: """ Checks if the object is a data-class instance diff -Nru pandas-2.1.4+dfsg/pandas/core/dtypes/missing.py pandas-2.2.2+dfsg/pandas/core/dtypes/missing.py --- pandas-2.1.4+dfsg/pandas/core/dtypes/missing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/dtypes/missing.py 2024-04-10 17:42:52.000000000 +0000 @@ -562,12 +562,29 @@ def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bool): - if not strict_nan: - # isna considers NaN and None to be equivalent. + left = ensure_object(left) + right = ensure_object(right) - return lib.array_equivalent_object(ensure_object(left), ensure_object(right)) + mask: npt.NDArray[np.bool_] | None = None + if strict_nan: + mask = isna(left) & isna(right) + if not mask.any(): + mask = None + + try: + if mask is None: + return lib.array_equivalent_object(left, right) + if not lib.array_equivalent_object(left[~mask], right[~mask]): + return False + left_remaining = left[mask] + right_remaining = right[mask] + except ValueError: + # can raise a ValueError if left and right cannot be + # compared (e.g. nested arrays) + left_remaining = left + right_remaining = right - for left_value, right_value in zip(left, right): + for left_value, right_value in zip(left_remaining, right_remaining): if left_value is NaT and right_value is not NaT: return False @@ -615,7 +632,7 @@ """ if not is_list_like(val): val = [val] - val = np.array(val, copy=False) + val = np.asarray(val) if val.dtype.kind in "mM": return np.array("NaT", dtype=val.dtype) elif val.dtype == object: @@ -630,6 +647,20 @@ return np.nan +def construct_1d_array_from_inferred_fill_value( + value: object, length: int +) -> ArrayLike: + # Find our empty_value dtype by constructing an array + # from our value and doing a .take on it + from pandas.core.algorithms import take_nd + from pandas.core.construction import sanitize_array + from pandas.core.indexes.base import Index + + arr = sanitize_array(value, Index(range(1)), copy=False) + taker = -1 * np.ones(length, dtype=np.intp) + return take_nd(arr, taker) + + def maybe_fill(arr: np.ndarray) -> np.ndarray: """ Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype. @@ -669,7 +700,8 @@ if isinstance(dtype, ExtensionDtype): return dtype.na_value elif dtype.kind in "mM": - return dtype.type("NaT", "ns") + unit = np.datetime_data(dtype)[0] + return dtype.type("NaT", unit) elif dtype.kind == "f": return np.nan elif dtype.kind in "iu": diff -Nru pandas-2.1.4+dfsg/pandas/core/flags.py pandas-2.2.2+dfsg/pandas/core/flags.py --- pandas-2.1.4+dfsg/pandas/core/flags.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/flags.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,8 +11,6 @@ """ Flags that apply to pandas objects. - .. versionadded:: 1.2.0 - Parameters ---------- obj : Series or DataFrame diff -Nru pandas-2.1.4+dfsg/pandas/core/frame.py pandas-2.2.2+dfsg/pandas/core/frame.py --- pandas-2.1.4+dfsg/pandas/core/frame.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/frame.py 2024-04-10 17:42:52.000000000 +0000 @@ -42,7 +42,9 @@ from pandas._config import ( get_option, using_copy_on_write, + warn_copy_on_write, ) +from pandas._config.config import _get_option from pandas._libs import ( algos as libalgos, @@ -60,13 +62,19 @@ InvalidIndexError, _chained_assignment_method_msg, _chained_assignment_msg, + _chained_assignment_warning_method_msg, + _chained_assignment_warning_msg, ) from pandas.util._decorators import ( Appender, Substitution, + deprecate_nonkeyword_arguments, doc, ) -from pandas.util._exceptions import find_stack_level +from pandas.util._exceptions import ( + find_stack_level, + rewrite_warning, +) from pandas.util._validators import ( validate_ascending, validate_bool_kwarg, @@ -135,7 +143,6 @@ from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import ( ensure_wrapped_if_datetimelike, - extract_array, sanitize_array, sanitize_masked_array, ) @@ -228,6 +235,7 @@ Level, MergeHow, MergeValidate, + MutableMappingT, NaAction, NaPosition, NsmallestNlargestKeep, @@ -238,6 +246,7 @@ Renamer, Scalar, Self, + SequenceNotStr, SortKind, StorageOptions, Suffixes, @@ -326,9 +335,6 @@ join; preserve the order of the left keys. * cross: creates the cartesian product from both frames, preserves the order of the left keys. - - .. versionadded:: 1.2.0 - on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -360,6 +366,18 @@ values must not be None. copy : bool, default True If False, avoid copy if possible. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` indicator : bool or str, default False If True, adds a column to the output DataFrame called "_merge" with information on the source of each row. The column can be given a different @@ -417,10 +435,10 @@ lkey value_x rkey value_y 0 foo 1 foo 5 1 foo 1 foo 8 -2 foo 5 foo 5 -3 foo 5 foo 8 -4 bar 2 bar 6 -5 baz 3 baz 7 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 Merge DataFrames df1 and df2 with specified left and right suffixes appended to any overlapping columns. @@ -430,10 +448,10 @@ lkey value_left rkey value_right 0 foo 1 foo 5 1 foo 1 foo 8 -2 foo 5 foo 5 -3 foo 5 foo 8 -4 bar 2 bar 6 -5 baz 3 baz 7 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 Merge DataFrames df1 and df2, but raise an exception if the DataFrames have any overlapping columns. @@ -638,26 +656,37 @@ def _constructor(self) -> Callable[..., DataFrame]: return DataFrame - def _constructor_from_mgr(self, mgr, axes): - if self._constructor is DataFrame: - # we are pandas.DataFrame (or a subclass that doesn't override _constructor) - return DataFrame._from_mgr(mgr, axes=axes) - else: - assert axes is mgr.axes + def _constructor_from_mgr(self, mgr, axes) -> DataFrame: + df = DataFrame._from_mgr(mgr, axes=axes) + + if type(self) is DataFrame: + # This would also work `if self._constructor is DataFrame`, but + # this check is slightly faster, benefiting the most-common case. + return df + + elif type(self).__name__ == "GeoDataFrame": + # Shim until geopandas can override their _constructor_from_mgr + # bc they have different behavior for Managers than for DataFrames return self._constructor(mgr) - _constructor_sliced: Callable[..., Series] = Series + # We assume that the subclass __init__ knows how to handle a + # pd.DataFrame object. + return self._constructor(df) - def _sliced_from_mgr(self, mgr, axes) -> Series: - return Series._from_mgr(mgr, axes) + _constructor_sliced: Callable[..., Series] = Series - def _constructor_sliced_from_mgr(self, mgr, axes): - if self._constructor_sliced is Series: - ser = self._sliced_from_mgr(mgr, axes) - ser._name = None # caller is responsible for setting real name + def _constructor_sliced_from_mgr(self, mgr, axes) -> Series: + ser = Series._from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name + + if type(self) is DataFrame: + # This would also work `if self._constructor_sliced is Series`, but + # this check is slightly faster, benefiting the most-common case. return ser - assert axes is mgr.axes - return self._constructor_sliced(mgr) + + # We assume that the subclass __init__ knows how to handle a + # pd.Series object. + return self._constructor_sliced(ser) # ---------------------------------------------------------------------- # Constructors @@ -670,17 +699,29 @@ dtype: Dtype | None = None, copy: bool | None = None, ) -> None: + allow_mgr = False if dtype is not None: dtype = self._validate_dtype(dtype) if isinstance(data, DataFrame): data = data._mgr + allow_mgr = True if not copy: # if not copying data, ensure to still return a shallow copy # to avoid the result sharing the same Manager data = data.copy(deep=False) if isinstance(data, (BlockManager, ArrayManager)): + if not allow_mgr: + # GH#52419 + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix + ) + if using_copy_on_write(): data = data.copy(deep=False) # first check if a Manager is passed without any other arguments @@ -690,7 +731,11 @@ NDFrame.__init__(self, data) return - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) + + is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) + data_dtype = getattr(data, "dtype", None) + original_dtype = dtype # GH47215 if isinstance(index, set): @@ -878,6 +923,18 @@ NDFrame.__init__(self, mgr) + if original_dtype is None and is_pandas_object and data_dtype == np.object_: + if self.dtypes.iloc[0] != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The DataFrame " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old " + "behavior.", + FutureWarning, + stacklevel=2, + ) + # ---------------------------------------------------------------------- def __dataframe__( @@ -889,8 +946,8 @@ Parameters ---------- nan_as_null : bool, default False - Whether to tell the DataFrame to overwrite null values in the data - with ``NaN`` (or ``NaT``). + `nan_as_null` is DEPRECATED and has no effect. Please avoid using + it; it will be removed in a future release. allow_copy : bool, default True Whether to allow memory copying when exporting. If set to False it would cause non-zero-copy exports to fail. @@ -905,9 +962,6 @@ Details on the interchange protocol: https://data-apis.org/dataframe-protocol/latest/index.html - `nan_as_null` currently has no effect; once support for nullable extension - dtypes is added, this value should be propagated to columns. - Examples -------- >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) @@ -927,7 +981,7 @@ from pandas.core.interchange.dataframe import PandasDataFrameXchg - return PandasDataFrameXchg(self, nan_as_null, allow_copy) + return PandasDataFrameXchg(self, allow_copy=allow_copy) def __dataframe_consortium_standard__( self, *, api_version: str | None = None @@ -944,6 +998,33 @@ ) return convert_to_standard_compliant_dataframe(self, api_version=api_version) + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas DataFrame as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas DataFrame to the Arrow + format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` + in its handling of the index, i.e. store the index as a column except + for RangeIndex). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if requested_schema is not None: + requested_schema = pa.Schema._import_from_c_capsule(requested_schema) + table = pa.Table.from_pandas(self, schema=requested_schema) + return table.__arrow_c_stream__() + # ---------------------------------------------------------------------- @property @@ -1182,7 +1263,7 @@ buf: None = ..., columns: Axes | None = ..., col_space: int | list[int] | dict[Hashable, int] | None = ..., - header: bool | list[str] = ..., + header: bool | SequenceNotStr[str] = ..., index: bool = ..., na_rep: str = ..., formatters: fmt.FormattersType | None = ..., @@ -1207,7 +1288,7 @@ buf: FilePath | WriteBuffer[str], columns: Axes | None = ..., col_space: int | list[int] | dict[Hashable, int] | None = ..., - header: bool | list[str] = ..., + header: bool | SequenceNotStr[str] = ..., index: bool = ..., na_rep: str = ..., formatters: fmt.FormattersType | None = ..., @@ -1226,6 +1307,9 @@ ) -> None: ... + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "buf"], name="to_string" + ) @Substitution( header_type="bool or list of str", header="Write out the column names. If a list of columns " @@ -1242,7 +1326,7 @@ buf: FilePath | WriteBuffer[str] | None = None, columns: Axes | None = None, col_space: int | list[int] | dict[Hashable, int] | None = None, - header: bool | list[str] = True, + header: bool | SequenceNotStr[str] = True, index: bool = True, na_rep: str = "NaN", formatters: fmt.FormattersType | None = None, @@ -1313,6 +1397,26 @@ line_width=line_width, ) + def _get_values_for_csv( + self, + *, + float_format: FloatFormatType | None, + date_format: str | None, + decimal: str, + na_rep: str, + quoting, # int csv.QUOTE_FOO from stdlib + ) -> Self: + # helper used by to_csv + mgr = self._mgr.get_values_for_csv( + float_format=float_format, + date_format=date_format, + decimal=decimal, + na_rep=na_rep, + quoting=quoting, + ) + # error: Incompatible return value type (got "DataFrame", expected "Self") + return self._constructor_from_mgr(mgr, axes=mgr.axes) # type: ignore[return-value] + # ---------------------------------------------------------------------- @property @@ -1888,7 +1992,7 @@ dtype = np.dtype(dtype) result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) if result.dtype is not dtype: - result = np.array(result, dtype=dtype, copy=False) + result = np.asarray(result, dtype=dtype) return result @@ -1918,22 +2022,56 @@ def to_dict( self, orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., + ) -> MutableMappingT: + ... + + @overload + def to_dict( + self, + orient: Literal["records"], + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., + ) -> list[MutableMappingT]: + ... + + @overload + def to_dict( + self, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, into: type[dict] = ..., + index: bool = ..., ) -> dict: ... @overload - def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]: + def to_dict( + self, + orient: Literal["records"], + *, + into: type[dict] = ..., + index: bool = ..., + ) -> list[dict]: ... + # error: Incompatible default for argument "into" (default has type "type + # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "orient"], name="to_dict" + ) def to_dict( self, orient: Literal[ "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", - into: type[dict] = dict, + into: type[MutableMappingT] + | MutableMappingT = dict, # type: ignore[assignment] index: bool = True, - ) -> dict | list[dict]: + ) -> MutableMappingT | list[MutableMappingT]: """ Convert the DataFrame to a dictionary. @@ -1961,7 +2099,7 @@ 'tight' as an allowed value for the ``orient`` argument into : class, default dict - The collections.abc.Mapping subclass used for all Mappings + The collections.abc.MutableMapping subclass used for all Mappings in the return value. Can be the actual class or an empty instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. @@ -1975,9 +2113,10 @@ Returns ------- - dict, list or collections.abc.Mapping - Return a collections.abc.Mapping object representing the DataFrame. - The resulting transformation depends on the `orient` parameter. + dict, list or collections.abc.MutableMapping + Return a collections.abc.MutableMapping object representing the + DataFrame. The resulting transformation depends on the `orient` + parameter. See Also -------- @@ -2036,8 +2175,11 @@ """ from pandas.core.methods.to_dict import to_dict - return to_dict(self, orient, into, index) + return to_dict(self, orient, into=into, index=index) + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "destination_table"], name="to_gbq" + ) def to_gbq( self, destination_table: str, @@ -2054,6 +2196,10 @@ """ Write a DataFrame to a Google BigQuery table. + .. deprecated:: 2.2.0 + + Please use ``pandas_gbq.to_gbq`` instead. + This function requires the `pandas-gbq package `__. @@ -2392,10 +2538,10 @@ columns = columns.drop(exclude) - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) - return cls(mgr) + return cls._from_mgr(mgr, axes=mgr.axes) def to_records( self, index: bool = True, column_dtypes=None, index_dtypes=None @@ -2593,7 +2739,7 @@ if dtype is not None: dtype = pandas_dtype(dtype) - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) columns = ensure_index(columns) if len(columns) != len(arrays): raise ValueError("len(columns) must match len(arrays)") @@ -2605,7 +2751,7 @@ verify_integrity=verify_integrity, typ=manager, ) - return cls(mgr) + return cls._from_mgr(mgr, axes=mgr.axes) @doc( storage_options=_shared_docs["storage_options"], @@ -2686,8 +2832,6 @@ {storage_options} - .. versionadded:: 1.2.0 - value_labels : dict of dicts Dictionary containing columns as keys and dictionaries of column value to labels as values. Labels for a single variable must be 32,000 @@ -2793,6 +2937,9 @@ to_feather(self, path, **kwargs) + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "buf"], name="to_markdown" + ) @doc( Series.to_markdown, klass=_shared_doc_kwargs["klass"], @@ -2868,6 +3015,9 @@ ) -> None: ... + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path"], name="to_parquet" + ) @doc(storage_options=_shared_docs["storage_options"]) def to_parquet( self, @@ -2894,11 +3044,6 @@ object implementing a binary ``write()`` function. If None, the result is returned as bytes. If a string or path, it will be used as Root Directory path when writing a partitioned dataset. - - .. versionchanged:: 1.2.0 - - Previously this was "fname" - engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` @@ -2921,8 +3066,6 @@ Must be None if path is not a string. {storage_options} - .. versionadded:: 1.2.0 - **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. @@ -3000,7 +3143,7 @@ (e.g. via builtin open function). If path is None, a bytes object is returned. engine : {'pyarrow'}, default 'pyarrow' - ORC library to use. Pyarrow must be >= 7.0.0. + ORC library to use. index : bool, optional If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -3124,6 +3267,9 @@ ) -> str: ... + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "buf"], name="to_html" + ) @Substitution( header_type="bool", header="Whether to print column labels, default True", @@ -3209,7 +3355,7 @@ ...
''' >>> assert html_string == df.to_html() """ - if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS: + if justify is not None and justify not in fmt.VALID_JUSTIFY_PARAMETERS: raise ValueError("Invalid value for justify parameter") formatter = fmt.DataFrameFormatter( @@ -3242,6 +3388,55 @@ render_links=render_links, ) + @overload + def to_xml( + self, + path_or_buffer: None = ..., + *, + index: bool = ..., + root_name: str | None = ..., + row_name: str | None = ..., + na_rep: str | None = ..., + attr_cols: list[str] | None = ..., + elem_cols: list[str] | None = ..., + namespaces: dict[str | None, str] | None = ..., + prefix: str | None = ..., + encoding: str = ..., + xml_declaration: bool | None = ..., + pretty_print: bool | None = ..., + parser: XMLParsers | None = ..., + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., + compression: CompressionOptions = ..., + storage_options: StorageOptions | None = ..., + ) -> str: + ... + + @overload + def to_xml( + self, + path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str], + *, + index: bool = ..., + root_name: str | None = ..., + row_name: str | None = ..., + na_rep: str | None = ..., + attr_cols: list[str] | None = ..., + elem_cols: list[str] | None = ..., + namespaces: dict[str | None, str] | None = ..., + prefix: str | None = ..., + encoding: str = ..., + xml_declaration: bool | None = ..., + pretty_print: bool | None = ..., + parser: XMLParsers | None = ..., + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., + compression: CompressionOptions = ..., + storage_options: StorageOptions | None = ..., + ) -> None: + ... + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path_or_buffer"], name="to_xml" + ) @doc( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buffer", @@ -3410,7 +3605,7 @@ lxml = import_optional_dependency("lxml.etree", errors="ignore") - TreeBuilder: type[EtreeXMLFormatter] | type[LxmlXMLFormatter] + TreeBuilder: type[EtreeXMLFormatter | LxmlXMLFormatter] if parser == "lxml": if lxml is not None: @@ -3588,6 +3783,18 @@ Note that a copy is always required for mixed dtype DataFrames, or for DataFrames with any extension types. + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + Returns ------- DataFrame @@ -3848,7 +4055,9 @@ copy=False, only_slice=True, ) - return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + result = result.__finalize__(self) + return result def __getitem__(self, key): check_dict_or_set_indexers(key) @@ -4065,6 +4274,17 @@ warnings.warn( _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 ) + elif not PYPY and not using_copy_on_write(): + if sys.getrefcount(self) <= 3 and ( + warn_copy_on_write() + or ( + not warn_copy_on_write() + and any(b.refs.has_reference() for b in self._mgr.blocks) # type: ignore[union-attr] + ) + ): + warnings.warn( + _chained_assignment_warning_msg, FutureWarning, stacklevel=2 + ) key = com.apply_if_callable(key, self) @@ -4235,11 +4455,15 @@ return self.isetitem(locs, value) - if len(value.columns) != 1: + if len(value.columns) > 1: raise ValueError( "Cannot set a DataFrame with multiple columns to the single " f"column {key}" ) + elif len(value.columns) == 0: + raise ValueError( + f"Cannot set a DataFrame without columns to the column {key}" + ) self[key] = value[value.columns[0]] @@ -4401,7 +4625,7 @@ def _get_item_cache(self, item: Hashable) -> Series: """Return the cached item, item represents a label indexer.""" - if using_copy_on_write(): + if using_copy_on_write() or warn_copy_on_write(): loc = self.columns.get_loc(item) return self._ixs(loc, axis=1) @@ -4865,7 +5089,8 @@ return True mgr = self._mgr._get_data_subset(predicate).copy(deep=None) - return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) + # error: Incompatible return value type (got "DataFrame", expected "Self") + return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) # type: ignore[return-value] def insert( self, @@ -4887,7 +5112,9 @@ column : str, number, or hashable object Label of the inserted column. value : Scalar, Series, or array-like + Content of the inserted column. allow_duplicates : bool, optional, default lib.no_default + Allow duplicate column labels to be created. See Also -------- @@ -5037,16 +5264,26 @@ if is_list_like(value): com.require_length_match(value, self.index) - return sanitize_array(value, self.index, copy=True, allow_2d=True), None + arr = sanitize_array(value, self.index, copy=True, allow_2d=True) + if ( + isinstance(value, Index) + and value.dtype == "object" + and arr.dtype != value.dtype + ): # + # TODO: Remove kludge in sanitize_array for string mode when enforcing + # this deprecation + warnings.warn( + "Setting an Index with object dtype into a DataFrame will stop " + "inferring another dtype in a future version. Cast the Index " + "explicitly before setting it into the DataFrame.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return arr, None @property def _series(self): - return { - item: Series( - self._mgr.iget(idx), index=self.index, name=item, fastpath=True - ) - for idx, item in enumerate(self.columns) - } + return {item: self._ixs(idx, axis=1) for idx, item in enumerate(self.columns)} # ---------------------------------------------------------------------- # Reindexing and alignment @@ -5435,6 +5672,18 @@ ('index', 'columns') or number (0, 1). The default is 'index'. copy : bool, default True Also copy underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` inplace : bool, default False Whether to modify the DataFrame rather than creating a new one. If True then value of copy is ignored. @@ -5623,6 +5872,9 @@ ) fill_value = lib.no_default + if self.empty: + return self.copy() + axis = self._get_axis_number(axis) if is_list_like(periods): @@ -6705,14 +6957,7 @@ vals = (col.values for name, col in self.items() if name in subset) labels, shape = map(list, zip(*map(f, vals))) - ids = get_group_index( - labels, - # error: Argument 1 to "tuple" has incompatible type "List[_T]"; - # expected "Iterable[int]" - tuple(shape), # type: ignore[arg-type] - sort=False, - xnull=False, - ) + ids = get_group_index(labels, tuple(shape), sort=False, xnull=False) result = self._constructor_sliced(duplicated(ids, keep), index=self.index) return result.__finalize__(self, method="duplicated") @@ -7261,7 +7506,7 @@ subset = self.columns.tolist() name = "proportion" if normalize else "count" - counts = self.groupby(subset, dropna=dropna, observed=False).grouper.size() + counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size() counts.name = name if sort: @@ -7302,8 +7547,8 @@ - ``first`` : prioritize the first occurrence(s) - ``last`` : prioritize the last occurrence(s) - - ``all`` : do not drop any duplicates, even it means - selecting more than `n` items. + - ``all`` : keep all the ties of the smallest item even if it means + selecting more than ``n`` items. Returns ------- @@ -7365,7 +7610,9 @@ Italy 59000000 1937894 IT Brunei 434000 12128 BN - When using ``keep='all'``, all duplicate items are maintained: + When using ``keep='all'``, the number of element kept can go beyond ``n`` + if there are duplicate values for the smallest element, all the + ties are kept: >>> df.nlargest(3, 'population', keep='all') population GDP alpha-2 @@ -7375,6 +7622,16 @@ Maldives 434000 4520 MV Brunei 434000 12128 BN + However, ``nlargest`` does not keep ``n`` distinct largest elements: + + >>> df.nlargest(5, 'population', keep='all') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + To order by the largest values in column "population" and then "GDP", we can specify multiple columns like in the next example. @@ -7411,8 +7668,8 @@ - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. - - ``all`` : do not drop any duplicates, even it means - selecting more than `n` items. + - ``all`` : keep all the ties of the largest item even if it means + selecting more than ``n`` items. Returns ------- @@ -7466,7 +7723,9 @@ Tuvalu 11300 38 TV Nauru 337000 182 NR - When using ``keep='all'``, all duplicate items are maintained: + When using ``keep='all'``, the number of element kept can go beyond ``n`` + if there are duplicate values for the largest element, all the + ties are kept. >>> df.nsmallest(3, 'population', keep='all') population GDP alpha-2 @@ -7475,6 +7734,16 @@ Iceland 337000 17036 IS Nauru 337000 182 NR + However, ``nsmallest`` does not keep ``n`` distinct + smallest elements: + + >>> df.nsmallest(4, 'population', keep='all') + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Iceland 337000 17036 IS + Nauru 337000 182 NR + To order by the smallest values in column "population" and then "GDP", we can specify multiple columns like in the next example. @@ -8050,43 +8319,49 @@ return self._construct_result(new_data) @Appender(ops.make_flex_doc("eq", "dataframe")) - def eq(self, other, axis: Axis = "columns", level=None): + def eq(self, other, axis: Axis = "columns", level=None) -> DataFrame: return self._flex_cmp_method(other, operator.eq, axis=axis, level=level) @Appender(ops.make_flex_doc("ne", "dataframe")) - def ne(self, other, axis: Axis = "columns", level=None): + def ne(self, other, axis: Axis = "columns", level=None) -> DataFrame: return self._flex_cmp_method(other, operator.ne, axis=axis, level=level) @Appender(ops.make_flex_doc("le", "dataframe")) - def le(self, other, axis: Axis = "columns", level=None): + def le(self, other, axis: Axis = "columns", level=None) -> DataFrame: return self._flex_cmp_method(other, operator.le, axis=axis, level=level) @Appender(ops.make_flex_doc("lt", "dataframe")) - def lt(self, other, axis: Axis = "columns", level=None): + def lt(self, other, axis: Axis = "columns", level=None) -> DataFrame: return self._flex_cmp_method(other, operator.lt, axis=axis, level=level) @Appender(ops.make_flex_doc("ge", "dataframe")) - def ge(self, other, axis: Axis = "columns", level=None): + def ge(self, other, axis: Axis = "columns", level=None) -> DataFrame: return self._flex_cmp_method(other, operator.ge, axis=axis, level=level) @Appender(ops.make_flex_doc("gt", "dataframe")) - def gt(self, other, axis: Axis = "columns", level=None): + def gt(self, other, axis: Axis = "columns", level=None) -> DataFrame: return self._flex_cmp_method(other, operator.gt, axis=axis, level=level) @Appender(ops.make_flex_doc("add", "dataframe")) - def add(self, other, axis: Axis = "columns", level=None, fill_value=None): + def add( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.add, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("radd", "dataframe")) - def radd(self, other, axis: Axis = "columns", level=None, fill_value=None): + def radd( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.radd, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("sub", "dataframe")) - def sub(self, other, axis: Axis = "columns", level=None, fill_value=None): + def sub( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.sub, level=level, fill_value=fill_value, axis=axis ) @@ -8094,13 +8369,17 @@ subtract = sub @Appender(ops.make_flex_doc("rsub", "dataframe")) - def rsub(self, other, axis: Axis = "columns", level=None, fill_value=None): + def rsub( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.rsub, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("mul", "dataframe")) - def mul(self, other, axis: Axis = "columns", level=None, fill_value=None): + def mul( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.mul, level=level, fill_value=fill_value, axis=axis ) @@ -8108,13 +8387,17 @@ multiply = mul @Appender(ops.make_flex_doc("rmul", "dataframe")) - def rmul(self, other, axis: Axis = "columns", level=None, fill_value=None): + def rmul( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.rmul, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("truediv", "dataframe")) - def truediv(self, other, axis: Axis = "columns", level=None, fill_value=None): + def truediv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.truediv, level=level, fill_value=fill_value, axis=axis ) @@ -8123,7 +8406,9 @@ divide = truediv @Appender(ops.make_flex_doc("rtruediv", "dataframe")) - def rtruediv(self, other, axis: Axis = "columns", level=None, fill_value=None): + def rtruediv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis ) @@ -8131,37 +8416,49 @@ rdiv = rtruediv @Appender(ops.make_flex_doc("floordiv", "dataframe")) - def floordiv(self, other, axis: Axis = "columns", level=None, fill_value=None): + def floordiv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.floordiv, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rfloordiv", "dataframe")) - def rfloordiv(self, other, axis: Axis = "columns", level=None, fill_value=None): + def rfloordiv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("mod", "dataframe")) - def mod(self, other, axis: Axis = "columns", level=None, fill_value=None): + def mod( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.mod, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rmod", "dataframe")) - def rmod(self, other, axis: Axis = "columns", level=None, fill_value=None): + def rmod( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.rmod, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("pow", "dataframe")) - def pow(self, other, axis: Axis = "columns", level=None, fill_value=None): + def pow( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, operator.pow, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rpow", "dataframe")) - def rpow(self, other, axis: Axis = "columns", level=None, fill_value=None): + def rpow( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: return self._flex_arith_method( other, roperator.rpow, level=level, fill_value=fill_value, axis=axis ) @@ -8529,11 +8826,11 @@ """ from pandas.core.computation import expressions - def combiner(x, y): - mask = extract_array(isna(x)) + def combiner(x: Series, y: Series): + mask = x.isna()._values - x_values = extract_array(x, extract_numpy=True) - y_values = extract_array(y, extract_numpy=True) + x_values = x._values + y_values = y._values # If the column y in other DataFrame is not in first DataFrame, # just return y_values. @@ -8643,40 +8940,42 @@ 1 b e 2 c f - For Series, its name attribute must be set. - >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], ... 'B': ['x', 'y', 'z']}) - >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2]) - >>> df.update(new_column) + >>> new_df = pd.DataFrame({'B': ['d', 'f']}, index=[0, 2]) + >>> df.update(new_df) >>> df A B 0 a d 1 b y - 2 c e + 2 c f + + For Series, its name attribute must be set. + >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], ... 'B': ['x', 'y', 'z']}) - >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2]) - >>> df.update(new_df) + >>> new_column = pd.Series(['d', 'e', 'f'], name='B') + >>> df.update(new_column) >>> df A B - 0 a x - 1 b d - 2 c e + 0 a d + 1 b e + 2 c f If `other` contains NaNs the corresponding values are not updated in the original dataframe. >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [400, 500, 600]}) + ... 'B': [400., 500., 600.]}) >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) >>> df.update(new_df) >>> df - A B - 0 1 4 - 1 2 500 - 2 3 6 + A B + 0 1 4.0 + 1 2 500.0 + 2 3 6.0 """ + if not PYPY and using_copy_on_write(): if sys.getrefcount(self) <= REF_COUNT: warnings.warn( @@ -8684,8 +8983,13 @@ ChainedAssignmentError, stacklevel=2, ) - - from pandas.core.computation import expressions + elif not PYPY and not using_copy_on_write() and self._is_view_after_cow_rules(): + if sys.getrefcount(self) <= REF_COUNT: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) # TODO: Support other joins if join != "left": # pragma: no cover @@ -8720,7 +9024,17 @@ if mask.all(): continue - self.loc[:, col] = expressions.where(mask, this, that) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Downcasting behavior", + category=FutureWarning, + ) + # GH#57124 - `that` might get upcasted because of NA values, and then + # downcasted in where because of the mask. Ignoring the warning + # is a stopgap, will replace with a new implementation of update + # in 3.0. + self.loc[:, col] = self[col].where(mask, that) # ---------------------------------------------------------------------- # Data reshaping @@ -8813,20 +9127,20 @@ >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', ... 'Parrot', 'Parrot'], ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df.groupby("Animal", group_keys=True).apply(lambda x: x) - Animal Max Speed + >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x) + Max Speed Animal - Falcon 0 Falcon 380.0 - 1 Falcon 370.0 - Parrot 2 Parrot 24.0 - 3 Parrot 26.0 - - >>> df.groupby("Animal", group_keys=False).apply(lambda x: x) - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 + Falcon 0 380.0 + 1 370.0 + Parrot 2 24.0 + 3 26.0 + + >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x) + Max Speed + 0 380.0 + 1 370.0 + 2 24.0 + 3 26.0 """ ) ) @@ -9072,6 +9386,11 @@ If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. + .. deprecated:: 2.2.0 + + The default value of ``False`` is deprecated and will change to + ``True`` in a future version of pandas. + sort : bool, default True Specifies if the result should be sorted. @@ -9182,7 +9501,7 @@ margins: bool = False, dropna: bool = True, margins_name: Level = "All", - observed: bool = False, + observed: bool | lib.NoDefault = lib.no_default, sort: bool = True, ) -> DataFrame: from pandas.core.reshape.pivot import pivot_table @@ -9349,33 +9668,6 @@ dog weight kg 3.0 height m 4.0 dtype: float64 - - **Dropping missing values** - - >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]], - ... index=['cat', 'dog'], - ... columns=multicol2) - - Note that rows where all values are missing are dropped by - default but this behaviour can be controlled via the dropna - keyword parameter: - - >>> df_multi_level_cols3 - weight height - kg m - cat NaN 1.0 - dog 2.0 3.0 - >>> df_multi_level_cols3.stack(dropna=False) - weight height - cat kg NaN NaN - m NaN 1.0 - dog kg 2.0 NaN - m NaN 3.0 - >>> df_multi_level_cols3.stack(dropna=True) - weight height - cat m NaN 1.0 - dog kg 2.0 NaN - m NaN 3.0 """ if not future_stack: from pandas.core.reshape.reshape import ( @@ -9383,6 +9675,20 @@ stack_multiple, ) + if ( + dropna is not lib.no_default + or sort is not lib.no_default + or self.columns.nlevels > 1 + ): + warnings.warn( + "The previous implementation of stack is deprecated and will be " + "removed in a future version of pandas. See the What's New notes " + "for pandas 2.1.0 for details. Specify future_stack=True to adopt " + "the new implementation and silence this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if dropna is lib.no_default: dropna = True if sort is lib.no_default: @@ -9773,12 +10079,12 @@ -------- DataFrame.apply : Perform any type of operations. DataFrame.transform : Perform transformation type operations. - core.groupby.GroupBy : Perform operations over groups. - core.resample.Resampler : Perform operations over resampled bins. - core.window.Rolling : Perform operations over rolling window. - core.window.Expanding : Perform operations over expanding window. - core.window.ExponentialMovingWindow : Perform operation over exponential weighted - window. + pandas.DataFrame.groupby : Perform operations over groups. + pandas.DataFrame.resample : Perform operations over resampled bins. + pandas.DataFrame.rolling : Perform operations over rolling window. + pandas.DataFrame.expanding : Perform operations over expanding window. + pandas.core.window.ewm.ExponentialMovingWindow : Perform operation over exponential + weighted window. """ ) @@ -9869,6 +10175,8 @@ result_type: Literal["expand", "reduce", "broadcast"] | None = None, args=(), by_row: Literal[False, "compat"] = "compat", + engine: Literal["python", "numba"] = "python", + engine_kwargs: dict[str, bool] | None = None, **kwargs, ): """ @@ -9928,6 +10236,36 @@ If False, the funcs will be passed the whole Series at once. .. versionadded:: 2.1.0 + + engine : {'python', 'numba'}, default 'python' + Choose between the python (default) engine or the numba engine in apply. + + The numba engine will attempt to JIT compile the passed function, + which may result in speedups for large DataFrames. + It also supports the following engine_kwargs : + + - nopython (compile the function in nopython mode) + - nogil (release the GIL inside the JIT compiled function) + - parallel (try to apply the function in parallel over the DataFrame) + + Note: Due to limitations within numba/how pandas interfaces with numba, + you should only use this if raw=True + + Note: The numba compiler only supports a subset of + valid Python/numpy operations. + + Please read more about the `supported python features + `_ + and `supported numpy features + `_ + in numba to learn what you can or cannot use in the passed function. + + .. versionadded:: 2.2.0 + + engine_kwargs : dict + Pass keyword arguments to the engine. + This is currently only used by the numba engine, + see the documentation for the engine argument for more information. **kwargs Additional keyword arguments to pass as keywords arguments to `func`. @@ -10028,6 +10366,8 @@ raw=raw, result_type=result_type, by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, ) @@ -10089,6 +10429,14 @@ 0 NaN 4 1 5.0 5 + It is also possible to use `map` with functions that are not + `lambda` functions: + + >>> df.map(round, ndigits=1) + 0 1 + 0 1.0 2.1 + 1 3.4 4.6 + Note that a vectorized version of `func` often exists, which will be much faster. You could square each number elementwise. @@ -10270,9 +10618,6 @@ of the calling's one. * cross: creates the cartesian product from both frames, preserves the order of the left keys. - - .. versionadded:: 1.2.0 - lsuffix : str, default '' Suffix to use from left frame's overlapping columns. rsuffix : str, default '' @@ -10471,9 +10816,9 @@ self, right: DataFrame | Series, how: MergeHow = "inner", - on: IndexLabel | None = None, - left_on: IndexLabel | None = None, - right_on: IndexLabel | None = None, + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, left_index: bool = False, right_index: bool = False, sort: bool = False, @@ -11189,7 +11534,20 @@ row_index = np.tile(np.arange(nrows), ncols) col_index = np.repeat(np.arange(ncols), nrows) ser = Series(arr, index=col_index, copy=False) - result = ser.groupby(row_index).agg(name, **kwds) + # GroupBy will raise a warning with SeriesGroupBy as the object, + # likely confusing users + with rewrite_warning( + target_message=( + f"The behavior of SeriesGroupBy.{name} with all-NA values" + ), + target_category=FutureWarning, + new_message=( + f"The behavior of {type(self).__name__}.{name} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In " + "a future version this will raise ValueError" + ), + ): + result = ser.groupby(row_index).agg(name, **kwds) result.index = df.index if not skipna and name not in ("any", "all"): mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1) @@ -11247,7 +11605,7 @@ def any( # type: ignore[override] self, *, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool = False, skipna: bool = True, **kwargs, @@ -11262,7 +11620,7 @@ @doc(make_doc("all", ndim=2)) def all( self, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool = False, skipna: bool = True, **kwargs, @@ -11661,6 +12019,7 @@ axis: Axis = ..., numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., ) -> Series: ... @@ -11671,6 +12030,7 @@ axis: Axis = ..., numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., ) -> Series | DataFrame: ... @@ -11681,6 +12041,7 @@ axis: Axis = ..., numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., + method: Literal["single", "table"] = ..., ) -> Series | DataFrame: ... @@ -11780,11 +12141,10 @@ if not is_list_like(q): # BlockManager.quantile expects listlike, so we wrap and unwrap here - # error: List item 0 has incompatible type "Union[float, Union[Union[ - # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]"; - # expected "float" - res_df = self.quantile( # type: ignore[call-overload] - [q], + # error: List item 0 has incompatible type "float | ExtensionArray | + # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float" + res_df = self.quantile( + [q], # type: ignore[list-item] axis=axis, numeric_only=numeric_only, interpolation=interpolation, @@ -11882,6 +12242,18 @@ copy : bool, default True If False then underlying input data is not copied. + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + Returns ------- DataFrame @@ -11948,6 +12320,18 @@ copy : bool, default True If False then underlying input data is not copied. + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + Returns ------- DataFrame @@ -11973,7 +12357,7 @@ For the yearly frequency >>> idx.to_period("Y") - PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]') + PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]') """ new_obj = self.copy(deep=copy and not using_copy_on_write()) @@ -12188,7 +12572,7 @@ # ---------------------------------------------------------------------- # Internal Interface Methods - def _to_dict_of_blocks(self, copy: bool = True): + def _to_dict_of_blocks(self): """ Return a dict of dtype -> Constructor Types that each is a homogeneous dtype. @@ -12197,11 +12581,10 @@ """ mgr = self._mgr # convert to BlockManager if needed -> this way support ArrayManager as well - mgr = mgr_to_mgr(mgr, "block") - mgr = cast(BlockManager, mgr) + mgr = cast(BlockManager, mgr_to_mgr(mgr, "block")) return { k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self) - for k, v, in mgr.to_dict(copy=copy).items() + for k, v, in mgr.to_dict().items() } @property diff -Nru pandas-2.1.4+dfsg/pandas/core/generic.py pandas-2.2.2+dfsg/pandas/core/generic.py --- pandas-2.1.4+dfsg/pandas/core/generic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/generic.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,6 +2,7 @@ from __future__ import annotations import collections +from copy import deepcopy import datetime as dt from functools import partial import gc @@ -29,6 +30,7 @@ from pandas._config import ( config, using_copy_on_write, + warn_copy_on_write, ) from pandas._libs import lib @@ -39,6 +41,7 @@ Timestamp, to_offset, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas._typing import ( AlignJoin, AnyArrayLike, @@ -71,6 +74,7 @@ Renamer, Scalar, Self, + SequenceNotStr, SortKind, StorageOptions, Suffixes, @@ -96,6 +100,8 @@ SettingWithCopyError, SettingWithCopyWarning, _chained_assignment_method_msg, + _chained_assignment_warning_method_msg, + _check_cacher, ) from pandas.util._decorators import ( deprecate_nonkeyword_arguments, @@ -255,8 +261,6 @@ "_is_copy", "_name", "_metadata", - "__array_struct__", - "__array_interface__", "_flags", ] _internal_names_set: set[str] = set(_internal_names) @@ -332,6 +336,7 @@ # fastpath of passing a manager doesn't check the option/manager class return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self) + @final @classmethod def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self: """ @@ -368,6 +373,13 @@ -------- DataFrame.flags : Global flags applying to this object. + Notes + ----- + Many operations that create new datasets will copy ``attrs``. Copies + are always deep so that changing ``attrs`` will only affect the + present dataset. ``pandas.concat`` copies ``attrs`` only if all input + datasets have the same ``attrs``. + Examples -------- For Series: @@ -445,6 +457,18 @@ ---------- copy : bool, default False Specify if a copy of the object should be made. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` allows_duplicate_labels : bool, optional Whether the returned object allows duplicate labels. @@ -627,12 +651,17 @@ Used in :meth:`DataFrame.eval`. """ from pandas.core.computation.parsing import clean_column_name + from pandas.core.series import Series if isinstance(self, ABCSeries): return {clean_column_name(self.name): self} return { - clean_column_name(k): v for k, v in self.items() if not isinstance(k, int) + clean_column_name(k): Series( + v, copy=False, index=self.index, name=k, dtype=self.dtypes[k] + ).__finalize__(self) + for k, v in zip(self.columns, self._iter_column_arrays()) + if not isinstance(k, int) } @final @@ -640,6 +669,14 @@ def _info_axis(self) -> Index: return getattr(self, self._info_axis_name) + def _is_view_after_cow_rules(self): + # Only to be used in cases of chained assignment checks, this is a + # simplified check that assumes that either the whole object is a view + # or a copy + if len(self._mgr.blocks) == 0: # type: ignore[union-attr] + return False + return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr] + @property def shape(self) -> tuple[int, ...]: """ @@ -731,7 +768,17 @@ copy : bool, default True Whether to make a copy of the underlying data. - .. versionadded:: 1.5.0 + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` Returns ------- @@ -817,13 +864,12 @@ assert isinstance(new_mgr, BlockManager) assert isinstance(self._mgr, BlockManager) new_mgr.blocks[0].refs = self._mgr.blocks[0].refs - new_mgr.blocks[0].refs.add_reference( - new_mgr.blocks[0] # type: ignore[arg-type] - ) + new_mgr.blocks[0].refs.add_reference(new_mgr.blocks[0]) if not using_copy_on_write() and copy is not False: new_mgr = new_mgr.copy(deep=True) - return self._constructor(new_mgr).__finalize__(self, method="swapaxes") + out = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + return out.__finalize__(self, method="swapaxes") return self._constructor( new_values, @@ -1163,6 +1209,18 @@ The axis to rename. For `Series` this parameter is unused and defaults to 0. copy : bool, default None Also copy underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` inplace : bool, default False Modifies the object directly, instead of creating a new Series or DataFrame. @@ -1394,8 +1452,8 @@ the same location are considered equal. The row/column index do not need to have the same type, as long - as the values are considered equal. Corresponding columns must be of - the same dtype. + as the values are considered equal. Corresponding columns and + index must be of the same dtype. Parameters ---------- @@ -1530,7 +1588,8 @@ .. deprecated:: 2.1.0 - bool is deprecated and will be removed in future version of pandas + bool is deprecated and will be removed in future version of pandas. + For ``Series`` use ``pandas.Series.item``. This must be a boolean scalar value, either True or False. It will raise a ValueError if the Series or DataFrame does not have exactly 1 element, or that @@ -1560,6 +1619,14 @@ True >>> pd.DataFrame({'col': [False]}).bool() # doctest: +SKIP False + + This is an alternative method and will only work + for single element objects with a boolean value: + + >>> pd.Series([True]).item() # doctest: +SKIP + True + >>> pd.Series([False]).item() # doctest: +SKIP + False """ warnings.warn( @@ -2079,7 +2146,9 @@ # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented __array_priority__: int = 1000 - def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None + ) -> np.ndarray: values = self._values arr = np.asarray(values, dtype=dtype) if ( @@ -2191,6 +2260,9 @@ # I/O Methods @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "excel_writer"], name="to_excel" + ) @doc( klass="object", storage_options=_shared_docs["storage_options"], @@ -2354,6 +2426,9 @@ ) @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path_or_buf"], name="to_json" + ) @doc( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buf", @@ -2450,8 +2525,6 @@ {storage_options} - .. versionadded:: 1.2.0 - mode : str, default 'w' (writing) Specify the IO mode for output when supplying a path_or_buf. Accepted args are 'w' (writing) and 'a' (append) only. @@ -2644,6 +2717,9 @@ ) @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path_or_buf"], name="to_hdf" + ) def to_hdf( self, path_or_buf: FilePath | HDFStore, @@ -2842,7 +2918,7 @@ index : bool, default True Write DataFrame index as a column. Uses `index_label` as the column - name in the table. + name in the table. Creates a table index for this column. index_label : str or sequence, default None Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. @@ -2896,6 +2972,9 @@ database. Otherwise, the datetimes will be stored as timezone unaware timestamps local to the original timezone. + Not all datastores support ``method="multi"``. Oracle, for example, + does not support multi-value insert. + References ---------- .. [1] https://docs.sqlalchemy.org @@ -3019,6 +3098,9 @@ ) @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path"], name="to_pickle" + ) @doc( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path", @@ -3050,8 +3132,6 @@ {storage_options} - .. versionadded:: 1.2.0 - See Also -------- read_pickle : Load pickled pandas object (or any object) from file. @@ -3091,6 +3171,9 @@ ) @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self"], name="to_clipboard" + ) def to_clipboard( self, excel: bool_t = True, sep: str | None = None, **kwargs ) -> None: @@ -3198,18 +3281,18 @@ 2 lion mammal 80.5 4 3 monkey mammal NaN 4 - >>> df.to_xarray() + >>> df.to_xarray() # doctest: +SKIP Dimensions: (index: 4) Coordinates: - * index (index) int64 0 1 2 3 + * index (index) int64 32B 0 1 2 3 Data variables: - name (index) object 'falcon' 'parrot' 'lion' 'monkey' - class (index) object 'bird' 'bird' 'mammal' 'mammal' - max_speed (index) float64 389.0 24.0 80.5 nan - num_legs (index) int64 2 2 4 4 + name (index) object 32B 'falcon' 'parrot' 'lion' 'monkey' + class (index) object 32B 'bird' 'bird' 'mammal' 'mammal' + max_speed (index) float64 32B 389.0 24.0 80.5 nan + num_legs (index) int64 32B 2 2 4 4 - >>> df['max_speed'].to_xarray() + >>> df['max_speed'].to_xarray() # doctest: +SKIP array([389. , 24. , 80.5, nan]) Coordinates: @@ -3231,7 +3314,7 @@ 2018-01-02 falcon 361 parrot 15 - >>> df_multiindex.to_xarray() + >>> df_multiindex.to_xarray() # doctest: +SKIP Dimensions: (date: 2, animal: 2) Coordinates: @@ -3252,7 +3335,7 @@ self, buf: None = ..., columns: Sequence[Hashable] | None = ..., - header: bool_t | list[str] = ..., + header: bool_t | SequenceNotStr[str] = ..., index: bool_t = ..., na_rep: str = ..., formatters: FormattersType | None = ..., @@ -3279,7 +3362,7 @@ self, buf: FilePath | WriteBuffer[str], columns: Sequence[Hashable] | None = ..., - header: bool_t | list[str] = ..., + header: bool_t | SequenceNotStr[str] = ..., index: bool_t = ..., na_rep: str = ..., formatters: FormattersType | None = ..., @@ -3302,11 +3385,14 @@ ... @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "buf"], name="to_latex" + ) def to_latex( self, buf: FilePath | WriteBuffer[str] | None = None, columns: Sequence[Hashable] | None = None, - header: bool_t | list[str] = True, + header: bool_t | SequenceNotStr[str] = True, index: bool_t = True, na_rep: str = "NaN", formatters: FormattersType | None = None, @@ -3333,9 +3419,6 @@ into a main LaTeX document or read from an external file with ``\input{{table.tex}}``. - .. versionchanged:: 1.2.0 - Added position argument, changed meaning of caption argument. - .. versionchanged:: 2.0.0 Refactored to use the Styler implementation via jinja2 templating. @@ -3426,10 +3509,6 @@ Tuple (full_caption, short_caption), which results in ``\caption[short_caption]{{full_caption}}``; if a single string is passed, no short caption will be set. - - .. versionchanged:: 1.2.0 - Optionally allow caption to be a tuple ``(full_caption, short_caption)``. - label : str, optional The LaTeX label to be placed inside ``\label{{}}`` in the output. This is used with ``\ref{{}}`` in the main ``.tex`` file. @@ -3438,8 +3517,6 @@ The LaTeX positional argument for tables, to be placed after ``\begin{{}}`` in the output. - .. versionadded:: 1.2.0 - Returns ------- str or None @@ -3718,6 +3795,9 @@ ... @final + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv" + ) @doc( storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buf", @@ -3757,11 +3837,6 @@ returned as a string. If a non-binary file object is passed, it should be opened with `newline=''`, disabling universal newlines. If a binary file object is passed, `mode` might need to contain a `'b'`. - - .. versionchanged:: 1.2.0 - - Support for binary file objects was introduced. - sep : str, default ',' String of length 1. Field delimiter for the output file. na_rep : str, default '' @@ -3802,17 +3877,6 @@ Passing compression options as keys in dict is supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'. - - .. versionchanged:: 1.2.0 - - Compression is supported for binary file objects. - - .. versionchanged:: 1.2.0 - - Previous versions forwarded dict entries for 'gzip' to - `gzip.open` instead of `gzip.GzipFile` which prevented - setting `mtime`. - quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC @@ -3848,8 +3912,6 @@ {storage_options} - .. versionadded:: 1.2.0 - Returns ------- None or str @@ -3863,14 +3925,17 @@ Examples -------- + Create 'out.csv' containing 'df' without indices + >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], ... 'weapon': ['sai', 'bo staff']}}) - >>> df.to_csv(index=False) - 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' + >>> df.to_csv('out.csv', index=False) # doctest: +SKIP Create 'out.zip' containing 'out.csv' + >>> df.to_csv(index=False) + 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' >>> compression_opts = dict(method='zip', ... archive_name='out.csv') # doctest: +SKIP >>> df.to_csv('out.zip', index=False, @@ -4360,7 +4425,7 @@ df.iloc[0:5]['group'] = 'a' """ - if using_copy_on_write(): + if using_copy_on_write() or warn_copy_on_write(): return # return early if the check is not needed @@ -4558,6 +4623,18 @@ copy : bool, default True Return a new object, even if the passed indexes are the same. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` limit : int, default None Maximum number of consecutive labels to fill for inexact matches. tolerance : optional @@ -5250,11 +5327,11 @@ new_data = self._mgr.take(indexer, axis=baxis, verify=False) # reconstruct axis if needed - new_data.set_axis(baxis, new_data.axes[baxis]._sort_levels_monotonic()) - - if ignore_index: - axis = 1 if isinstance(self, ABCDataFrame) else 0 - new_data.set_axis(axis, default_index(len(indexer))) + if not ignore_index: + new_axis = new_data.axes[baxis]._sort_levels_monotonic() + else: + new_axis = default_index(len(indexer)) + new_data.set_axis(baxis, new_axis) result = self._constructor_from_mgr(new_data, axes=new_data.axes) @@ -5304,6 +5381,18 @@ copy : bool, default True Return a new object, even if the passed indexes are the same. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` level : int or name Broadcast across a level, matching Index values on the passed MultiIndex level. @@ -6163,8 +6252,12 @@ stable across pandas releases. """ if isinstance(other, NDFrame): - for name in other.attrs: - self.attrs[name] = other.attrs[name] + if other.attrs: + # We want attrs propagation to have minimal performance + # impact if attrs are not used; i.e. attrs is an empty dict. + # One could make the deepcopy unconditionally, but a deepcopy + # of an empty dict is 50x more expensive than the empty check. + self.attrs = deepcopy(other.attrs) self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels # For subclasses using _metadata. @@ -6173,11 +6266,13 @@ object.__setattr__(self, name, getattr(other, name, None)) if method == "concat": - attrs = other.objs[0].attrs - check_attrs = all(objs.attrs == attrs for objs in other.objs[1:]) - if check_attrs: - for name in attrs: - self.attrs[name] = attrs[name] + # propagate attrs only if all concat arguments have the same attrs + if all(bool(obj.attrs) for obj in other.objs): + # all concatenate arguments have non-empty attrs + attrs = other.objs[0].attrs + have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:]) + if have_same_attrs: + self.attrs = deepcopy(attrs) allows_duplicate_labels = all( x.flags.allows_duplicate_labels for x in other.objs @@ -6384,6 +6479,18 @@ Return a copy when ``copy=True`` (be very careful setting ``copy=False`` as changes to values then may propagate to other pandas objects). + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` errors : {'raise', 'ignore'}, default 'raise' Control raising of exceptions on invalid data for provided dtype. @@ -6527,7 +6634,9 @@ return self.copy(deep=copy) # GH 18099/22869: columnwise conversion to extension dtype # GH 24704: self.items handles duplicate column names - results = [ser.astype(dtype, copy=copy) for _, ser in self.items()] + results = [ + ser.astype(dtype, copy=copy, errors=errors) for _, ser in self.items() + ] else: # else, only a single dtype is given @@ -6565,6 +6674,21 @@ and index are copied). Any changes to the data of the original will be reflected in the shallow copy (and vice versa). + .. note:: + The ``deep=False`` behaviour as described above will change + in pandas 3.0. `Copy-on-Write + `__ + will be enabled by default, which means that the "shallow" copy + is that is returned with ``deep=False`` will still avoid making + an eager copy, but changes to the data of the original will *no* + longer be reflected in the shallow copy (or vice versa). Instead, + it makes use of a lazy (deferred) copy mechanism that will copy + the data only when any changes to the original or shallow copy is + made. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + Parameters ---------- deep : bool, default True @@ -6634,7 +6758,8 @@ False Updates to the data shared by shallow copy and original is reflected - in both; deep copy remains unchanged. + in both (NOTE: this will no longer be true for pandas >= 3.0); + deep copy remains unchanged. >>> s.iloc[0] = 3 >>> shallow.iloc[1] = 4 @@ -6667,7 +6792,8 @@ 1 [3, 4] dtype: object - ** Copy-on-Write is set to true: ** + **Copy-on-Write is set to true**, the shallow copy is not modified + when the original data is changed: >>> with pd.option_context("mode.copy_on_write", True): ... s = pd.Series([1, 2], index=["a", "b"]) @@ -6718,6 +6844,18 @@ Whether to make a copy for non-object or non-inferable columns or Series. + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + Returns ------- same type as input object @@ -6778,8 +6916,6 @@ Whether, if possible, conversion can be done to floating extension types. If `convert_integer` is also True, preference will be give to integer dtypes if the floats can be faithfully casted to integers. - - .. versionadded:: 1.2.0 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' Back-end data type applied to the resultant :class:`DataFrame` (still experimental). Behaviour is as follows: @@ -6823,10 +6959,6 @@ appropriate integer extension type. Otherwise, convert to an appropriate floating extension type. - .. versionchanged:: 1.2 - Starting with pandas 1.2, this method also converts float columns - to the nullable floating extension type. - In the future, as new dtypes are added that support ``pd.NA``, the results of this method will change to support those new dtypes. @@ -6896,36 +7028,16 @@ dtype: string """ check_dtype_backend(dtype_backend) - if self.ndim == 1: - return self._convert_dtypes( - infer_objects, - convert_string, - convert_integer, - convert_boolean, - convert_floating, - dtype_backend=dtype_backend, - ) - else: - results = [ - col._convert_dtypes( - infer_objects, - convert_string, - convert_integer, - convert_boolean, - convert_floating, - dtype_backend=dtype_backend, - ) - for col_name, col in self.items() - ] - if len(results) > 0: - result = concat(results, axis=1, copy=False, keys=self.columns) - cons = cast(type["DataFrame"], self._constructor) - result = cons(result) - result = result.__finalize__(self, method="convert_dtypes") - # https://github.com/python/mypy/issues/8354 - return cast(Self, result) - else: - return self.copy(deep=None) + new_mgr = self._mgr.convert_dtypes( # type: ignore[union-attr] + infer_objects=infer_objects, + convert_string=convert_string, + convert_integer=convert_integer, + convert_boolean=convert_boolean, + convert_floating=convert_floating, + dtype_backend=dtype_backend, + ) + res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + return res.__finalize__(self, method="convert_dtypes") # ---------------------------------------------------------------------- # Filling NA's @@ -6954,6 +7066,7 @@ axis: None | Axis = None, inplace: bool_t = False, limit: None | int = None, + limit_area: Literal["inside", "outside"] | None = None, downcast: dict | None = None, ): if axis is None: @@ -6962,9 +7075,14 @@ method = clean_fill_method(method) if not self._mgr.is_single_block and axis == 1: + # e.g. test_align_fill_method + # TODO(3.0): once downcast is removed, we can do the .T + # in all axis=1 cases, and remove axis kward from mgr.pad_or_backfill. if inplace: raise NotImplementedError() - result = self.T._pad_or_backfill(method=method, limit=limit).T + result = self.T._pad_or_backfill( + method=method, limit=limit, limit_area=limit_area + ).T return result @@ -6972,6 +7090,7 @@ method=method, axis=self._get_block_manager_axis(axis), limit=limit, + limit_area=limit_area, inplace=inplace, downcast=downcast, ) @@ -7074,6 +7193,8 @@ or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.2.0 + Returns ------- {klass} or None @@ -7152,6 +7273,22 @@ ChainedAssignmentError, stacklevel=2, ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and _check_cacher(self): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) value, method = validate_fillna_kwargs(value, method) if method is not None: @@ -7315,6 +7452,7 @@ axis: None | Axis = ..., inplace: Literal[False] = ..., limit: None | int = ..., + limit_area: Literal["inside", "outside"] | None = ..., downcast: dict | None | lib.NoDefault = ..., ) -> Self: ... @@ -7326,6 +7464,7 @@ axis: None | Axis = ..., inplace: Literal[True], limit: None | int = ..., + limit_area: Literal["inside", "outside"] | None = ..., downcast: dict | None | lib.NoDefault = ..., ) -> None: ... @@ -7337,6 +7476,7 @@ axis: None | Axis = ..., inplace: bool_t = ..., limit: None | int = ..., + limit_area: Literal["inside", "outside"] | None = ..., downcast: dict | None | lib.NoDefault = ..., ) -> Self | None: ... @@ -7352,6 +7492,7 @@ axis: None | Axis = None, inplace: bool_t = False, limit: None | int = None, + limit_area: Literal["inside", "outside"] | None = None, downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ @@ -7373,11 +7514,24 @@ be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. + limit_area : {{`None`, 'inside', 'outside'}}, default None + If limit is specified, consecutive NaNs will be filled with this + restriction. + + * ``None``: No fill restriction. + * 'inside': Only fill NaNs surrounded by valid values + (interpolate). + * 'outside': Only fill NaNs outside valid values (extrapolate). + + .. versionadded:: 2.2.0 + downcast : dict, default is None A dict of item->dtype of what to downcast if possible, or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.2.0 + Returns ------- {klass} or None @@ -7422,12 +7576,29 @@ ChainedAssignmentError, stacklevel=2, ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and _check_cacher(self): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) return self._pad_or_backfill( "ffill", axis=axis, inplace=inplace, limit=limit, + limit_area=limit_area, # error: Argument "downcast" to "_fillna_with_method" of "NDFrame" # has incompatible type "Union[Dict[Any, Any], None, # Literal[_NoDefault.no_default]]"; expected "Optional[Dict[Any, Any]]" @@ -7475,6 +7646,7 @@ axis: None | Axis = ..., inplace: Literal[False] = ..., limit: None | int = ..., + limit_area: Literal["inside", "outside"] | None = ..., downcast: dict | None | lib.NoDefault = ..., ) -> Self: ... @@ -7497,6 +7669,7 @@ axis: None | Axis = ..., inplace: bool_t = ..., limit: None | int = ..., + limit_area: Literal["inside", "outside"] | None = ..., downcast: dict | None | lib.NoDefault = ..., ) -> Self | None: ... @@ -7512,6 +7685,7 @@ axis: None | Axis = None, inplace: bool_t = False, limit: None | int = None, + limit_area: Literal["inside", "outside"] | None = None, downcast: dict | None | lib.NoDefault = lib.no_default, ) -> Self | None: """ @@ -7533,11 +7707,24 @@ be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. + limit_area : {{`None`, 'inside', 'outside'}}, default None + If limit is specified, consecutive NaNs will be filled with this + restriction. + + * ``None``: No fill restriction. + * 'inside': Only fill NaNs surrounded by valid values + (interpolate). + * 'outside': Only fill NaNs outside valid values (extrapolate). + + .. versionadded:: 2.2.0 + downcast : dict, default is None A dict of item->dtype of what to downcast if possible, or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). + .. deprecated:: 2.2.0 + Returns ------- {klass} or None @@ -7593,11 +7780,29 @@ ChainedAssignmentError, stacklevel=2, ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and _check_cacher(self): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + return self._pad_or_backfill( "bfill", axis=axis, inplace=inplace, limit=limit, + limit_area=limit_area, # error: Argument "downcast" to "_fillna_with_method" of "NDFrame" # has incompatible type "Union[Dict[Any, Any], None, # Literal[_NoDefault.no_default]]"; expected "Optional[Dict[Any, Any]]" @@ -7746,6 +7951,26 @@ ChainedAssignmentError, stacklevel=2, ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and _check_cacher(self): + # in non-CoW mode, chained Series access will populate the + # `_item_cache` which results in an increased ref count not below + # the threshold, while we still need to warn. We detect this case + # of a Series derived from a DataFrame through the presence of + # checking the `_cacher` + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) if not is_bool(regex) and to_replace is not None: raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") @@ -7792,7 +8017,9 @@ if items: keys, values = zip(*items) else: - keys, values = ([], []) + # error: Incompatible types in assignment (expression has type + # "list[Never]", variable has type "tuple[Any, ...]") + keys, values = ([], []) # type: ignore[assignment] are_mappings = [is_dict_like(v) for v in values] @@ -7807,7 +8034,12 @@ value_dict = {} for k, v in items: - keys, values = list(zip(*v.items())) or ([], []) + # error: Incompatible types in assignment (expression has type + # "list[Never]", variable has type "tuple[Any, ...]") + keys, values = list(zip(*v.items())) or ( # type: ignore[assignment] + [], + [], + ) to_rep_dict[k] = list(keys) value_dict[k] = list(values) @@ -7920,6 +8152,51 @@ else: return result.__finalize__(self, method="replace") + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: Literal[False] = ..., + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> Self: + ... + + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: Literal[True], + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> None: + ... + + @overload + def interpolate( + self, + method: InterpolateOptions = ..., + *, + axis: Axis = ..., + limit: int | None = ..., + inplace: bool_t = ..., + limit_direction: Literal["forward", "backward", "both"] | None = ..., + limit_area: Literal["inside", "outside"] | None = ..., + downcast: Literal["infer"] | None | lib.NoDefault = ..., + **kwargs, + ) -> Self | None: + ... + @final def interpolate( self, @@ -8126,6 +8403,22 @@ ChainedAssignmentError, stacklevel=2, ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and _check_cacher(self): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) axis = self._get_axis_number(axis) @@ -8258,8 +8551,6 @@ * DataFrame : when `self` is a DataFrame and `where` is an array-like - Return scalar, Series, or DataFrame. - See Also -------- merge_asof : Perform an asof merge. Similar to left join. @@ -8590,6 +8881,42 @@ # GH 40420 return self.where(subset, threshold, axis=axis, inplace=inplace) + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: Literal[False] = ..., + **kwargs, + ) -> Self: + ... + + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: Literal[True], + **kwargs, + ) -> None: + ... + + @overload + def clip( + self, + lower=..., + upper=..., + *, + axis: Axis | None = ..., + inplace: bool_t = ..., + **kwargs, + ) -> Self | None: + ... + @final def clip( self, @@ -8660,6 +8987,16 @@ 3 -1 6 4 5 -4 + Clips using specific lower and upper thresholds per column: + + >>> df.clip([-2, -1], [4, 5]) + col_0 col_1 + 0 4 -1 + 1 -2 -1 + 2 0 5 + 3 -1 5 + 4 4 -1 + Clips using specific lower and upper thresholds per column element: >>> t = pd.Series([2, -4, -1, 6, 3]) @@ -8708,6 +9045,22 @@ ChainedAssignmentError, stacklevel=2, ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) axis = nv.validate_clip_with_axis(axis, (), kwargs) if axis is not None: @@ -8827,7 +9180,7 @@ -------- Start by creating a series with 4 one minute timestamps. - >>> index = pd.date_range('1/1/2000', periods=4, freq='T') + >>> index = pd.date_range('1/1/2000', periods=4, freq='min') >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index) >>> df = pd.DataFrame({{'s': series}}) >>> df @@ -8839,7 +9192,7 @@ Upsample the series into 30 second bins. - >>> df.asfreq(freq='30S') + >>> df.asfreq(freq='30s') s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN @@ -8851,7 +9204,7 @@ Upsample again, providing a ``fill value``. - >>> df.asfreq(freq='30S', fill_value=9.0) + >>> df.asfreq(freq='30s', fill_value=9.0) s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 9.0 @@ -8863,7 +9216,7 @@ Upsample again, providing a ``method``. - >>> df.asfreq(freq='30S', method='bfill') + >>> df.asfreq(freq='30s', method='bfill') s 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN @@ -8915,7 +9268,7 @@ Examples -------- - >>> i = pd.date_range('2018-04-09', periods=4, freq='12H') + >>> i = pd.date_range('2018-04-09', periods=4, freq='12h') >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) >>> ts A @@ -9034,8 +9387,8 @@ axis: Axis | lib.NoDefault = lib.no_default, closed: Literal["right", "left"] | None = None, label: Literal["right", "left"] | None = None, - convention: Literal["start", "end", "s", "e"] = "start", - kind: Literal["timestamp", "period"] | None = None, + convention: Literal["start", "end", "s", "e"] | lib.NoDefault = lib.no_default, + kind: Literal["timestamp", "period"] | None | lib.NoDefault = lib.no_default, on: Level | None = None, level: Level | None = None, origin: str | TimestampConvertibleTypes = "start_day", @@ -9063,20 +9416,26 @@ Use frame.T.resample(...) instead. closed : {{'right', 'left'}}, default None Which side of bin interval is closed. The default is 'left' - for all frequency offsets except for 'M', 'A', 'Q', 'BM', - 'BA', 'BQ', and 'W' which all have a default of 'right'. + for all frequency offsets except for 'ME', 'YE', 'QE', 'BME', + 'BA', 'BQE', and 'W' which all have a default of 'right'. label : {{'right', 'left'}}, default None Which bin edge label to label bucket with. The default is 'left' - for all frequency offsets except for 'M', 'A', 'Q', 'BM', - 'BA', 'BQ', and 'W' which all have a default of 'right'. + for all frequency offsets except for 'ME', 'YE', 'QE', 'BME', + 'BA', 'BQE', and 'W' which all have a default of 'right'. convention : {{'start', 'end', 's', 'e'}}, default 'start' For `PeriodIndex` only, controls whether to use the start or end of `rule`. + + .. deprecated:: 2.2.0 + Convert PeriodIndex to DatetimeIndex before resampling instead. kind : {{'timestamp', 'period'}}, optional, default None Pass 'timestamp' to convert the resulting index to a `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`. By default the input representation is retained. + .. deprecated:: 2.2.0 + Convert index to desired type explicitly instead. + on : str, optional For a DataFrame, column to use instead of index for resampling. Column must be datetime-like. @@ -9143,7 +9502,7 @@ -------- Start by creating a series with 9 one minute timestamps. - >>> index = pd.date_range('1/1/2000', periods=9, freq='T') + >>> index = pd.date_range('1/1/2000', periods=9, freq='min') >>> series = pd.Series(range(9), index=index) >>> series 2000-01-01 00:00:00 0 @@ -9155,16 +9514,16 @@ 2000-01-01 00:06:00 6 2000-01-01 00:07:00 7 2000-01-01 00:08:00 8 - Freq: T, dtype: int64 + Freq: min, dtype: int64 Downsample the series into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> series.resample('3T').sum() + >>> series.resample('3min').sum() 2000-01-01 00:00:00 3 2000-01-01 00:03:00 12 2000-01-01 00:06:00 21 - Freq: 3T, dtype: int64 + Freq: 3min, dtype: int64 Downsample the series into 3 minute bins as above, but label each bin using the right edge instead of the left. Please note that the @@ -9173,116 +9532,65 @@ bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed value in the resampled bucket with the label ``2000-01-01 00:03:00`` does not include 3 (if it did, the summed value would be 6, not 3). - To include this value close the right side of the bin interval as - illustrated in the example below this one. - >>> series.resample('3T', label='right').sum() + >>> series.resample('3min', label='right').sum() 2000-01-01 00:03:00 3 2000-01-01 00:06:00 12 2000-01-01 00:09:00 21 - Freq: 3T, dtype: int64 + Freq: 3min, dtype: int64 - Downsample the series into 3 minute bins as above, but close the right - side of the bin interval. + To include this value close the right side of the bin interval, + as shown below. - >>> series.resample('3T', label='right', closed='right').sum() + >>> series.resample('3min', label='right', closed='right').sum() 2000-01-01 00:00:00 0 2000-01-01 00:03:00 6 2000-01-01 00:06:00 15 2000-01-01 00:09:00 15 - Freq: 3T, dtype: int64 + Freq: 3min, dtype: int64 Upsample the series into 30 second bins. - >>> series.resample('30S').asfreq()[0:5] # Select first 5 rows + >>> series.resample('30s').asfreq()[0:5] # Select first 5 rows 2000-01-01 00:00:00 0.0 2000-01-01 00:00:30 NaN 2000-01-01 00:01:00 1.0 2000-01-01 00:01:30 NaN 2000-01-01 00:02:00 2.0 - Freq: 30S, dtype: float64 + Freq: 30s, dtype: float64 Upsample the series into 30 second bins and fill the ``NaN`` values using the ``ffill`` method. - >>> series.resample('30S').ffill()[0:5] + >>> series.resample('30s').ffill()[0:5] 2000-01-01 00:00:00 0 2000-01-01 00:00:30 0 2000-01-01 00:01:00 1 2000-01-01 00:01:30 1 2000-01-01 00:02:00 2 - Freq: 30S, dtype: int64 + Freq: 30s, dtype: int64 Upsample the series into 30 second bins and fill the ``NaN`` values using the ``bfill`` method. - >>> series.resample('30S').bfill()[0:5] + >>> series.resample('30s').bfill()[0:5] 2000-01-01 00:00:00 0 2000-01-01 00:00:30 1 2000-01-01 00:01:00 1 2000-01-01 00:01:30 2 2000-01-01 00:02:00 2 - Freq: 30S, dtype: int64 + Freq: 30s, dtype: int64 Pass a custom function via ``apply`` >>> def custom_resampler(arraylike): ... return np.sum(arraylike) + 5 ... - >>> series.resample('3T').apply(custom_resampler) + >>> series.resample('3min').apply(custom_resampler) 2000-01-01 00:00:00 8 2000-01-01 00:03:00 17 2000-01-01 00:06:00 26 - Freq: 3T, dtype: int64 - - For a Series with a PeriodIndex, the keyword `convention` can be - used to control whether to use the start or end of `rule`. - - Resample a year by quarter using 'start' `convention`. Values are - assigned to the first quarter of the period. - - >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01', - ... freq='A', - ... periods=2)) - >>> s - 2012 1 - 2013 2 - Freq: A-DEC, dtype: int64 - >>> s.resample('Q', convention='start').asfreq() - 2012Q1 1.0 - 2012Q2 NaN - 2012Q3 NaN - 2012Q4 NaN - 2013Q1 2.0 - 2013Q2 NaN - 2013Q3 NaN - 2013Q4 NaN - Freq: Q-DEC, dtype: float64 - - Resample quarters by month using 'end' `convention`. Values are - assigned to the last month of the period. - - >>> q = pd.Series([1, 2, 3, 4], index=pd.period_range('2018-01-01', - ... freq='Q', - ... periods=4)) - >>> q - 2018Q1 1 - 2018Q2 2 - 2018Q3 3 - 2018Q4 4 - Freq: Q-DEC, dtype: int64 - >>> q.resample('M', convention='end').asfreq() - 2018-03 1.0 - 2018-04 NaN - 2018-05 NaN - 2018-06 2.0 - 2018-07 NaN - 2018-08 NaN - 2018-09 3.0 - 2018-10 NaN - 2018-11 NaN - 2018-12 4.0 - Freq: M, dtype: float64 + Freq: 3min, dtype: int64 For DataFrame objects, the keyword `on` can be used to specify the column instead of the index for resampling. @@ -9303,7 +9611,7 @@ 5 18 100 2018-02-11 6 17 40 2018-02-18 7 19 50 2018-02-25 - >>> df.resample('M', on='week_starting').mean() + >>> df.resample('ME', on='week_starting').mean() price volume week_starting 2018-01-31 10.75 62.5 @@ -9353,7 +9661,7 @@ 2000-10-02 00:12:00 18 2000-10-02 00:19:00 21 2000-10-02 00:26:00 24 - Freq: 7T, dtype: int64 + Freq: 7min, dtype: int64 >>> ts.resample('17min').sum() 2000-10-01 23:14:00 0 @@ -9361,7 +9669,7 @@ 2000-10-01 23:48:00 21 2000-10-02 00:05:00 54 2000-10-02 00:22:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 >>> ts.resample('17min', origin='epoch').sum() 2000-10-01 23:18:00 0 @@ -9369,14 +9677,14 @@ 2000-10-01 23:52:00 27 2000-10-02 00:09:00 39 2000-10-02 00:26:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 >>> ts.resample('17min', origin='2000-01-01').sum() 2000-10-01 23:24:00 3 2000-10-01 23:41:00 15 2000-10-01 23:58:00 45 2000-10-02 00:15:00 45 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: @@ -9386,14 +9694,14 @@ 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 >>> ts.resample('17min', offset='23h30min').sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 If you want to take the largest Timestamp as the end of the bins: @@ -9402,7 +9710,7 @@ 2000-10-01 23:52:00 18 2000-10-02 00:09:00 27 2000-10-02 00:26:00 63 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 In contrast with the `start_day`, you can use `end_day` to take the ceiling midnight of the largest Timestamp as the end of the bins and drop the bins @@ -9413,7 +9721,7 @@ 2000-10-01 23:55:00 15 2000-10-02 00:12:00 45 2000-10-02 00:29:00 45 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 """ from pandas.core.resample import get_resampler @@ -9436,6 +9744,30 @@ else: axis = 0 + if kind is not lib.no_default: + # GH#55895 + warnings.warn( + f"The 'kind' keyword in {type(self).__name__}.resample is " + "deprecated and will be removed in a future version. " + "Explicitly cast the index to the desired type instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + kind = None + + if convention is not lib.no_default: + warnings.warn( + f"The 'convention' keyword in {type(self).__name__}.resample is " + "deprecated and will be removed in a future version. " + "Explicitly cast PeriodIndex to DatetimeIndex before resampling " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + convention = "start" + return get_resampler( cast("Series | DataFrame", self), freq=rule, @@ -9467,7 +9799,7 @@ ---------- offset : str, DateOffset or dateutil.relativedelta The offset length of the data that will be selected. For instance, - '1M' will display all the rows having their index within the first month. + '1ME' will display all the rows having their index within the first month. Returns ------- @@ -9586,7 +9918,7 @@ Get the rows for the last 3 days: - >>> ts.last('3D') # doctest: +SKIP + >>> ts.last('3D') # doctest: +SKIP A 2018-04-13 3 2018-04-15 4 @@ -9893,6 +10225,18 @@ copy : bool, default True Always returns new objects. If copy=False and no reindexing is required then original objects are returned. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` fill_value : scalar, default np.nan Value to use for missing values. Defaults to NaN, but can be any "compatible" value. @@ -10276,6 +10620,7 @@ inplace: bool_t = False, axis: Axis | None = None, level=None, + warn: bool_t = True, ): """ Equivalent to public method `where`, except that `other` is not @@ -10306,7 +10651,14 @@ # make sure we are boolean fill_value = bool(inplace) - cond = cond.fillna(fill_value) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + cond = cond.fillna(fill_value) + cond = cond.infer_objects(copy=False) msg = "Boolean array expected for the condition, not {dtype}" @@ -10399,7 +10751,7 @@ # we may have different type blocks come out of putmask, so # reconstruct the block manager - new_data = self._mgr.putmask(mask=cond, new=other, align=align) + new_data = self._mgr.putmask(mask=cond, new=other, align=align, warn=warn) result = self._constructor_from_mgr(new_data, axes=new_data.axes) return self._update_inplace(result) @@ -10611,6 +10963,23 @@ ChainedAssignmentError, stacklevel=2, ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) + other = common.apply_if_callable(other, self) return self._where(cond, other, inplace, axis, level) @@ -10677,14 +11046,31 @@ ChainedAssignmentError, stacklevel=2, ) + elif ( + not PYPY + and not using_copy_on_write() + and self._is_view_after_cow_rules() + ): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if isinstance(self, ABCSeries) and hasattr(self, "_cacher"): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) cond = common.apply_if_callable(cond, self) + other = common.apply_if_callable(other, self) # see gh-21891 if not hasattr(cond, "__invert__"): cond = np.array(cond) - return self.where( + return self._where( ~cond, other=other, inplace=inplace, @@ -10863,15 +11249,17 @@ raise ValueError(msg) elif isinstance(freq, str): - freq = to_offset(freq) + is_period = isinstance(index, PeriodIndex) + freq = to_offset(freq, is_period=is_period) if isinstance(index, PeriodIndex): orig_freq = to_offset(index.freq) if freq != orig_freq: assert orig_freq is not None # for mypy raise ValueError( - f"Given freq {freq.rule_code} does not match " - f"PeriodIndex freq {orig_freq.rule_code}" + f"Given freq {freq_to_period_freqstr(freq.n, freq.name)} " + f"does not match PeriodIndex freq " + f"{freq_to_period_freqstr(orig_freq.n, orig_freq.name)}" ) new_ax = index.shift(periods) else: @@ -10906,6 +11294,18 @@ copy : bool, default is True, Return a copy of the truncated section. + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + Returns ------- type of caller @@ -11062,6 +11462,18 @@ copy : bool, default True Also make a copy of the underlying data. + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + Returns ------- {klass} @@ -11087,7 +11499,7 @@ Pass None to convert to UTC and get a tz-naive index: >>> s = pd.Series([1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) >>> s.tz_convert(None) 2018-09-14 23:30:00 1 dtype: int64 @@ -11151,6 +11563,18 @@ must be None. copy : bool, default True Also make a copy of the underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' When clocks moved backward due to DST, ambiguous times may arise. For example in Central European Time (UTC+01), when going from @@ -11205,7 +11629,7 @@ Pass None to convert to tz-naive index and preserve local time: >>> s = pd.Series([1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00'])) >>> s.tz_localize(None) 2018-09-15 01:30:00 1 dtype: int64 @@ -11259,7 +11683,7 @@ 2015-03-29 01:59:59.999999999+01:00 0 2015-03-29 03:30:00+02:00 1 dtype: int64 - >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H')) + >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h')) 2015-03-29 03:30:00+02:00 0 2015-03-29 03:30:00+02:00 1 dtype: int64 @@ -11448,10 +11872,10 @@ Describing a ``DataFrame``. By default only numeric fields are returned. - >>> df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']), + >>> df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']), ... 'numeric': [1, 2, 3], ... 'object': ['a', 'b', 'c'] - ... }) + ... }) >>> df.describe() numeric count 3.0 @@ -11595,7 +12019,7 @@ .. deprecated:: 2.1 freq : DateOffset, timedelta, or str, optional - Increment to use from time series API (e.g. 'M' or BDay()). + Increment to use from time series API (e.g. 'ME' or BDay()). **kwargs Additional keyword arguments are passed into `DataFrame.shift` or `Series.shift`. @@ -11708,19 +12132,20 @@ if limit is lib.no_default: cols = self.items() if self.ndim == 2 else [(None, self)] for _, col in cols: - mask = col.isna().values - mask = mask[np.argmax(~mask) :] - if mask.any(): - warnings.warn( - "The default fill_method='pad' in " - f"{type(self).__name__}.pct_change is deprecated and will " - "be removed in a future version. Either fill in any " - "non-leading NA values prior to calling pct_change or " - "specify 'fill_method=None' to not fill NA values.", - FutureWarning, - stacklevel=find_stack_level(), - ) - break + if len(col) > 0: + mask = col.isna().values + mask = mask[np.argmax(~mask) :] + if mask.any(): + warnings.warn( + "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and " + "will be removed in a future version. Either fill in " + "any non-leading NA values prior to calling pct_change " + "or specify 'fill_method=None' to not fill NA values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + break fill_method = "pad" if limit is lib.no_default: limit = None @@ -11746,7 +12171,7 @@ self, name: str, func, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool_t = False, skipna: bool_t = True, **kwargs, @@ -11759,7 +12184,10 @@ res = self._logical_func( name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs ) - return res._logical_func(name, func, skipna=skipna, **kwargs) + # error: Item "bool" of "Series | bool" has no attribute "_logical_func" + return res._logical_func( # type: ignore[union-attr] + name, func, skipna=skipna, **kwargs + ) elif axis is None: axis = 0 @@ -11788,7 +12216,7 @@ def any( self, - axis: Axis = 0, + axis: Axis | None = 0, bool_only: bool_t = False, skipna: bool_t = True, **kwargs, @@ -12252,14 +12680,26 @@ """ Wrap arithmetic method to operate inplace. """ + warn = True + if not PYPY and warn_copy_on_write(): + if sys.getrefcount(self) <= REF_COUNT + 2: + # we are probably in an inplace setitem context (e.g. df['a'] += 1) + warn = False + result = op(self, other) - if self.ndim == 1 and result._indexed_same(self) and result.dtype == self.dtype: + if ( + self.ndim == 1 + and result._indexed_same(self) + and result.dtype == self.dtype + and not using_copy_on_write() + and not (warn_copy_on_write() and not warn) + ): # GH#36498 this inplace op can _actually_ be inplace. # Item "ArrayManager" of "Union[ArrayManager, SingleArrayManager, # BlockManager, SingleBlockManager]" has no attribute "setitem_inplace" self._mgr.setitem_inplace( # type: ignore[union-attr] - slice(None), result._values + slice(None), result._values, warn=warn ) return self @@ -12359,11 +12799,6 @@ ------- type of index - Notes - ----- - If all elements are non-NA/null, returns None. - Also returns None for empty {klass}. - Examples -------- For Series: @@ -12374,6 +12809,22 @@ >>> s.last_valid_index() 2 + >>> s = pd.Series([None, None]) + >>> print(s.first_valid_index()) + None + >>> print(s.last_valid_index()) + None + + If all elements in Series are NA/null, returns None. + + >>> s = pd.Series() + >>> print(s.first_valid_index()) + None + >>> print(s.last_valid_index()) + None + + If Series is empty, returns None. + For DataFrame: >>> df = pd.DataFrame({{'A': [None, None, 2], 'B': [None, 3, 4]}}) @@ -12386,6 +12837,31 @@ 1 >>> df.last_valid_index() 2 + + >>> df = pd.DataFrame({{'A': [None, None, None], 'B': [None, None, None]}}) + >>> df + A B + 0 None None + 1 None None + 2 None None + >>> print(df.first_valid_index()) + None + >>> print(df.last_valid_index()) + None + + If all elements in DataFrame are NA/null, returns None. + + >>> df = pd.DataFrame() + >>> df + Empty DataFrame + Columns: [] + Index: [] + >>> print(df.first_valid_index()) + None + >>> print(df.last_valid_index()) + None + + If DataFrame is empty, returns None. """ return self._find_valid_index(how="first") @@ -12425,6 +12901,39 @@ {examples} """ +_sum_prod_doc = """ +{desc} + +Parameters +---------- +axis : {axis_descr} + Axis for the function to be applied on. + For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.{name} with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + + .. versionadded:: 2.0.0 + +skipna : bool, default True + Exclude NA/null values when computing the result. +numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. + +{min_count}\ +**kwargs + Additional keyword arguments to be passed to the function. + +Returns +------- +{name1} or scalar\ +{see_also}\ +{examples} +""" + _num_ddof_doc = """ {desc} @@ -12432,6 +12941,13 @@ ---------- axis : {axis_descr} For `Series` this parameter is unused and defaults to 0. + + .. warning:: + + The behavior of DataFrame.{name} with ``axis=None`` is deprecated, + in a future version this will reduce over both axes and return a scalar + To retain the old behavior, pass axis=0 (or do not pass axis). + skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -12490,9 +13006,9 @@ Examples -------- >>> df = pd.DataFrame({'person_id': [0, 1, 2, 3], -... 'age': [21, 25, 62, 43], -... 'height': [1.61, 1.87, 1.49, 2.01]} -... ).set_index('person_id') +... 'age': [21, 25, 62, 43], +... 'height': [1.61, 1.87, 1.49, 2.01]} +... ).set_index('person_id') >>> df age height person_id @@ -13139,7 +13655,7 @@ kwargs = {"min_count": ""} elif name == "sum": - base_doc = _num_doc + base_doc = _sum_prod_doc desc = ( "Return the sum of the values over the requested axis.\n\n" "This is equivalent to the method ``numpy.sum``." @@ -13149,7 +13665,7 @@ kwargs = {"min_count": _min_count_stub} elif name == "prod": - base_doc = _num_doc + base_doc = _sum_prod_doc desc = "Return the product of the values over the requested axis." see_also = _stat_func_see_also examples = _prod_examples @@ -13318,7 +13834,7 @@ With a DataFrame >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [2, 3, 4], 'c': [1, 3, 5]}, - ... index=['tiger', 'zebra', 'cow']) + ... index=['tiger', 'zebra', 'cow']) >>> df a b c tiger 1 2 1 @@ -13342,7 +13858,7 @@ getting an error. >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': ['T', 'Z', 'X']}, - ... index=['tiger', 'zebra', 'cow']) + ... index=['tiger', 'zebra', 'cow']) >>> df.skew(numeric_only=True) a 0.0 dtype: float64""" @@ -13433,6 +13949,7 @@ docstr = base_doc.format( desc=desc, + name=name, name1=name1, name2=name2, axis_descr=axis_descr, diff -Nru pandas-2.1.4+dfsg/pandas/core/groupby/generic.py pandas-2.2.2+dfsg/pandas/core/groupby/generic.py --- pandas-2.1.4+dfsg/pandas/core/groupby/generic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/groupby/generic.py 2024-04-10 17:42:52.000000000 +0000 @@ -28,6 +28,7 @@ Interval, lib, ) +from pandas._libs.hashtable import duplicated from pandas.errors import SpecificationError from pandas.util._decorators import ( Appender, @@ -84,6 +85,7 @@ default_index, ) from pandas.core.series import Series +from pandas.core.sorting import get_group_index from pandas.core.util.numba_ import maybe_use_numba from pandas.plotting import boxplot_frame_groupby @@ -158,7 +160,7 @@ def _get_data_to_aggregate( self, *, numeric_only: bool = False, name: str | None = None ) -> SingleManager: - ser = self._selected_obj + ser = self._obj_with_exclusions single = ser._mgr if numeric_only and not is_numeric_dtype(ser.dtype): # GH#41291 match Series behavior @@ -281,11 +283,11 @@ return self.obj._constructor( [], name=self.obj.name, - index=self.grouper.result_index, + index=self._grouper.result_index, dtype=obj.dtype, ) - if self.grouper.nkeys > 1: + if self._grouper.nkeys > 1: return self._python_agg_general(func, *args, **kwargs) try: @@ -307,7 +309,7 @@ ) # result is a dict whose keys are the elements of result_index - result = Series(result, index=self.grouper.result_index) + result = Series(result, index=self._grouper.result_index) result = self._wrap_aggregated_output(result) return result @@ -322,7 +324,7 @@ f = lambda x: func(x, *args, **kwargs) obj = self._obj_with_exclusions - result = self.grouper.agg_series(obj, f) + result = self._grouper.agg_series(obj, f) res = obj._constructor(result, name=obj.name) return self._wrap_aggregated_output(res) @@ -402,7 +404,7 @@ # GH#47787 see test_group_on_empty_multiindex res_index = data.index else: - res_index = self.grouper.result_index + res_index = self._grouper.result_index return self.obj._constructor( [], @@ -414,7 +416,7 @@ if isinstance(values[0], dict): # GH #823 #24880 - index = self.grouper.result_index + index = self._grouper.result_index res_df = self.obj._constructor_expanddim(values, index=index) res_df = self._reindex_output(res_df) # if self.observed is False, @@ -437,7 +439,7 @@ else: # GH #6265 #24880 result = self.obj._constructor( - data=values, index=self.grouper.result_index, name=self.obj.name + data=values, index=self._grouper.result_index, name=self.obj.name ) if not self.as_index: result = self._insert_inaxis_grouper(result) @@ -450,8 +452,8 @@ result = {} initialized = False - for name, group in self.grouper.get_iterator( - self._selected_obj, axis=self.axis + for name, group in self._grouper.get_iterator( + self._obj_with_exclusions, axis=self.axis ): # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations object.__setattr__(group, "name", name) @@ -468,10 +470,9 @@ __examples_series_doc = dedent( """ - >>> ser = pd.Series( - ... [390.0, 350.0, 30.0, 20.0], - ... index=["Falcon", "Falcon", "Parrot", "Parrot"], - ... name="Max Speed") + >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0], + ... index=["Falcon", "Falcon", "Parrot", "Parrot"], + ... name="Max Speed") >>> grouped = ser.groupby([1, 1, 2, 2]) >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) Falcon 0.707107 @@ -522,10 +523,10 @@ ): assert axis == 0 # handled by caller - obj = self._selected_obj + obj = self._obj_with_exclusions try: - result = self.grouper._cython_operation( + result = self._grouper._cython_operation( "transform", obj._values, how, axis, **kwargs ) except NotImplementedError as err: @@ -548,8 +549,8 @@ klass = type(self.obj) results = [] - for name, group in self.grouper.get_iterator( - self._selected_obj, axis=self.axis + for name, group in self._grouper.get_iterator( + self._obj_with_exclusions, axis=self.axis ): # this setattr is needed for test_transform_lambda_with_datetimetz object.__setattr__(group, "name", name) @@ -620,8 +621,8 @@ try: indices = [ self._get_index(name) - for name, group in self.grouper.get_iterator( - self._selected_obj, axis=self.axis + for name, group in self._grouper.get_iterator( + self._obj_with_exclusions, axis=self.axis ) if true_and_notna(group) ] @@ -672,49 +673,33 @@ 2023-02-01 1 Freq: MS, dtype: int64 """ - ids, _, _ = self.grouper.group_info - + ids, _, ngroups = self._grouper.group_info val = self.obj._values + codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False) - codes, _ = algorithms.factorize(val, sort=False) - sorter = np.lexsort((codes, ids)) - codes = codes[sorter] - ids = ids[sorter] - - # group boundaries are where group ids change - # unique observations are where sorted values change - idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] - inc = np.r_[1, codes[1:] != codes[:-1]] + if self._grouper.has_dropped_na: + mask = ids >= 0 + ids = ids[mask] + codes = codes[mask] + + group_index = get_group_index( + labels=[ids, codes], + shape=(ngroups, len(uniques)), + sort=False, + xnull=dropna, + ) - # 1st item of each group is a new unique observation - mask = codes == -1 if dropna: - inc[idx] = 1 - inc[mask] = 0 - else: - inc[mask & np.r_[False, mask[:-1]]] = 0 - inc[idx] = 1 - - out = np.add.reduceat(inc, idx).astype("int64", copy=False) - if len(ids): - # NaN/NaT group exists if the head of ids is -1, - # so remove it from res and exclude its index from idx - if ids[0] == -1: - res = out[1:] - idx = idx[np.flatnonzero(idx)] - else: - res = out - else: - res = out[1:] - ri = self.grouper.result_index - - # we might have duplications among the bins - if len(res) != len(ri): - res, out = np.zeros(len(ri), dtype=out.dtype), res - if len(ids) > 0: - # GH#21334s - res[ids[idx]] = out + mask = group_index >= 0 + if (~mask).any(): + ids = ids[mask] + group_index = group_index[mask] + + mask = duplicated(group_index, "first") + res = np.bincount(ids[~mask], minlength=ngroups) + res = ensure_int64(res) + ri = self._grouper.result_index result: Series | DataFrame = self.obj._constructor( res, index=ri, name=self.obj.name ) @@ -724,8 +709,10 @@ return self._reindex_output(result, fill_value=0) @doc(Series.describe) - def describe(self, **kwargs): - return super().describe(**kwargs) + def describe(self, percentiles=None, include=None, exclude=None) -> Series: + return super().describe( + percentiles=percentiles, include=include, exclude=exclude + ) def value_counts( self, @@ -747,10 +734,10 @@ from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut - ids, _, _ = self.grouper.group_info + ids, _, _ = self._grouper.group_info val = self.obj._values - index_names = self.grouper.names + [self.obj.name] + index_names = self._grouper.names + [self.obj.name] if isinstance(val.dtype, CategoricalDtype) or ( bins is not None and not np.iterable(bins) @@ -773,6 +760,7 @@ mask = ids != -1 ids, val = ids[mask], val[mask] + lab: Index | np.ndarray if bins is None: lab, lev = algorithms.factorize(val, sort=True) llab = lambda lab, inc: lab[inc] @@ -816,9 +804,9 @@ rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - codes = self.grouper.reconstructed_codes + codes = self._grouper.reconstructed_codes codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - levels = [ping.group_index for ping in self.grouper.groupings] + [lev] + levels = [ping._group_index for ping in self._grouper.groupings] + [lev] if dropna: mask = codes[-1] != -1 @@ -862,7 +850,8 @@ _, idx = get_join_indexers( left, right, sort=False, how="left" # type: ignore[arg-type] ) - out = np.where(idx != -1, out[idx], 0) + if idx is not None: + out = np.where(idx != -1, out[idx], 0) if sort: sorter = np.lexsort((out if ascending else -out, left[0])) @@ -898,6 +887,12 @@ """ Fill NA/NaN values using the specified method within groups. + .. deprecated:: 2.2.0 + This method is deprecated and will be removed in a future version. + Use the :meth:`.SeriesGroupBy.ffill` or :meth:`.SeriesGroupBy.bfill` + for forward or backward filling instead. If you want to fill with a + single value, use :meth:`Series.fillna` instead. + Parameters ---------- value : scalar, dict, Series, or DataFrame @@ -912,17 +907,8 @@ Method to use for filling holes. ``'ffill'`` will propagate the last valid observation forward within a group. ``'bfill'`` will use next valid observation to fill the gap. - - .. deprecated:: 2.1.0 - Use obj.ffill or obj.bfill instead. - axis : {0 or 'index', 1 or 'columns'} Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - inplace : bool, default False Broken. Do not set to True. limit : int, default None @@ -937,8 +923,6 @@ or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). - .. deprecated:: 2.1.0 - Returns ------- Series @@ -970,6 +954,14 @@ mouse 0.0 dtype: float64 """ + warnings.warn( + f"{type(self).__name__}.fillna is deprecated and " + "will be removed in a future version. Use obj.ffill() or obj.bfill() " + "for forward or backward filling instead. If you want to fill with a " + f"single value, use {type(self.obj).__name__}.fillna instead", + FutureWarning, + stacklevel=find_stack_level(), + ) result = self._op_via_apply( "fillna", value=value, @@ -1155,7 +1147,7 @@ @property @doc(Series.plot.__doc__) - def plot(self): + def plot(self) -> GroupByPlot: result = GroupByPlot(self) return result @@ -1164,7 +1156,7 @@ self, n: int = 5, keep: Literal["first", "last", "all"] = "first" ) -> Series: f = partial(Series.nlargest, n=n, keep=keep) - data = self._selected_obj + data = self._obj_with_exclusions # Don't change behavior if result index happens to be the same, i.e. # already ordered and n >= all group sizes. result = self._python_apply_general(f, data, not_indexed_same=True) @@ -1175,7 +1167,7 @@ self, n: int = 5, keep: Literal["first", "last", "all"] = "first" ) -> Series: f = partial(Series.nsmallest, n=n, keep=keep) - data = self._selected_obj + data = self._obj_with_exclusions # Don't change behavior if result index happens to be the same, i.e. # already ordered and n >= all group sizes. result = self._python_apply_general(f, data, not_indexed_same=True) @@ -1185,15 +1177,13 @@ def idxmin( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - result = self._op_via_apply("idxmin", axis=axis, skipna=skipna) - return result.astype(self.obj.index.dtype) if result.empty else result + return self._idxmax_idxmin("idxmin", axis=axis, skipna=skipna) @doc(Series.idxmax.__doc__) def idxmax( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True ) -> Series: - result = self._op_via_apply("idxmax", axis=axis, skipna=skipna) - return result.astype(self.obj.index.dtype) if result.empty else result + return self._idxmax_idxmin("idxmax", axis=axis, skipna=skipna) @doc(Series.corr.__doc__) def corr( @@ -1341,14 +1331,10 @@ """ Examples -------- - >>> df = pd.DataFrame( - ... { - ... "A": [1, 1, 2, 2], + >>> data = {"A": [1, 1, 2, 2], ... "B": [1, 2, 3, 4], - ... "C": [0.362838, 0.227877, 1.267767, -0.562860], - ... } - ... ) - + ... "C": [0.362838, 0.227877, 1.267767, -0.562860]} + >>> df = pd.DataFrame(data) >>> df A B C 0 1 1 0.362838 @@ -1403,7 +1389,8 @@ >>> df.groupby("A").agg( ... b_min=pd.NamedAgg(column="B", aggfunc="min"), - ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum") + ... ) b_min c_sum A 1 1 0.590715 @@ -1474,7 +1461,7 @@ func, *args, engine_kwargs=engine_kwargs, **kwargs ) # grouper specific aggregations - if self.grouper.nkeys > 1: + if self._grouper.nkeys > 1: # test_groupby_as_index_series_scalar gets here with 'not self.as_index' return self._python_agg_general(func, *args, **kwargs) elif args or kwargs: @@ -1542,7 +1529,7 @@ output: dict[int, ArrayLike] = {} for idx, (name, ser) in enumerate(obj.items()): - result = self.grouper.agg_series(ser, f) + result = self._grouper.agg_series(ser, f) output[idx] = result res = self.obj._constructor(output) @@ -1550,17 +1537,17 @@ return self._wrap_aggregated_output(res) def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: - if self.grouper.nkeys != 1: + if self._grouper.nkeys != 1: raise AssertionError("Number of keys must be 1") obj = self._obj_with_exclusions result: dict[Hashable, NDFrame | np.ndarray] = {} - for name, grp_df in self.grouper.get_iterator(obj, self.axis): + for name, grp_df in self._grouper.get_iterator(obj, self.axis): fres = func(grp_df, *args, **kwargs) result[name] = fres - result_index = self.grouper.result_index + result_index = self._grouper.result_index other_ax = obj.axes[1 - self.axis] out = self.obj._constructor(result, index=other_ax, columns=result_index) if self.axis == 0: @@ -1580,7 +1567,7 @@ # GH#47787 see test_group_on_empty_multiindex res_index = data.index else: - res_index = self.grouper.result_index + res_index = self._grouper.result_index result = self.obj._constructor(index=res_index, columns=data.columns) result = result.astype(data.dtypes, copy=False) @@ -1600,7 +1587,7 @@ is_transform=is_transform, ) - key_index = self.grouper.result_index if self.as_index else None + key_index = self._grouper.result_index if self.as_index else None if isinstance(first_not_none, (np.ndarray, Index)): # GH#1738: values is list of arrays of unequal lengths @@ -1706,7 +1693,7 @@ ) def arr_func(bvalues: ArrayLike) -> ArrayLike: - return self.grouper._cython_operation( + return self._grouper._cython_operation( "transform", bvalues, how, 1, **kwargs ) @@ -1728,7 +1715,7 @@ applied = [] obj = self._obj_with_exclusions - gen = self.grouper.get_iterator(obj, axis=self.axis) + gen = self._grouper.get_iterator(obj, axis=self.axis) fast_path, slow_path = self._define_paths(func, *args, **kwargs) # Determine whether to use slow or fast path by evaluating on the first group. @@ -1922,7 +1909,7 @@ indices = [] obj = self._selected_obj - gen = self.grouper.get_iterator(obj, axis=self.axis) + gen = self._grouper.get_iterator(obj, axis=self.axis) for name, group in gen: # 2023-02-27 no tests are broken this pinning, but it is documented in the @@ -1984,7 +1971,7 @@ self.keys, axis=self.axis, level=self.level, - grouper=self.grouper, + grouper=self._grouper, exclusions=self.exclusions, selection=key, as_index=self.as_index, @@ -2000,7 +1987,7 @@ subset, self.keys, level=self.level, - grouper=self.grouper, + grouper=self._grouper, exclusions=self.exclusions, selection=key, as_index=self.as_index, @@ -2022,7 +2009,7 @@ mgr = obj._mgr if numeric_only: - mgr = mgr.get_numeric_data(copy=False) + mgr = mgr.get_numeric_data() return mgr def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: @@ -2037,7 +2024,7 @@ SeriesGroupBy( obj.iloc[:, i], selection=colname, - grouper=self.grouper, + grouper=self._grouper, exclusions=self.exclusions, observed=self.observed, ) @@ -2047,7 +2034,7 @@ if not len(results): # concat would raise - res_df = DataFrame([], columns=columns, index=self.grouper.result_index) + res_df = DataFrame([], columns=columns, index=self._grouper.result_index) else: res_df = concat(results, keys=columns, axis=1) @@ -2164,7 +2151,7 @@ >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) + ... index=['Pork', 'Wheat Products', 'Beef']) >>> df consumption co2_emissions @@ -2187,22 +2174,9 @@ Beef co2_emissions dtype: object """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "idxmax") - else: - axis = self.axis - - def func(df): - return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = "idxmax" - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True + return self._idxmax_idxmin( + "idxmax", axis=axis, numeric_only=numeric_only, skipna=skipna ) - return result.astype(self.obj.index.dtype) if result.empty else result def idxmin( self, @@ -2259,7 +2233,7 @@ >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], ... 'co2_emissions': [37.2, 19.66, 1712]}, - ... index=['Pork', 'Wheat Products', 'Beef']) + ... index=['Pork', 'Wheat Products', 'Beef']) >>> df consumption co2_emissions @@ -2282,22 +2256,9 @@ Beef consumption dtype: object """ - if axis is not lib.no_default: - if axis is None: - axis = self.axis - axis = self.obj._get_axis_number(axis) - self._deprecate_axis(axis, "idxmin") - else: - axis = self.axis - - def func(df): - return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only) - - func.__name__ = "idxmin" - result = self._python_apply_general( - func, self._obj_with_exclusions, not_indexed_same=True + return self._idxmax_idxmin( + "idxmin", axis=axis, numeric_only=numeric_only, skipna=skipna ) - return result.astype(self.obj.index.dtype) if result.empty else result boxplot = boxplot_frame_groupby @@ -2355,9 +2316,9 @@ Examples -------- >>> df = pd.DataFrame({ - ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], - ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], - ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] + ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], + ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], + ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] ... }) >>> df @@ -2426,6 +2387,12 @@ """ Fill NA/NaN values using the specified method within groups. + .. deprecated:: 2.2.0 + This method is deprecated and will be removed in a future version. + Use the :meth:`.DataFrameGroupBy.ffill` or :meth:`.DataFrameGroupBy.bfill` + for forward or backward filling instead. If you want to fill with a + single value, use :meth:`DataFrame.fillna` instead. + Parameters ---------- value : scalar, dict, Series, or DataFrame @@ -2446,11 +2413,6 @@ the same results as :meth:`.DataFrame.fillna`. When the :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0`` or ``axis=1`` here will produce the same results. - - .. deprecated:: 2.1.0 - For axis=1, operate on the underlying object instead. Otherwise - the axis keyword is not necessary. - inplace : bool, default False Broken. Do not set to True. limit : int, default None @@ -2465,8 +2427,6 @@ or the string 'infer' which will try to downcast to an appropriate equal type (e.g. float64 to int64 if possible). - .. deprecated:: 2.1.0 - Returns ------- DataFrame @@ -2541,14 +2501,14 @@ 3 3.0 NaN 2.0 4 3.0 NaN NaN """ - if method is not None: - warnings.warn( - f"{type(self).__name__}.fillna with 'method' is deprecated and " - "will raise in a future version. Use obj.ffill() or obj.bfill() " - "instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) + warnings.warn( + f"{type(self).__name__}.fillna is deprecated and " + "will be removed in a future version. Use obj.ffill() or obj.bfill() " + "for forward or backward filling instead. If you want to fill with a " + f"single value, use {type(self.obj).__name__}.fillna instead", + FutureWarning, + stacklevel=find_stack_level(), + ) result = self._op_via_apply( "fillna", diff -Nru pandas-2.1.4+dfsg/pandas/core/groupby/groupby.py pandas-2.2.2+dfsg/pandas/core/groupby/groupby.py --- pandas-2.1.4+dfsg/pandas/core/groupby/groupby.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/groupby/groupby.py 2024-04-10 17:42:52.000000000 +0000 @@ -86,9 +86,11 @@ is_object_dtype, is_scalar, needs_i8_conversion, + pandas_dtype, ) from pandas.core.dtypes.missing import ( isna, + na_value_for_dtype, notna, ) @@ -147,6 +149,7 @@ if TYPE_CHECKING: from typing import Any + from pandas.core.resample import Resampler from pandas.core.window import ( ExpandingGroupby, ExponentialMovingWindowGroupby, @@ -183,6 +186,19 @@ A callable that takes a {input} as its first argument, and returns a dataframe, a series or a scalar. In addition the callable may take positional and keyword arguments. + include_groups : bool, default True + When True, will attempt to apply ``func`` to the groupings in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. deprecated:: 2.2.0 + + Setting include_groups to True is deprecated. Only the value + False will be allowed in a future version of pandas. + args, kwargs : tuple and dict Optional positional and keyword arguments to pass to ``func``. @@ -217,8 +233,8 @@ """, "dataframe_examples": """ >>> df = pd.DataFrame({'A': 'a a b'.split(), - ... 'B': [1,2,3], - ... 'C': [4,6,5]}) + ... 'B': [1, 2, 3], + ... 'C': [4, 6, 5]}) >>> g1 = df.groupby('A', group_keys=False) >>> g2 = df.groupby('A', group_keys=True) @@ -275,7 +291,7 @@ each group together into a Series, including setting the index as appropriate: - >>> g1.apply(lambda x: x.C.max() - x.B.min()) + >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) A a 5 b 2 @@ -298,7 +314,7 @@ The resulting dtype will reflect the return value of the passed ``func``. - >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2) + >>> g1.apply(lambda x: x * 2 if x.name == 'a' else x / 2) a 0.0 a 2.0 b 1.0 @@ -307,7 +323,7 @@ In the above, the groups are not part of the index. We can have them included by using ``g2`` where ``group_keys=True``: - >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2) + >>> g2.apply(lambda x: x * 2 if x.name == 'a' else x / 2) a a 0.0 a 2.0 b b 1.0 @@ -406,14 +422,18 @@ functions that expect Series, DataFrames, GroupBy or Resampler objects. Instead of writing ->>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP +>>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 +>>> g = lambda x, arg1: x * 5 / arg1 +>>> f = lambda x: x ** 4 +>>> df = pd.DataFrame([["a", 4], ["b", 5]], columns=["group", "value"]) +>>> h(g(f(df.groupby('group')), arg1=1), arg2=2, arg3=3) # doctest: +SKIP You can write >>> (df.groupby('group') ... .pipe(f) -... .pipe(g, arg1=a) -... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP +... .pipe(g, arg1=1) +... .pipe(h, arg2=2, arg3=3)) # doctest: +SKIP which is much more readable. @@ -757,7 +777,7 @@ } axis: AxisInt - grouper: ops.BaseGrouper + _grouper: ops.BaseGrouper keys: _KeysArgType | None = None level: IndexLabel | None = None group_keys: bool @@ -773,6 +793,17 @@ @final @property + def grouper(self) -> ops.BaseGrouper: + warnings.warn( + f"{type(self).__name__}.grouper is deprecated and will be removed in a " + "future version of pandas.", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._grouper + + @final + @property def groups(self) -> dict[Hashable, np.ndarray]: """ Dict {group name -> group labels}. @@ -817,12 +848,12 @@ >>> ser.resample('MS').groups {Timestamp('2023-01-01 00:00:00'): 2, Timestamp('2023-02-01 00:00:00'): 4} """ - return self.grouper.groups + return self._grouper.groups @final @property def ngroups(self) -> int: - return self.grouper.ngroups + return self._grouper.ngroups @final @property @@ -872,7 +903,7 @@ defaultdict(, {Timestamp('2023-01-01 00:00:00'): [0, 1], Timestamp('2023-02-01 00:00:00'): [2, 3]}) """ - return self.grouper.indices + return self._grouper.indices @final def _get_indices(self, names): @@ -947,7 +978,7 @@ return self.obj[self._selection] # Otherwise _selection is equivalent to _selection_list, so - # _selected_obj matches _obj_with_exclusions, so we can re-use + # _selected_obj matches _obj_with_exclusions, so we can reuse # that and avoid making a copy. return self._obj_with_exclusions @@ -1038,7 +1069,7 @@ owl 1 2 3 toucan 1 5 6 eagle 7 8 9 - >>> df.groupby(by=["a"]).get_group(1) + >>> df.groupby(by=["a"]).get_group((1,)) a b c owl 1 2 3 toucan 1 5 6 @@ -1058,6 +1089,26 @@ 2023-01-15 2 dtype: int64 """ + keys = self.keys + level = self.level + # mypy doesn't recognize level/keys as being sized when passed to len + if (is_list_like(level) and len(level) == 1) or ( # type: ignore[arg-type] + is_list_like(keys) and len(keys) == 1 # type: ignore[arg-type] + ): + # GH#25971 + if isinstance(name, tuple) and len(name) == 1: + # Allow users to pass tuples of length 1 to silence warning + name = name[0] + elif not isinstance(name, tuple): + warnings.warn( + "When grouping with a length-1 list-like, " + "you will need to pass a length-1 tuple to get_group in a future " + "version of pandas. Pass `(name,)` instead of `name` to silence " + "this warning.", + FutureWarning, + stacklevel=find_stack_level(), + ) + inds = self._get_index(name) if not len(inds): raise KeyError(name) @@ -1149,7 +1200,7 @@ """ keys = self.keys level = self.level - result = self.grouper.get_iterator(self._selected_obj, axis=self.axis) + result = self._grouper.get_iterator(self._selected_obj, axis=self.axis) # error: Argument 1 to "len" has incompatible type "Hashable"; expected "Sized" if is_list_like(level) and len(level) == 1: # type: ignore[arg-type] # GH 51583 @@ -1239,7 +1290,7 @@ more """ - grouper: ops.BaseGrouper + _grouper: ops.BaseGrouper as_index: bool @final @@ -1300,7 +1351,7 @@ self.obj = obj self.axis = obj._get_axis_number(axis) - self.grouper = grouper + self._grouper = grouper self.exclusions = frozenset(exclusions) if exclusions else frozenset() def __getattr__(self, attr: str): @@ -1378,7 +1429,7 @@ not_indexed_same=not is_transform, ) - if self.grouper.has_dropped_na and is_transform: + if self._grouper.has_dropped_na and is_transform: # result will have dropped rows due to nans, fill with null # and ensure index is ordered same as the input result = self._set_result_index_ordered(result) @@ -1399,9 +1450,9 @@ if self.group_keys and not is_transform: if self.as_index: # possible MI return case - group_keys = self.grouper.result_index - group_levels = self.grouper.levels - group_names = self.grouper.names + group_keys = self._grouper.result_index + group_levels = self._grouper.levels + group_names = self._grouper.names result = concat( values, @@ -1422,7 +1473,7 @@ ax = self._selected_obj._get_axis(self.axis) if self.dropna: - labels = self.grouper.group_info[0] + labels = self._grouper.group_info[0] mask = labels != -1 ax = ax[mask] @@ -1431,7 +1482,7 @@ # when the ax has duplicates # so we resort to this # GH 14776, 30667 - # TODO: can we re-use e.g. _reindex_non_unique? + # TODO: can we reuse e.g. _reindex_non_unique? if ax.has_duplicates and not result.axes[self.axis].equals(ax): # e.g. test_category_order_transformer target = algorithms.unique1d(ax._values) @@ -1464,16 +1515,16 @@ obj_axis = self.obj._get_axis(self.axis) - if self.grouper.is_monotonic and not self.grouper.has_dropped_na: + if self._grouper.is_monotonic and not self._grouper.has_dropped_na: # shortcut if we have an already ordered grouper result = result.set_axis(obj_axis, axis=self.axis, copy=False) return result # row order is scrambled => sort the rows by position in original index - original_positions = Index(self.grouper.result_ilocs()) + original_positions = Index(self._grouper.result_ilocs()) result = result.set_axis(original_positions, axis=self.axis, copy=False) result = result.sort_index(axis=self.axis) - if self.grouper.has_dropped_na: + if self._grouper.has_dropped_na: # Add back in any missing rows due to dropna - index here is integral # with values referring to the row of the input so can use RangeIndex result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) @@ -1489,9 +1540,9 @@ # zip in reverse so we can always insert at loc 0 columns = result.columns for name, lev, in_axis in zip( - reversed(self.grouper.names), - reversed(self.grouper.get_group_levels()), - reversed([grp.in_axis for grp in self.grouper.groupings]), + reversed(self._grouper.names), + reversed(self._grouper.get_group_levels()), + reversed([grp.in_axis for grp in self._grouper.groupings]), ): # GH #28549 # When using .apply(-), name will be in columns already @@ -1549,10 +1600,10 @@ # enforced in __init__ result = self._insert_inaxis_grouper(result) result = result._consolidate() - index = Index(range(self.grouper.ngroups)) + index = Index(range(self._grouper.ngroups)) else: - index = self.grouper.result_index + index = self._grouper.result_index if qs is not None: # We get here with len(qs) != 1 and not self.as_index @@ -1580,20 +1631,20 @@ @final def _numba_prep(self, data: DataFrame): - ids, _, ngroups = self.grouper.group_info - sorted_index = self.grouper._sort_idx - sorted_ids = self.grouper._sorted_ids + ids, _, ngroups = self._grouper.group_info + sorted_index = self._grouper._sort_idx + sorted_ids = self._grouper._sorted_ids sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() # GH 46867 index_data = data.index if isinstance(index_data, MultiIndex): - if len(self.grouper.groupings) > 1: + if len(self._grouper.groupings) > 1: raise NotImplementedError( "Grouping with more than 1 grouping labels and " "a MultiIndex is not supported with engine='numba'" ) - group_key = self.grouper.groupings[0].name + group_key = self._grouper.groupings[0].name index_data = index_data.get_level_values(group_key) sorted_index_data = index_data.take(sorted_index).to_numpy() @@ -1634,13 +1685,13 @@ ) # Pass group ids to kernel directly if it can handle it # (This is faster since it doesn't require a sort) - ids, _, _ = self.grouper.group_info - ngroups = self.grouper.ngroups + ids, _, _ = self._grouper.group_info + ngroups = self._grouper.ngroups res_mgr = df._mgr.apply( aggregator, labels=ids, ngroups=ngroups, **aggregator_kwargs ) - res_mgr.axes[1] = self.grouper.result_index + res_mgr.axes[1] = self._grouper.result_index result = df._constructor_from_mgr(res_mgr, axes=res_mgr.axes) if data.ndim == 1: @@ -1711,7 +1762,7 @@ len(df.columns), *args, ) - index = self.grouper.result_index + index = self._grouper.result_index if data.ndim == 1: result_kwargs = {"name": data.name} result = result.ravel() @@ -1731,7 +1782,7 @@ input="dataframe", examples=_apply_docs["dataframe_examples"] ) ) - def apply(self, func, *args, **kwargs) -> NDFrameT: + def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: orig_func = func func = com.is_builtin_func(func) if orig_func != func: @@ -1764,10 +1815,25 @@ else: f = func + if not include_groups: + return self._python_apply_general(f, self._obj_with_exclusions) + # ignore SettingWithCopy here in case the user mutates with option_context("mode.chained_assignment", None): try: result = self._python_apply_general(f, self._selected_obj) + if ( + not isinstance(self.obj, Series) + and self._selection is None + and self._selected_obj.shape != self._obj_with_exclusions.shape + ): + warnings.warn( + message=_apply_groupings_depr.format( + type(self).__name__, "apply" + ), + category=DeprecationWarning, + stacklevel=find_stack_level(), + ) except TypeError: # gh-20949 # try again, with .apply acting as a filtering @@ -1816,7 +1882,7 @@ Series or DataFrame data after applying f """ - values, mutated = self.grouper.apply_groupwise(f, data, self.axis) + values, mutated = self._grouper.apply_groupwise(f, data, self.axis) if not_indexed_same is None: not_indexed_same = mutated @@ -1834,13 +1900,15 @@ min_count: int = -1, *, alias: str, - npfunc: Callable, + npfunc: Callable | None = None, + **kwargs, ): result = self._cython_agg_general( how=alias, alt=npfunc, numeric_only=numeric_only, min_count=min_count, + **kwargs, ) return result.__finalize__(self.obj, method="groupby") @@ -1871,7 +1939,7 @@ # should always be preserved by the implemented aggregations # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype? try: - res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True) + res_values = self._grouper.agg_series(ser, alt, preserve_dtype=True) except Exception as err: msg = f"agg function failed [how->{how},dtype->{ser.dtype}]" # preserve the kind of exception that raised @@ -1890,7 +1958,7 @@ def _cython_agg_general( self, how: str, - alt: Callable, + alt: Callable | None = None, numeric_only: bool = False, min_count: int = -1, **kwargs, @@ -1902,7 +1970,7 @@ def array_func(values: ArrayLike) -> ArrayLike: try: - result = self.grouper._cython_operation( + result = self._grouper._cython_operation( "aggregate", values, how, @@ -1918,16 +1986,19 @@ # TODO: avoid special casing SparseArray here if how in ["any", "all"] and isinstance(values, SparseArray): pass - elif how in ["any", "all", "std", "sem"]: + elif alt is None or how in ["any", "all", "std", "sem"]: raise # TODO: re-raise as TypeError? should not be reached else: return result + assert alt is not None result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) return result new_mgr = data.grouped_reduce(array_func) res = self._wrap_agged_manager(new_mgr) + if how in ["idxmin", "idxmax"]: + res = self._wrap_idxmax_idxmin(res) out = self._wrap_aggregated_output(res) if self.axis == 1: out = out.infer_objects(copy=False) @@ -1966,11 +2037,13 @@ # If func is a reduction, we need to broadcast the # result to the whole group. Compute func result # and deal with possible broadcasting below. - # Temporarily set observed for dealing with categoricals. - with com.temp_setattr(self, "observed", True): - with com.temp_setattr(self, "as_index", True): - # GH#49834 - result needs groups in the index for - # _wrap_transform_fast_result + with com.temp_setattr(self, "as_index", True): + # GH#49834 - result needs groups in the index for + # _wrap_transform_fast_result + if func in ["idxmin", "idxmax"]: + func = cast(Literal["idxmin", "idxmax"], func) + result = self._idxmax_idxmin(func, True, *args, **kwargs) + else: if engine is not None: kwargs["engine"] = engine kwargs["engine_kwargs"] = engine_kwargs @@ -1986,8 +2059,8 @@ obj = self._obj_with_exclusions # for each col, reshape to size of original frame by take operation - ids, _, _ = self.grouper.group_info - result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False) + ids, _, _ = self._grouper.group_info + result = result.reindex(self._grouper.result_index, axis=self.axis, copy=False) if self.obj.ndim == 1: # i.e. SeriesGroupBy @@ -2039,7 +2112,7 @@ this is currently implementing sort=False (though the default is sort=True) for groupby in general """ - ids, _, ngroups = self.grouper.group_info + ids, _, ngroups = self._grouper.group_info sorter = get_group_index_sorter(ids, ngroups) ids, count = ids[sorter], len(ids) @@ -2055,7 +2128,7 @@ else: out = np.repeat(out[np.r_[run[1:], True]], rep) - out - if self.grouper.has_dropped_na: + if self._grouper.has_dropped_na: out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False)) else: out = out.astype(np.int64, copy=False) @@ -2078,7 +2151,7 @@ @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def any(self, skipna: bool = True): + def any(self, skipna: bool = True) -> NDFrameT: """ Return True if any value in the group is truthful, else False. @@ -2127,14 +2200,14 @@ """ return self._cython_agg_general( "any", - alt=lambda x: Series(x).any(skipna=skipna), + alt=lambda x: Series(x, copy=False).any(skipna=skipna), skipna=skipna, ) @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def all(self, skipna: bool = True): + def all(self, skipna: bool = True) -> NDFrameT: """ Return True if all values in the group are truthful, else False. @@ -2184,7 +2257,7 @@ """ return self._cython_agg_general( "all", - alt=lambda x: Series(x).all(skipna=skipna), + alt=lambda x: Series(x, copy=False).all(skipna=skipna), skipna=skipna, ) @@ -2248,7 +2321,7 @@ Freq: MS, dtype: int64 """ data = self._get_data_to_aggregate() - ids, _, ngroups = self.grouper.group_info + ids, _, ngroups = self._grouper.group_info mask = ids != -1 is_series = data.ndim == 1 @@ -2269,7 +2342,8 @@ elif isinstance(bvalues, ArrowExtensionArray) and not isinstance( bvalues.dtype, StringDtype ): - return type(bvalues)._from_sequence(counted[0]) + dtype = pandas_dtype("int64[pyarrow]") + return type(bvalues)._from_sequence(counted[0], dtype=dtype) if is_series: assert counted.ndim == 2 assert counted.shape[0] == 1 @@ -2377,13 +2451,13 @@ else: result = self._cython_agg_general( "mean", - alt=lambda x: Series(x).mean(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only), numeric_only=numeric_only, ) return result.__finalize__(self.obj, method="groupby") @final - def median(self, numeric_only: bool = False): + def median(self, numeric_only: bool = False) -> NDFrameT: """ Compute median of groups, excluding missing values. @@ -2457,7 +2531,7 @@ """ result = self._cython_agg_general( "median", - alt=lambda x: Series(x).median(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only), numeric_only=numeric_only, ) return result.__finalize__(self.obj, method="groupby") @@ -2566,7 +2640,7 @@ else: return self._cython_agg_general( "std", - alt=lambda x: Series(x).std(ddof=ddof), + alt=lambda x: Series(x, copy=False).std(ddof=ddof), numeric_only=numeric_only, ddof=ddof, ) @@ -2673,7 +2747,7 @@ else: return self._cython_agg_general( "var", - alt=lambda x: Series(x).var(ddof=ddof), + alt=lambda x: Series(x, copy=False).var(ddof=ddof), numeric_only=numeric_only, ddof=ddof, ) @@ -2703,7 +2777,7 @@ obj = self._obj_with_exclusions in_axis_names = { - grouping.name for grouping in self.grouper.groupings if grouping.in_axis + grouping.name for grouping in self._grouper.groupings if grouping.in_axis } if isinstance(obj, Series): _name = obj.name @@ -2734,7 +2808,7 @@ if _name not in in_axis_names and _name in subsetted ] - groupings = list(self.grouper.groupings) + groupings = list(self._grouper.groupings) for key in keys: grouper, _, _ = get_grouper( df, @@ -2763,18 +2837,34 @@ and not grouping._observed for grouping in groupings ): - levels_list = [ping.result_index for ping in groupings] - multi_index, _ = MultiIndex.from_product( + levels_list = [ping._result_index for ping in groupings] + multi_index = MultiIndex.from_product( levels_list, names=[ping.name for ping in groupings] - ).sortlevel() + ) result_series = result_series.reindex(multi_index, fill_value=0) + if sort: + # Sort by the values + result_series = result_series.sort_values( + ascending=ascending, kind="stable" + ) + if self.sort: + # Sort by the groupings + names = result_series.index.names + # GH#55951 - Temporarily replace names in case they are integers + result_series.index.names = range(len(names)) + index_level = list(range(len(self._grouper.groupings))) + result_series = result_series.sort_index( + level=index_level, sort_remaining=False + ) + result_series.index.names = names + if normalize: # Normalize the results by dividing by the original group sizes. # We are guaranteed to have the first N levels be the # user-requested grouping. levels = list( - range(len(self.grouper.groupings), result_series.index.nlevels) + range(len(self._grouper.groupings), result_series.index.nlevels) ) indexed_group_size = result_series.groupby( result_series.index.droplevel(levels), @@ -2788,13 +2878,6 @@ # Handle groups of non-observed categories result_series = result_series.fillna(0.0) - if sort: - # Sort the values and then resort by the main grouping - index_level = range(len(self.grouper.groupings)) - result_series = result_series.sort_values(ascending=ascending).sort_index( - level=index_level, sort_remaining=False - ) - result: Series | DataFrame if self.as_index: result = result_series @@ -2807,14 +2890,14 @@ result_series.name = name result_series.index = index.set_names(range(len(columns))) result_frame = result_series.reset_index() - orig_dtype = self.grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] # noqa: E501 + orig_dtype = self._grouper.groupings[0].obj.columns.dtype # type: ignore[union-attr] cols = Index(columns, dtype=orig_dtype).insert(len(columns), name) result_frame.columns = cols result = result_frame return result.__finalize__(self.obj, method="value_counts") @final - def sem(self, ddof: int = 1, numeric_only: bool = False): + def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: """ Compute standard error of the mean of groups, excluding missing values. @@ -2894,7 +2977,7 @@ ) return self._cython_agg_general( "sem", - alt=lambda x: Series(x).sem(ddof=ddof), + alt=lambda x: Series(x, copy=False).sem(ddof=ddof), numeric_only=numeric_only, ddof=ddof, ) @@ -2957,7 +3040,7 @@ 2023-02-01 1 Freq: MS, dtype: int64 """ - result = self.grouper.size() + result = self._grouper.size() dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None if isinstance(self.obj, Series): if isinstance(self.obj.array, ArrowExtensionArray): @@ -3110,7 +3193,7 @@ 2 30 72""" ), ) - def prod(self, numeric_only: bool = False, min_count: int = 0): + def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) @@ -3252,9 +3335,13 @@ ) @final - def first(self, numeric_only: bool = False, min_count: int = -1): + def first( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the first non-null entry of each column. + Compute the first entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3262,12 +3349,17 @@ Include only float, int, boolean columns. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 2.2.1 Returns ------- Series or DataFrame - First non-null of values within each group. + First values within each group. See Also -------- @@ -3319,12 +3411,17 @@ min_count=min_count, alias="first", npfunc=first_compat, + skipna=skipna, ) @final - def last(self, numeric_only: bool = False, min_count: int = -1): + def last( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the last non-null entry of each column. + Compute the last entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3333,12 +3430,17 @@ everything, then use only numeric data. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 2.2.1 Returns ------- Series or DataFrame - Last non-null of values within each group. + Last of values within each group. See Also -------- @@ -3378,6 +3480,7 @@ min_count=min_count, alias="last", npfunc=last_compat, + skipna=skipna, ) @final @@ -3457,13 +3560,13 @@ if not is_numeric: raise DataError("No numeric types to aggregate") - res_values = self.grouper._cython_operation( + res_values = self._grouper._cython_operation( "aggregate", obj._values, "ohlc", axis=0, min_count=-1 ) agg_names = ["open", "high", "low", "close"] result = self.obj._constructor_expanddim( - res_values, index=self.grouper.result_index, columns=agg_names + res_values, index=self._grouper.result_index, columns=agg_names ) return self._reindex_output(result) @@ -3509,7 +3612,7 @@ return result @final - def resample(self, rule, *args, **kwargs): + def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: """ Provide resampling when using a TimeGrouper. @@ -3523,7 +3626,23 @@ ---------- rule : str or DateOffset The offset string or object representing target grouper conversion. - *args, **kwargs + *args + Possible arguments are `how`, `fill_method`, `limit`, `kind` and + `on`, and other arguments of `TimeGrouper`. + include_groups : bool, default True + When True, will attempt to include the groupings in the operation in + the case that they are columns of the DataFrame. If this raises a + TypeError, the result will be computed with the groupings excluded. + When False, the groupings will be excluded when applying ``func``. + + .. versionadded:: 2.2.0 + + .. deprecated:: 2.2.0 + + Setting include_groups to True is deprecated. Only the value + False will be allowed in a future version of pandas. + + **kwargs Possible arguments are `how`, `fill_method`, `limit`, `kind` and `on`, and other arguments of `TimeGrouper`. @@ -3544,7 +3663,7 @@ Examples -------- - >>> idx = pd.date_range('1/1/2000', periods=4, freq='T') + >>> idx = pd.date_range('1/1/2000', periods=4, freq='min') >>> df = pd.DataFrame(data=4 * [range(2)], ... index=idx, ... columns=['a', 'b']) @@ -3559,59 +3678,71 @@ Downsample the DataFrame into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> df.groupby('a').resample('3T').sum() - a b + >>> df.groupby('a').resample('3min', include_groups=False).sum() + b a - 0 2000-01-01 00:00:00 0 2 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:00:00 5 1 + 0 2000-01-01 00:00:00 2 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:00:00 1 Upsample the series into 30 second bins. - >>> df.groupby('a').resample('30S').sum() - a b + >>> df.groupby('a').resample('30s', include_groups=False).sum() + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:00:30 0 0 - 2000-01-01 00:01:00 0 1 - 2000-01-01 00:01:30 0 0 - 2000-01-01 00:02:00 0 0 - 2000-01-01 00:02:30 0 0 - 2000-01-01 00:03:00 0 1 - 5 2000-01-01 00:02:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:00:30 0 + 2000-01-01 00:01:00 1 + 2000-01-01 00:01:30 0 + 2000-01-01 00:02:00 0 + 2000-01-01 00:02:30 0 + 2000-01-01 00:03:00 1 + 5 2000-01-01 00:02:00 1 Resample by month. Values are assigned to the month of the period. - >>> df.groupby('a').resample('M').sum() - a b + >>> df.groupby('a').resample('ME', include_groups=False).sum() + b a - 0 2000-01-31 0 3 - 5 2000-01-31 5 1 + 0 2000-01-31 3 + 5 2000-01-31 1 Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - >>> df.groupby('a').resample('3T', closed='right').sum() - a b + >>> ( + ... df.groupby('a') + ... .resample('3min', closed='right', include_groups=False) + ... .sum() + ... ) + b a - 0 1999-12-31 23:57:00 0 1 - 2000-01-01 00:00:00 0 2 - 5 2000-01-01 00:00:00 5 1 + 0 1999-12-31 23:57:00 1 + 2000-01-01 00:00:00 2 + 5 2000-01-01 00:00:00 1 Downsample the series into 3 minute bins and close the right side of the bin interval, but label each bin using the right edge instead of the left. - >>> df.groupby('a').resample('3T', closed='right', label='right').sum() - a b + >>> ( + ... df.groupby('a') + ... .resample('3min', closed='right', label='right', include_groups=False) + ... .sum() + ... ) + b a - 0 2000-01-01 00:00:00 0 1 - 2000-01-01 00:03:00 0 2 - 5 2000-01-01 00:03:00 5 1 + 0 2000-01-01 00:00:00 1 + 2000-01-01 00:03:00 2 + 5 2000-01-01 00:03:00 1 """ from pandas.core.resample import get_resampler_for_grouping - return get_resampler_for_grouping(self, rule, *args, **kwargs) + # mypy flags that include_groups could be specified via `*args` or `**kwargs` + # GH#54961 would resolve. + return get_resampler_for_grouping( # type: ignore[misc] + self, rule, *args, include_groups=include_groups, **kwargs + ) @final def rolling(self, *args, **kwargs) -> RollingGroupby: @@ -3748,7 +3879,7 @@ return RollingGroupby( self._selected_obj, *args, - _grouper=self.grouper, + _grouper=self._grouper, _as_index=self.as_index, **kwargs, ) @@ -3770,7 +3901,7 @@ return ExpandingGroupby( self._selected_obj, *args, - _grouper=self.grouper, + _grouper=self._grouper, **kwargs, ) @@ -3790,7 +3921,7 @@ return ExponentialMovingWindowGroupby( self._selected_obj, *args, - _grouper=self.grouper, + _grouper=self._grouper, **kwargs, ) @@ -3822,7 +3953,7 @@ if limit is None: limit = -1 - ids, _, _ = self.grouper.group_info + ids, _, _ = self._grouper.group_info sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False) if direction == "bfill": sorted_labels = sorted_labels[::-1] @@ -3847,7 +3978,7 @@ # np.take_along_axis if isinstance(values, np.ndarray): dtype = values.dtype - if self.grouper.has_dropped_na: + if self._grouper.has_dropped_na: # dropped null groups give rise to nan in the result dtype = ensure_dtype_can_hold_na(values.dtype) out = np.empty(values.shape, dtype=dtype) @@ -4153,7 +4284,7 @@ if not dropna: mask = self._make_mask_from_positional_indexer(n) - ids, _, _ = self.grouper.group_info + ids, _, _ = self._grouper.group_info # Drop NA values in grouping mask = mask & (ids != -1) @@ -4182,14 +4313,14 @@ grouper: np.ndarray | Index | ops.BaseGrouper if len(dropped) == len(self._selected_obj): # Nothing was dropped, can use the same grouper - grouper = self.grouper + grouper = self._grouper else: # we don't have the grouper info available # (e.g. we have selected out # a column that is not in the current object) - axis = self.grouper.axis - grouper = self.grouper.codes_info[axis.isin(dropped.index)] - if self.grouper.has_dropped_na: + axis = self._grouper.axis + grouper = self._grouper.codes_info[axis.isin(dropped.index)] + if self._grouper.has_dropped_na: # Null groups need to still be encoded as -1 when passed to groupby nulls = grouper == -1 # error: No overload variant of "where" matches argument types @@ -4254,10 +4385,10 @@ mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile") obj = self._wrap_agged_manager(mgr) if self.axis == 1: - splitter = self.grouper._get_splitter(obj.T, axis=self.axis) + splitter = self._grouper._get_splitter(obj.T, axis=self.axis) sdata = splitter._sorted_data.T else: - splitter = self.grouper._get_splitter(obj, axis=self.axis) + splitter = self._grouper._get_splitter(obj, axis=self.axis) sdata = splitter._sorted_data starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups) @@ -4364,7 +4495,7 @@ qs = np.array([q], dtype=np.float64) pass_qs = None - ids, _, ngroups = self.grouper.group_info + ids, _, ngroups = self._grouper.group_info nqs = len(qs) func = partial( @@ -4497,16 +4628,16 @@ """ obj = self._obj_with_exclusions index = obj._get_axis(self.axis) - comp_ids = self.grouper.group_info[0] + comp_ids = self._grouper.group_info[0] dtype: type - if self.grouper.has_dropped_na: + if self._grouper.has_dropped_na: comp_ids = np.where(comp_ids == -1, np.nan, comp_ids) dtype = np.float64 else: dtype = np.int64 - if any(ping._passed_categorical for ping in self.grouper.groupings): + if any(ping._passed_categorical for ping in self._grouper.groupings): # comp_ids reflect non-observed groups, we need only observed comp_ids = rank_1d(comp_ids, ties_method="dense") - 1 @@ -5084,7 +5215,7 @@ else: if fill_value is lib.no_default: fill_value = None - ids, _, ngroups = self.grouper.group_info + ids, _, ngroups = self._grouper.group_info res_indexer = np.zeros(len(ids), dtype=np.int64) libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period) @@ -5319,9 +5450,9 @@ limit = 0 filled = getattr(self, fill_method)(limit=limit) if self.axis == 0: - fill_grp = filled.groupby(self.grouper.codes, group_keys=self.group_keys) + fill_grp = filled.groupby(self._grouper.codes, group_keys=self.group_keys) else: - fill_grp = filled.T.groupby(self.grouper.codes, group_keys=self.group_keys) + fill_grp = filled.T.groupby(self._grouper.codes, group_keys=self.group_keys) shifted = fill_grp.shift(periods=periods, freq=freq) if self.axis == 1: shifted = shifted.T @@ -5423,7 +5554,7 @@ Series or DataFrame Filtered _selected_obj. """ - ids = self.grouper.group_info[0] + ids = self._grouper.group_info[0] mask = mask & (ids != -1) if self.axis == 0: @@ -5463,7 +5594,7 @@ Series or DataFrame Object (potentially) re-indexed to include all possible groups. """ - groupings = self.grouper.groupings + groupings = self._grouper.groupings if len(groupings) == 1: return output @@ -5479,8 +5610,8 @@ ): return output - levels_list = [ping.group_index for ping in groupings] - names = self.grouper.names + levels_list = [ping._group_index for ping in groupings] + names = self._grouper.names if qs is not None: # error: Argument 1 to "append" of "list" has incompatible type # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" @@ -5502,7 +5633,7 @@ # GH 13204 # Here, the categorical in-axis groupers, which need to be fully # expanded, are columns in `output`. An idea is to do: - # output = output.set_index(self.grouper.names) + # output = output.set_index(self._grouper.names) # .reindex(index).reset_index() # but special care has to be taken because of possible not-in-axis # groupers. @@ -5518,7 +5649,7 @@ output = output.drop(labels=list(g_names), axis=1) # Set a temp index and reindex (possibly expanding) - output = output.set_index(self.grouper.result_index).reindex( + output = output.set_index(self._grouper.result_index).reindex( index, copy=False, fill_value=fill_value ) @@ -5634,7 +5765,7 @@ random_state = com.random_state(random_state) - group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis) + group_iterator = self._grouper.get_iterator(self._selected_obj, self.axis) sampled_indices = [] for labels, obj in group_iterator: @@ -5658,6 +5789,140 @@ sampled_indices = np.concatenate(sampled_indices) return self._selected_obj.take(sampled_indices, axis=self.axis) + def _idxmax_idxmin( + self, + how: Literal["idxmax", "idxmin"], + ignore_unobserved: bool = False, + axis: Axis | None | lib.NoDefault = lib.no_default, + skipna: bool = True, + numeric_only: bool = False, + ) -> NDFrameT: + """Compute idxmax/idxmin. + + Parameters + ---------- + how : {'idxmin', 'idxmax'} + Whether to compute idxmin or idxmax. + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + numeric_only : bool, default False + Include only float, int, boolean columns. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ignore_unobserved : bool, default False + When True and an unobserved group is encountered, do not raise. This used + for transform where unobserved groups do not play an impact on the result. + + Returns + ------- + Series or DataFrame + idxmax or idxmin for the groupby operation. + """ + if axis is not lib.no_default: + if axis is None: + axis = self.axis + axis = self.obj._get_axis_number(axis) + self._deprecate_axis(axis, how) + else: + axis = self.axis + + if not self.observed and any( + ping._passed_categorical for ping in self._grouper.groupings + ): + expected_len = np.prod( + [len(ping._group_index) for ping in self._grouper.groupings] + ) + if len(self._grouper.groupings) == 1: + result_len = len(self._grouper.groupings[0].grouping_vector.unique()) + else: + # result_index only contains observed groups in this case + result_len = len(self._grouper.result_index) + assert result_len <= expected_len + has_unobserved = result_len < expected_len + + raise_err: bool | np.bool_ = not ignore_unobserved and has_unobserved + # Only raise an error if there are columns to compute; otherwise we return + # an empty DataFrame with an index (possibly including unobserved) but no + # columns + data = self._obj_with_exclusions + if raise_err and isinstance(data, DataFrame): + if numeric_only: + data = data._get_numeric_data() + raise_err = len(data.columns) > 0 + + if raise_err: + raise ValueError( + f"Can't get {how} of an empty group due to unobserved categories. " + "Specify observed=True in groupby instead." + ) + elif not skipna: + if self._obj_with_exclusions.isna().any(axis=None): + warnings.warn( + f"The behavior of {type(self).__name__}.{how} with all-NA " + "values, or any-NA and skipna=False, is deprecated. In a future " + "version this will raise ValueError", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if axis == 1: + try: + + def func(df): + method = getattr(df, how) + return method(axis=axis, skipna=skipna, numeric_only=numeric_only) + + func.__name__ = how + result = self._python_apply_general( + func, self._obj_with_exclusions, not_indexed_same=True + ) + except ValueError as err: + name = "argmax" if how == "idxmax" else "argmin" + if f"attempt to get {name} of an empty sequence" in str(err): + raise ValueError( + f"Can't get {how} of an empty group due to unobserved " + "categories. Specify observed=True in groupby instead." + ) from None + raise + return result + + result = self._agg_general( + numeric_only=numeric_only, + min_count=1, + alias=how, + skipna=skipna, + ) + return result + + def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT: + index = self.obj._get_axis(self.axis) + if res.size == 0: + result = res.astype(index.dtype) + else: + if isinstance(index, MultiIndex): + index = index.to_flat_index() + values = res._values + assert isinstance(values, np.ndarray) + na_value = na_value_for_dtype(index.dtype, compat=False) + if isinstance(res, Series): + # mypy: expression has type "Series", variable has type "NDFrameT" + result = res._constructor( # type: ignore[assignment] + index.array.take(values, allow_fill=True, fill_value=na_value), + index=res.index, + name=res.name, + ) + else: + data = {} + for k, column_values in enumerate(values.T): + data[k] = index.array.take( + column_values, allow_fill=True, fill_value=na_value + ) + result = self.obj._constructor(data, index=res.index) + result.columns = res.columns + return result + @doc(GroupBy) def get_groupby( @@ -5720,3 +5985,13 @@ mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) return mi + + +# GH#7155 +_apply_groupings_depr = ( + "{}.{} operated on the grouping columns. This behavior is deprecated, " + "and in a future version of pandas the grouping columns will be excluded " + "from the operation. Either pass `include_groups=False` to exclude the " + "groupings or explicitly select the grouping columns after groupby to silence " + "this warning." +) diff -Nru pandas-2.1.4+dfsg/pandas/core/groupby/grouper.py pandas-2.2.2+dfsg/pandas/core/groupby/grouper.py --- pandas-2.1.4+dfsg/pandas/core/groupby/grouper.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/groupby/grouper.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,7 +12,10 @@ import numpy as np -from pandas._config import using_copy_on_write +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) from pandas._libs import lib from pandas._libs.tslibs import OutOfBoundsDatetime @@ -116,8 +119,6 @@ row/column will be dropped. If False, NA values will also be treated as the key in groups. - .. versionadded:: 1.2.0 - Returns ------- Grouper or pandas.api.typing.TimeGrouper @@ -189,7 +190,7 @@ 2000-10-02 00:12:00 18 2000-10-02 00:19:00 21 2000-10-02 00:26:00 24 - Freq: 7T, dtype: int64 + Freq: 7min, dtype: int64 >>> ts.groupby(pd.Grouper(freq='17min')).sum() 2000-10-01 23:14:00 0 @@ -197,7 +198,7 @@ 2000-10-01 23:48:00 21 2000-10-02 00:05:00 54 2000-10-02 00:22:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum() 2000-10-01 23:18:00 0 @@ -205,14 +206,14 @@ 2000-10-01 23:52:00 27 2000-10-02 00:09:00 39 2000-10-02 00:26:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum() 2000-10-01 23:24:00 3 2000-10-01 23:41:00 15 2000-10-01 23:58:00 45 2000-10-02 00:15:00 45 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 If you want to adjust the start of the bins with an `offset` Timedelta, the two following lines are equivalent: @@ -222,14 +223,14 @@ 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum() 2000-10-01 23:30:00 9 2000-10-01 23:47:00 21 2000-10-02 00:04:00 54 2000-10-02 00:21:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 To replace the use of the deprecated `base` argument, you can now use `offset`, in this example it is equivalent to have `base=2`: @@ -240,7 +241,7 @@ 2000-10-01 23:50:00 36 2000-10-02 00:07:00 39 2000-10-02 00:24:00 24 - Freq: 17T, dtype: int64 + Freq: 17min, dtype: int64 """ sort: bool @@ -289,12 +290,12 @@ self.dropna = dropna self._grouper_deprecated = None - self._indexer_deprecated = None + self._indexer_deprecated: npt.NDArray[np.intp] | None = None self._obj_deprecated = None self._gpr_index = None self.binner = None self._grouper = None - self._indexer = None + self._indexer: npt.NDArray[np.intp] | None = None def _get_grouper( self, obj: NDFrameT, validate: bool = True @@ -327,10 +328,9 @@ return grouper, obj - @final def _set_grouper( - self, obj: NDFrame, sort: bool = False, *, gpr_index: Index | None = None - ): + self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None + ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: """ given an object and the specifications, setup the internal grouper for this particular specification @@ -350,8 +350,6 @@ """ assert obj is not None - indexer = None - if self.key is not None and self.level is not None: raise ValueError("The Grouper cannot specify both a key and a level!") @@ -398,6 +396,7 @@ raise ValueError(f"The level {level} is not valid") # possibly sort + indexer: npt.NDArray[np.intp] | None = None if (self.sort or sort) and not ax.is_monotonic_increasing: # use stable sort to support first, last, nth # TODO: why does putting na_position="first" fix datetimelike cases? @@ -441,6 +440,8 @@ @final @property def obj(self): + # TODO(3.0): enforcing these deprecations on Grouper should close + # GH#25564, GH#41930 warnings.warn( f"{type(self).__name__}.obj is deprecated and will be removed " "in a future version. Use GroupBy.indexer instead.", @@ -519,7 +520,6 @@ """ _codes: npt.NDArray[np.signedinteger] | None = None - _group_index: Index | None = None _all_grouper: Categorical | None _orig_cats: Index | None _index: Index @@ -675,7 +675,7 @@ @property def ngroups(self) -> int: - return len(self.group_index) + return len(self._group_index) @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: @@ -691,34 +691,58 @@ return self._codes_and_uniques[0] @cache_readonly - def group_arraylike(self) -> ArrayLike: + def _group_arraylike(self) -> ArrayLike: """ Analogous to result_index, but holding an ArrayLike to ensure we can retain ExtensionDtypes. """ if self._all_grouper is not None: # retain dtype for categories, including unobserved ones - return self.result_index._values + return self._result_index._values elif self._passed_categorical: - return self.group_index._values + return self._group_index._values return self._codes_and_uniques[1] + @property + def group_arraylike(self) -> ArrayLike: + """ + Analogous to result_index, but holding an ArrayLike to ensure + we can retain ExtensionDtypes. + """ + warnings.warn( + "group_arraylike is deprecated and will be removed in a future " + "version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._group_arraylike + @cache_readonly - def result_index(self) -> Index: + def _result_index(self) -> Index: # result_index retains dtype for categories, including unobserved ones, # which group_index does not if self._all_grouper is not None: - group_idx = self.group_index + group_idx = self._group_index assert isinstance(group_idx, CategoricalIndex) cats = self._orig_cats # set_categories is dynamically added return group_idx.set_categories(cats) # type: ignore[attr-defined] - return self.group_index + return self._group_index + + @property + def result_index(self) -> Index: + warnings.warn( + "result_index is deprecated and will be removed in a future " + "version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._result_index @cache_readonly - def group_index(self) -> Index: + def _group_index(self) -> Index: codes, uniques = self._codes_and_uniques if not self._dropna and self._passed_categorical: assert isinstance(uniques, Categorical) @@ -740,6 +764,16 @@ ) return Index._with_infer(uniques, name=self.name) + @property + def group_index(self) -> Index: + warnings.warn( + "group_index is deprecated and will be removed in a future " + "version of pandas", + category=FutureWarning, + stacklevel=find_stack_level(), + ) + return self._group_index + @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: uniques: ArrayLike @@ -805,7 +839,7 @@ @cache_readonly def groups(self) -> dict[Hashable, np.ndarray]: - cats = Categorical.from_codes(self.codes, self.group_index, validate=False) + cats = Categorical.from_codes(self.codes, self._group_index, validate=False) return self._index.groupby(cats) @@ -965,7 +999,7 @@ def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False - if using_copy_on_write(): + if using_copy_on_write() or warn_copy_on_write(): # For the CoW case, we check the references to determine if the # series is part of the object try: diff -Nru pandas-2.1.4+dfsg/pandas/core/groupby/ops.py pandas-2.2.2+dfsg/pandas/core/groupby/ops.py --- pandas-2.1.4+dfsg/pandas/core/groupby/ops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/groupby/ops.py 2024-04-10 17:42:52.000000000 +0000 @@ -33,7 +33,6 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( maybe_cast_pointwise_result, maybe_downcast_to_dtype, @@ -78,7 +77,7 @@ from pandas.core.generic import NDFrame -def check_result_array(obj, dtype): +def check_result_array(obj, dtype) -> None: # Our operation is supposed to be an aggregation/reduction. If # it returns an ndarray, this likely means an invalid operation has # been passed. See test_apply_without_aggregation, test_agg_must_agg @@ -134,6 +133,8 @@ "all": functools.partial(libgroupby.group_any_all, val_test="all"), "sum": "group_sum", "prod": "group_prod", + "idxmin": functools.partial(libgroupby.group_idxmin_idxmax, name="idxmin"), + "idxmax": functools.partial(libgroupby.group_idxmin_idxmax, name="idxmax"), "min": "group_min", "max": "group_max", "mean": "group_mean", @@ -188,7 +189,7 @@ f"function is not implemented for this dtype: " f"[how->{how},dtype->{dtype_str}]" ) - elif how in ["std", "sem"]: + elif how in ["std", "sem", "idxmin", "idxmax"]: # We have a partial object that does not have __signatures__ return f elif how == "skew": @@ -269,6 +270,10 @@ if how == "rank": out_dtype = "float64" + elif how in ["idxmin", "idxmax"]: + # The Cython implementation only produces the row number; we'll take + # from the index using this in post processing + out_dtype = "intp" else: if dtype.kind in "iufcb": out_dtype = f"{dtype.kind}{dtype.itemsize}" @@ -400,7 +405,16 @@ result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) - if self.how in ["min", "max", "mean", "last", "first", "sum"]: + if self.how in [ + "idxmin", + "idxmax", + "min", + "max", + "mean", + "last", + "first", + "sum", + ]: func( out=result, counts=counts, @@ -410,6 +424,7 @@ mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, + **kwargs, ) elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: if self.how in ["std", "sem"]: @@ -464,10 +479,10 @@ **kwargs, ) - if self.kind == "aggregate": + if self.kind == "aggregate" and self.how not in ["idxmin", "idxmax"]: # i.e. counts is defined. Locations where count list[Index]: - return [ping.group_index for ping in self.groupings] + return [ping._group_index for ping in self.groupings] @property def names(self) -> list[Hashable]: @@ -693,7 +708,7 @@ out = np.bincount(ids[ids != -1], minlength=ngroups) else: out = [] - return Series(out, index=self.result_index, dtype="int64") + return Series(out, index=self.result_index, dtype="int64", copy=False) @cache_readonly def groups(self) -> dict[Hashable, np.ndarray]: @@ -751,7 +766,7 @@ # FIXME: compress_group_index's second return value is int64, not intp ping = self.groupings[0] - return ping.codes, np.arange(len(ping.group_index), dtype=np.intp) + return ping.codes, np.arange(len(ping._group_index), dtype=np.intp) @final @cache_readonly @@ -767,10 +782,10 @@ @cache_readonly def result_index(self) -> Index: if len(self.groupings) == 1: - return self.groupings[0].result_index.rename(self.names[0]) + return self.groupings[0]._result_index.rename(self.names[0]) codes = self.reconstructed_codes - levels = [ping.result_index for ping in self.groupings] + levels = [ping._result_index for ping in self.groupings] return MultiIndex( levels=levels, codes=codes, verify_integrity=False, names=self.names ) @@ -780,12 +795,12 @@ # Note: only called from _insert_inaxis_grouper, which # is only called for BaseGrouper, never for BinGrouper if len(self.groupings) == 1: - return [self.groupings[0].group_arraylike] + return [self.groupings[0]._group_arraylike] name_list = [] for ping, codes in zip(self.groupings, self.reconstructed_codes): codes = ensure_platform_int(codes) - levels = ping.group_arraylike.take(codes) + levels = ping._group_arraylike.take(codes) name_list.append(levels) @@ -848,18 +863,11 @@ result = self._aggregate_series_pure_python(obj, func) - if len(obj) == 0 and len(result) == 0 and isinstance(obj.dtype, ExtensionDtype): - cls = obj.dtype.construct_array_type() - out = cls._from_sequence(result) - + npvalues = lib.maybe_convert_objects(result, try_float=False) + if preserve_dtype: + out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: - npvalues = lib.maybe_convert_objects(result, try_float=False) - if preserve_dtype: - out = maybe_cast_pointwise_result( - npvalues, obj.dtype, numeric_only=True - ) - else: - out = npvalues + out = npvalues return out @final @@ -1002,9 +1010,6 @@ } return result - def __iter__(self) -> Iterator[Hashable]: - return iter(self.groupings[0].grouping_vector) - @property def nkeys(self) -> int: # still matches len(self.groupings), but we can hard-code diff -Nru pandas-2.1.4+dfsg/pandas/core/indexers/objects.py pandas-2.2.2+dfsg/pandas/core/indexers/objects.py --- pandas-2.1.4+dfsg/pandas/core/indexers/objects.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexers/objects.py 2024-04-10 17:42:52.000000000 +0000 @@ -262,7 +262,9 @@ # end bound is previous end # or current index end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign - if end_diff <= zero: + if end_diff == zero and not right_closed: + end[i] = end[i - 1] + 1 + elif end_diff <= zero: end[i] = i + 1 else: end[i] = end[i - 1] diff -Nru pandas-2.1.4+dfsg/pandas/core/indexes/accessors.py pandas-2.2.2+dfsg/pandas/core/indexes/accessors.py --- pandas-2.1.4+dfsg/pandas/core/indexes/accessors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexes/accessors.py 2024-04-10 17:42:52.000000000 +0000 @@ -85,9 +85,7 @@ f"cannot convert an object of type {type(data)} to a datetimelike index" ) - # error: Signature of "_delegate_property_get" incompatible with supertype - # "PandasDelegate" - def _delegate_property_get(self, name: str): # type: ignore[override] + def _delegate_property_get(self, name: str): from pandas import Series values = self._get_values() @@ -152,6 +150,20 @@ @delegate_names( delegate=ArrowExtensionArray, + accessors=TimedeltaArray._datetimelike_ops, + typ="property", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) +@delegate_names( + delegate=ArrowExtensionArray, + accessors=TimedeltaArray._datetimelike_methods, + typ="method", + accessor_mapping=lambda x: f"_dt_{x}", + raise_on_missing=False, +) +@delegate_names( + delegate=ArrowExtensionArray, accessors=DatetimeArray._datetimelike_ops, typ="property", accessor_mapping=lambda x: f"_dt_{x}", @@ -175,7 +187,7 @@ self._orig = orig self._freeze() - def _delegate_property_get(self, name: str): # type: ignore[override] + def _delegate_property_get(self, name: str): if not hasattr(self._parent.array, f"_dt_{name}"): raise NotImplementedError( f"dt.{name} is not supported for {self._parent.dtype}" @@ -215,6 +227,9 @@ return result + def to_pytimedelta(self): + return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta() + def to_pydatetime(self): # GH#20306 warnings.warn( @@ -227,7 +242,7 @@ ) return cast(ArrowExtensionArray, self._parent.array)._dt_to_pydatetime() - def isocalendar(self): + def isocalendar(self) -> DataFrame: from pandas import DataFrame result = ( @@ -243,6 +258,26 @@ ) return iso_calendar_df + @property + def components(self) -> DataFrame: + from pandas import DataFrame + + components_df = DataFrame( + { + col: getattr(self._parent.array, f"_dt_{col}") + for col in [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ] + } + ) + return components_df + @delegate_names( delegate=DatetimeArray, @@ -284,7 +319,7 @@ 2 2 dtype: int32 - >>> quarters_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="q")) + >>> quarters_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="QE")) >>> quarters_series 0 2000-03-31 1 2000-06-30 @@ -414,7 +449,7 @@ Examples -------- >>> seconds_series = pd.Series( - ... pd.timedelta_range(start="1 second", periods=3, freq="S") + ... pd.timedelta_range(start="1 second", periods=3, freq="s") ... ) >>> seconds_series 0 0 days 00:00:01 @@ -528,7 +563,7 @@ 1 2000-01-01 00:00:01 2 2000-01-01 00:00:02 3 2000-01-01 00:00:03 - dtype: period[S] + dtype: period[s] >>> seconds_series.dt.second 0 0 1 1 @@ -544,7 +579,7 @@ 1 2000-01-01 01:00 2 2000-01-01 02:00 3 2000-01-01 03:00 - dtype: period[H] + dtype: period[h] >>> hours_series.dt.hour 0 0 1 1 @@ -594,7 +629,7 @@ index=orig.index, ) - if isinstance(data.dtype, ArrowDtype) and data.dtype.kind == "M": + if isinstance(data.dtype, ArrowDtype) and data.dtype.kind in "Mm": return ArrowTemporalProperties(data, orig) if lib.is_np_dtype(data.dtype, "M"): return DatetimeProperties(data, orig) diff -Nru pandas-2.1.4+dfsg/pandas/core/indexes/api.py pandas-2.2.2+dfsg/pandas/core/indexes/api.py --- pandas-2.1.4+dfsg/pandas/core/indexes/api.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexes/api.py 2024-04-10 17:42:52.000000000 +0000 @@ -220,7 +220,10 @@ if len(indexes) == 1: result = indexes[0] if isinstance(result, list): - result = Index(sorted(result)) + if not sort: + result = Index(result) + else: + result = Index(sorted(result)) return result indexes, kind = _sanitize_and_check(indexes) @@ -239,8 +242,12 @@ Index """ if all(isinstance(ind, Index) for ind in inds): - result = inds[0].append(inds[1:]).unique() - result = result.astype(dtype, copy=False) + inds = [ind.astype(dtype, copy=False) for ind in inds] + result = inds[0].unique() + other = inds[1].append(inds[2:]) + diff = other[result.get_indexer_for(other) == -1] + if len(diff): + result = result.append(diff.unique()) if sort: result = result.sort_values() return result @@ -377,5 +384,5 @@ def default_index(n: int) -> RangeIndex: - rng = range(0, n) + rng = range(n) return RangeIndex._simple_new(rng, name=None) diff -Nru pandas-2.1.4+dfsg/pandas/core/indexes/base.py pandas-2.2.2+dfsg/pandas/core/indexes/base.py --- pandas-2.1.4+dfsg/pandas/core/indexes/base.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexes/base.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,6 +31,7 @@ algos as libalgos, index as libindex, lib, + writers, ) from pandas._libs.internals import BlockValuesRefs import pandas._libs.join as libjoin @@ -38,7 +39,6 @@ is_datetime_array, no_default, ) -from pandas._libs.missing import is_float_nan from pandas._libs.tslibs import ( IncompatibleFrequency, OutOfBoundsDatetime, @@ -71,6 +71,7 @@ from pandas.util._decorators import ( Appender, cache_readonly, + deprecate_nonkeyword_arguments, doc, ) from pandas.util._exceptions import ( @@ -99,7 +100,6 @@ is_bool_dtype, is_ea_or_datetimelike_dtype, is_float, - is_float_dtype, is_hashable, is_integer, is_iterator, @@ -121,12 +121,16 @@ ExtensionDtype, IntervalDtype, PeriodDtype, + SparseDtype, ) from pandas.core.dtypes.generic import ( + ABCCategoricalIndex, ABCDataFrame, ABCDatetimeIndex, + ABCIntervalIndex, ABCMultiIndex, ABCPeriodIndex, + ABCRangeIndex, ABCSeries, ABCTimedeltaIndex, ) @@ -152,9 +156,14 @@ ArrowExtensionArray, BaseMaskedArray, Categorical, + DatetimeArray, ExtensionArray, + TimedeltaArray, +) +from pandas.core.arrays.string_ import ( + StringArray, + StringDtype, ) -from pandas.core.arrays.string_ import StringArray from pandas.core.base import ( IndexOpsMixin, PandasObject, @@ -200,7 +209,10 @@ MultiIndex, Series, ) - from pandas.core.arrays import PeriodArray + from pandas.core.arrays import ( + IntervalArray, + PeriodArray, + ) __all__ = ["Index"] @@ -319,12 +331,12 @@ Parameters ---------- data : array-like (1-dimensional) - dtype : NumPy dtype (default: object) - If dtype is None, we find the dtype that best fits the data. - If an actual dtype is provided, we coerce to that dtype if it's safe. - Otherwise, an error will be raised. - copy : bool - Make a copy of input ndarray. + dtype : str, numpy.dtype, or ExtensionDtype, optional + Data type for the output Index. If not specified, this will be + inferred from `data`. + See the :ref:`user guide ` for more usages. + copy : bool, default False + Copy input data. name : object Name to be stored in the index. tupleize_cols : bool (default: True) @@ -357,9 +369,6 @@ Index([1, 2, 3], dtype='uint8') """ - # To hand over control to subclasses - _join_precedence = 1 - # similar to __array_priority__, positions Index after Series and DataFrame # but before ExtensionArray. Should NOT be overridden by subclasses. __pandas_priority__ = 2000 @@ -374,9 +383,6 @@ # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() - # can_use_libjoin assures sv and ov are ndarrays - sv = cast(np.ndarray, sv) - ov = cast(np.ndarray, ov) # similar but not identical to ov.searchsorted(sv) return libjoin.left_join_indexer_unique(sv, ov) @@ -387,9 +393,6 @@ # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() - # can_use_libjoin assures sv and ov are ndarrays - sv = cast(np.ndarray, sv) - ov = cast(np.ndarray, ov) joined_ndarray, lidx, ridx = libjoin.left_join_indexer(sv, ov) joined = self._from_join_target(joined_ndarray) return joined, lidx, ridx @@ -401,9 +404,6 @@ # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() - # can_use_libjoin assures sv and ov are ndarrays - sv = cast(np.ndarray, sv) - ov = cast(np.ndarray, ov) joined_ndarray, lidx, ridx = libjoin.inner_join_indexer(sv, ov) joined = self._from_join_target(joined_ndarray) return joined, lidx, ridx @@ -415,9 +415,6 @@ # Caller is responsible for ensuring other.dtype == self.dtype sv = self._get_join_target() ov = other._get_join_target() - # can_use_libjoin assures sv and ov are ndarrays - sv = cast(np.ndarray, sv) - ov = cast(np.ndarray, ov) joined_ndarray, lidx, ridx = libjoin.outer_join_indexer(sv, ov) joined = self._from_join_target(joined_ndarray) return joined, lidx, ridx @@ -459,7 +456,7 @@ @property def _engine_type( self, - ) -> type[libindex.IndexEngine] | type[libindex.ExtensionEngine]: + ) -> type[libindex.IndexEngine | libindex.ExtensionEngine]: return self._engine_types.get(self.dtype, libindex.ObjectEngine) # whether we support partial string indexing. Overridden @@ -482,7 +479,7 @@ copy: bool = False, name=None, tupleize_cols: bool = True, - ) -> Index: + ) -> Self: from pandas.core.indexes.range import RangeIndex name = maybe_extract_name(name, data, cls) @@ -496,12 +493,16 @@ if not copy and isinstance(data, (ABCSeries, Index)): refs = data._references + is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray)) + # range if isinstance(data, (range, RangeIndex)): result = RangeIndex(start=data, copy=copy, name=name) if dtype is not None: return result.astype(dtype, copy=False) - return result + # error: Incompatible return value type (got "MultiIndex", + # expected "Self") + return result # type: ignore[return-value] elif is_ea_or_datetimelike_dtype(dtype): # non-EA dtype indexes have special casting logic, so we punt here @@ -524,7 +525,7 @@ elif is_scalar(data): raise cls._raise_scalar_data_error(data) elif hasattr(data, "__array__"): - return Index(np.asarray(data), dtype=dtype, copy=copy, name=name) + return cls(np.asarray(data), dtype=dtype, copy=copy, name=name) elif not is_list_like(data) and not isinstance(data, memoryview): # 2022-11-16 the memoryview check is only necessary on some CI # builds, not clear why @@ -541,7 +542,11 @@ # 10697 from pandas.core.indexes.multi import MultiIndex - return MultiIndex.from_tuples(data, names=name) + # error: Incompatible return value type (got "MultiIndex", + # expected "Self") + return MultiIndex.from_tuples( # type: ignore[return-value] + data, names=name + ) # other iterable of some kind if not isinstance(data, (list, tuple)): @@ -569,7 +574,19 @@ klass = cls._dtype_to_subclass(arr.dtype) arr = klass._ensure_array(arr, arr.dtype, copy=False) - return klass._simple_new(arr, name, refs=refs) + result = klass._simple_new(arr, name, refs=refs) + if dtype is None and is_pandas_object and data_dtype == np.object_: + if result.dtype != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The Index " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old " + "behavior.", + FutureWarning, + stacklevel=2, + ) + return result # type: ignore[return-value] @classmethod def _ensure_array(cls, data, dtype, copy: bool): @@ -595,24 +612,7 @@ # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 if isinstance(dtype, ExtensionDtype): - if isinstance(dtype, DatetimeTZDtype): - from pandas import DatetimeIndex - - return DatetimeIndex - elif isinstance(dtype, CategoricalDtype): - from pandas import CategoricalIndex - - return CategoricalIndex - elif isinstance(dtype, IntervalDtype): - from pandas import IntervalIndex - - return IntervalIndex - elif isinstance(dtype, PeriodDtype): - from pandas import PeriodIndex - - return PeriodIndex - - return Index + return dtype.index_class if dtype.kind == "M": from pandas import DatetimeIndex @@ -737,7 +737,7 @@ assert len(duplicates) out = ( - Series(np.arange(len(self))) + Series(np.arange(len(self)), copy=False) .groupby(self, observed=False) .agg(list)[duplicates] ) @@ -912,7 +912,7 @@ """ return len(self._data) - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: """ The array interface, return my values. """ @@ -956,7 +956,7 @@ return self.__array_wrap__(result) @final - def __array_wrap__(self, result, context=None): + def __array_wrap__(self, result, context=None, return_scalar=False): """ Gets called after a ufunc and other functions e.g. np.split. """ @@ -1016,20 +1016,27 @@ dtype = pandas_dtype(cls) if needs_i8_conversion(dtype): - if dtype.kind == "m" and dtype != "m8[ns]": - # e.g. m8[s] - return self._data.view(cls) - idx_cls = self._dtype_to_subclass(dtype) - # NB: we only get here for subclasses that override - # _data_cls such that it is a type and not a tuple - # of types. - arr_cls = idx_cls._data_cls - arr = arr_cls(self._data.view("i8"), dtype=dtype) - return idx_cls._simple_new(arr, name=self.name, refs=self._references) + arr = self.array.view(dtype) + if isinstance(arr, ExtensionArray): + # here we exclude non-supported dt64/td64 dtypes + return idx_cls._simple_new( + arr, name=self.name, refs=self._references + ) + return arr result = self._data.view(cls) else: + if cls is not None: + warnings.warn( + # GH#55709 + f"Passing a type in {type(self).__name__}.view is deprecated " + "and will raise in a future version. " + "Call view without any argument to retain the old behavior.", + FutureWarning, + stacklevel=find_stack_level(), + ) + result = self._view() if isinstance(result, Index): result._id = self._id @@ -1144,7 +1151,7 @@ allow_fill: bool = True, fill_value=None, **kwargs, - ): + ) -> Self: if kwargs: nv.validate_take((), kwargs) if is_scalar(indices): @@ -1229,7 +1236,7 @@ """ @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) - def repeat(self, repeats, axis=None): + def repeat(self, repeats, axis: None = None) -> Self: repeats = ensure_platform_int(repeats) nv.validate_repeat((), {"axis": axis}) res_values = self._values.repeat(repeats) @@ -1307,25 +1314,11 @@ klass_name = type(self).__name__ data = self._format_data() attrs = self._format_attrs() - space = self._format_space() attrs_str = [f"{k}={v}" for k, v in attrs] - prepr = f",{space}".join(attrs_str) - - # no data provided, just attributes - if data is None: - data = "" + prepr = ", ".join(attrs_str) return f"{klass_name}({data}{prepr})" - def _format_space(self) -> str_t: - # using space here controls if the attributes - # are line separated or not (the default) - - # max_seq_items = get_option('display.max_seq_items') - # if len(self) > max_seq_items: - # space = "\n%s" % (' ' * (len(klass) + 1)) - return " " - @property def _formatter_func(self): """ @@ -1333,6 +1326,7 @@ """ return default_pprint + @final def _format_data(self, name=None) -> str_t: """ Return the formatted data as a unicode string. @@ -1342,10 +1336,13 @@ if self.inferred_type == "string": is_justify = False - elif self.inferred_type == "categorical": + elif isinstance(self.dtype, CategoricalDtype): self = cast("CategoricalIndex", self) if is_object_dtype(self.categories.dtype): is_justify = False + elif isinstance(self, ABCRangeIndex): + # We will do the relevant formatting via attrs + return "" return format_object_summary( self, @@ -1402,6 +1399,14 @@ """ Render a string representation of the Index. """ + warnings.warn( + # GH#55413 + f"{type(self).__name__}.format is deprecated and will be removed " + "in a future version. Convert using index.astype(str) or " + "index.map(formatter) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) header = [] if name: header.append( @@ -1413,30 +1418,55 @@ if formatter is not None: return header + list(self.map(formatter)) - return self._format_with_header(header, na_rep=na_rep) + return self._format_with_header(header=header, na_rep=na_rep) + + _default_na_rep = "NaN" - def _format_with_header(self, header: list[str_t], na_rep: str_t) -> list[str_t]: + @final + def _format_flat( + self, + *, + include_name: bool, + formatter: Callable | None = None, + ) -> list[str_t]: + """ + Render a string representation of the Index. + """ + header = [] + if include_name: + header.append( + pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) + + if formatter is not None: + return header + list(self.map(formatter)) + + return self._format_with_header(header=header, na_rep=self._default_na_rep) + + def _format_with_header(self, *, header: list[str_t], na_rep: str_t) -> list[str_t]: from pandas.io.formats.format import format_array values = self._values - if is_object_dtype(values.dtype) or is_string_dtype(values.dtype): - values = np.asarray(values) - values = lib.maybe_convert_objects(values, safe=True) - - result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values] - - # could have nans - mask = is_float_nan(values) - if mask.any(): - result_arr = np.array(result) - result_arr[mask] = na_rep - result = result_arr.tolist() + if ( + is_object_dtype(values.dtype) + or is_string_dtype(values.dtype) + or isinstance(self.dtype, (IntervalDtype, CategoricalDtype)) + ): + # TODO: why do we need different justify for these cases? + justify = "all" else: - result = trim_front(format_array(values, None, justify="left")) + justify = "left" + # passing leading_space=False breaks test_format_missing, + # test_index_repr_in_frame_with_nan, but would otherwise make + # trim_front unnecessary + formatted = format_array(values, None, justify=justify) + result = trim_front(formatted) return header + result - def _format_native_types( + def _get_values_for_csv( self, *, na_rep: str_t = "", @@ -1445,30 +1475,14 @@ date_format=None, quoting=None, ) -> npt.NDArray[np.object_]: - """ - Actually format specific types of the index. - """ - from pandas.io.formats.format import FloatArrayFormatter - - if is_float_dtype(self.dtype) and not isinstance(self.dtype, ExtensionDtype): - formatter = FloatArrayFormatter( - self._values, - na_rep=na_rep, - float_format=float_format, - decimal=decimal, - quoting=quoting, - fixed_width=False, - ) - return formatter.get_result_as_array() - - mask = isna(self) - if self.dtype != object and not quoting: - values = np.asarray(self).astype(str) - else: - values = np.array(self, dtype=object, copy=True) - - values[mask] = na_rep - return values + return get_values_for_csv( + self._values, + na_rep=na_rep, + decimal=decimal, + float_format=float_format, + date_format=date_format, + quoting=quoting, + ) def _summary(self, name=None) -> str_t: """ @@ -1897,7 +1911,18 @@ return idx return None - def rename(self, name, inplace: bool = False): + @overload + def rename(self, name, *, inplace: Literal[False] = ...) -> Self: + ... + + @overload + def rename(self, name, *, inplace: Literal[True]) -> None: + ... + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self", "name"], name="rename" + ) + def rename(self, name, inplace: bool = False) -> Self | None: """ Alter Index or MultiIndex name. @@ -2196,11 +2221,6 @@ @final def _can_hold_na(self) -> bool: if isinstance(self.dtype, ExtensionDtype): - if isinstance(self.dtype, IntervalDtype): - # FIXME(GH#45720): this is inaccurate for integer-backed - # IntervalArray, but without it other.categories.take raises - # in IntervalArray._cmp_method - return True return self.dtype._can_hold_na if self.dtype.kind in "iub": return False @@ -3366,6 +3386,7 @@ and other.is_monotonic_increasing and not (self.has_duplicates and other.has_duplicates) and self._can_use_libjoin + and other._can_use_libjoin ): # Both are monotonic and at least one is unique, so can use outer join # (actually don't need either unique, but without this restriction @@ -3464,7 +3485,7 @@ self, other = self._dti_setop_align_tzs(other, "intersection") if self.equals(other): - if self.has_duplicates: + if not self.is_unique: result = self.unique()._get_reconciled_name_object(other) else: result = self._get_reconciled_name_object(other) @@ -3519,7 +3540,7 @@ self.is_monotonic_increasing and other.is_monotonic_increasing and self._can_use_libjoin - and not isinstance(self, ABCMultiIndex) + and other._can_use_libjoin ): try: res_indexer, indexer, _ = self._inner_indexer(other) @@ -3528,7 +3549,7 @@ pass else: # TODO: algos.unique1d should preserve DTA/TDA - if is_numeric_dtype(self): + if is_numeric_dtype(self.dtype): # This is faster, because Index.unique() checks for uniqueness # before calculating the unique values. res = algos.unique1d(res_indexer) @@ -3554,7 +3575,7 @@ Returns ------- - np.ndarray or ExtensionArray + np.ndarray or ExtensionArray or MultiIndex The returned array will be unique. """ left_unique = self.unique() @@ -3571,6 +3592,7 @@ # unnecessary in the case with sort=None bc we will sort later taker = np.sort(taker) + result: MultiIndex | ExtensionArray | np.ndarray if isinstance(left_unique, ABCMultiIndex): result = left_unique.take(taker) else: @@ -3624,14 +3646,14 @@ if len(other) == 0: # Note: we do not (yet) sort even if sort=None GH#24959 - result = self.rename(result_name) + result = self.unique().rename(result_name) if sort is True: return result.sort_values() return result if not self._should_compare(other): # Nothing matches -> difference is everything - result = self.rename(result_name) + result = self.unique().rename(result_name) if sort is True: return result.sort_values() return result @@ -3641,21 +3663,13 @@ def _difference(self, other, sort): # overridden by RangeIndex - - this = self.unique() - - indexer = this.get_indexer_for(other) - indexer = indexer.take((indexer != -1).nonzero()[0]) - - label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - - the_diff: MultiIndex | ArrayLike - if isinstance(this, ABCMultiIndex): - the_diff = this.take(label_diff) - else: - the_diff = this._values.take(label_diff) + this = self + if isinstance(self, ABCCategoricalIndex) and self.hasnans and other.hasnans: + this = this.dropna() + other = other.unique() + the_diff = this[other.get_indexer_for(this) == -1] + the_diff = the_diff if this.is_unique else the_diff.unique() the_diff = _maybe_try_sort(the_diff, sort) - return the_diff def _wrap_difference_result(self, other, result): @@ -3803,9 +3817,15 @@ self._check_indexing_error(key) raise - _index_shared_docs[ - "get_indexer" - ] = """ + @final + def get_indexer( + self, + target, + method: ReindexMethod | None = None, + limit: int | None = None, + tolerance=None, + ) -> npt.NDArray[np.intp]: + """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the @@ -3813,7 +3833,7 @@ Parameters ---------- - target : %(target_klass)s + target : Index method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. @@ -3840,7 +3860,7 @@ Integers from 0 to n - 1 indicating that the index at these positions matches the corresponding target values. Missing values in the target are marked by -1. - %(raises_section)s + Notes ----- Returns -1 for unmatched values, for further explanation see the @@ -3855,16 +3875,6 @@ Notice that the return value is an array of locations in ``index`` and ``x`` is marked by -1, as it is not in ``index``. """ - - @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) - @final - def get_indexer( - self, - target, - method: ReindexMethod | None = None, - limit: int | None = None, - tolerance=None, - ) -> npt.NDArray[np.intp]: method = clean_reindex_fill_method(method) orig_target = target target = self._maybe_cast_listlike_indexer(target) @@ -3919,7 +3929,7 @@ return ensure_platform_int(indexer) - pself, ptarget = self._maybe_promote(target) + pself, ptarget = self._maybe_downcast_for_indexing(target) if pself is not self or ptarget is not target: return pself.get_indexer( ptarget, method=method, limit=limit, tolerance=tolerance @@ -3979,12 +3989,8 @@ if isinstance(self.dtype, IntervalDtype): if isinstance(target.dtype, IntervalDtype): return False - # See https://github.com/pandas-dev/pandas/issues/47772 the commented - # out code can be restored (instead of hardcoding `return True`) - # once that issue is fixed # "Index" has no attribute "left" - # return self.left._should_compare(target) # type: ignore[attr-defined] - return True + return self.left._should_compare(target) # type: ignore[attr-defined] return False @final @@ -4056,19 +4062,14 @@ self, target: Index, method: str_t, limit: int | None = None, tolerance=None ) -> npt.NDArray[np.intp]: if self._is_multi: - # TODO: get_indexer_with_fill docstring says values must be _sorted_ - # but that doesn't appear to be enforced - # error: "IndexEngine" has no attribute "get_indexer_with_fill" - engine = self._engine - with warnings.catch_warnings(): - # TODO: We need to fix this. Casting to int64 in cython - warnings.filterwarnings("ignore", category=RuntimeWarning) - return engine.get_indexer_with_fill( # type: ignore[union-attr] - target=target._values, - values=self._values, - method=method, - limit=limit, - ) + if not (self.is_monotonic_increasing or self.is_monotonic_decreasing): + raise ValueError("index must be monotonic increasing or decreasing") + encoded = self.append(target)._engine.values # type: ignore[union-attr] + self_encoded = Index(encoded[: len(self)]) + target_encoded = Index(encoded[len(self) :]) + return self_encoded._get_fill_indexer( + target_encoded, method, limit, tolerance + ) if self.is_monotonic_increasing and target.is_monotonic_increasing: target_values = target._get_engine_target() @@ -4249,15 +4250,9 @@ self._validate_indexer("slice", key.step, "getitem") return key - # convert the slice to an indexer here - - # special case for interval_dtype bc we do not do partial-indexing - # on integer Intervals when slicing - # TODO: write this in terms of e.g. should_partial_index? - ints_are_positional = self._should_fallback_to_positional or isinstance( - self.dtype, IntervalDtype - ) - is_positional = is_index_slice and ints_are_positional + # convert the slice to an indexer here; checking that the user didn't + # pass a positional slice to loc + is_positional = is_index_slice and self._should_fallback_to_positional # if we are mixed and have integers if is_positional: @@ -4476,7 +4471,7 @@ indexer, missing = self.get_indexer_non_unique(target) check = indexer != -1 - new_labels = self.take(indexer[check]) + new_labels: Index | np.ndarray = self.take(indexer[check]) new_indexer = None if len(missing): @@ -4595,6 +4590,7 @@ Index([1, 2, 3, 4, 5, 6], dtype='int64') """ other = ensure_index(other) + sort = sort or how == "outer" if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex): if (self.tz is None) ^ (other.tz is None): @@ -4603,15 +4599,12 @@ if not self._is_multi and not other._is_multi: # We have specific handling for MultiIndex below - pself, pother = self._maybe_promote(other) + pself, pother = self._maybe_downcast_for_indexing(other) if pself is not self or pother is not other: return pself.join( pother, how=how, level=level, return_indexers=True, sort=sort ) - lindexer: np.ndarray | None - rindexer: np.ndarray | None - # try to figure out the join level # GH3662 if level is None and (self._is_multi or other._is_multi): @@ -4625,73 +4618,74 @@ if level is not None and (self._is_multi or other._is_multi): return self._join_level(other, level, how=how) - if len(other) == 0: - if how in ("left", "outer"): - join_index = self._view() - rindexer = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, None, rindexer - elif how in ("right", "inner", "cross"): - join_index = other._view() - lindexer = np.array([]) - return join_index, lindexer, None - - if len(self) == 0: - if how in ("right", "outer"): - join_index = other._view() - lindexer = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lindexer, None - elif how in ("left", "inner", "cross"): - join_index = self._view() - rindexer = np.array([]) - return join_index, None, rindexer - - if self._join_precedence < other._join_precedence: - flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"} - how = flip.get(how, how) - join_index, lidx, ridx = other.join( - self, how=how, level=level, return_indexers=True - ) - lidx, ridx = ridx, lidx - return join_index, lidx, ridx + if len(self) == 0 or len(other) == 0: + try: + return self._join_empty(other, how, sort) + except TypeError: + # object dtype; non-comparable objects + pass if self.dtype != other.dtype: dtype = self._find_common_type_compat(other) this = self.astype(dtype, copy=False) other = other.astype(dtype, copy=False) return this.join(other, how=how, return_indexers=True) + elif ( + isinstance(self, ABCCategoricalIndex) + and isinstance(other, ABCCategoricalIndex) + and not self.ordered + and not self.categories.equals(other.categories) + ): + # dtypes are "equal" but categories are in different order + other = Index(other._values.reorder_categories(self.categories)) _validate_join_method(how) - if not self.is_unique and not other.is_unique: - return self._join_non_unique(other, how=how) - elif not self.is_unique or not other.is_unique: - if self.is_monotonic_increasing and other.is_monotonic_increasing: - if not isinstance(self.dtype, IntervalDtype): - # otherwise we will fall through to _join_via_get_indexer - # GH#39133 - # go through object dtype for ea till engine is supported properly - return self._join_monotonic(other, how=how) - else: - return self._join_non_unique(other, how=how) - elif ( - # GH48504: exclude MultiIndex to avoid going through MultiIndex._values + if ( self.is_monotonic_increasing and other.is_monotonic_increasing and self._can_use_libjoin - and not isinstance(self, ABCMultiIndex) - and not isinstance(self.dtype, CategoricalDtype) + and other._can_use_libjoin + and (self.is_unique or other.is_unique) ): - # Categorical is monotonic if data are ordered as categories, but join can - # not handle this in case of not lexicographically monotonic GH#38502 try: return self._join_monotonic(other, how=how) except TypeError: # object dtype; non-comparable objects pass + elif not self.is_unique or not other.is_unique: + return self._join_non_unique(other, how=how, sort=sort) return self._join_via_get_indexer(other, how, sort) @final + def _join_empty( + self, other: Index, how: JoinHow, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + assert len(self) == 0 or len(other) == 0 + _validate_join_method(how) + + lidx: np.ndarray | None + ridx: np.ndarray | None + + if len(other): + how = cast(JoinHow, {"left": "right", "right": "left"}.get(how, how)) + join_index, ridx, lidx = other._join_empty(self, how, sort) + elif how in ["left", "outer"]: + if sort and not self.is_monotonic_increasing: + lidx = self.argsort() + join_index = self.take(lidx) + else: + lidx = None + join_index = self._view() + ridx = np.broadcast_to(np.intp(-1), len(join_index)) + else: + join_index = other._view() + lidx = np.array([], dtype=np.intp) + ridx = None + return join_index, lidx, ridx + + @final def _join_via_get_indexer( self, other: Index, how: JoinHow, sort: bool ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: @@ -4701,20 +4695,20 @@ # Note: at this point we have checked matching dtypes if how == "left": - join_index = self + join_index = self.sort_values() if sort else self elif how == "right": - join_index = other + join_index = other.sort_values() if sort else other elif how == "inner": - # TODO: sort=False here for backwards compat. It may - # be better to use the sort parameter passed into join - join_index = self.intersection(other, sort=False) + join_index = self.intersection(other, sort=sort) elif how == "outer": - # TODO: sort=True here for backwards compat. It may - # be better to use the sort parameter passed into join - join_index = self.union(other) - - if sort: - join_index = join_index.sort_values() + try: + join_index = self.union(other, sort=sort) + except TypeError: + join_index = self.union(other) + try: + join_index = _maybe_try_sort(join_index, sort) + except TypeError: + pass if join_index is self: lindexer = None @@ -4787,6 +4781,13 @@ multi_join_idx = multi_join_idx.remove_unused_levels() + # maintain the order of the index levels + if how == "right": + level_order = other_names_list + ldrop_names + else: + level_order = self_names_list + rdrop_names + multi_join_idx = multi_join_idx.reorder_levels(level_order) + return multi_join_idx, lidx, ridx jl = next(iter(overlap)) @@ -4810,21 +4811,24 @@ @final def _join_non_unique( - self, other: Index, how: JoinHow = "left" + self, other: Index, how: JoinHow = "left", sort: bool = False ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp]]: - from pandas.core.reshape.merge import get_join_indexers + from pandas.core.reshape.merge import get_join_indexers_non_unique # We only get here if dtypes match assert self.dtype == other.dtype - left_idx, right_idx = get_join_indexers( - [self._values], [other._values], how=how, sort=True + left_idx, right_idx = get_join_indexers_non_unique( + self._values, other._values, how=how, sort=sort ) mask = left_idx == -1 join_idx = self.take(left_idx) right = other.take(right_idx) join_index = join_idx.putmask(mask, right) + if isinstance(join_index, ABCMultiIndex) and how == "outer": + # test_join_index_levels + join_index = join_index._sort_levels_monotonic() return join_index, left_idx, right_idx @final @@ -4982,6 +4986,7 @@ ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: # We only get here with matching dtypes and both monotonic increasing assert other.dtype == self.dtype + assert self._can_use_libjoin and other._can_use_libjoin if self.equals(other): # This is a convenient place for this check, but its correctness @@ -5043,27 +5048,39 @@ # expected "Self") mask = lidx == -1 join_idx = self.take(lidx) - right = other.take(ridx) + right = cast("MultiIndex", other.take(ridx)) join_index = join_idx.putmask(mask, right)._sort_levels_monotonic() return join_index.set_names(name) # type: ignore[return-value] else: name = get_op_result_name(self, other) return self._constructor._with_infer(joined, name=name, dtype=self.dtype) + @final @cache_readonly def _can_use_libjoin(self) -> bool: """ - Whether we can use the fastpaths implement in _libs.join + Whether we can use the fastpaths implemented in _libs.join. + + This is driven by whether (in monotonic increasing cases that are + guaranteed not to have NAs) we can convert to a np.ndarray without + making a copy. If we cannot, this negates the performance benefit + of using libjoin. """ if type(self) is Index: # excludes EAs, but include masks, we get here with monotonic # values only, meaning no NA return ( isinstance(self.dtype, np.dtype) - or isinstance(self.values, BaseMaskedArray) - or isinstance(self._values, ArrowExtensionArray) + or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray)) + or self.dtype == "string[python]" ) - return not isinstance(self.dtype, IntervalDtype) + # Exclude index types where the conversion to numpy converts to object dtype, + # which negates the performance benefit of libjoin + # Subclasses should override to return False if _get_join_target is + # not zero-copy. + # TODO: exclude RangeIndex (which allocates memory)? + # Doing so seems to break test_concat_datetime_timezone + return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex)) # -------------------------------------------------------------------- # Uncategorized Methods @@ -5184,7 +5201,8 @@ return self._values.astype(object) return vals - def _get_join_target(self) -> ArrayLike: + @final + def _get_join_target(self) -> np.ndarray: """ Get the ndarray or ExtensionArray that we can pass to the join functions. @@ -5196,17 +5214,22 @@ # This is only used if our array is monotonic, so no missing values # present return self._values.to_numpy() - return self._get_engine_target() + + # TODO: exclude ABCRangeIndex case here as it copies + target = self._get_engine_target() + if not isinstance(target, np.ndarray): + raise ValueError("_can_use_libjoin should return False.") + return target def _from_join_target(self, result: np.ndarray) -> ArrayLike: """ Cast the ndarray returned from one of the libjoin.foo_indexer functions - back to type(self)._data. + back to type(self._data). """ if isinstance(self.values, BaseMaskedArray): return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_)) elif isinstance(self.values, (ArrowExtensionArray, StringArray)): - return type(self.values)._from_sequence(result) + return type(self.values)._from_sequence(result, dtype=self.dtype) return result @doc(IndexOpsMixin._memory_usage) @@ -5380,6 +5403,16 @@ else: key = np.asarray(key, dtype=bool) + if not isinstance(self.dtype, ExtensionDtype): + if len(key) == 0 and len(key) != len(self): + warnings.warn( + "Using a boolean indexer with length 0 on an Index with " + "length greater than 0 is deprecated and will raise in a " + "future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + result = getitem(key) # Because we ruled out integer above, we always get an arraylike here if result.ndim > 1: @@ -5397,7 +5430,7 @@ result = type(self)._simple_new(res, name=self._name, refs=self._references) if "_engine" in self._cache: reverse = slobj.step is not None and slobj.step < 0 - result._engine._update_from_sliced(self._engine, reverse=reverse) # type: ignore[union-attr] # noqa: E501 + result._engine._update_from_sliced(self._engine, reverse=reverse) # type: ignore[union-attr] return result @@ -5585,6 +5618,14 @@ # quickly return if the lengths are different return False + if ( + isinstance(self.dtype, StringDtype) + and self.dtype.storage == "pyarrow_numpy" + and other.dtype != self.dtype + ): + # special case for object behavior + return other.equals(self.astype(object)) + if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype): # if other is not object, use other's logic for coercion return other.equals(self) @@ -5781,13 +5822,49 @@ return result + @overload + def sort_values( + self, + *, + return_indexer: Literal[False] = ..., + ascending: bool = ..., + na_position: NaPosition = ..., + key: Callable | None = ..., + ) -> Self: + ... + + @overload + def sort_values( + self, + *, + return_indexer: Literal[True], + ascending: bool = ..., + na_position: NaPosition = ..., + key: Callable | None = ..., + ) -> tuple[Self, np.ndarray]: + ... + + @overload + def sort_values( + self, + *, + return_indexer: bool = ..., + ascending: bool = ..., + na_position: NaPosition = ..., + key: Callable | None = ..., + ) -> Self | tuple[Self, np.ndarray]: + ... + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self"], name="sort_values" + ) def sort_values( self, return_indexer: bool = False, ascending: bool = True, na_position: NaPosition = "last", key: Callable | None = None, - ): + ) -> Self | tuple[Self, np.ndarray]: """ Return a sorted copy of the index. @@ -5803,9 +5880,6 @@ na_position : {'first' or 'last'}, default 'last' Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. - - .. versionadded:: 1.2.0 - key : callable, optional If not None, apply the key function to the index values before sorting. This is similar to the `key` argument in the @@ -5842,6 +5916,16 @@ >>> idx.sort_values(ascending=False, return_indexer=True) (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) """ + if key is None and ( + (ascending and self.is_monotonic_increasing) + or (not ascending and self.is_monotonic_decreasing) + ): + if return_indexer: + indexer = np.arange(len(self), dtype=np.intp) + return self.copy(), indexer + else: + return self.copy() + # GH 35584. Sort missing values according to na_position kwarg # ignore na_position for MultiIndex if not isinstance(self, ABCMultiIndex): @@ -6046,7 +6130,7 @@ # that can be matched to Interval scalars. return self._get_indexer_non_comparable(target, method=None, unique=False) - pself, ptarget = self._maybe_promote(target) + pself, ptarget = self._maybe_downcast_for_indexing(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) @@ -6062,14 +6146,15 @@ # TODO: get_indexer has fastpaths for both Categorical-self and # Categorical-target. Can we do something similar here? - # Note: _maybe_promote ensures we never get here with MultiIndex - # self and non-Multi target - tgt_values = target._get_engine_target() + # Note: _maybe_downcast_for_indexing ensures we never get here + # with MultiIndex self and non-Multi target if self._is_multi and target._is_multi: engine = self._engine # Item "IndexEngine" of "Union[IndexEngine, ExtensionEngine]" has # no attribute "_extract_level_codes" tgt_values = engine._extract_level_codes(target) # type: ignore[union-attr] + else: + tgt_values = target._get_engine_target() indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), ensure_platform_int(missing) @@ -6160,19 +6245,7 @@ nmissing = missing_mask.sum() if nmissing: - # TODO: remove special-case; this is just to keep exception - # message tests from raising while debugging - use_interval_msg = isinstance(self.dtype, IntervalDtype) or ( - isinstance(self.dtype, CategoricalDtype) - # "Index" has no attribute "categories" [attr-defined] - and isinstance( - self.categories.dtype, IntervalDtype # type: ignore[attr-defined] - ) - ) - if nmissing == len(indexer): - if use_interval_msg: - key = list(key) raise KeyError(f"None of [{key}] are in the [{axis_name}]") not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) @@ -6224,8 +6297,8 @@ If doing an inequality check, i.e. method is not None. """ if method is not None: - other = _unpack_nested_dtype(target) - raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}") + other_dtype = _unpack_nested_dtype(target) + raise TypeError(f"Cannot compare dtypes {self.dtype} and {other_dtype}") no_matches = -1 * np.ones(target.shape, dtype=np.intp) if unique: @@ -6249,7 +6322,7 @@ _requires_unique_msg = "Reindexing only valid with uniquely valued Index objects" @final - def _maybe_promote(self, other: Index) -> tuple[Index, Index]: + def _maybe_downcast_for_indexing(self, other: Index) -> tuple[Index, Index]: """ When dealing with an object-dtype Index and a non-object Index, see if we can upcast the object-dtype one to improve performance. @@ -6290,7 +6363,7 @@ if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype): # Reverse op so we dont need to re-implement on the subclasses - other, self = other._maybe_promote(self) + other, self = other._maybe_downcast_for_indexing(self) return self, other @@ -6336,8 +6409,7 @@ # respectively. return False - other = _unpack_nested_dtype(other) - dtype = other.dtype + dtype = _unpack_nested_dtype(other) return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: @@ -6533,18 +6605,6 @@ >>> midx.isin([(1, 'red'), (3, 'red')]) array([ True, False, False]) - - For a DatetimeIndex, string values in `values` are converted to - Timestamps. - - >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] - >>> dti = pd.to_datetime(dates) - >>> dti - DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], - dtype='datetime64[ns]', freq=None) - - >>> dti.isin(['2000-03-11']) - array([ True, False, False]) """ if level is not None: self._validate_index_level(level) @@ -6949,14 +7009,24 @@ loc = loc if loc >= 0 else loc - 1 new_values[loc] = item - idx = Index._with_infer(new_values, name=self.name) + out = Index._with_infer(new_values, name=self.name) if ( using_pyarrow_string_dtype() - and is_string_dtype(idx.dtype) + and is_string_dtype(out.dtype) and new_values.dtype == object ): - idx = idx.astype(new_values.dtype) - return idx + out = out.astype(new_values.dtype) + if self.dtype == object and out.dtype != object: + # GH#51363 + warnings.warn( + "The behavior of Index.insert with object-dtype is deprecated, " + "in a future version this will return an object-dtype Index " + "instead of inferring a non-object dtype. To retain the old " + "behavior, do `idx.insert(loc, item).infer_objects(copy=False)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + return out def drop( self, @@ -7001,6 +7071,7 @@ indexer = indexer[~mask] return self.delete(indexer) + @final def infer_objects(self, copy: bool = True) -> Index: """ If we have an object dtype, try to infer a non-object dtype. @@ -7061,7 +7132,8 @@ """ return Index(self.to_series().diff(periods)) - def round(self, decimals: int = 0): + @final + def round(self, decimals: int = 0) -> Self: """ Round each value in the Index to the given number of decimals. @@ -7562,7 +7634,7 @@ index_like = list(index_like) if isinstance(index_like, list): - if type(index_like) is not list: + if type(index_like) is not list: # noqa: E721 # must check for exactly list here because of strict type # check in clean_index_list index_like = list(index_like) @@ -7648,7 +7720,7 @@ return names -def _unpack_nested_dtype(other: Index) -> Index: +def _unpack_nested_dtype(other: Index) -> DtypeObj: """ When checking if our dtype is comparable with another, we need to unpack CategoricalDtype to look at its categories.dtype. @@ -7659,20 +7731,20 @@ Returns ------- - Index + np.dtype or ExtensionDtype """ dtype = other.dtype if isinstance(dtype, CategoricalDtype): # If there is ever a SparseIndex, this could get dispatched # here too. - return dtype.categories + return dtype.categories.dtype elif isinstance(dtype, ArrowDtype): # GH 53617 import pyarrow as pa if pa.types.is_dictionary(dtype.pyarrow_dtype): - other = other.astype(ArrowDtype(dtype.pyarrow_dtype.value_type)) - return other + other = other[:0].astype(ArrowDtype(dtype.pyarrow_dtype.value_type)) + return other.dtype def _maybe_try_sort(result: Index | ArrayLike, sort: bool | None): @@ -7693,3 +7765,113 @@ stacklevel=find_stack_level(), ) return result + + +def get_values_for_csv( + values: ArrayLike, + *, + date_format, + na_rep: str = "nan", + quoting=None, + float_format=None, + decimal: str = ".", +) -> npt.NDArray[np.object_]: + """ + Convert to types which can be consumed by the standard library's + csv.writer.writerows. + """ + if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm": + # GH#40754 Convert categorical datetimes to datetime array + values = algos.take_nd( + values.categories._values, + ensure_platform_int(values._codes), + fill_value=na_rep, + ) + + values = ensure_wrapped_if_datetimelike(values) + + if isinstance(values, (DatetimeArray, TimedeltaArray)): + if values.ndim == 1: + result = values._format_native_types(na_rep=na_rep, date_format=date_format) + result = result.astype(object, copy=False) + return result + + # GH#21734 Process every column separately, they might have different formats + results_converted = [] + for i in range(len(values)): + result = values[i, :]._format_native_types( + na_rep=na_rep, date_format=date_format + ) + results_converted.append(result.astype(object, copy=False)) + return np.vstack(results_converted) + + elif isinstance(values.dtype, PeriodDtype): + # TODO: tests that get here in column path + values = cast("PeriodArray", values) + res = values._format_native_types(na_rep=na_rep, date_format=date_format) + return res + + elif isinstance(values.dtype, IntervalDtype): + # TODO: tests that get here in column path + values = cast("IntervalArray", values) + mask = values.isna() + if not quoting: + result = np.asarray(values).astype(str) + else: + result = np.array(values, dtype=object, copy=True) + + result[mask] = na_rep + return result + + elif values.dtype.kind == "f" and not isinstance(values.dtype, SparseDtype): + # see GH#13418: no special formatting is desired at the + # output (important for appropriate 'quoting' behaviour), + # so do not pass it through the FloatArrayFormatter + if float_format is None and decimal == ".": + mask = isna(values) + + if not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype="object") + + values[mask] = na_rep + values = values.astype(object, copy=False) + return values + + from pandas.io.formats.format import FloatArrayFormatter + + formatter = FloatArrayFormatter( + values, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + quoting=quoting, + fixed_width=False, + ) + res = formatter.get_result_as_array() + res = res.astype(object, copy=False) + return res + + elif isinstance(values, ExtensionArray): + mask = isna(values) + + new_values = np.asarray(values.astype(object)) + new_values[mask] = na_rep + return new_values + + else: + mask = isna(values) + itemsize = writers.word_len(na_rep) + + if values.dtype != _dtype_obj and not quoting and itemsize: + values = values.astype(str) + if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: + # enlarge for the na_rep + values = values.astype(f" CategoricalIndex: + ) -> Self: name = maybe_extract_name(name, data, cls) if is_scalar(data): @@ -355,13 +353,6 @@ extra = super()._format_attrs() return attrs + extra - def _format_with_header(self, header: list[str], na_rep: str) -> list[str]: - result = [ - pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep - for x in self._values - ] - return header + result - # -------------------------------------------------------------------- @property diff -Nru pandas-2.1.4+dfsg/pandas/core/indexes/datetimelike.py pandas-2.2.2+dfsg/pandas/core/indexes/datetimelike.py --- pandas-2.1.4+dfsg/pandas/core/indexes/datetimelike.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexes/datetimelike.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,6 +14,7 @@ cast, final, ) +import warnings import numpy as np @@ -31,6 +32,7 @@ parsing, to_offset, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas.compat.numpy import function as nv from pandas.errors import ( InvalidIndexError, @@ -41,6 +43,7 @@ cache_readonly, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_integer, @@ -108,8 +111,16 @@ @property @doc(DatetimeLikeArrayMixin.freqstr) - def freqstr(self) -> str | None: - return self._data.freqstr + def freqstr(self) -> str: + from pandas import PeriodIndex + + if self._data.freqstr is not None and isinstance( + self._data, (PeriodArray, PeriodIndex) + ): + freq = freq_to_period_freqstr(self._data.freq.n, self._data.freq.name) + return freq + else: + return self._data.freqstr # type: ignore[return-value] @cache_readonly @abstractmethod @@ -178,6 +189,7 @@ # -------------------------------------------------------------------- # Rendering Methods + _default_na_rep = "NaT" def format( self, @@ -189,6 +201,14 @@ """ Render a string representation of the Index. """ + warnings.warn( + # GH#55413 + f"{type(self).__name__}.format is deprecated and will be removed " + "in a future version. Convert using index.astype(str) or " + "index.map(formatter) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) header = [] if name: header.append( @@ -200,14 +220,17 @@ if formatter is not None: return header + list(self.map(formatter)) - return self._format_with_header(header, na_rep=na_rep, date_format=date_format) + return self._format_with_header( + header=header, na_rep=na_rep, date_format=date_format + ) def _format_with_header( - self, header: list[str], na_rep: str = "NaT", date_format: str | None = None + self, *, header: list[str], na_rep: str, date_format: str | None = None ) -> list[str]: + # TODO: not reached in tests 2023-10-11 # matches base class except for whitespace padding and date_format return header + list( - self._format_native_types(na_rep=na_rep, date_format=date_format) + self._get_values_for_csv(na_rep=na_rep, date_format=date_format) ) @property @@ -419,8 +442,6 @@ _is_monotonic_decreasing = Index.is_monotonic_decreasing _is_unique = Index.is_unique - _join_precedence = 10 - @property def unit(self) -> str: return self._data.unit @@ -495,7 +516,7 @@ # appropriate timezone from `start` and `end`, so tz does not need # to be passed explicitly. result = self._data._generate_range( - start=start, end=end, periods=None, freq=self.freq + start=start, end=end, periods=None, freq=self.freq, unit=self.unit ) return type(self)._simple_new(result, name=self.name) @@ -512,7 +533,7 @@ # Convert our i8 representations to RangeIndex # Caller is responsible for checking isinstance(self.freq, Tick) freq = cast(Tick, self.freq) - tick = freq.delta._value + tick = Timedelta(freq).as_unit("ns")._value rng = range(self[0]._value, self[-1]._value + tick, tick) return RangeIndex(rng) @@ -676,7 +697,10 @@ dates = concat_compat([left._values, right_chunk]) # The can_fast_union check ensures that the result.freq # should match self.freq - dates = type(self._data)(dates, freq=self.freq) + assert isinstance(dates, type(self._data)) + # error: Item "ExtensionArray" of "ExtensionArray | + # ndarray[Any, Any]" has no attribute "_freq" + assert dates._freq == self.freq # type: ignore[union-attr] result = type(self)._simple_new(dates) return result else: diff -Nru pandas-2.1.4+dfsg/pandas/core/indexes/datetimes.py pandas-2.2.2+dfsg/pandas/core/indexes/datetimes.py --- pandas-2.1.4+dfsg/pandas/core/indexes/datetimes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexes/datetimes.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,6 +17,8 @@ ) from pandas._libs.tslibs import ( Resolution, + Tick, + Timedelta, periods_per_day, timezones, to_offset, @@ -65,6 +67,8 @@ PeriodIndex, ) +from pandas._libs.tslibs.dtypes import OFFSET_TO_PERIOD_FREQSTR + def _new_DatetimeIndex(cls, d): """ @@ -116,7 +120,6 @@ "tzinfo", "dtype", "to_pydatetime", - "_format_native_types", "date", "time", "timetz", @@ -198,8 +201,6 @@ timetz dayofyear day_of_year - weekofyear - week dayofweek day_of_week weekday @@ -266,6 +267,7 @@ return libindex.DatetimeEngine _data: DatetimeArray + _values: DatetimeArray tz: dt.tzinfo | None # -------------------------------------------------------------------- @@ -393,19 +395,13 @@ ------- bool """ + if isinstance(self.freq, Tick): + delta = Timedelta(self.freq) - from pandas.io.formats.format import is_dates_only - - delta = getattr(self.freq, "delta", None) - - if delta and delta % dt.timedelta(days=1) != dt.timedelta(days=0): - return False + if delta % dt.timedelta(days=1) != dt.timedelta(days=0): + return False - # error: Argument 1 to "is_dates_only" has incompatible type - # "Union[ExtensionArray, ndarray]"; expected "Union[ndarray, - # DatetimeArray, Index, DatetimeIndex]" - - return self.tz is None and is_dates_only(self._values) # type: ignore[arg-type] + return self._values._is_dates_only def __reduce__(self): d = {"data": self._data, "name": self.name} @@ -424,11 +420,13 @@ # -------------------------------------------------------------------- # Rendering Methods - @property + @cache_readonly def _formatter_func(self): + # Note this is equivalent to the DatetimeIndexOpsMixin method but + # uses the maybe-cached self._is_dates_only instead of re-computing it. from pandas.io.formats.format import get_format_datetime64 - formatter = get_format_datetime64(is_dates_only_=self._is_dates_only) + formatter = get_format_datetime64(is_dates_only=self._is_dates_only) return lambda x: f"'{formatter(x)}'" # -------------------------------------------------------------------- @@ -535,7 +533,8 @@ ------- lower, upper: pd.Timestamp """ - per = Period(parsed, freq=reso.attr_abbrev) + freq = OFFSET_TO_PERIOD_FREQSTR.get(reso.attr_abbrev, reso.attr_abbrev) + per = Period(parsed, freq=freq) start, end = per.start_time, per.end_time # GH 24076 @@ -784,11 +783,11 @@ Examples -------- - >>> idx = pd.date_range("2023-01-01", periods=4, freq="H") + >>> idx = pd.date_range("2023-01-01", periods=4, freq="h") >>> idx DatetimeIndex(['2023-01-01 00:00:00', '2023-01-01 01:00:00', '2023-01-01 02:00:00', '2023-01-01 03:00:00'], - dtype='datetime64[ns]', freq='H') + dtype='datetime64[ns]', freq='h') >>> idx.indexer_between_time("00:00", "2:00", include_end=False) array([0, 1]) """ @@ -853,7 +852,7 @@ periods : int, optional Number of periods to generate. freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'D' - Frequency strings can have multiples, e.g. '5H'. See + Frequency strings can have multiples, e.g. '5h'. See :ref:`here ` for a list of frequency aliases. tz : str or tzinfo, optional @@ -946,26 +945,26 @@ **Other Parameters** - Changed the `freq` (frequency) to ``'M'`` (month end frequency). + Changed the `freq` (frequency) to ``'ME'`` (month end frequency). - >>> pd.date_range(start='1/1/2018', periods=5, freq='M') + >>> pd.date_range(start='1/1/2018', periods=5, freq='ME') DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', '2018-05-31'], - dtype='datetime64[ns]', freq='M') + dtype='datetime64[ns]', freq='ME') Multiples are allowed - >>> pd.date_range(start='1/1/2018', periods=5, freq='3M') + >>> pd.date_range(start='1/1/2018', periods=5, freq='3ME') DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', '2019-01-31'], - dtype='datetime64[ns]', freq='3M') + dtype='datetime64[ns]', freq='3ME') `freq` can also be specified as an Offset object. >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3)) DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', '2019-01-31'], - dtype='datetime64[ns]', freq='3M') + dtype='datetime64[ns]', freq='3ME') Specify `tz` to set the timezone. @@ -997,11 +996,11 @@ **Specify a unit** - >>> pd.date_range(start="2017-01-01", periods=10, freq="100AS", unit="s") + >>> pd.date_range(start="2017-01-01", periods=10, freq="100YS", unit="s") DatetimeIndex(['2017-01-01', '2117-01-01', '2217-01-01', '2317-01-01', '2417-01-01', '2517-01-01', '2617-01-01', '2717-01-01', '2817-01-01', '2917-01-01'], - dtype='datetime64[s]', freq='100AS-JAN') + dtype='datetime64[s]', freq='100YS-JAN') """ if freq is None and com.any_none(periods, start, end): freq = "D" @@ -1045,7 +1044,7 @@ periods : int, default None Number of periods to generate. freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'B' - Frequency strings can have multiples, e.g. '5H'. The default is + Frequency strings can have multiples, e.g. '5h'. The default is business daily ('B'). tz : str or None Time zone name for returning localized DatetimeIndex, for example diff -Nru pandas-2.1.4+dfsg/pandas/core/indexes/frozen.py pandas-2.2.2+dfsg/pandas/core/indexes/frozen.py --- pandas-2.1.4+dfsg/pandas/core/indexes/frozen.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexes/frozen.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,7 +9,7 @@ from __future__ import annotations from typing import ( - Any, + TYPE_CHECKING, NoReturn, ) @@ -17,6 +17,9 @@ from pandas.io.formats.printing import pprint_thing +if TYPE_CHECKING: + from pandas._typing import Self + class FrozenList(PandasObject, list): """ @@ -75,19 +78,19 @@ return type(self)(super().__getitem__(n)) return super().__getitem__(n) - def __radd__(self, other): + def __radd__(self, other) -> Self: if isinstance(other, tuple): other = list(other) return type(self)(other + list(self)) - def __eq__(self, other: Any) -> bool: + def __eq__(self, other: object) -> bool: if isinstance(other, (tuple, FrozenList)): other = list(other) return super().__eq__(other) __req__ = __eq__ - def __mul__(self, other): + def __mul__(self, other) -> Self: return type(self)(super().__mul__(other)) __imul__ = __mul__ diff -Nru pandas-2.1.4+dfsg/pandas/core/indexes/interval.py pandas-2.2.2+dfsg/pandas/core/indexes/interval.py --- pandas-2.1.4+dfsg/pandas/core/indexes/interval.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexes/interval.py 2024-04-10 17:42:52.000000000 +0000 @@ -22,6 +22,7 @@ ) from pandas._libs.tslibs import ( BaseOffset, + Period, Timedelta, Timestamp, to_offset, @@ -42,7 +43,6 @@ ) from pandas.core.dtypes.common import ( ensure_platform_int, - is_float, is_float_dtype, is_integer, is_integer_dtype, @@ -59,6 +59,7 @@ from pandas.core.dtypes.missing import is_valid_na_for_dtype from pandas.core.algorithms import unique +from pandas.core.arrays.datetimelike import validate_periods from pandas.core.arrays.interval import ( IntervalArray, _interval_shared_docs, @@ -93,6 +94,7 @@ Dtype, DtypeObj, IntervalClosedType, + Self, npt, ) _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -225,7 +227,7 @@ copy: bool = False, name: Hashable | None = None, verify_integrity: bool = True, - ) -> IntervalIndex: + ) -> Self: name = maybe_extract_name(name, data, cls) with rewrite_exception("IntervalArray", cls.__name__): @@ -560,7 +562,7 @@ if scalar: # Timestamp/Timedelta key_dtype, key_i8 = infer_dtype_from_scalar(key) - if lib.is_period(key): + if isinstance(key, Period): key_i8 = key.ordinal elif isinstance(key_i8, Timestamp): key_i8 = key_i8._value @@ -842,25 +844,6 @@ return Index(self._data.length, copy=False) # -------------------------------------------------------------------- - # Rendering Methods - # __repr__ associated methods are based on MultiIndex - - def _format_with_header(self, header: list[str], na_rep: str) -> list[str]: - # matches base class except for whitespace padding - return header + list(self._format_native_types(na_rep=na_rep)) - - def _format_native_types( - self, *, na_rep: str = "NaN", quoting=None, **kwargs - ) -> npt.NDArray[np.object_]: - # GH 28210: use base method but with different default na_rep - return super()._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) - - def _format_data(self, name=None) -> str: - # TODO: integrate with categorical and make generic - # name argument is unused here; just for compat with base / categorical - return f"{self._data._format_data()},{self._format_space()}" - - # -------------------------------------------------------------------- # Set Operations def _intersection(self, other, sort): @@ -1038,8 +1021,9 @@ >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... end=pd.Timestamp('2017-01-04')) - IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], - (2017-01-03, 2017-01-04]], + IntervalIndex([(2017-01-01 00:00:00, 2017-01-02 00:00:00], + (2017-01-02 00:00:00, 2017-01-03 00:00:00], + (2017-01-03 00:00:00, 2017-01-04 00:00:00]], dtype='interval[datetime64[ns], right]') The ``freq`` parameter specifies the frequency between the left and right. @@ -1055,8 +1039,9 @@ >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... periods=3, freq='MS') - IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01], - (2017-03-01, 2017-04-01]], + IntervalIndex([(2017-01-01 00:00:00, 2017-02-01 00:00:00], + (2017-02-01 00:00:00, 2017-03-01 00:00:00], + (2017-03-01 00:00:00, 2017-04-01 00:00:00]], dtype='interval[datetime64[ns], right]') Specify ``start``, ``end``, and ``periods``; the frequency is generated @@ -1091,10 +1076,7 @@ if not _is_valid_endpoint(end): raise ValueError(f"end must be numeric or datetime-like, got {end}") - if is_float(periods): - periods = int(periods) - elif not is_integer(periods) and periods is not None: - raise TypeError(f"periods must be a number, got {periods}") + periods = validate_periods(periods) if freq is not None and not is_number(freq): try: diff -Nru pandas-2.1.4+dfsg/pandas/core/indexes/multi.py pandas-2.2.2+dfsg/pandas/core/indexes/multi.py --- pandas-2.1.4+dfsg/pandas/core/indexes/multi.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexes/multi.py 2024-04-10 17:42:52.000000000 +0000 @@ -38,6 +38,7 @@ IgnoreRaise, IndexLabel, Scalar, + Self, Shape, npt, ) @@ -72,9 +73,7 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeIndex, ABCSeries, - ABCTimedeltaIndex, ) from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import ( @@ -108,7 +107,10 @@ lexsort_indexer, ) -from pandas.io.formats.printing import pprint_thing +from pandas.io.formats.printing import ( + get_adjustment, + pprint_thing, +) if TYPE_CHECKING: from pandas import ( @@ -330,7 +332,7 @@ copy: bool = False, name=None, verify_integrity: bool = True, - ) -> MultiIndex: + ) -> Self: # compat with Index if name is not None: names = name @@ -767,12 +769,12 @@ vals = cast("CategoricalIndex", vals) vals = vals._data._internal_get_values() - if isinstance(vals.dtype, ExtensionDtype) or isinstance( - vals, (ABCDatetimeIndex, ABCTimedeltaIndex) + if isinstance(vals.dtype, ExtensionDtype) or lib.is_np_dtype( + vals.dtype, "mM" ): vals = vals.astype(object) - vals = np.array(vals, copy=False) + vals = np.asarray(vals) vals = algos.take_nd(vals, codes, fill_value=index._na_value) values.append(vals) @@ -841,6 +843,54 @@ @cache_readonly def levels(self) -> FrozenList: + """ + Levels of the MultiIndex. + + Levels refer to the different hierarchical levels or layers in a MultiIndex. + In a MultiIndex, each level represents a distinct dimension or category of + the index. + + To access the levels, you can use the levels attribute of the MultiIndex, + which returns a tuple of Index objects. Each Index object represents a + level in the MultiIndex and contains the unique values found in that + specific level. + + If a MultiIndex is created with levels A, B, C, and the DataFrame using + it filters out all rows of the level C, MultiIndex.levels will still + return A, B, C. + + Examples + -------- + >>> index = pd.MultiIndex.from_product([['mammal'], + ... ('goat', 'human', 'cat', 'dog')], + ... names=['Category', 'Animals']) + >>> leg_num = pd.DataFrame(data=(4, 2, 4, 4), index=index, columns=['Legs']) + >>> leg_num + Legs + Category Animals + mammal goat 4 + human 2 + cat 4 + dog 4 + + >>> leg_num.index.levels + FrozenList([['mammal'], ['cat', 'dog', 'goat', 'human']]) + + MultiIndex levels will not change even if the DataFrame using the MultiIndex + does not contain all them anymore. + See how "human" is not in the DataFrame, but it is still in levels: + + >>> large_leg_num = leg_num[leg_num.Legs > 2] + >>> large_leg_num + Legs + Category Animals + mammal goat 4 + cat 4 + dog 4 + + >>> large_leg_num.index.levels + FrozenList([['mammal'], ['cat', 'dog', 'goat', 'human']]) + """ # Use cache_readonly to ensure that self.get_locs doesn't repeatedly # create new IndexEngine # https://github.com/pandas-dev/pandas/issues/31648 @@ -1029,7 +1079,7 @@ # Codes Methods @property - def codes(self): + def codes(self) -> FrozenList: return self._codes def _set_codes( @@ -1073,7 +1123,9 @@ self._reset_cache() - def set_codes(self, codes, *, level=None, verify_integrity: bool = True): + def set_codes( + self, codes, *, level=None, verify_integrity: bool = True + ) -> MultiIndex: """ Set new codes on MultiIndex. Defaults to returning new index. @@ -1198,7 +1250,7 @@ names=None, deep: bool = False, name=None, - ): + ) -> Self: """ Make a copy of this object. @@ -1257,11 +1309,11 @@ new_index._id = self._id return new_index - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" return self.values - def view(self, cls=None): + def view(self, cls=None) -> Self: """this is defined as a copy with the same identity""" result = self.copy() result._id = self._id @@ -1333,7 +1385,7 @@ formatter_funcs = [level._formatter_func for level in self.levels] return tuple(func(val) for func, val in zip(formatter_funcs, tup)) - def _format_native_types( + def _get_values_for_csv( self, *, na_rep: str = "nan", **kwargs ) -> npt.NDArray[np.object_]: new_levels = [] @@ -1341,7 +1393,7 @@ # go through the levels and format them for level, level_codes in zip(self.levels, self.codes): - level_strs = level._format_native_types(na_rep=na_rep, **kwargs) + level_strs = level._get_values_for_csv(na_rep=na_rep, **kwargs) # add nan values, if there are any mask = level_codes == -1 if mask.any(): @@ -1357,7 +1409,7 @@ if len(new_levels) == 1: # a single-level multi-index - return Index(new_levels[0].take(new_codes[0]))._format_native_types() + return Index(new_levels[0].take(new_codes[0]))._get_values_for_csv() else: # reconstruct the multi-index mi = MultiIndex( @@ -1379,6 +1431,15 @@ sparsify=None, adjoin: bool = True, ) -> list: + warnings.warn( + # GH#55413 + f"{type(self).__name__}.format is deprecated and will be removed " + "in a future version. Convert using index.astype(str) or " + "index.map(formatter) instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if name is not None: names = name @@ -1436,13 +1497,74 @@ ) if adjoin: - from pandas.io.formats.format import get_adjustment - adj = get_adjustment() return adj.adjoin(space, *result_levels).split("\n") else: return result_levels + def _format_multi( + self, + *, + include_names: bool, + sparsify: bool | None | lib.NoDefault, + formatter: Callable | None = None, + ) -> list: + if len(self) == 0: + return [] + + stringified_levels = [] + for lev, level_codes in zip(self.levels, self.codes): + na = _get_na_rep(lev.dtype) + + if len(lev) > 0: + taken = formatted = lev.take(level_codes) + formatted = taken._format_flat(include_name=False, formatter=formatter) + + # we have some NA + mask = level_codes == -1 + if mask.any(): + formatted = np.array(formatted, dtype=object) + formatted[mask] = na + formatted = formatted.tolist() + + else: + # weird all NA case + formatted = [ + pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n")) + for x in algos.take_nd(lev._values, level_codes) + ] + stringified_levels.append(formatted) + + result_levels = [] + for lev, lev_name in zip(stringified_levels, self.names): + level = [] + + if include_names: + level.append( + pprint_thing(lev_name, escape_chars=("\t", "\r", "\n")) + if lev_name is not None + else "" + ) + + level.extend(np.array(lev, dtype=object)) + result_levels.append(level) + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + if sparsify: + sentinel: Literal[""] | bool | lib.NoDefault = "" + # GH3547 use value of sparsify as sentinel if it's "Falsey" + assert isinstance(sparsify, bool) or sparsify is lib.no_default + if sparsify is lib.no_default: + sentinel = sparsify + # little bit of a kludge job for #1217 + result_levels = sparsify_labels( + result_levels, start=int(include_names), sentinel=sentinel + ) + + return result_levels + # -------------------------------------------------------------------- # Names Methods @@ -1658,7 +1780,8 @@ filled = algos.take_nd(lev._values, level_codes, fill_value=lev._na_value) return lev._shallow_copy(filled, name=name) - def get_level_values(self, level): + # error: Signature of "get_level_values" incompatible with supertype "Index" + def get_level_values(self, level) -> Index: # type: ignore[override] """ Return vector of label values for requested level. @@ -1944,7 +2067,7 @@ # indexer to reorder the level codes indexer = ensure_platform_int(indexer) ri = lib.get_reverse_indexer(indexer, len(indexer)) - level_codes = algos.take_nd(ri, level_codes) + level_codes = algos.take_nd(ri, level_codes, fill_value=-1) new_levels.append(lev) new_codes.append(level_codes) @@ -2207,17 +2330,9 @@ def argsort( self, *args, na_position: str = "last", **kwargs ) -> npt.NDArray[np.intp]: - if len(args) == 0 and len(kwargs) == 0: - # lexsort is significantly faster than self._values.argsort() - target = self._sort_levels_monotonic(raise_if_incomparable=True) - return lexsort_indexer( - # error: Argument 1 to "lexsort_indexer" has incompatible type - # "List[Categorical]"; expected "Union[List[Union[ExtensionArray, - # ndarray[Any, Any]]], List[Series]]" - target._get_codes_for_sorting(), # type: ignore[arg-type] - na_position=na_position, - ) - return self._values.argsort(*args, **kwargs) + target = self._sort_levels_monotonic(raise_if_incomparable=True) + keys = [lev.codes for lev in target._get_codes_for_sorting()] + return lexsort_indexer(keys, na_position=na_position, codes_given=True) @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats: int, axis=None) -> MultiIndex: @@ -3282,7 +3397,7 @@ locs = (level_codes >= idx.start) & (level_codes < idx.stop) return locs - locs = np.array(level_codes == idx, dtype=bool, copy=False) + locs = np.asarray(level_codes == idx, dtype=bool) if not locs.any(): # The label is present in self.levels[level] but unused: @@ -3303,7 +3418,7 @@ raise KeyError(key) return slice(start, end) - def get_locs(self, seq): + def get_locs(self, seq) -> npt.NDArray[np.intp]: """ Get location for a sequence of labels. @@ -3373,6 +3488,8 @@ "is not the same length as the index" ) lvl_indexer = np.asarray(k) + if indexer is None: + lvl_indexer = lvl_indexer.copy() elif is_list_like(k): # a collection of labels to include from this level (these are or'd) @@ -3939,14 +4056,18 @@ for i, (p, t) in enumerate(zip(prev, cur)): if i == k - 1: sparse_cur.append(t) - result.append(sparse_cur) + # error: Argument 1 to "append" of "list" has incompatible + # type "list[Any]"; expected "tuple[Any, ...]" + result.append(sparse_cur) # type: ignore[arg-type] break if p == t: sparse_cur.append(sentinel) else: sparse_cur.extend(cur[i:]) - result.append(sparse_cur) + # error: Argument 1 to "append" of "list" has incompatible + # type "list[Any]"; expected "tuple[Any, ...]" + result.append(sparse_cur) # type: ignore[arg-type] break prev = cur diff -Nru pandas-2.1.4+dfsg/pandas/core/indexes/period.py pandas-2.2.2+dfsg/pandas/core/indexes/period.py --- pandas-2.1.4+dfsg/pandas/core/indexes/period.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexes/period.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,7 @@ timedelta, ) from typing import TYPE_CHECKING +import warnings import numpy as np @@ -16,10 +17,12 @@ Resolution, Tick, ) +from pandas._libs.tslibs.dtypes import OFFSET_TO_PERIOD_FREQSTR from pandas.util._decorators import ( cache_readonly, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import is_integer from pandas.core.dtypes.dtypes import PeriodDtype @@ -79,7 +82,7 @@ PeriodArray, wrap=True, ) -@inherit_names(["is_leap_year", "_format_native_types"], PeriodArray) +@inherit_names(["is_leap_year"], PeriodArray) class PeriodIndex(DatetimeIndexOpsMixin): """ Immutable ndarray holding ordinal values indicating regular periods in time. @@ -96,12 +99,33 @@ freq : str or period object, optional One of pandas period strings or corresponding objects. year : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. month : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. quarter : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. day : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. hour : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. minute : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. second : int, array, or Series, default None + + .. deprecated:: 2.2.0 + Use PeriodIndex.from_fields instead. dtype : str or PeriodDtype, default None Attributes @@ -134,6 +158,8 @@ asfreq strftime to_timestamp + from_fields + from_ordinals See Also -------- @@ -145,7 +171,7 @@ Examples -------- - >>> idx = pd.PeriodIndex(year=[2000, 2002], quarter=[1, 3]) + >>> idx = pd.PeriodIndex.from_fields(year=[2000, 2002], quarter=[1, 3]) >>> idx PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]') """ @@ -232,6 +258,24 @@ if not set(fields).issubset(valid_field_set): argument = next(iter(set(fields) - valid_field_set)) raise TypeError(f"__new__() got an unexpected keyword argument {argument}") + elif len(fields): + # GH#55960 + warnings.warn( + "Constructing PeriodIndex from fields is deprecated. Use " + "PeriodIndex.from_fields instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + if ordinal is not None: + # GH#55960 + warnings.warn( + "The 'ordinal' keyword in PeriodIndex is deprecated and will " + "be removed in a future version. Use PeriodIndex.from_ordinals " + "instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) name = maybe_extract_name(name, data, cls) @@ -240,14 +284,14 @@ if not fields: # test_pickle_compat_construction cls._raise_scalar_data_error(None) + data = cls.from_fields(**fields, freq=freq)._data + copy = False - data, freq2 = PeriodArray._generate_range(None, None, None, freq, fields) - # PeriodArray._generate range does validation that fields is - # empty when really using the range-based constructor. - freq = freq2 + elif fields: + if data is not None: + raise ValueError("Cannot pass both data and fields") + raise ValueError("Cannot pass both ordinal and fields") - dtype = PeriodDtype(freq) - data = PeriodArray(data, dtype=dtype) else: freq = validate_dtype_freq(dtype, freq) @@ -260,10 +304,11 @@ data = data.asfreq(freq) if data is None and ordinal is not None: - # we strangely ignore `ordinal` if data is passed. ordinal = np.asarray(ordinal, dtype=np.int64) dtype = PeriodDtype(freq) data = PeriodArray(ordinal, dtype=dtype) + elif data is not None and ordinal is not None: + raise ValueError("Cannot pass both data and ordinal") else: # don't pass copy here, since we copy later. data = period_array(data=data, freq=freq) @@ -273,6 +318,39 @@ return cls._simple_new(data, name=name, refs=refs) + @classmethod + def from_fields( + cls, + *, + year=None, + quarter=None, + month=None, + day=None, + hour=None, + minute=None, + second=None, + freq=None, + ) -> Self: + fields = { + "year": year, + "quarter": quarter, + "month": month, + "day": day, + "hour": hour, + "minute": minute, + "second": second, + } + fields = {key: value for key, value in fields.items() if value is not None} + arr = PeriodArray._from_fields(fields=fields, freq=freq) + return cls._simple_new(arr) + + @classmethod + def from_ordinals(cls, ordinals, *, freq, name=None) -> Self: + ordinals = np.asarray(ordinals, dtype=np.int64) + dtype = PeriodDtype(freq) + data = PeriodArray._simple_new(ordinals, dtype=dtype) + return cls._simple_new(data, name=name) + # ------------------------------------------------------------------------ # Data @@ -453,7 +531,8 @@ return super()._maybe_cast_slice_bound(label, side) def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): - iv = Period(parsed, freq=reso.attr_abbrev) + freq = OFFSET_TO_PERIOD_FREQSTR.get(reso.attr_abbrev, reso.attr_abbrev) + iv = Period(parsed, freq=freq) return (iv.asfreq(self.freq, how="start"), iv.asfreq(self.freq, how="end")) @doc(DatetimeIndexOpsMixin.shift) @@ -529,7 +608,7 @@ if freq is None and (not isinstance(start, Period) and not isinstance(end, Period)): freq = "D" - data, freq = PeriodArray._generate_range(start, end, periods, freq, fields={}) + data, freq = PeriodArray._generate_range(start, end, periods, freq) dtype = PeriodDtype(freq) data = PeriodArray(data, dtype=dtype) return PeriodIndex(data, name=name) diff -Nru pandas-2.1.4+dfsg/pandas/core/indexes/range.py pandas-2.2.2+dfsg/pandas/core/indexes/range.py --- pandas-2.1.4+dfsg/pandas/core/indexes/range.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexes/range.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,7 +11,9 @@ TYPE_CHECKING, Any, Callable, + Literal, cast, + overload, ) import numpy as np @@ -25,6 +27,7 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import ( cache_readonly, + deprecate_nonkeyword_arguments, doc, ) @@ -139,12 +142,12 @@ dtype: Dtype | None = None, copy: bool = False, name: Hashable | None = None, - ) -> RangeIndex: + ) -> Self: cls._validate_dtype(dtype) name = maybe_extract_name(name, start, cls) # RangeIndex - if isinstance(start, RangeIndex): + if isinstance(start, cls): return start.copy(name=name) elif isinstance(start, range): return cls._simple_new(start, name=name) @@ -240,7 +243,7 @@ """ return np.arange(self.start, self.stop, self.step, dtype=np.int64) - def _get_data_as_items(self): + def _get_data_as_items(self) -> list[tuple[str, int]]: """return a list of tuples of start, stop, step""" rng = self._range return [("start", rng.start), ("stop", rng.stop), ("step", rng.step)] @@ -257,16 +260,12 @@ """ Return a list of tuples of the (attr, formatted_value) """ - attrs = self._get_data_as_items() + attrs = cast("list[tuple[str, str | int]]", self._get_data_as_items()) if self._name is not None: attrs.append(("name", ibase.default_pprint(self._name))) return attrs - def _format_data(self, name=None): - # we are formatting thru the attributes - return None - - def _format_with_header(self, header: list[str], na_rep: str) -> list[str]: + def _format_with_header(self, *, header: list[str], na_rep: str) -> list[str]: # Equivalent to Index implementation, but faster if not len(self._range): return header @@ -407,7 +406,7 @@ # Indexing Methods @doc(Index.get_loc) - def get_loc(self, key): + def get_loc(self, key) -> int: if is_integer(key) or (is_float(key) and key.is_integer()): new_key = int(key) try: @@ -559,13 +558,50 @@ return self._range == other._range return super().equals(other) + # error: Signature of "sort_values" incompatible with supertype "Index" + @overload # type: ignore[override] + def sort_values( + self, + *, + return_indexer: Literal[False] = ..., + ascending: bool = ..., + na_position: NaPosition = ..., + key: Callable | None = ..., + ) -> Self: + ... + + @overload + def sort_values( + self, + *, + return_indexer: Literal[True], + ascending: bool = ..., + na_position: NaPosition = ..., + key: Callable | None = ..., + ) -> tuple[Self, np.ndarray | RangeIndex]: + ... + + @overload + def sort_values( + self, + *, + return_indexer: bool = ..., + ascending: bool = ..., + na_position: NaPosition = ..., + key: Callable | None = ..., + ) -> Self | tuple[Self, np.ndarray | RangeIndex]: + ... + + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self"], name="sort_values" + ) def sort_values( self, return_indexer: bool = False, ascending: bool = True, na_position: NaPosition = "last", key: Callable | None = None, - ): + ) -> Self | tuple[Self, np.ndarray | RangeIndex]: if key is not None: return super().sort_values( return_indexer=return_indexer, @@ -1107,14 +1143,16 @@ # test_arithmetic_explicit_conversions return super()._arith_method(other, op) - def take( + # error: Return type "Index" of "take" incompatible with return type + # "RangeIndex" in supertype "Index" + def take( # type: ignore[override] self, indices, axis: Axis = 0, allow_fill: bool = True, fill_value=None, **kwargs, - ): + ) -> Index: if kwargs: nv.validate_take((), kwargs) if is_scalar(indices): diff -Nru pandas-2.1.4+dfsg/pandas/core/indexes/timedeltas.py pandas-2.2.2+dfsg/pandas/core/indexes/timedeltas.py --- pandas-2.1.4+dfsg/pandas/core/indexes/timedeltas.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexes/timedeltas.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,6 +13,7 @@ Timedelta, to_offset, ) +from pandas._libs.tslibs.timedeltas import disallow_ambiguous_unit from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( @@ -21,7 +22,6 @@ ) from pandas.core.dtypes.generic import ABCSeries -from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays.timedeltas import TimedeltaArray import pandas.core.common as com from pandas.core.indexes.base import ( @@ -48,7 +48,6 @@ "sum", "std", "median", - "_format_native_types", ], TimedeltaArray, ) @@ -64,6 +63,10 @@ Optional timedelta-like data to construct index with. unit : {'D', 'h', 'm', 's', 'ms', 'us', 'ns'}, optional The unit of ``data``. + + .. deprecated:: 2.2.0 + Use ``pd.to_timedelta`` instead. + freq : str or pandas offset object, optional One of pandas date offset strings or corresponding objects. The string ``'infer'`` can be passed in order to set the frequency of the index as @@ -114,13 +117,9 @@ TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) - >>> pd.TimedeltaIndex([1, 2, 4, 8], unit='D') - TimedeltaIndex(['1 days', '2 days', '4 days', '8 days'], - dtype='timedelta64[ns]', freq=None) - We can also let pandas infer the frequency when possible. - >>> pd.TimedeltaIndex(range(5), unit='D', freq='infer') + >>> pd.TimedeltaIndex(np.arange(5) * 24 * 3600 * 1e9, freq='infer') TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq='D') """ @@ -150,7 +149,7 @@ def __new__( cls, data=None, - unit=None, + unit=lib.no_default, freq=lib.no_default, closed=lib.no_default, dtype=None, @@ -166,16 +165,24 @@ stacklevel=find_stack_level(), ) + if unit is not lib.no_default: + # GH#55499 + warnings.warn( + f"The 'unit' keyword in {cls.__name__} construction is " + "deprecated and will be removed in a future version. " + "Use pd.to_timedelta instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + unit = None + name = maybe_extract_name(name, data, cls) if is_scalar(data): cls._raise_scalar_data_error(data) - if unit in {"Y", "y", "M"}: - raise ValueError( - "Units 'M', 'Y', and 'y' are no longer supported, as they do not " - "represent unambiguous timedelta values durations." - ) + disallow_ambiguous_unit(unit) if dtype is not None: dtype = pandas_dtype(dtype) @@ -278,7 +285,7 @@ periods : int, default None Number of periods to generate. freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'D' - Frequency strings can have multiples, e.g. '5H'. + Frequency strings can have multiples, e.g. '5h'. name : str, default None Name of the resulting TimedeltaIndex. closed : str, default None @@ -320,10 +327,10 @@ Only fixed frequencies can be passed, non-fixed frequencies such as 'M' (month end) will raise. - >>> pd.timedelta_range(start='1 day', end='2 days', freq='6H') + >>> pd.timedelta_range(start='1 day', end='2 days', freq='6h') TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00', '1 days 18:00:00', '2 days 00:00:00'], - dtype='timedelta64[ns]', freq='6H') + dtype='timedelta64[ns]', freq='6h') Specify ``start``, ``end``, and ``periods``; the frequency is generated automatically (linearly spaced). @@ -336,14 +343,13 @@ **Specify a unit** >>> pd.timedelta_range("1 Day", periods=3, freq="100000D", unit="s") - TimedeltaIndex(['1 days 00:00:00', '100001 days 00:00:00', - '200001 days 00:00:00'], + TimedeltaIndex(['1 days', '100001 days', '200001 days'], dtype='timedelta64[s]', freq='100000D') """ if freq is None and com.any_none(periods, start, end): freq = "D" - freq, _ = dtl.maybe_infer_freq(freq) + freq = to_offset(freq) tdarr = TimedeltaArray._generate_range( start, end, periods, freq, closed=closed, unit=unit ) diff -Nru pandas-2.1.4+dfsg/pandas/core/indexing.py pandas-2.2.2+dfsg/pandas/core/indexing.py --- pandas-2.1.4+dfsg/pandas/core/indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,6 +4,8 @@ import sys from typing import ( TYPE_CHECKING, + Any, + TypeVar, cast, final, ) @@ -11,7 +13,10 @@ import numpy as np -from pandas._config import using_copy_on_write +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) from pandas._libs.indexing import NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim @@ -23,8 +28,11 @@ InvalidIndexError, LossySetitemError, _chained_assignment_msg, + _chained_assignment_warning_msg, + _check_cacher, ) from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( can_hold_element, @@ -49,6 +57,7 @@ ABCSeries, ) from pandas.core.dtypes.missing import ( + construct_1d_array_from_inferred_fill_value, infer_fill_value, is_valid_na_for_dtype, isna, @@ -82,6 +91,7 @@ Axis, AxisInt, Self, + npt, ) from pandas import ( @@ -89,6 +99,7 @@ Series, ) +T = TypeVar("T") # "null slice" _NS = slice(None, None) _one_ellipsis_message = "indexer may only contain one '...' entry" @@ -152,6 +163,10 @@ """ Purely integer-location based indexing for selection by position. + .. deprecated:: 2.2.0 + + Returning a tuple from a callable is deprecated. + ``.iloc[]`` is primarily integer position based (from ``0`` to ``length-1`` of the axis), but may also be used with a boolean array. @@ -165,7 +180,8 @@ - A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above). This is useful in method chains, when you don't have a reference to the - calling object, but would like to base your selection on some value. + calling object, but would like to base your selection on + some value. - A tuple of row and column indexes. The tuple elements consist of one of the above inputs, e.g. ``(0, 1)``. @@ -186,7 +202,7 @@ -------- >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, - ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }] + ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000}] >>> df = pd.DataFrame(mydict) >>> df a b c d @@ -327,7 +343,7 @@ DataFrame.at : Access a single value for a row/column label pair. DataFrame.iloc : Access group of rows and columns by integer position(s). DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the - Series/DataFrame. + Series/DataFrame. Series.loc : Access group of values using labels. Examples @@ -335,8 +351,8 @@ **Getting values** >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=['cobra', 'viper', 'sidewinder'], - ... columns=['max_speed', 'shield']) + ... index=['cobra', 'viper', 'sidewinder'], + ... columns=['max_speed', 'shield']) >>> df max_speed shield cobra 1 2 @@ -379,8 +395,8 @@ Alignable boolean Series: >>> df.loc[pd.Series([False, True, False], - ... index=['viper', 'sidewinder', 'cobra'])] - max_speed shield + ... index=['viper', 'sidewinder', 'cobra'])] + max_speed shield sidewinder 7 8 Index (same behavior as ``df.reindex``) @@ -406,7 +422,7 @@ Multiple conditional using ``&`` that returns a boolean Series >>> df.loc[(df['max_speed'] > 1) & (df['shield'] < 8)] - max_speed shield + max_speed shield viper 4 5 Multiple conditional using ``|`` that returns a boolean Series @@ -495,7 +511,7 @@ Another example using integers for the index >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=[7, 8, 9], columns=['max_speed', 'shield']) + ... index=[7, 8, 9], columns=['max_speed', 'shield']) >>> df max_speed shield 7 1 2 @@ -516,13 +532,13 @@ A number of examples using a DataFrame with a MultiIndex >>> tuples = [ - ... ('cobra', 'mark i'), ('cobra', 'mark ii'), - ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), - ... ('viper', 'mark ii'), ('viper', 'mark iii') + ... ('cobra', 'mark i'), ('cobra', 'mark ii'), + ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), + ... ('viper', 'mark ii'), ('viper', 'mark iii') ... ] >>> index = pd.MultiIndex.from_tuples(tuples) >>> values = [[12, 2], [0, 4], [10, 20], - ... [1, 4], [7, 1], [16, 36]] + ... [1, 4], [7, 1], [16, 36]] >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) >>> df max_speed shield @@ -604,12 +620,12 @@ Raises ------ KeyError - * If getting a value and 'label' does not exist in a DataFrame or - Series. + If getting a value and 'label' does not exist in a DataFrame or Series. + ValueError - * If row/column label pair is not a tuple or if any label from - the pair is not a scalar for DataFrame. - * If label is list-like (*excluding* NamedTuple) for Series. + If row/column label pair is not a tuple or if any label + from the pair is not a scalar for DataFrame. + If label is list-like (*excluding* NamedTuple) for Series. See Also -------- @@ -828,7 +844,6 @@ if self.ndim != 2: return - orig_key = key if isinstance(key, tuple) and len(key) > 1: # key may be a tuple if we are .loc # if length of key is > 1 set key to column part @@ -846,7 +861,7 @@ keys = self.obj.columns.union(key, sort=False) diff = Index(key).difference(self.obj.columns, sort=False) - if len(diff) and com.is_null_slice(orig_key[0]): + if len(diff): # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B" # is a new column, add the new columns with dtype=np.void # so that later when we go through setitem_single_column @@ -871,13 +886,24 @@ warnings.warn( _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self.obj) + ref_count = 2 + if not warn_copy_on_write() and _check_cacher(self.obj): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_msg, FutureWarning, stacklevel=2 + ) check_dict_or_set_indexers(key) if isinstance(key, tuple): key = tuple(list(x) if is_iterator(x) else x for x in key) key = tuple(com.apply_if_callable(x, self.obj) for x in key) else: - key = com.apply_if_callable(key, self.obj) + maybe_callable = com.apply_if_callable(key, self.obj) + key = self._check_deprecated_callable_usage(key, maybe_callable) indexer = self._get_setitem_indexer(key) self._has_valid_setitem_indexer(key) @@ -984,7 +1010,7 @@ This is only called after a failed call to _getitem_lowerdim. """ retval = self.obj - # Selecting columns before rows is signficiantly faster + # Selecting columns before rows is significantly faster start_val = (self.ndim - len(tup)) + 1 for i, key in enumerate(reversed(tup)): i = self.ndim - i - start_val @@ -1136,6 +1162,17 @@ def _convert_to_indexer(self, key, axis: AxisInt): raise AbstractMethodError(self) + def _check_deprecated_callable_usage(self, key: Any, maybe_callable: T) -> T: + # GH53533 + if self.name == "iloc" and callable(key) and isinstance(maybe_callable, tuple): + warnings.warn( + "Returning a tuple from a callable with iloc " + "is deprecated and will be removed in a future version", + FutureWarning, + stacklevel=find_stack_level(), + ) + return maybe_callable + @final def __getitem__(self, key): check_dict_or_set_indexers(key) @@ -1150,6 +1187,7 @@ axis = self.axis or 0 maybe_callable = com.apply_if_callable(key, self.obj) + maybe_callable = self._check_deprecated_callable_usage(key, maybe_callable) return self._getitem_axis(maybe_callable, axis=axis) def _is_scalar_access(self, key: tuple): @@ -1384,7 +1422,7 @@ # nested tuple slicing if is_nested_tuple(key, labels): locs = labels.get_locs(key) - indexer = [slice(None)] * self.ndim + indexer: list[slice | npt.NDArray[np.intp]] = [slice(None)] * self.ndim indexer[axis] = locs return self.obj.iloc[tuple(indexer)] @@ -1838,7 +1876,10 @@ return self.obj[key] = empty_value - + elif not is_list_like(value): + self.obj[key] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) else: # FIXME: GH#42099#issuecomment-864326014 self.obj[key] = infer_fill_value(value) @@ -1855,7 +1896,15 @@ # just replacing the block manager here # so the object is the same index = self.obj._get_axis(i) - labels = index.insert(len(index), key) + with warnings.catch_warnings(): + # TODO: re-issue this with setitem-specific message? + warnings.filterwarnings( + "ignore", + "The behavior of Index.insert with object-dtype " + "is deprecated", + category=FutureWarning, + ) + labels = index.insert(len(index), key) # We are expanding the Series/DataFrame values to match # the length of thenew index `labels`. GH#40096 ensure @@ -2088,10 +2137,41 @@ # If we're setting an entire column and we can't do it inplace, # then we can use value's dtype (or inferred dtype) # instead of object + dtype = self.obj.dtypes.iloc[loc] + if dtype not in (np.void, object) and not self.obj.empty: + # - Exclude np.void, as that is a special case for expansion. + # We want to warn for + # df = pd.DataFrame({'a': [1, 2]}) + # df.loc[:, 'a'] = .3 + # but not for + # df = pd.DataFrame({'a': [1, 2]}) + # df.loc[:, 'b'] = .3 + # - Exclude `object`, as then no upcasting happens. + # - Exclude empty initial object with enlargement, + # as then there's nothing to be inconsistent with. + warnings.warn( + f"Setting an item of incompatible dtype is deprecated " + "and will raise in a future error of pandas. " + f"Value '{value}' has dtype incompatible with {dtype}, " + "please explicitly cast to a compatible dtype first.", + FutureWarning, + stacklevel=find_stack_level(), + ) self.obj.isetitem(loc, value) else: # set value into the column (first attempting to operate inplace, then # falling back to casting if necessary) + dtype = self.obj.dtypes.iloc[loc] + if dtype == np.void: + # This means we're expanding, with multiple columns, e.g. + # df = pd.DataFrame({'A': [1,2,3], 'B': [4,5,6]}) + # df.loc[df.index <= 2, ['F', 'G']] = (1, 'abc') + # Columns F and G will initially be set to np.void. + # Here, we replace those temporary `np.void` columns with + # columns of the appropriate dtype, based on `value`. + self.obj.iloc[:, loc] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) self.obj._mgr.column_setitem(loc, plane_indexer, value) self.obj._clear_item_cache() @@ -2102,6 +2182,12 @@ """ from pandas import Series + if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): + # TODO(EA): ExtensionBlock.setitem this causes issues with + # setting for extensionarrays that store dicts. Need to decide + # if it's worth supporting that. + value = self._align_series(indexer, Series(value)) + info_axis = self.obj._info_axis_number item_labels = self.obj._get_axis(info_axis) if isinstance(indexer, tuple): @@ -2122,13 +2208,7 @@ indexer = maybe_convert_ix(*indexer) # e.g. test_setitem_frame_align - if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): - # TODO(EA): ExtensionBlock.setitem this causes issues with - # setting for extensionarrays that store dicts. Need to decide - # if it's worth supporting that. - value = self._align_series(indexer, Series(value)) - - elif isinstance(value, ABCDataFrame) and name != "iloc": + if isinstance(value, ABCDataFrame) and name != "iloc": value = self._align_frame(indexer, value)._values # check for chained assignment @@ -2148,7 +2228,14 @@ # and set inplace if self.ndim == 1: index = self.obj.index - new_index = index.insert(len(index), indexer) + with warnings.catch_warnings(): + # TODO: re-issue this with setitem-specific message? + warnings.filterwarnings( + "ignore", + "The behavior of Index.insert with object-dtype is deprecated", + category=FutureWarning, + ) + new_index = index.insert(len(index), indexer) # we have a coerced indexer, e.g. a float # that matches in an int64 Index, so diff -Nru pandas-2.1.4+dfsg/pandas/core/interchange/buffer.py pandas-2.2.2+dfsg/pandas/core/interchange/buffer.py --- pandas-2.1.4+dfsg/pandas/core/interchange/buffer.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/interchange/buffer.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,16 +1,18 @@ from __future__ import annotations -from typing import Any - -import numpy as np +from typing import ( + TYPE_CHECKING, + Any, +) from pandas.core.interchange.dataframe_protocol import ( Buffer, DlpackDeviceType, ) -from pandas.util.version import Version -_NUMPY_HAS_DLPACK = Version(np.__version__) >= Version("1.22.0") +if TYPE_CHECKING: + import numpy as np + import pyarrow as pa class PandasBuffer(Buffer): @@ -22,7 +24,7 @@ """ Handle only regular columns (= numpy arrays) for now. """ - if not x.strides == (x.dtype.itemsize,): + if x.strides[0] and not x.strides == (x.dtype.itemsize,): # The protocol does not support strided buffers, so a copy is # necessary. If that's not allowed, we need to raise an exception. if allow_copy: @@ -55,9 +57,7 @@ """ Represent this structure as DLPack interface. """ - if _NUMPY_HAS_DLPACK: - return self._x.__dlpack__() - raise NotImplementedError("__dlpack__") + return self._x.__dlpack__() def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: """ @@ -76,4 +76,61 @@ } ) + ")" + ) + + +class PandasBufferPyarrow(Buffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__( + self, + buffer: pa.Buffer, + *, + length: int, + ) -> None: + """ + Handle pyarrow chunked arrays. + """ + self._buffer = buffer + self._length = length + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._buffer.size + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._buffer.address + + def __dlpack__(self) -> Any: + """ + Represent this structure as DLPack interface. + """ + raise NotImplementedError() + + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + """ + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + return ( + "PandasBuffer[pyarrow](" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": "CPU", + } + ) + + ")" ) diff -Nru pandas-2.1.4+dfsg/pandas/core/interchange/column.py pandas-2.2.2+dfsg/pandas/core/interchange/column.py --- pandas-2.1.4+dfsg/pandas/core/interchange/column.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/interchange/column.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import Any +from typing import ( + TYPE_CHECKING, + Any, +) import numpy as np @@ -9,14 +12,18 @@ from pandas.errors import NoBufferPresent from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.dtypes import ( +from pandas.core.dtypes.dtypes import BaseMaskedDtype + +import pandas as pd +from pandas import ( ArrowDtype, DatetimeTZDtype, ) - -import pandas as pd from pandas.api.types import is_string_dtype -from pandas.core.interchange.buffer import PandasBuffer +from pandas.core.interchange.buffer import ( + PandasBuffer, + PandasBufferPyarrow, +) from pandas.core.interchange.dataframe_protocol import ( Column, ColumnBuffers, @@ -29,6 +36,9 @@ dtype_to_arrow_c_fmt, ) +if TYPE_CHECKING: + from pandas.core.interchange.dataframe_protocol import Buffer + _NP_KINDS = { "i": DtypeKind.INT, "u": DtypeKind.UINT, @@ -76,6 +86,14 @@ Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. """ + if isinstance(column, pd.DataFrame): + raise TypeError( + "Expected a Series, got a DataFrame. This likely happened " + "because you called __dataframe__ on a DataFrame which, " + "after converting column names to string, resulted in duplicated " + f"names: {column.columns}. Please rename these columns before " + "using the interchange protocol." + ) if not isinstance(column, pd.Series): raise NotImplementedError(f"Columns of type {type(column)} not handled yet") @@ -116,7 +134,7 @@ Endianness.NATIVE, ) elif is_string_dtype(dtype): - if infer_dtype(self._col) == "string": + if infer_dtype(self._col) in ("string", "empty"): return ( DtypeKind.STRING, 8, @@ -143,9 +161,21 @@ byteorder = dtype.numpy_dtype.byteorder elif isinstance(dtype, DatetimeTZDtype): byteorder = dtype.base.byteorder # type: ignore[union-attr] + elif isinstance(dtype, BaseMaskedDtype): + byteorder = dtype.numpy_dtype.byteorder else: byteorder = dtype.byteorder + if dtype == "bool[pyarrow]": + # return early to avoid the `* 8` below, as this is a bitmask + # rather than a bytemask + return ( + kind, + dtype.itemsize, # pyright: ignore[reportGeneralTypeIssues] + ArrowCTypes.BOOL, + byteorder, + ) + return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder @property @@ -179,6 +209,16 @@ @property def describe_null(self): + if isinstance(self._col.dtype, BaseMaskedDtype): + column_null_dtype = ColumnNullType.USE_BYTEMASK + null_value = 1 + return column_null_dtype, null_value + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so this + # is already single-chunk by the time we get here. + if self._col.array._pa_array.chunks[0].buffers()[0] is None: # type: ignore[attr-defined] + return ColumnNullType.NON_NULLABLE, None + return ColumnNullType.USE_BITMASK, 0 kind = self.dtype[0] try: null, value = _NULL_DESCRIPTION[kind] @@ -263,10 +303,11 @@ def _get_data_buffer( self, - ) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple + ) -> tuple[Buffer, tuple[DtypeKind, int, str, str]]: """ Return the buffer containing the data and the buffer's associated dtype. """ + buffer: Buffer if self.dtype[0] in ( DtypeKind.INT, DtypeKind.UINT, @@ -276,12 +317,25 @@ ): # self.dtype[2] is an ArrowCTypes.TIMESTAMP where the tz will make # it longer than 4 characters + dtype = self.dtype if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4: np_arr = self._col.dt.tz_convert(None).to_numpy() else: - np_arr = self._col.to_numpy() + arr = self._col.array + if isinstance(self._col.dtype, BaseMaskedDtype): + np_arr = arr._data # type: ignore[attr-defined] + elif isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, + # so this is already single-chunk by the time we get here. + arr = arr._pa_array.chunks[0] # type: ignore[attr-defined] + buffer = PandasBufferPyarrow( + arr.buffers()[1], # type: ignore[attr-defined] + length=len(arr), + ) + return buffer, dtype + else: + np_arr = arr._ndarray # type: ignore[attr-defined] buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) - dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: codes = self._col.values._codes buffer = PandasBuffer(codes, allow_copy=self._allow_copy) @@ -301,24 +355,40 @@ buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) # Define the dtype for the returned buffer - dtype = ( - DtypeKind.STRING, - 8, - ArrowCTypes.STRING, - Endianness.NATIVE, - ) # note: currently only support native endianness + # TODO: this will need correcting + # https://github.com/pandas-dev/pandas/issues/54781 + dtype = self.dtype else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") return buffer, dtype - def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: + def _get_validity_buffer(self) -> tuple[Buffer, Any] | None: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. Raises NoBufferPresent if null representation is not a bit or byte mask. """ null, invalid = self.describe_null + buffer: Buffer + if isinstance(self._col.dtype, ArrowDtype): + # We already rechunk (if necessary / allowed) upon initialization, so this + # is already single-chunk by the time we get here. + arr = self._col.array._pa_array.chunks[0] # type: ignore[attr-defined] + dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE) + if arr.buffers()[0] is None: + return None + buffer = PandasBufferPyarrow( + arr.buffers()[0], + length=len(arr), + ) + return buffer, dtype + + if isinstance(self._col.dtype, BaseMaskedDtype): + mask = self._col.array._mask # type: ignore[attr-defined] + buffer = PandasBuffer(mask) + dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) + return buffer, dtype if self.dtype[0] == DtypeKind.STRING: # For now, use byte array as the mask. diff -Nru pandas-2.1.4+dfsg/pandas/core/interchange/dataframe.py pandas-2.2.2+dfsg/pandas/core/interchange/dataframe.py --- pandas-2.1.4+dfsg/pandas/core/interchange/dataframe.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/interchange/dataframe.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,7 @@ from pandas.core.interchange.column import PandasColumn from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg +from pandas.core.interchange.utils import maybe_rechunk if TYPE_CHECKING: from collections.abc import ( @@ -27,25 +28,24 @@ attributes defined on this class. """ - def __init__( - self, df: DataFrame, nan_as_null: bool = False, allow_copy: bool = True - ) -> None: + def __init__(self, df: DataFrame, allow_copy: bool = True) -> None: """ Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. """ - self._df = df - # ``nan_as_null`` is a keyword intended for the consumer to tell the - # producer to overwrite null values in the data with ``NaN`` (or ``NaT``). - # This currently has no effect; once support for nullable extension - # dtypes is added, this value should be propagated to columns. - self._nan_as_null = nan_as_null + self._df = df.rename(columns=str, copy=False) self._allow_copy = allow_copy + for i, _col in enumerate(self._df.columns): + rechunked = maybe_rechunk(self._df.iloc[:, i], allow_copy=allow_copy) + if rechunked is not None: + self._df.isetitem(i, rechunked) def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True ) -> PandasDataFrameXchg: - return PandasDataFrameXchg(self._df, nan_as_null, allow_copy) + # `nan_as_null` can be removed here once it's removed from + # Dataframe.__dataframe__ + return PandasDataFrameXchg(self._df, allow_copy) @property def metadata(self) -> dict[str, Index]: @@ -84,18 +84,16 @@ indices = list(indices) return PandasDataFrameXchg( - self._df.iloc[:, indices], self._nan_as_null, self._allow_copy + self._df.iloc[:, indices], allow_copy=self._allow_copy ) - def select_columns_by_name(self, names: list[str]) -> PandasDataFrameXchg: # type: ignore[override] # noqa: E501 + def select_columns_by_name(self, names: list[str]) -> PandasDataFrameXchg: # type: ignore[override] if not isinstance(names, abc.Sequence): raise ValueError("`names` is not a sequence") if not isinstance(names, list): names = list(names) - return PandasDataFrameXchg( - self._df.loc[:, names], self._nan_as_null, self._allow_copy - ) + return PandasDataFrameXchg(self._df.loc[:, names], allow_copy=self._allow_copy) def get_chunks(self, n_chunks: int | None = None) -> Iterable[PandasDataFrameXchg]: """ @@ -109,8 +107,7 @@ for start in range(0, step * n_chunks, step): yield PandasDataFrameXchg( self._df.iloc[start : start + step, :], - self._nan_as_null, - self._allow_copy, + allow_copy=self._allow_copy, ) else: yield self diff -Nru pandas-2.1.4+dfsg/pandas/core/interchange/from_dataframe.py pandas-2.2.2+dfsg/pandas/core/interchange/from_dataframe.py --- pandas-2.1.4+dfsg/pandas/core/interchange/from_dataframe.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/interchange/from_dataframe.py 2024-04-10 17:42:52.000000000 +0000 @@ -295,13 +295,14 @@ null_pos = None if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): - assert buffers["validity"], "Validity buffers cannot be empty for masks" - valid_buff, valid_dtype = buffers["validity"] - null_pos = buffer_to_ndarray( - valid_buff, valid_dtype, offset=col.offset, length=col.size() - ) - if sentinel_val == 0: - null_pos = ~null_pos + validity = buffers["validity"] + if validity is not None: + valid_buff, valid_dtype = validity + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) + if sentinel_val == 0: + null_pos = ~null_pos # Assemble the strings from the code units str_list: list[None | float | str] = [None] * col.size() @@ -486,6 +487,8 @@ np.ndarray or pd.Series Data with the nulls being set. """ + if validity is None: + return data null_kind, sentinel_val = col.describe_null null_pos = None diff -Nru pandas-2.1.4+dfsg/pandas/core/interchange/utils.py pandas-2.2.2+dfsg/pandas/core/interchange/utils.py --- pandas-2.1.4+dfsg/pandas/core/interchange/utils.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/interchange/utils.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,6 +16,8 @@ DatetimeTZDtype, ) +import pandas as pd + if typing.TYPE_CHECKING: from pandas._typing import DtypeObj @@ -37,6 +39,7 @@ "float": "f", # float32 "double": "g", # float64 "string": "u", + "large_string": "U", "binary": "z", "time32[s]": "tts", "time32[ms]": "ttm", @@ -141,6 +144,35 @@ elif isinstance(dtype, DatetimeTZDtype): return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz) + elif isinstance(dtype, pd.BooleanDtype): + return ArrowCTypes.BOOL + raise NotImplementedError( f"Conversion of {dtype} to Arrow C format string is not implemented." ) + + +def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None: + """ + Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary. + + - Returns `None` if the input series is not backed by a multi-chunk pyarrow array + (and so doesn't need rechunking) + - Returns a single-chunk-backed-Series if the input is backed by a multi-chunk + pyarrow array and `allow_copy` is `True`. + - Raises a `RuntimeError` if `allow_copy` is `False` and input is a + based by a multi-chunk pyarrow array. + """ + if not isinstance(series.dtype, pd.ArrowDtype): + return None + chunked_array = series.array._pa_array # type: ignore[attr-defined] + if len(chunked_array.chunks) == 1: + return None + if not allow_copy: + raise RuntimeError( + "Found multi-chunk pyarrow array, but `allow_copy` is False. " + "Please rechunk the array before calling this function, or set " + "`allow_copy=True`." + ) + arr = chunked_array.combine_chunks() + return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index) diff -Nru pandas-2.1.4+dfsg/pandas/core/internals/__init__.py pandas-2.2.2+dfsg/pandas/core/internals/__init__.py --- pandas-2.1.4+dfsg/pandas/core/internals/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/internals/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,4 @@ -from pandas.core.internals.api import make_block +from pandas.core.internals.api import make_block # 2023-09-18 pyarrow uses this from pandas.core.internals.array_manager import ( ArrayManager, SingleArrayManager, @@ -7,22 +7,16 @@ DataManager, SingleDataManager, ) -from pandas.core.internals.blocks import ( # io.pytables, io.packers - Block, - DatetimeTZBlock, - ExtensionBlock, -) from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( BlockManager, SingleBlockManager, - create_block_manager_from_blocks, ) __all__ = [ - "Block", - "DatetimeTZBlock", - "ExtensionBlock", + "Block", # pylint: disable=undefined-all-variable + "DatetimeTZBlock", # pylint: disable=undefined-all-variable + "ExtensionBlock", # pylint: disable=undefined-all-variable "make_block", "DataManager", "ArrayManager", @@ -31,27 +25,58 @@ "SingleBlockManager", "SingleArrayManager", "concatenate_managers", - # this is preserved here for downstream compatibility (GH-33892) - "create_block_manager_from_blocks", ] def __getattr__(name: str): + # GH#55139 import warnings - from pandas.util._exceptions import find_stack_level + if name == "create_block_manager_from_blocks": + # GH#33892 + warnings.warn( + f"{name} is deprecated and will be removed in a future version. " + "Use public APIs instead.", + DeprecationWarning, + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, + ) + from pandas.core.internals.managers import create_block_manager_from_blocks + + return create_block_manager_from_blocks - if name in ["NumericBlock", "ObjectBlock"]: + if name in [ + "NumericBlock", + "ObjectBlock", + "Block", + "ExtensionBlock", + "DatetimeTZBlock", + ]: warnings.warn( f"{name} is deprecated and will be removed in a future version. " "Use public APIs instead.", DeprecationWarning, - stacklevel=find_stack_level(), + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, ) if name == "NumericBlock": from pandas.core.internals.blocks import NumericBlock return NumericBlock + elif name == "DatetimeTZBlock": + from pandas.core.internals.blocks import DatetimeTZBlock + + return DatetimeTZBlock + elif name == "ExtensionBlock": + from pandas.core.internals.blocks import ExtensionBlock + + return ExtensionBlock + elif name == "Block": + from pandas.core.internals.blocks import Block + + return Block else: from pandas.core.internals.blocks import ObjectBlock diff -Nru pandas-2.1.4+dfsg/pandas/core/internals/api.py pandas-2.2.2+dfsg/pandas/core/internals/api.py --- pandas-2.1.4+dfsg/pandas/core/internals/api.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/internals/api.py 2024-04-10 17:42:52.000000000 +0000 @@ -23,9 +23,6 @@ from pandas.core.arrays import DatetimeArray from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( - Block, - DatetimeTZBlock, - ExtensionBlock, check_ndim, ensure_block_shape, extract_pandas_array, @@ -36,6 +33,8 @@ if TYPE_CHECKING: from pandas._typing import Dtype + from pandas.core.internals.blocks import Block + def make_block( values, placement, klass=None, ndim=None, dtype: Dtype | None = None @@ -56,6 +55,11 @@ values, dtype = extract_pandas_array(values, dtype, ndim) + from pandas.core.internals.blocks import ( + DatetimeTZBlock, + ExtensionBlock, + ) + if klass is ExtensionBlock and isinstance(values.dtype, PeriodDtype): # GH-44681 changed PeriodArray to be stored in the 2D # NDArrayBackedExtensionBlock instead of ExtensionBlock @@ -105,3 +109,48 @@ else: ndim = values.ndim return ndim + + +def __getattr__(name: str): + # GH#55139 + import warnings + + if name in [ + "Block", + "ExtensionBlock", + "DatetimeTZBlock", + "create_block_manager_from_blocks", + ]: + # GH#33892 + warnings.warn( + f"{name} is deprecated and will be removed in a future version. " + "Use public APIs instead.", + DeprecationWarning, + # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758 + # on hard-coding stacklevel + stacklevel=2, + ) + + if name == "create_block_manager_from_blocks": + from pandas.core.internals.managers import create_block_manager_from_blocks + + return create_block_manager_from_blocks + + elif name == "Block": + from pandas.core.internals.blocks import Block + + return Block + + elif name == "DatetimeTZBlock": + from pandas.core.internals.blocks import DatetimeTZBlock + + return DatetimeTZBlock + + elif name == "ExtensionBlock": + from pandas.core.internals.blocks import ExtensionBlock + + return ExtensionBlock + + raise AttributeError( + f"module 'pandas.core.internals.api' has no attribute '{name}'" + ) diff -Nru pandas-2.1.4+dfsg/pandas/core/internals/array_manager.py pandas-2.2.2+dfsg/pandas/core/internals/array_manager.py --- pandas-2.1.4+dfsg/pandas/core/internals/array_manager.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/internals/array_manager.py 2024-04-10 17:42:52.000000000 +0000 @@ -68,6 +68,7 @@ Index, ensure_index, ) +from pandas.core.indexes.base import get_values_for_csv from pandas.core.internals.base import ( DataManager, SingleDataManager, @@ -81,7 +82,6 @@ extract_pandas_array, maybe_coerce_values, new_block, - to_native_types, ) from pandas.core.internals.managers import make_na_array @@ -309,7 +309,7 @@ return type(self)(result_arrays, self._axes) - def setitem(self, indexer, value) -> Self: + def setitem(self, indexer, value, warn: bool = True) -> Self: return self.apply_with_block("setitem", indexer=indexer, value=value) def diff(self, n: int) -> Self: @@ -343,8 +343,17 @@ return self.apply(_convert) - def to_native_types(self, **kwargs) -> Self: - return self.apply(to_native_types, **kwargs) + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Self: + return self.apply( + get_values_for_csv, + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, + ) @property def any_extension_types(self) -> bool: @@ -1103,7 +1112,7 @@ def _normalize_axis(axis): return axis - def make_empty(self, axes=None) -> SingleArrayManager: + def make_empty(self, axes=None) -> Self: """Return an empty ArrayManager with index/array of length 0""" if axes is None: axes = [Index([], dtype=object)] @@ -1178,7 +1187,7 @@ new_array = getattr(self.array, func)(**kwargs) return type(self)([new_array], self._axes) - def setitem(self, indexer, value) -> SingleArrayManager: + def setitem(self, indexer, value, warn: bool = True) -> SingleArrayManager: """ Set values with indexer. diff -Nru pandas-2.1.4+dfsg/pandas/core/internals/base.py pandas-2.2.2+dfsg/pandas/core/internals/base.py --- pandas-2.1.4+dfsg/pandas/core/internals/base.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/internals/base.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,7 +14,10 @@ import numpy as np -from pandas._config import using_copy_on_write +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) from pandas._libs import ( algos as libalgos, @@ -49,6 +52,16 @@ ) +class _AlreadyWarned: + def __init__(self): + # This class is used on the manager level to the block level to + # ensure that we warn only once. The block method can update the + # warned_already option without returning a value to keep the + # interface consistent. This is only a temporary solution for + # CoW warnings. + self.warned_already = False + + class DataManager(PandasObject): # TODO share more methods/attributes @@ -133,7 +146,7 @@ """ Implementation for DataFrame.equals """ - if not isinstance(other, DataManager): + if not isinstance(other, type(self)): return False self_axes, other_axes = self.axes, other.axes @@ -177,6 +190,7 @@ inplace=inplace, downcast=downcast, using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) @final @@ -196,19 +210,26 @@ ) @final - def putmask(self, mask, new, align: bool = True) -> Self: + def putmask(self, mask, new, align: bool = True, warn: bool = True) -> Self: if align: align_keys = ["new", "mask"] else: align_keys = ["mask"] new = extract_array(new, extract_numpy=True) + already_warned = None + if warn_copy_on_write(): + already_warned = _AlreadyWarned() + if not warn: + already_warned.warned_already = True + return self.apply_with_block( "putmask", align_keys=align_keys, mask=mask, new=new, using_cow=using_copy_on_write(), + already_warned=already_warned, ) @final @@ -231,12 +252,16 @@ value=value, inplace=inplace, using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) @final def replace_regex(self, **kwargs) -> Self: return self.apply_with_block( - "_replace_regex", **kwargs, using_cow=using_copy_on_write() + "_replace_regex", + **kwargs, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) @final @@ -257,13 +282,18 @@ inplace=inplace, regex=regex, using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) bm._consolidate_inplace() return bm def interpolate(self, inplace: bool, **kwargs) -> Self: return self.apply_with_block( - "interpolate", inplace=inplace, **kwargs, using_cow=using_copy_on_write() + "interpolate", + inplace=inplace, + **kwargs, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: @@ -272,6 +302,7 @@ inplace=inplace, **kwargs, using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), ) def shift(self, periods: int, fill_value) -> Self: @@ -307,7 +338,7 @@ # error: "SingleDataManager" has no attribute "arrays"; maybe "array" return self.arrays[0] # type: ignore[attr-defined] - def setitem_inplace(self, indexer, value) -> None: + def setitem_inplace(self, indexer, value, warn: bool = True) -> None: """ Set values with indexer. diff -Nru pandas-2.1.4+dfsg/pandas/core/internals/blocks.py pandas-2.2.2+dfsg/pandas/core/internals/blocks.py --- pandas-2.1.4+dfsg/pandas/core/internals/blocks.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/internals/blocks.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,7 @@ from __future__ import annotations from functools import wraps +import inspect import re from typing import ( TYPE_CHECKING, @@ -15,23 +16,26 @@ import numpy as np -from pandas._config import using_copy_on_write +from pandas._config import ( + get_option, + using_copy_on_write, + warn_copy_on_write, +) from pandas._libs import ( NaT, internals as libinternals, lib, - writers, ) from pandas._libs.internals import ( BlockPlacement, BlockValuesRefs, ) from pandas._libs.missing import NA -from pandas._libs.tslibs import IncompatibleFrequency from pandas._typing import ( ArrayLike, AxisInt, + DtypeBackend, DtypeObj, F, FillnaOptions, @@ -54,12 +58,12 @@ from pandas.core.dtypes.cast import ( LossySetitemError, can_hold_element, + convert_dtypes, find_result_type, maybe_downcast_to_dtype, np_can_hold_element, ) from pandas.core.dtypes.common import ( - ensure_platform_int, is_1d_only_ea_dtype, is_float_dtype, is_integer_dtype, @@ -73,7 +77,6 @@ IntervalDtype, NumpyEADtype, PeriodDtype, - SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -120,6 +123,7 @@ extract_array, ) from pandas.core.indexers import check_setitem_lengths +from pandas.core.indexes.base import get_values_for_csv if TYPE_CHECKING: from collections.abc import ( @@ -134,6 +138,29 @@ _dtype_obj = np.dtype("object") +COW_WARNING_GENERAL_MSG = """\ +Setting a value on a view: behaviour will change in pandas 3.0. +You are mutating a Series or DataFrame object, and currently this mutation will +also have effect on other Series or DataFrame objects that share data with this +object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object +will never modify another. +""" + + +COW_WARNING_SETITEM_MSG = """\ +Setting a value on a view: behaviour will change in pandas 3.0. +Currently, the mutation will also have effect on the object that shares data +with this object. For example, when setting a value in a Series that was +extracted from a column of a DataFrame, that DataFrame will also be updated: + + ser = df["col"] + ser[0] = 0 <--- in pandas 2, this also updates `df` + +In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never +modify another, and thus in the example above, `df` will not be changed. +""" + + def maybe_split(meth: F) -> F: """ If we have a multi-column block, split and operate block-wise. Otherwise @@ -151,7 +178,7 @@ return cast(F, newfunc) -class Block(PandasObject): +class Block(PandasObject, libinternals.Block): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas data structure @@ -458,6 +485,12 @@ and will receive the same block """ new_dtype = find_result_type(self.values.dtype, other) + if new_dtype == self.dtype: + # GH#52927 avoid RecursionError + raise AssertionError( + "Something has gone wrong, please report a bug at " + "https://github.com/pandas-dev/pandas/issues" + ) # In a future version of pandas, the default will be that # setting `nan` into an integer series won't raise. @@ -466,6 +499,9 @@ and is_integer_dtype(self.values.dtype) and isna(other) and other is not NaT + and not ( + isinstance(other, (np.datetime64, np.timedelta64)) and np.isnat(other) + ) ): warn_on_upcast = False elif ( @@ -480,7 +516,7 @@ if warn_on_upcast: warnings.warn( f"Setting an item of incompatible dtype is deprecated " - "and will raise in a future error of pandas. " + "and will raise an error in a future version of pandas. " f"Value '{other}' has dtype incompatible with {self.values.dtype}, " "please explicitly cast to a compatible dtype first.", FutureWarning, @@ -496,7 +532,11 @@ @final def _maybe_downcast( - self, blocks: list[Block], downcast=None, using_cow: bool = False + self, + blocks: list[Block], + downcast, + using_cow: bool, + caller: str, ) -> list[Block]: if downcast is False: return blocks @@ -508,14 +548,63 @@ # but ATM it breaks too much existing code. # split and convert the blocks - return extend_blocks( + if caller == "fillna" and get_option("future.no_silent_downcasting"): + return blocks + + nbs = extend_blocks( [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] ) + if caller == "fillna": + if len(nbs) != len(blocks) or not all( + x.dtype == y.dtype for x, y in zip(nbs, blocks) + ): + # GH#54261 + warnings.warn( + "Downcasting object dtype arrays on .fillna, .ffill, .bfill " + "is deprecated and will change in a future version. " + "Call result.infer_objects(copy=False) instead. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return nbs - if downcast is None: + elif downcast is None: + return blocks + elif caller == "where" and get_option("future.no_silent_downcasting") is True: return blocks + else: + nbs = extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks]) + + # When _maybe_downcast is called with caller="where", it is either + # a) with downcast=False, which is a no-op (the desired future behavior) + # b) with downcast="infer", which is _not_ passed by the user. + # In the latter case the future behavior is to stop doing inference, + # so we issue a warning if and only if some inference occurred. + if caller == "where": + # GH#53656 + if len(blocks) != len(nbs) or any( + left.dtype != right.dtype for left, right in zip(blocks, nbs) + ): + # In this case _maybe_downcast was _not_ a no-op, so the behavior + # will change, so we issue a warning. + warnings.warn( + "Downcasting behavior in Series and DataFrame methods 'where', " + "'mask', and 'clip' is deprecated. In a future " + "version this will not infer object dtypes or cast all-round " + "floats to integers. Instead call " + "result.infer_objects(copy=False) for object inference, " + "or cast round floats explicitly. To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) - return extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks]) + return nbs @final @maybe_split @@ -577,6 +666,52 @@ res_values = maybe_coerce_values(res_values) return [self.make_block(res_values, refs=refs)] + def convert_dtypes( + self, + copy: bool, + using_cow: bool, + infer_objects: bool = True, + convert_string: bool = True, + convert_integer: bool = True, + convert_boolean: bool = True, + convert_floating: bool = True, + dtype_backend: DtypeBackend = "numpy_nullable", + ) -> list[Block]: + if infer_objects and self.is_object: + blks = self.convert(copy=False, using_cow=using_cow) + else: + blks = [self] + + if not any( + [convert_floating, convert_integer, convert_boolean, convert_string] + ): + return [b.copy(deep=copy) for b in blks] + + rbs = [] + for blk in blks: + # Determine dtype column by column + sub_blks = [blk] if blk.ndim == 1 or self.shape[0] == 1 else blk._split() + dtypes = [ + convert_dtypes( + b.values, + convert_string, + convert_integer, + convert_boolean, + convert_floating, + infer_objects, + dtype_backend, + ) + for b in sub_blks + ] + if all(dtype == self.dtype for dtype in dtypes): + # Avoid block splitting if no dtype changes + rbs.append(blk.copy(deep=copy)) + continue + + for dtype, b in zip(dtypes, sub_blks): + rbs.append(b.astype(dtype=dtype, copy=copy, squeeze=b.ndim != 1)) + return rbs + # --------------------------------------------------------------------- # Array-Like Methods @@ -592,6 +727,7 @@ copy: bool = False, errors: IgnoreRaise = "raise", using_cow: bool = False, + squeeze: bool = False, ) -> Block: """ Coerce to the new dtype. @@ -606,12 +742,18 @@ - ``ignore`` : suppress exceptions. On error return original object using_cow: bool, default False Signaling if copy on write copy logic is used. + squeeze : bool, default False + squeeze values to ndim=1 if only one column is given Returns ------- Block """ values = self.values + if squeeze and values.ndim == 2 and is_1d_only_ea_dtype(dtype): + if values.shape[0] != 1: + raise ValueError("Can not squeeze with more than one column.") + values = values[0, :] # type: ignore[call-overload] new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) @@ -631,9 +773,18 @@ return newb @final - def to_native_types(self, na_rep: str = "nan", quoting=None, **kwargs) -> Block: + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Block: """convert to our native types format""" - result = to_native_types(self.values, na_rep=na_rep, quoting=quoting, **kwargs) + result = get_values_for_csv( + self.values, + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, + ) return self.make_block(result) @final @@ -683,6 +834,7 @@ # mask may be pre-computed if we're called from replace_list mask: npt.NDArray[np.bool_] | None = None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -727,10 +879,40 @@ # and rest? blk = self._maybe_copy(using_cow, inplace) putmask_inplace(blk.values, mask, value) + if ( + inplace + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + if not (self.is_object and value is None): # if the user *explicitly* gave None, we keep None, otherwise # may downcast to NaN - blocks = blk.convert(copy=False, using_cow=using_cow) + if get_option("future.no_silent_downcasting") is True: + blocks = [blk] + else: + blocks = blk.convert(copy=False, using_cow=using_cow) + if len(blocks) > 1 or blocks[0].dtype != blk.dtype: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated and " + "will be removed in a future version. To retain the old " + "behavior, explicitly call " + "`result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) else: blocks = [blk] return blocks @@ -771,6 +953,7 @@ inplace: bool = False, mask=None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: """ Replace elements by the given value. @@ -805,7 +988,35 @@ replace_regex(block.values, rx, value, mask) - return block.convert(copy=False, using_cow=using_cow) + if ( + inplace + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + + nbs = block.convert(copy=False, using_cow=using_cow) + opt = get_option("future.no_silent_downcasting") + if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated and " + "will be removed in a future version. To retain the old " + "behavior, explicitly call `result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + return nbs @final def replace_list( @@ -815,6 +1026,7 @@ inplace: bool = False, regex: bool = False, using_cow: bool = False, + already_warned=None, ) -> list[Block]: """ See BlockManager.replace_list docstring. @@ -871,6 +1083,21 @@ else: rb = [self if inplace else self.copy()] + if ( + inplace + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + + opt = get_option("future.no_silent_downcasting") for i, ((src, dest), mask) in enumerate(zip(pairs, masks)): convert = i == src_len # only convert once at the end new_rb: list[Block] = [] @@ -908,14 +1135,33 @@ b.refs.referenced_blocks.index(ref) ) - if convert and blk.is_object and not all(x is None for x in dest_list): + if ( + not opt + and convert + and blk.is_object + and not all(x is None for x in dest_list) + ): # GH#44498 avoid unwanted cast-back - result = extend_blocks( - [ - b.convert(copy=True and not using_cow, using_cow=using_cow) - for b in result - ] - ) + nbs = [] + for res_blk in result: + converted = res_blk.convert( + copy=True and not using_cow, using_cow=using_cow + ) + if len(converted) > 1 or converted[0].dtype != res_blk.dtype: + warnings.warn( + # GH#54710 + "Downcasting behavior in `replace` is deprecated " + "and will be removed in a future version. To " + "retain the old behavior, explicitly call " + "`result.infer_objects(copy=False)`. " + "To opt-in to the future " + "behavior, set " + "`pd.set_option('future.no_silent_downcasting', True)`", + FutureWarning, + stacklevel=find_stack_level(), + ) + nbs.extend(converted) + result = nbs new_rb.extend(result) rb = new_rb return rb @@ -1179,10 +1425,19 @@ if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1: # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 casted = casted[0, ...] - values[indexer] = casted + try: + values[indexer] = casted + except (TypeError, ValueError) as err: + if is_list_like(casted): + raise ValueError( + "setting an array element with a sequence." + ) from err + raise return self - def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: + def putmask( + self, mask, new, using_cow: bool = False, already_warned=None + ) -> list[Block]: """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -1215,6 +1470,19 @@ return [self.copy(deep=False)] return [self] + if ( + warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + try: casted = np_can_hold_element(values.dtype, new) @@ -1297,7 +1565,7 @@ try: # try/except here is equivalent to a self._can_hold_element check, - # but this gets us back 'casted' which we will re-use below; + # but this gets us back 'casted' which we will reuse below; # without using 'casted', expressions.where may do unwanted upcasts. casted = np_can_hold_element(values.dtype, other) except (ValueError, TypeError, LossySetitemError): @@ -1309,7 +1577,7 @@ block = self.coerce_to_target_dtype(other) blocks = block.where(orig_other, cond, using_cow=using_cow) return self._maybe_downcast( - blocks, downcast=_downcast, using_cow=using_cow + blocks, downcast=_downcast, using_cow=using_cow, caller="where" ) else: @@ -1379,6 +1647,7 @@ inplace: bool = False, downcast=None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: """ fillna on the block with the value. If we fail, then convert to @@ -1405,14 +1674,18 @@ else: # GH#45423 consistent downcasting on no-ops. nb = self.copy(deep=not using_cow) - nbs = nb._maybe_downcast([nb], downcast=downcast, using_cow=using_cow) + nbs = nb._maybe_downcast( + [nb], downcast=downcast, using_cow=using_cow, caller="fillna" + ) return nbs if limit is not None: mask[mask.cumsum(self.ndim - 1) > limit] = False if inplace: - nbs = self.putmask(mask.T, value, using_cow=using_cow) + nbs = self.putmask( + mask.T, value, using_cow=using_cow, already_warned=already_warned + ) else: # without _downcast, we would break # test_fillna_dtype_conversion_equiv_replace @@ -1423,7 +1696,9 @@ # different behavior in _maybe_downcast. return extend_blocks( [ - blk._maybe_downcast([blk], downcast=downcast, using_cow=using_cow) + blk._maybe_downcast( + [blk], downcast=downcast, using_cow=using_cow, caller="fillna" + ) for blk in nbs ] ) @@ -1438,6 +1713,7 @@ limit_area: Literal["inside", "outside"] | None = None, downcast: Literal["infer"] | None = None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: if not self._can_hold_na: # If there are no NAs, then interpolate is a no-op @@ -1458,13 +1734,26 @@ limit_area=limit_area, copy=copy, ) + if ( + not copy + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True if axis == 1: new_values = new_values.T data = extract_array(new_values, extract_numpy=True) nb = self.make_block_same_class(data, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow) + return nb._maybe_downcast([nb], downcast, using_cow, caller="fillna") @final def interpolate( @@ -1478,6 +1767,7 @@ limit_area: Literal["inside", "outside"] | None = None, downcast: Literal["infer"] | None = None, using_cow: bool = False, + already_warned=None, **kwargs, ) -> list[Block]: inplace = validate_bool_kwarg(inplace, "inplace") @@ -1516,8 +1806,22 @@ ) data = extract_array(new_values, extract_numpy=True) + if ( + not copy + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + nb = self.make_block_same_class(data, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow) + return nb._maybe_downcast([nb], downcast, using_cow, caller="interpolate") @final def diff(self, n: int) -> list[Block]: @@ -1615,13 +1919,14 @@ # has no attribute "round" values = self.values.round(decimals) # type: ignore[union-attr] if values is self.values: - refs = self.refs if not using_cow: # Normally would need to do this before, but # numpy only returns same array when round operation # is no-op # https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636 values = values.copy() + else: + refs = self.refs return self.make_block_same_class(values, refs=refs) # --------------------------------------------------------------------- @@ -1664,7 +1969,7 @@ else: # No overload variant of "__getitem__" of "ExtensionArray" matches # argument type "Tuple[slice, slice]" - values = self.values[previous_loc + 1 : idx, :] # type: ignore[call-overload] # noqa: E501 + values = self.values[previous_loc + 1 : idx, :] # type: ignore[call-overload] locs = mgr_locs_arr[previous_loc + 1 : idx] nb = type(self)( values, placement=BlockPlacement(locs), ndim=self.ndim, refs=refs @@ -1756,9 +2061,7 @@ try: values[indexer] = value - except (ValueError, TypeError) as err: - _catch_deprecated_value_error(err) - + except (ValueError, TypeError): if isinstance(self.dtype, IntervalDtype): # see TestSetitemFloatIntervalWithIntIntervalValues nb = self.coerce_to_target_dtype(orig_value, warn_on_upcast=True) @@ -1801,16 +2104,14 @@ try: res_values = arr._where(cond, other).T - except (ValueError, TypeError) as err: - _catch_deprecated_value_error(err) - + except (ValueError, TypeError): if self.ndim == 1 or self.shape[0] == 1: if isinstance(self.dtype, IntervalDtype): # TestSetitemFloatIntervalWithIntIntervalValues blk = self.coerce_to_target_dtype(orig_other) nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) return self._maybe_downcast( - nbs, downcast=_downcast, using_cow=using_cow + nbs, downcast=_downcast, using_cow=using_cow, caller="where" ) elif isinstance(self, NDArrayBackedExtensionBlock): @@ -1819,7 +2120,7 @@ blk = self.coerce_to_target_dtype(orig_other) nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) return self._maybe_downcast( - nbs, downcast=_downcast, using_cow=using_cow + nbs, downcast=_downcast, using_cow=using_cow, caller="where" ) else: @@ -1846,7 +2147,9 @@ return [nb] @final - def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: + def putmask( + self, mask, new, using_cow: bool = False, already_warned=None + ) -> list[Block]: """ See Block.putmask.__doc__ """ @@ -1864,6 +2167,19 @@ return [self.copy(deep=False)] return [self] + if ( + warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + self = self._maybe_copy(using_cow, inplace=True) values = self.values if values.ndim == 2: @@ -1872,9 +2188,7 @@ try: # Caller is responsible for ensuring matching lengths values._putmask(mask, new) - except (TypeError, ValueError) as err: - _catch_deprecated_value_error(err) - + except (TypeError, ValueError): if self.ndim == 1 or self.shape[0] == 1: if isinstance(self.dtype, IntervalDtype): # Discussion about what we want to support in the general @@ -1949,19 +2263,29 @@ limit_area: Literal["inside", "outside"] | None = None, downcast: Literal["infer"] | None = None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: values = self.values - copy, refs = self._get_refs_and_copy(using_cow, inplace) + + kwargs: dict[str, Any] = {"method": method, "limit": limit} + if "limit_area" in inspect.signature(values._pad_or_backfill).parameters: + kwargs["limit_area"] = limit_area + elif limit_area is not None: + raise NotImplementedError( + f"{type(values).__name__} does not implement limit_area " + "(added in pandas 2.2). 3rd-party ExtnsionArray authors " + "need to add this argument to _pad_or_backfill." + ) if values.ndim == 2 and axis == 1: # NDArrayBackedExtensionArray.fillna assumes axis=0 - new_values = values.T._pad_or_backfill(method=method, limit=limit).T + new_values = values.T._pad_or_backfill(**kwargs).T else: - new_values = values._pad_or_backfill(method=method, limit=limit) + new_values = values._pad_or_backfill(**kwargs) return [self.make_block_same_class(new_values)] -class ExtensionBlock(libinternals.Block, EABackedBlock): +class ExtensionBlock(EABackedBlock): """ Block for holding extension types. @@ -1982,6 +2306,7 @@ inplace: bool = False, downcast=None, using_cow: bool = False, + already_warned=None, ) -> list[Block]: if isinstance(self.dtype, IntervalDtype): # Block.fillna handles coercion (test_fillna_interval) @@ -1991,6 +2316,7 @@ inplace=inplace, downcast=downcast, using_cow=using_cow, + already_warned=already_warned, ) if using_cow and self._can_hold_na and not self.values._hasna: refs = self.refs @@ -2018,9 +2344,23 @@ DeprecationWarning, stacklevel=find_stack_level(), ) + else: + if ( + not copy + and warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True nb = self.make_block_same_class(new_values, refs=refs) - return nb._maybe_downcast([nb], downcast, using_cow=using_cow) + return nb._maybe_downcast([nb], downcast, using_cow=using_cow, caller="fillna") @cache_readonly def shape(self) -> Shape: @@ -2133,8 +2473,9 @@ """Extension arrays are never treated as views.""" return False + # error: Cannot override writeable attribute with read-only property @cache_readonly - def is_numeric(self): + def is_numeric(self) -> bool: # type: ignore[override] return self.values.dtype._is_numeric def _slice( @@ -2229,7 +2570,7 @@ return blocks, mask -class NumpyBlock(libinternals.NumpyBlock, Block): +class NumpyBlock(Block): values: np.ndarray __slots__ = () @@ -2267,7 +2608,7 @@ __slots__ = () -class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock): +class NDArrayBackedExtensionBlock(EABackedBlock): """ Block backed by an NDArrayBackedExtensionArray """ @@ -2281,19 +2622,6 @@ return self.values._ndarray.base is not None -def _catch_deprecated_value_error(err: Exception) -> None: - """ - We catch ValueError for now, but only a specific one raised by DatetimeArray - which will no longer be raised in version 2.0. - """ - if isinstance(err, ValueError): - if isinstance(err, IncompatibleFrequency): - pass - elif "'value.closed' is" in str(err): - # IntervalDtype mismatched 'closed' - pass - - class DatetimeLikeBlock(NDArrayBackedExtensionBlock): """Block for datetime64[ns], timedelta64[ns].""" @@ -2496,93 +2824,6 @@ return values -def to_native_types( - values: ArrayLike, - *, - na_rep: str = "nan", - quoting=None, - float_format=None, - decimal: str = ".", - **kwargs, -) -> npt.NDArray[np.object_]: - """convert to our native types format""" - if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm": - # GH#40754 Convert categorical datetimes to datetime array - values = algos.take_nd( - values.categories._values, - ensure_platform_int(values._codes), - fill_value=na_rep, - ) - - values = ensure_wrapped_if_datetimelike(values) - - if isinstance(values, (DatetimeArray, TimedeltaArray)): - if values.ndim == 1: - result = values._format_native_types(na_rep=na_rep, **kwargs) - result = result.astype(object, copy=False) - return result - - # GH#21734 Process every column separately, they might have different formats - results_converted = [] - for i in range(len(values)): - result = values[i, :]._format_native_types(na_rep=na_rep, **kwargs) - results_converted.append(result.astype(object, copy=False)) - return np.vstack(results_converted) - - elif values.dtype.kind == "f" and not isinstance(values.dtype, SparseDtype): - # see GH#13418: no special formatting is desired at the - # output (important for appropriate 'quoting' behaviour), - # so do not pass it through the FloatArrayFormatter - if float_format is None and decimal == ".": - mask = isna(values) - - if not quoting: - values = values.astype(str) - else: - values = np.array(values, dtype="object") - - values[mask] = na_rep - values = values.astype(object, copy=False) - return values - - from pandas.io.formats.format import FloatArrayFormatter - - formatter = FloatArrayFormatter( - values, - na_rep=na_rep, - float_format=float_format, - decimal=decimal, - quoting=quoting, - fixed_width=False, - ) - res = formatter.get_result_as_array() - res = res.astype(object, copy=False) - return res - - elif isinstance(values, ExtensionArray): - mask = isna(values) - - new_values = np.asarray(values.astype(object)) - new_values[mask] = na_rep - return new_values - - else: - mask = isna(values) - itemsize = writers.word_len(na_rep) - - if values.dtype != _dtype_obj and not quoting and itemsize: - values = values.astype(str) - if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize: - # enlarge for the na_rep - values = values.astype(f" ArrayLike: """ The array that Series.values returns (public attribute). diff -Nru pandas-2.1.4+dfsg/pandas/core/internals/construction.py pandas-2.2.2+dfsg/pandas/core/internals/construction.py --- pandas-2.1.4+dfsg/pandas/core/internals/construction.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/internals/construction.py 2024-04-10 17:42:52.000000000 +0000 @@ -193,7 +193,7 @@ return mgr -def mgr_to_mgr(mgr, typ: str, copy: bool = True): +def mgr_to_mgr(mgr, typ: str, copy: bool = True) -> Manager: """ Convert to specific type of Manager. Does not copy if the type is already correct. Does not guarantee a copy otherwise. `copy` keyword only controls @@ -550,7 +550,7 @@ if len(values) == 0: # TODO: check for length-zero range, in which case return int64 dtype? - # TODO: re-use anything in try_cast? + # TODO: reuse anything in try_cast? return np.empty((0, 0), dtype=object) elif isinstance(values, range): arr = range_to_ndarray(values) @@ -919,7 +919,7 @@ # assure that they are of the base dict class and not of derived # classes - data = [d if type(d) is dict else dict(d) for d in data] + data = [d if type(d) is dict else dict(d) for d in data] # noqa: E721 content = lib.dicts_to_array(data, list(columns)) return content, columns @@ -1044,7 +1044,9 @@ # i.e. maybe_convert_objects didn't convert arr = maybe_infer_to_datetimelike(arr) if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): - arr = StringDtype().construct_array_type()._from_sequence(arr) + new_dtype = StringDtype() + arr_cls = new_dtype.construct_array_type() + arr = arr_cls._from_sequence(arr, dtype=new_dtype) elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): if arr.dtype.kind in "iufb": arr = pd_array(arr, copy=False) diff -Nru pandas-2.1.4+dfsg/pandas/core/internals/managers.py pandas-2.2.2+dfsg/pandas/core/internals/managers.py --- pandas-2.1.4+dfsg/pandas/core/internals/managers.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/internals/managers.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,11 +12,13 @@ cast, ) import warnings -import weakref import numpy as np -from pandas._config import using_copy_on_write +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) from pandas._libs import ( internals as libinternals, @@ -26,6 +28,7 @@ BlockPlacement, BlockValuesRefs, ) +from pandas._libs.tslibs import Timestamp from pandas.errors import PerformanceWarning from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level @@ -50,7 +53,11 @@ ) import pandas.core.algorithms as algos -from pandas.core.arrays import DatetimeArray +from pandas.core.arrays import ( + ArrowExtensionArray, + ArrowStringArray, + DatetimeArray, +) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.construction import ( ensure_wrapped_if_datetimelike, @@ -68,6 +75,8 @@ interleaved_dtype, ) from pandas.core.internals.blocks import ( + COW_WARNING_GENERAL_MSG, + COW_WARNING_SETITEM_MSG, Block, NumpyBlock, ensure_block_shape, @@ -93,6 +102,8 @@ npt, ) + from pandas.api.extensions import ExtensionArray + class BaseBlockManager(DataManager): """ @@ -263,17 +274,15 @@ return for i, blk in enumerate(self.blocks): blk.refs = mgr.blocks[i].refs - # Argument 1 to "add_reference" of "BlockValuesRefs" has incompatible type - # "Block"; expected "SharedBlock" - blk.refs.add_reference(blk) # type: ignore[arg-type] + blk.refs.add_reference(blk) def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: """ Checks if two blocks from two different block managers reference the same underlying values. """ - ref = weakref.ref(self.blocks[blkno]) - return ref in mgr.blocks[blkno].refs.referenced_blocks + blk = self.blocks[blkno] + return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) def get_dtypes(self) -> npt.NDArray[np.object_]: dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) @@ -360,7 +369,7 @@ # Alias so we can share code with ArrayManager apply_with_block = apply - def setitem(self, indexer, value) -> Self: + def setitem(self, indexer, value, warn: bool = True) -> Self: """ Set values with indexer. @@ -369,7 +378,14 @@ if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: raise ValueError(f"Cannot set values with ndim > {self.ndim}") - if using_copy_on_write() and not self._has_no_reference(0): + if warn and warn_copy_on_write() and not self._has_no_reference(0): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + + elif using_copy_on_write() and not self._has_no_reference(0): # this method is only called if there is a single block -> hardcoded 0 # Split blocks to only copy the columns we want to modify if self.ndim == 2 and isinstance(indexer, tuple): @@ -430,12 +446,31 @@ return self.apply("convert", copy=copy, using_cow=using_copy_on_write()) - def to_native_types(self, **kwargs) -> Self: + def convert_dtypes(self, **kwargs): + if using_copy_on_write(): + copy = False + else: + copy = True + + return self.apply( + "convert_dtypes", copy=copy, using_cow=using_copy_on_write(), **kwargs + ) + + def get_values_for_csv( + self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None + ) -> Self: """ Convert values to native types (strings / python objects) that are used in formatting (repr / csv). """ - return self.apply("to_native_types", **kwargs) + return self.apply( + "get_values_for_csv", + na_rep=na_rep, + quoting=quoting, + float_format=float_format, + date_format=date_format, + decimal=decimal, + ) @property def any_extension_types(self) -> bool: @@ -459,17 +494,12 @@ def _get_data_subset(self, predicate: Callable) -> Self: blocks = [blk for blk in self.blocks if predicate(blk.values)] - return self._combine(blocks, copy=False) + return self._combine(blocks) - def get_bool_data(self, copy: bool = False) -> Self: + def get_bool_data(self) -> Self: """ Select blocks that are bool-dtype and columns from object-dtype blocks that are all-bool. - - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks """ new_blocks = [] @@ -482,26 +512,16 @@ nbs = blk._split() new_blocks.extend(nb for nb in nbs if nb.is_bool) - return self._combine(new_blocks, copy) + return self._combine(new_blocks) - def get_numeric_data(self, copy: bool = False) -> Self: - """ - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks - """ + def get_numeric_data(self) -> Self: numeric_blocks = [blk for blk in self.blocks if blk.is_numeric] if len(numeric_blocks) == len(self.blocks): # Avoid somewhat expensive _combine - if copy: - return self.copy(deep=True) return self - return self._combine(numeric_blocks, copy) + return self._combine(numeric_blocks) - def _combine( - self, blocks: list[Block], copy: bool = True, index: Index | None = None - ) -> Self: + def _combine(self, blocks: list[Block], index: Index | None = None) -> Self: """return a new manager with the blocks""" if len(blocks) == 0: if self.ndim == 2: @@ -518,11 +538,8 @@ inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) new_blocks: list[Block] = [] - # TODO(CoW) we could optimize here if we know that the passed blocks - # are fully "owned" (eg created from an operation, not coming from - # an existing manager) for b in blocks: - nb = b.copy(deep=copy) + nb = b.copy(deep=False) nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer]) new_blocks.append(nb) @@ -969,6 +986,10 @@ n = len(self) if isinstance(dtype, ExtensionDtype): + # TODO: use object dtype as workaround for non-performant + # EA.__setitem__ methods. (primarily ArrowExtensionArray.__setitem__ + # when iteratively setting individual values) + # https://github.com/pandas-dev/pandas/pull/54508#issuecomment-1675827918 result = np.empty(n, dtype=object) else: result = np.empty(n, dtype=dtype) @@ -1048,7 +1069,7 @@ value: ArrayLike, inplace: bool = False, refs: BlockValuesRefs | None = None, - ): + ) -> None: """ Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items @@ -1156,8 +1177,6 @@ unfit_count = len(unfit_idxr) new_blocks: list[Block] = [] - # TODO(CoW) is this always correct to assume that the new_blocks - # are not referencing anything else? if value_is_extension_type: # This code (ab-)uses the fact that EA blocks contain only # one item. @@ -1285,7 +1304,17 @@ This is a method on the BlockManager level, to avoid creating an intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) """ - if using_copy_on_write() and not self._has_no_reference(loc): + needs_to_warn = False + if warn_copy_on_write() and not self._has_no_reference(loc): + if not isinstance( + self.blocks[self.blknos[loc]].values, + (ArrowExtensionArray, ArrowStringArray), + ): + # We might raise if we are in an expansion case, so defer + # warning till we actually updated + needs_to_warn = True + + elif using_copy_on_write() and not self._has_no_reference(loc): blkno = self.blknos[loc] # Split blocks to only copy the column we want to modify blk_loc = self.blklocs[loc] @@ -1308,6 +1337,13 @@ new_mgr = col_mgr.setitem((idx,), value) self.iset(loc, new_mgr._block.values, inplace=True) + if needs_to_warn: + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: """ Insert item at selected position. @@ -1319,8 +1355,14 @@ value : np.ndarray or ExtensionArray refs : The reference tracking object of the value to set. """ - # insert to the axis; this could possibly raise a TypeError - new_axis = self.items.insert(loc, item) + with warnings.catch_warnings(): + # TODO: re-issue this with setitem-specific message? + warnings.filterwarnings( + "ignore", + "The behavior of Index.insert with object-dtype is deprecated", + category=FutureWarning, + ) + new_axis = self.items.insert(loc, item) if value.ndim == 2: value = value.T @@ -1332,7 +1374,6 @@ value = ensure_block_shape(value, ndim=self.ndim) bp = BlockPlacement(slice(loc, loc + 1)) - # TODO(CoW) do we always "own" the passed `value`? block = new_block_2d(values=value, placement=bp, refs=refs) if not len(self.blocks): @@ -1573,14 +1614,10 @@ bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) return bm - def to_dict(self, copy: bool = True) -> dict[str, Self]: + def to_dict(self) -> dict[str, Self]: """ Return a dict of str(dtype) -> BlockManager - Parameters - ---------- - copy : bool, default True - Returns ------- values : a dict of dtype -> BlockManager @@ -1591,7 +1628,7 @@ bd.setdefault(str(b.dtype), []).append(b) # TODO(EA2D): the combine will be unnecessary with 2D EAs - return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()} + return {dtype: self._combine(blocks) for dtype, blocks in bd.items()} def as_array( self, @@ -1619,7 +1656,6 @@ """ passed_nan = lib.is_float(na_value) and isna(na_value) - # TODO(CoW) handle case where resulting array is a view if len(self.blocks) == 0: arr = np.empty(self.shape, dtype=float) return arr.transpose() @@ -1646,6 +1682,8 @@ na_value=na_value, copy=copy, ).reshape(blk.shape) + elif not copy: + arr = np.asarray(blk.values, dtype=dtype) else: arr = np.array(blk.values, dtype=dtype, copy=copy) @@ -1872,7 +1910,7 @@ # compatibility with 0.13.1. return axes_array, block_values, block_items, extra_state - def __setstate__(self, state): + def __setstate__(self, state) -> None: def unpickle_block(values, mgr_locs, ndim: int) -> Block: # TODO(EA2D): ndim would be unnecessary with 2D EAs # older pickles may store e.g. DatetimeIndex instead of DatetimeArray @@ -1920,9 +1958,15 @@ return type(self)(blk.copy(deep=False), self.index) array = blk.values[indexer] + if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "b": + # boolean indexing always gives a copy with numpy + refs = None + else: + # TODO(CoW) in theory only need to track reference if new_array is a view + refs = blk.refs + bp = BlockPlacement(slice(0, len(array))) - # TODO(CoW) in theory only need to track reference if new_array is a view - block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs) + block = type(blk)(array, placement=bp, ndim=1, refs=refs) new_idx = self.index[indexer] return type(self)(block, new_idx) @@ -1961,20 +2005,20 @@ """The array that Series._values returns""" return self._block.values - def array_values(self): + def array_values(self) -> ExtensionArray: """The array that Series.array returns""" return self._block.array_values - def get_numeric_data(self, copy: bool = False) -> Self: + def get_numeric_data(self) -> Self: if self._block.is_numeric: - return self.copy(deep=copy) + return self.copy(deep=False) return self.make_empty() @property def _can_hold_na(self) -> bool: return self._block._can_hold_na - def setitem_inplace(self, indexer, value) -> None: + def setitem_inplace(self, indexer, value, warn: bool = True) -> None: """ Set values with indexer. @@ -1984,9 +2028,18 @@ in place, not returning a new Manager (and Block), and thus never changing the dtype. """ - if using_copy_on_write() and not self._has_no_reference(0): - self.blocks = (self._block.copy(),) - self._cache.clear() + using_cow = using_copy_on_write() + warn_cow = warn_copy_on_write() + if (using_cow or warn_cow) and not self._has_no_reference(0): + if using_cow: + self.blocks = (self._block.copy(),) + self._cache.clear() + elif warn_cow and warn: + warnings.warn( + COW_WARNING_SETITEM_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) super().setitem_inplace(indexer, value) @@ -2014,11 +2067,11 @@ Set the values of the single block in place. Use at your own risk! This does not check if the passed values are - valid for the current Block/SingleBlockManager (length, dtype, etc). + valid for the current Block/SingleBlockManager (length, dtype, etc), + and this does not properly keep track of references. """ - # TODO(CoW) do we need to handle copy on write here? Currently this is - # only used for FrameColumnApply.series_generator (what if apply is - # mutating inplace?) + # NOTE(CoW) Currently this is only used for FrameColumnApply.series_generator + # which handles CoW by setting the refs manually if necessary self.blocks[0].values = values self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values))) @@ -2142,7 +2195,6 @@ # when consolidating, we can ignore refs (either stacking always copies, # or the EA is already copied in the calling dict_to_mgr) - # TODO(CoW) check if this is also valid for rec_array_to_mgr # group by dtype grouper = itertools.groupby(tuples, _grouping_func) @@ -2291,8 +2343,10 @@ def make_na_array(dtype: DtypeObj, shape: Shape, fill_value) -> ArrayLike: if isinstance(dtype, DatetimeTZDtype): # NB: exclude e.g. pyarrow[dt64tz] dtypes - i8values = np.full(shape, fill_value._value) - return DatetimeArray(i8values, dtype=dtype) + ts = Timestamp(fill_value).as_unit(dtype.unit) + i8values = np.full(shape, ts._value) + dt64values = i8values.view(f"M8[{dtype.unit}]") + return DatetimeArray._simple_new(dt64values, dtype=dtype) elif is_1d_only_ea_dtype(dtype): dtype = cast(ExtensionDtype, dtype) diff -Nru pandas-2.1.4+dfsg/pandas/core/methods/describe.py pandas-2.2.2+dfsg/pandas/core/methods/describe.py --- pandas-2.1.4+dfsg/pandas/core/methods/describe.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/methods/describe.py 2024-04-10 17:42:52.000000000 +0000 @@ -146,6 +146,8 @@ A black list of data types to omit from the result. """ + obj: DataFrame + def __init__( self, obj: DataFrame, @@ -196,7 +198,7 @@ include=self.include, exclude=self.exclude, ) - return data # pyright: ignore[reportGeneralTypeIssues] + return data def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]: @@ -301,7 +303,7 @@ names = ["count", "unique"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) - result = [data.count(), count_unique] + result: list[float | Timestamp] = [data.count(), count_unique] dtype = None if count_unique > 0: top, freq = objcounts.index[0], objcounts.iloc[0] diff -Nru pandas-2.1.4+dfsg/pandas/core/methods/selectn.py pandas-2.2.2+dfsg/pandas/core/methods/selectn.py --- pandas-2.1.4+dfsg/pandas/core/methods/selectn.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/methods/selectn.py 2024-04-10 17:42:52.000000000 +0000 @@ -139,7 +139,11 @@ # arr passed into kth_smallest must be contiguous. We copy # here because kth_smallest will modify its input - kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1) + # avoid OOB access with kth_smallest_c when n <= 0 + if len(arr) > 0: + kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1) + else: + kth_val = np.nan (ns,) = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind="mergesort")] diff -Nru pandas-2.1.4+dfsg/pandas/core/methods/to_dict.py pandas-2.2.2+dfsg/pandas/core/methods/to_dict.py --- pandas-2.1.4+dfsg/pandas/core/methods/to_dict.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/methods/to_dict.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,30 +3,87 @@ from typing import ( TYPE_CHECKING, Literal, + overload, ) import warnings import numpy as np +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import maybe_box_native -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + BaseMaskedDtype, + ExtensionDtype, +) from pandas.core import common as com if TYPE_CHECKING: + from pandas._typing import MutableMappingT + from pandas import DataFrame +@overload +def to_dict( + df: DataFrame, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., +) -> MutableMappingT: + ... + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["records"], + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., +) -> list[MutableMappingT]: + ... + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[dict] = ..., + index: bool = ..., +) -> dict: + ... + + +@overload +def to_dict( + df: DataFrame, + orient: Literal["records"], + *, + into: type[dict] = ..., + index: bool = ..., +) -> list[dict]: + ... + + +# error: Incompatible default for argument "into" (default has type "type[dict +# [Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") def to_dict( df: DataFrame, orient: Literal[ "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", - into: type[dict] = dict, + *, + into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment] index: bool = True, -) -> dict | list[dict]: +) -> MutableMappingT | list[MutableMappingT]: """ Convert the DataFrame to a dictionary. @@ -54,7 +111,7 @@ 'tight' as an allowed value for the ``orient`` argument into : class, default dict - The collections.abc.Mapping subclass used for all Mappings + The collections.abc.MutableMapping subclass used for all Mappings in the return value. Can be the actual class or an empty instance of the mapping type you want. If you want a collections.defaultdict, you must pass it initialized. @@ -69,8 +126,8 @@ Returns ------- dict, list or collections.abc.Mapping - Return a collections.abc.Mapping object representing the DataFrame. - The resulting transformation depends on the `orient` parameter. + Return a collections.abc.MutableMapping object representing the + DataFrame. The resulting transformation depends on the `orient` parameter. """ if not df.columns.is_unique: warnings.warn( @@ -100,19 +157,23 @@ for i, col_dtype in enumerate(df.dtypes.values) if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype) ] + box_na_values = [ + lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA + for i, col_dtype in enumerate(df.dtypes.values) + ] are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes) if orient == "dict": - return into_c((k, v.to_dict(into)) for k, v in df.items()) + return into_c((k, v.to_dict(into=into)) for k, v in df.items()) elif orient == "list": - object_dtype_indices_as_set = set(box_native_indices) + object_dtype_indices_as_set: set[int] = set(box_native_indices) return into_c( ( k, - list(map(maybe_box_native, v.tolist())) + list(map(maybe_box_native, v.to_numpy(na_value=box_na_values[i]))) if i in object_dtype_indices_as_set - else v.tolist(), + else list(map(maybe_box_native, v.to_numpy())), ) for i, (k, v) in enumerate(df.items()) ) diff -Nru pandas-2.1.4+dfsg/pandas/core/missing.py pandas-2.2.2+dfsg/pandas/core/missing.py --- pandas-2.1.4+dfsg/pandas/core/missing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/missing.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,15 +3,13 @@ """ from __future__ import annotations -from functools import ( - partial, - wraps, -) +from functools import wraps from typing import ( TYPE_CHECKING, Any, Literal, cast, + overload, ) import numpy as np @@ -33,6 +31,7 @@ from pandas.core.dtypes.cast import infer_dtype_from from pandas.core.dtypes.common import ( is_array_like, + is_bool_dtype, is_numeric_dtype, is_numeric_v_string_like, is_object_dtype, @@ -102,21 +101,34 @@ # GH 21977 mask = np.zeros(arr.shape, dtype=bool) - for x in nonna: - if is_numeric_v_string_like(arr, x): - # GH#29553 prevent numpy deprecation warnings - pass - else: - if potential_na: - new_mask = np.zeros(arr.shape, dtype=np.bool_) - new_mask[arr_mask] = arr[arr_mask] == x + if ( + is_numeric_dtype(arr.dtype) + and not is_bool_dtype(arr.dtype) + and is_bool_dtype(nonna.dtype) + ): + pass + elif ( + is_bool_dtype(arr.dtype) + and is_numeric_dtype(nonna.dtype) + and not is_bool_dtype(nonna.dtype) + ): + pass + else: + for x in nonna: + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + pass else: - new_mask = arr == x - - if not isinstance(new_mask, np.ndarray): - # usually BooleanArray - new_mask = new_mask.to_numpy(dtype=bool, na_value=False) - mask |= new_mask + if potential_na: + new_mask = np.zeros(arr.shape, dtype=np.bool_) + new_mask[arr_mask] = arr[arr_mask] == x + else: + new_mask = arr == x + + if not isinstance(new_mask, np.ndarray): + # usually BooleanArray + new_mask = new_mask.to_numpy(dtype=bool, na_value=False) + mask |= new_mask if na_mask.any(): mask |= isna(arr) @@ -124,9 +136,33 @@ return mask -def clean_fill_method(method: str, allow_nearest: bool = False): +@overload +def clean_fill_method( + method: Literal["ffill", "pad", "bfill", "backfill"], + *, + allow_nearest: Literal[False] = ..., +) -> Literal["pad", "backfill"]: + ... + + +@overload +def clean_fill_method( + method: Literal["ffill", "pad", "bfill", "backfill", "nearest"], + *, + allow_nearest: Literal[True], +) -> Literal["pad", "backfill", "nearest"]: + ... + + +def clean_fill_method( + method: Literal["ffill", "pad", "bfill", "backfill", "nearest"], + *, + allow_nearest: bool = False, +) -> Literal["pad", "backfill", "nearest"]: if isinstance(method, str): - method = method.lower() + # error: Incompatible types in assignment (expression has type "str", variable + # has type "Literal['ffill', 'pad', 'bfill', 'backfill', 'nearest']") + method = method.lower() # type: ignore[assignment] if method == "ffill": method = "pad" elif method == "bfill": @@ -252,7 +288,9 @@ return limit_area # type: ignore[return-value] -def infer_limit_direction(limit_direction, method): +def infer_limit_direction( + limit_direction: Literal["backward", "forward", "both"] | None, method: str +) -> Literal["backward", "forward", "both"]: # Set `limit_direction` depending on `method` if limit_direction is None: if method in ("backfill", "bfill"): @@ -311,6 +349,7 @@ limit_direction: str = "forward", limit_area: str | None = None, fill_value: Any | None = None, + mask=None, **kwargs, ) -> None: """ @@ -358,6 +397,7 @@ limit_area=limit_area_validated, fill_value=fill_value, bounds_error=False, + mask=mask, **kwargs, ) @@ -402,6 +442,7 @@ fill_value: Any | None = None, bounds_error: bool = False, order: int | None = None, + mask=None, **kwargs, ) -> None: """ @@ -415,8 +456,10 @@ ----- Fills 'yvalues' in-place. """ - - invalid = isna(yvalues) + if mask is not None: + invalid = mask + else: + invalid = isna(yvalues) valid = ~invalid if not valid.any(): @@ -493,7 +536,10 @@ **kwargs, ) - if is_datetimelike: + if mask is not None: + mask[:] = False + mask[preserve_nans] = True + elif is_datetimelike: yvalues[preserve_nans] = NaT.value else: yvalues[preserve_nans] = np.nan @@ -796,6 +842,7 @@ values, method=method, limit=limit, + limit_area=limit_area, ) if limit_area == "inside": @@ -836,27 +883,6 @@ ----- Modifies values in-place. """ - if limit_area is not None: - np.apply_along_axis( - # error: Argument 1 to "apply_along_axis" has incompatible type - # "partial[None]"; expected - # "Callable[..., Union[_SupportsArray[dtype[]], - # Sequence[_SupportsArray[dtype[]]], - # Sequence[Sequence[_SupportsArray[dtype[]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]], - # Sequence[Sequence[Sequence[Sequence[_ - # SupportsArray[dtype[]]]]]]]]" - partial( # type: ignore[arg-type] - _interpolate_with_limit_area, - method=method, - limit=limit, - limit_area=limit_area, - ), - axis, - values, - ) - return - transf = (lambda x: x) if axis == 0 else (lambda x: x.T) # reshape a 1 dim if needed @@ -870,8 +896,7 @@ func = get_fill_func(method, ndim=2) # _pad_2d and _backfill_2d both modify tvalues inplace - func(tvalues, limit=limit) - return + func(tvalues, limit=limit, limit_area=limit_area) def _fillna_prep( @@ -882,7 +907,6 @@ if mask is None: mask = isna(values) - mask = mask.view(np.uint8) return mask @@ -892,16 +916,23 @@ """ @wraps(func) - def new_func(values, limit: int | None = None, mask=None): + def new_func( + values, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask=None, + ): if needs_i8_conversion(values.dtype): if mask is None: # This needs to occur before casting to int64 mask = isna(values) - result, mask = func(values.view("i8"), limit=limit, mask=mask) + result, mask = func( + values.view("i8"), limit=limit, limit_area=limit_area, mask=mask + ) return result.view(values.dtype), mask - return func(values, limit=limit, mask=mask) + return func(values, limit=limit, limit_area=limit_area, mask=mask) return cast(F, new_func) @@ -910,9 +941,12 @@ def _pad_1d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: mask = _fillna_prep(values, mask) + if limit_area is not None and not mask.all(): + _fill_limit_area_1d(mask, limit_area) algos.pad_inplace(values, mask, limit=limit) return values, mask @@ -921,9 +955,12 @@ def _backfill_1d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]: mask = _fillna_prep(values, mask) + if limit_area is not None and not mask.all(): + _fill_limit_area_1d(mask, limit_area) algos.backfill_inplace(values, mask, limit=limit) return values, mask @@ -932,9 +969,12 @@ def _pad_2d( values: np.ndarray, limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, mask: npt.NDArray[np.bool_] | None = None, ): mask = _fillna_prep(values, mask) + if limit_area is not None: + _fill_limit_area_2d(mask, limit_area) if values.size: algos.pad_2d_inplace(values, mask, limit=limit) @@ -946,9 +986,14 @@ @_datetimelike_compat def _backfill_2d( - values, limit: int | None = None, mask: npt.NDArray[np.bool_] | None = None + values, + limit: int | None = None, + limit_area: Literal["inside", "outside"] | None = None, + mask: npt.NDArray[np.bool_] | None = None, ): mask = _fillna_prep(values, mask) + if limit_area is not None: + _fill_limit_area_2d(mask, limit_area) if values.size: algos.backfill_2d_inplace(values, mask, limit=limit) @@ -958,6 +1003,63 @@ return values, mask +def _fill_limit_area_1d( + mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"] +) -> None: + """Prepare 1d mask for ffill/bfill with limit_area. + + Caller is responsible for checking at least one value of mask is False. + When called, mask will no longer faithfully represent when + the corresponding are NA or not. + + Parameters + ---------- + mask : np.ndarray[bool, ndim=1] + Mask representing NA values when filling. + limit_area : { "outside", "inside" } + Whether to limit filling to outside or inside the outer most non-NA value. + """ + neg_mask = ~mask + first = neg_mask.argmax() + last = len(neg_mask) - neg_mask[::-1].argmax() - 1 + if limit_area == "inside": + mask[:first] = False + mask[last + 1 :] = False + elif limit_area == "outside": + mask[first + 1 : last] = False + + +def _fill_limit_area_2d( + mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"] +) -> None: + """Prepare 2d mask for ffill/bfill with limit_area. + + When called, mask will no longer faithfully represent when + the corresponding are NA or not. + + Parameters + ---------- + mask : np.ndarray[bool, ndim=1] + Mask representing NA values when filling. + limit_area : { "outside", "inside" } + Whether to limit filling to outside or inside the outer most non-NA value. + """ + neg_mask = ~mask.T + if limit_area == "outside": + # Identify inside + la_mask = ( + np.maximum.accumulate(neg_mask, axis=0) + & np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1] + ) + else: + # Identify outside + la_mask = ( + ~np.maximum.accumulate(neg_mask, axis=0) + | ~np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1] + ) + mask[la_mask.T] = False + + _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} diff -Nru pandas-2.1.4+dfsg/pandas/core/nanops.py pandas-2.2.2+dfsg/pandas/core/nanops.py --- pandas-2.1.4+dfsg/pandas/core/nanops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/nanops.py 2024-04-10 17:42:52.000000000 +0000 @@ -757,6 +757,10 @@ >>> nanops.nanmedian(s.values) 2.0 """ + # for floats without mask, the data already uses NaN as missing value + # indicator, and `mask` will be calculated from that below -> in those + # cases we never need to set NaN to the masked values + using_nan_sentinel = values.dtype.kind == "f" and mask is None def get_median(x, _mask=None): if _mask is None: @@ -774,7 +778,7 @@ return res dtype = values.dtype - values, mask = _get_values(values, skipna, mask=mask, fill_value=0) + values, mask = _get_values(values, skipna, mask=mask, fill_value=None) if values.dtype.kind != "f": if values.dtype == object: # GH#34671 avoid casting strings to numeric @@ -786,7 +790,9 @@ except ValueError as err: # e.g. "could not convert string to float: 'a'" raise TypeError(str(err)) from err - if mask is not None: + if not using_nan_sentinel and mask is not None: + if not values.flags.writeable: + values = values.copy() values[mask] = np.nan notempty = values.size @@ -1139,9 +1145,10 @@ array([2, 2, 1, 1]) """ values, mask = _get_values(values, True, fill_value_typ="-inf", mask=mask) - # error: Need type annotation for 'result' - result = values.argmax(axis) # type: ignore[var-annotated] - result = _maybe_arg_null_out(result, axis, mask, skipna) + result = values.argmax(axis) + # error: Argument 1 to "_maybe_arg_null_out" has incompatible type "Any | + # signedinteger[Any]"; expected "ndarray[Any, Any]" + result = _maybe_arg_null_out(result, axis, mask, skipna) # type: ignore[arg-type] return result @@ -1184,9 +1191,10 @@ array([0, 0, 1, 1]) """ values, mask = _get_values(values, True, fill_value_typ="+inf", mask=mask) - # error: Need type annotation for 'result' - result = values.argmin(axis) # type: ignore[var-annotated] - result = _maybe_arg_null_out(result, axis, mask, skipna) + result = values.argmin(axis) + # error: Argument 1 to "_maybe_arg_null_out" has incompatible type "Any | + # signedinteger[Any]"; expected "ndarray[Any, Any]" + result = _maybe_arg_null_out(result, axis, mask, skipna) # type: ignore[arg-type] return result diff -Nru pandas-2.1.4+dfsg/pandas/core/ops/array_ops.py pandas-2.2.2+dfsg/pandas/core/ops/array_ops.py --- pandas-2.1.4+dfsg/pandas/core/ops/array_ops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/ops/array_ops.py 2024-04-10 17:42:52.000000000 +0000 @@ -24,11 +24,9 @@ ) from pandas._libs.tslibs import ( BaseOffset, - get_supported_reso, - get_unit_from_dtype, - is_supported_unit, + get_supported_dtype, + is_supported_dtype, is_unitless, - npy_unit_to_abbrev, ) from pandas.util._exceptions import find_stack_level @@ -543,12 +541,11 @@ # GH 52295 if is_unitless(obj.dtype): obj = obj.astype("datetime64[ns]") - elif not is_supported_unit(get_unit_from_dtype(obj.dtype)): - unit = get_unit_from_dtype(obj.dtype) - closest_unit = npy_unit_to_abbrev(get_supported_reso(unit)) - obj = obj.astype(f"datetime64[{closest_unit}]") + elif not is_supported_dtype(obj.dtype): + new_dtype = get_supported_dtype(obj.dtype) + obj = obj.astype(new_dtype) right = np.broadcast_to(obj, shape) - return DatetimeArray(right) + return DatetimeArray._simple_new(right, dtype=right.dtype) return Timestamp(obj) @@ -562,18 +559,25 @@ # GH 52295 if is_unitless(obj.dtype): obj = obj.astype("timedelta64[ns]") - elif not is_supported_unit(get_unit_from_dtype(obj.dtype)): - unit = get_unit_from_dtype(obj.dtype) - closest_unit = npy_unit_to_abbrev(get_supported_reso(unit)) - obj = obj.astype(f"timedelta64[{closest_unit}]") + elif not is_supported_dtype(obj.dtype): + new_dtype = get_supported_dtype(obj.dtype) + obj = obj.astype(new_dtype) right = np.broadcast_to(obj, shape) - return TimedeltaArray(right) + return TimedeltaArray._simple_new(right, dtype=right.dtype) # In particular non-nanosecond timedelta64 needs to be cast to # nanoseconds, or else we get undesired behavior like # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') return Timedelta(obj) + # We want NumPy numeric scalars to behave like Python scalars + # post NEP 50 + elif isinstance(obj, np.integer): + return int(obj) + + elif isinstance(obj, np.floating): + return float(obj) + return obj diff -Nru pandas-2.1.4+dfsg/pandas/core/resample.py pandas-2.2.2+dfsg/pandas/core/resample.py --- pandas-2.1.4+dfsg/pandas/core/resample.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/resample.py 2024-04-10 17:42:52.000000000 +0000 @@ -24,6 +24,7 @@ Timestamp, to_offset, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas._typing import NDFrameT from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -32,8 +33,12 @@ Substitution, doc, ) -from pandas.util._exceptions import find_stack_level +from pandas.util._exceptions import ( + find_stack_level, + rewrite_warning, +) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -44,6 +49,7 @@ ResamplerWindowApply, warn_alias_replacement, ) +from pandas.core.arrays import ArrowExtensionArray from pandas.core.base import ( PandasObject, SelectionMixin, @@ -57,12 +63,14 @@ from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, + _apply_groupings_depr, _pipe_template, get_groupby, ) from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.api import MultiIndex +from pandas.core.indexes.base import Index from pandas.core.indexes.datetimes import ( DatetimeIndex, date_range, @@ -104,7 +112,6 @@ from pandas import ( DataFrame, - Index, Series, ) @@ -135,7 +142,7 @@ After resampling, see aggregate, apply, and transform functions. """ - grouper: BinGrouper + _grouper: BinGrouper _timegrouper: TimeGrouper binner: DatetimeIndex | TimedeltaIndex | PeriodIndex # depends on subclass exclusions: frozenset[Hashable] = frozenset() # for SelectionMixin compat @@ -163,6 +170,7 @@ gpr_index: Index, group_keys: bool = False, selection=None, + include_groups: bool = True, ) -> None: self._timegrouper = timegrouper self.keys = None @@ -171,17 +179,19 @@ self.kind = kind self.group_keys = group_keys self.as_index = True + self.include_groups = include_groups self.obj, self.ax, self._indexer = self._timegrouper._set_grouper( self._convert_obj(obj), sort=True, gpr_index=gpr_index ) - self.binner, self.grouper = self._get_binner() + self.binner, self._grouper = self._get_binner() self._selection = selection if self._timegrouper.key is not None: self.exclusions = frozenset([self._timegrouper.key]) else: self.exclusions = frozenset() + @final def __str__(self) -> str: """ Provide a nice str repr of our rolling object. @@ -193,6 +203,7 @@ ) return f"{type(self).__name__} [{', '.join(attrs)}]" + @final def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) @@ -203,6 +214,7 @@ return object.__getattribute__(self, attr) + @final @property def _from_selection(self) -> bool: """ @@ -242,6 +254,7 @@ bin_grouper = BinGrouper(bins, binlabels, indexer=self._indexer) return binner, bin_grouper + @final @Substitution( klass="Resampler", examples=""" @@ -296,7 +309,7 @@ 2013-01-01 00:00:02 3 2013-01-01 00:00:03 4 2013-01-01 00:00:04 5 - Freq: S, dtype: int64 + Freq: s, dtype: int64 >>> r = s.resample('2s') @@ -304,7 +317,7 @@ 2013-01-01 00:00:00 3 2013-01-01 00:00:02 7 2013-01-01 00:00:04 5 - Freq: 2S, dtype: int64 + Freq: 2s, dtype: int64 >>> r.agg(['sum', 'mean', 'max']) sum mean max @@ -327,6 +340,7 @@ """ ) + @final @doc( _shared_docs["aggregate"], see_also=_agg_see_also_doc, @@ -345,6 +359,7 @@ agg = aggregate apply = aggregate + @final def transform(self, arg, *args, **kwargs): """ Call function producing a like-indexed Series on each group. @@ -369,13 +384,13 @@ >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 - Freq: H, dtype: int64 + Freq: h, dtype: int64 >>> resampled = s.resample('15min') >>> resampled.transform(lambda x: (x - x.mean()) / x.std()) 2018-01-01 00:00:00 NaN 2018-01-01 01:00:00 NaN - Freq: H, dtype: float64 + Freq: h, dtype: float64 """ return self._selected_obj.groupby(self._timegrouper).transform( arg, *args, **kwargs @@ -399,7 +414,7 @@ subset : object, default None subset to act on """ - grouper = self.grouper + grouper = self._grouper if subset is None: subset = self.obj if key is not None: @@ -419,7 +434,7 @@ """ Re-evaluate the obj with a groupby aggregation. """ - grouper = self.grouper + grouper = self._grouper # Excludes `on` column when provided obj = self._obj_with_exclusions @@ -444,7 +459,9 @@ # a DataFrame column, but aggregate_item_by_item operates column-wise # on Series, raising AttributeError or KeyError # (depending on whether the column lookup uses getattr/__getitem__) - result = grouped.apply(how, *args, **kwargs) + result = _apply( + grouped, how, *args, include_groups=self.include_groups, **kwargs + ) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -456,15 +473,22 @@ # we have a non-reducing function # try to evaluate - result = grouped.apply(how, *args, **kwargs) + result = _apply( + grouped, how, *args, include_groups=self.include_groups, **kwargs + ) return self._wrap_result(result) - def _get_resampler_for_grouping(self, groupby: GroupBy, key): + @final + def _get_resampler_for_grouping( + self, groupby: GroupBy, key, include_groups: bool = True + ): """ Return the correct class for resampling with groupby. """ - return self._resampler_for_grouping(groupby=groupby, key=key, parent=self) + return self._resampler_for_grouping( + groupby=groupby, key=key, parent=self, include_groups=include_groups + ) def _wrap_result(self, result): """ @@ -489,8 +513,12 @@ result.index = _asfreq_compat(obj.index[:0], freq=self.freq) result.name = getattr(obj, "name", None) + if self._timegrouper._arrow_dtype is not None: + result.index = result.index.astype(self._timegrouper._arrow_dtype) + return result + @final def ffill(self, limit: int | None = None): """ Forward fill the values. @@ -559,6 +587,7 @@ """ return self._upsample("ffill", limit=limit) + @final def nearest(self, limit: int | None = None): """ Resample by using the nearest value. @@ -597,7 +626,7 @@ >>> s 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 - Freq: H, dtype: int64 + Freq: h, dtype: int64 >>> s.resample('15min').nearest() 2018-01-01 00:00:00 1 @@ -605,7 +634,7 @@ 2018-01-01 00:30:00 2 2018-01-01 00:45:00 2 2018-01-01 01:00:00 2 - Freq: 15T, dtype: int64 + Freq: 15min, dtype: int64 Limit the number of upsampled values imputed by the nearest: @@ -615,10 +644,11 @@ 2018-01-01 00:30:00 NaN 2018-01-01 00:45:00 2.0 2018-01-01 01:00:00 2.0 - Freq: 15T, dtype: float64 + Freq: 15min, dtype: float64 """ return self._upsample("nearest", limit=limit) + @final def bfill(self, limit: int | None = None): """ Backward fill the new missing values in the resampled data. @@ -666,7 +696,7 @@ 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 2018-01-01 02:00:00 3 - Freq: H, dtype: int64 + Freq: h, dtype: int64 >>> s.resample('30min').bfill() 2018-01-01 00:00:00 1 @@ -674,7 +704,7 @@ 2018-01-01 01:00:00 2 2018-01-01 01:30:00 3 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 + Freq: 30min, dtype: int64 >>> s.resample('15min').bfill(limit=2) 2018-01-01 00:00:00 1.0 @@ -686,7 +716,7 @@ 2018-01-01 01:30:00 3.0 2018-01-01 01:45:00 3.0 2018-01-01 02:00:00 3.0 - Freq: 15T, dtype: float64 + Freq: 15min, dtype: float64 Resampling a DataFrame that has missing values: @@ -721,6 +751,7 @@ """ return self._upsample("bfill", limit=limit) + @final def fillna(self, method, limit: int | None = None): """ Fill missing values introduced by upsampling. @@ -777,7 +808,7 @@ 2018-01-01 00:00:00 1 2018-01-01 01:00:00 2 2018-01-01 02:00:00 3 - Freq: H, dtype: int64 + Freq: h, dtype: int64 Without filling the missing values you get: @@ -787,7 +818,7 @@ 2018-01-01 01:00:00 2.0 2018-01-01 01:30:00 NaN 2018-01-01 02:00:00 3.0 - Freq: 30T, dtype: float64 + Freq: 30min, dtype: float64 >>> s.resample('30min').fillna("backfill") 2018-01-01 00:00:00 1 @@ -795,7 +826,7 @@ 2018-01-01 01:00:00 2 2018-01-01 01:30:00 3 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 + Freq: 30min, dtype: int64 >>> s.resample('15min').fillna("backfill", limit=2) 2018-01-01 00:00:00 1.0 @@ -807,7 +838,7 @@ 2018-01-01 01:30:00 3.0 2018-01-01 01:45:00 3.0 2018-01-01 02:00:00 3.0 - Freq: 15T, dtype: float64 + Freq: 15min, dtype: float64 >>> s.resample('30min').fillna("pad") 2018-01-01 00:00:00 1 @@ -815,7 +846,7 @@ 2018-01-01 01:00:00 2 2018-01-01 01:30:00 2 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 + Freq: 30min, dtype: int64 >>> s.resample('30min').fillna("nearest") 2018-01-01 00:00:00 1 @@ -823,17 +854,17 @@ 2018-01-01 01:00:00 2 2018-01-01 01:30:00 3 2018-01-01 02:00:00 3 - Freq: 30T, dtype: int64 + Freq: 30min, dtype: int64 Missing values present before the upsampling are not affected. >>> sm = pd.Series([1, None, 3], - ... index=pd.date_range('20180101', periods=3, freq='h')) + ... index=pd.date_range('20180101', periods=3, freq='h')) >>> sm 2018-01-01 00:00:00 1.0 2018-01-01 01:00:00 NaN 2018-01-01 02:00:00 3.0 - Freq: H, dtype: float64 + Freq: h, dtype: float64 >>> sm.resample('30min').fillna('backfill') 2018-01-01 00:00:00 1.0 @@ -841,7 +872,7 @@ 2018-01-01 01:00:00 NaN 2018-01-01 01:30:00 3.0 2018-01-01 02:00:00 3.0 - Freq: 30T, dtype: float64 + Freq: 30min, dtype: float64 >>> sm.resample('30min').fillna('pad') 2018-01-01 00:00:00 1.0 @@ -849,7 +880,7 @@ 2018-01-01 01:00:00 NaN 2018-01-01 01:30:00 NaN 2018-01-01 02:00:00 3.0 - Freq: 30T, dtype: float64 + Freq: 30min, dtype: float64 >>> sm.resample('30min').fillna('nearest') 2018-01-01 00:00:00 1.0 @@ -857,7 +888,7 @@ 2018-01-01 01:00:00 NaN 2018-01-01 01:30:00 3.0 2018-01-01 02:00:00 3.0 - Freq: 30T, dtype: float64 + Freq: 30min, dtype: float64 DataFrame resampling is done column-wise. All the same options are available. @@ -888,6 +919,7 @@ ) return self._upsample(method, limit=limit) + @final def interpolate( self, method: InterpolateOptions = "linear", @@ -905,7 +937,7 @@ The original index is first reindexed to target timestamps (see :meth:`core.resample.Resampler.asfreq`), - then the interpolation of ``NaN`` values via :meth`DataFrame.interpolate` + then the interpolation of ``NaN`` values via :meth:`DataFrame.interpolate` happens. Parameters @@ -971,7 +1003,7 @@ downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. - .. deprecated::2.1.0 + .. deprecated:: 2.1.0 ``**kwargs`` : optional Keyword arguments to pass on to the interpolating function. @@ -996,13 +1028,8 @@ Examples -------- - >>> import datetime as dt - >>> timesteps = [ - ... dt.datetime(2023, 3, 1, 7, 0, 0), - ... dt.datetime(2023, 3, 1, 7, 0, 1), - ... dt.datetime(2023, 3, 1, 7, 0, 2), - ... dt.datetime(2023, 3, 1, 7, 0, 3), - ... dt.datetime(2023, 3, 1, 7, 0, 4)] + >>> start = "2023-03-01T07:00:00" + >>> timesteps = pd.date_range(start, periods=5, freq="s") >>> series = pd.Series(data=[1, -1, 2, 1, 3], index=timesteps) >>> series 2023-03-01 07:00:00 1 @@ -1010,7 +1037,7 @@ 2023-03-01 07:00:02 2 2023-03-01 07:00:03 1 2023-03-01 07:00:04 3 - dtype: int64 + Freq: s, dtype: int64 Upsample the dataframe to 0.5Hz by providing the period time of 2s. @@ -1018,7 +1045,7 @@ 2023-03-01 07:00:00 1 2023-03-01 07:00:02 2 2023-03-01 07:00:04 3 - Freq: 2S, dtype: int64 + Freq: 2s, dtype: int64 Downsample the dataframe to 2Hz by providing the period time of 500ms. @@ -1032,9 +1059,9 @@ 2023-03-01 07:00:03.000 1.0 2023-03-01 07:00:03.500 2.0 2023-03-01 07:00:04.000 3.0 - Freq: 500L, dtype: float64 + Freq: 500ms, dtype: float64 - Internal reindexing with ``as_freq()`` prior to interpolation leads to + Internal reindexing with ``asfreq()`` prior to interpolation leads to an interpolated timeseries on the basis the reindexed timestamps (anchors). Since not all datapoints from original series become anchors, it can lead to misleading interpolation results as in the following example: @@ -1051,7 +1078,7 @@ 2023-03-01 07:00:03.200 2.6 2023-03-01 07:00:03.600 2.8 2023-03-01 07:00:04.000 3.0 - Freq: 400L, dtype: float64 + Freq: 400ms, dtype: float64 Note that the series erroneously increases between two anchors ``07:00:00`` and ``07:00:02``. @@ -1069,6 +1096,7 @@ **kwargs, ) + @final def asfreq(self, fill_value=None): """ Return the values at the new freq, essentially a reindex. @@ -1107,6 +1135,7 @@ """ return self._upsample("asfreq", fill_value=fill_value) + @final def sum( self, numeric_only: bool = False, @@ -1154,6 +1183,7 @@ nv.validate_resampler_func("sum", args, kwargs) return self._downsample("sum", numeric_only=numeric_only, min_count=min_count) + @final def prod( self, numeric_only: bool = False, @@ -1201,6 +1231,7 @@ nv.validate_resampler_func("prod", args, kwargs) return self._downsample("prod", numeric_only=numeric_only, min_count=min_count) + @final def min( self, numeric_only: bool = False, @@ -1235,6 +1266,7 @@ nv.validate_resampler_func("min", args, kwargs) return self._downsample("min", numeric_only=numeric_only, min_count=min_count) + @final def max( self, numeric_only: bool = False, @@ -1268,36 +1300,46 @@ nv.validate_resampler_func("max", args, kwargs) return self._downsample("max", numeric_only=numeric_only, min_count=min_count) + @final @doc(GroupBy.first) def first( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, *args, **kwargs, ): maybe_warn_args_and_kwargs(type(self), "first", args, kwargs) nv.validate_resampler_func("first", args, kwargs) - return self._downsample("first", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "first", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) + @final @doc(GroupBy.last) def last( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, *args, **kwargs, ): maybe_warn_args_and_kwargs(type(self), "last", args, kwargs) nv.validate_resampler_func("last", args, kwargs) - return self._downsample("last", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "last", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) + @final @doc(GroupBy.median) def median(self, numeric_only: bool = False, *args, **kwargs): maybe_warn_args_and_kwargs(type(self), "median", args, kwargs) nv.validate_resampler_func("median", args, kwargs) return self._downsample("median", numeric_only=numeric_only) + @final def mean( self, numeric_only: bool = False, @@ -1341,6 +1383,7 @@ nv.validate_resampler_func("mean", args, kwargs) return self._downsample("mean", numeric_only=numeric_only) + @final def std( self, ddof: int = 1, @@ -1388,6 +1431,7 @@ nv.validate_resampler_func("std", args, kwargs) return self._downsample("std", ddof=ddof, numeric_only=numeric_only) + @final def var( self, ddof: int = 1, @@ -1441,6 +1485,7 @@ nv.validate_resampler_func("var", args, kwargs) return self._downsample("var", ddof=ddof, numeric_only=numeric_only) + @final @doc(GroupBy.sem) def sem( self, @@ -1453,6 +1498,7 @@ nv.validate_resampler_func("sem", args, kwargs) return self._downsample("sem", ddof=ddof, numeric_only=numeric_only) + @final @doc(GroupBy.ohlc) def ohlc( self, @@ -1480,6 +1526,7 @@ return self._downsample("ohlc") + @final @doc(SeriesGroupBy.nunique) def nunique( self, @@ -1490,6 +1537,7 @@ nv.validate_resampler_func("nunique", args, kwargs) return self._downsample("nunique") + @final @doc(GroupBy.size) def size(self): result = self._downsample("size") @@ -1509,6 +1557,7 @@ result = Series([], index=result.index, dtype="int64", name=name) return result + @final @doc(GroupBy.count) def count(self): result = self._downsample("count") @@ -1526,7 +1575,8 @@ return result - def quantile(self, q: float | AnyArrayLike = 0.5, **kwargs): + @final + def quantile(self, q: float | list[float] | AnyArrayLike = 0.5, **kwargs): """ Return value at the given quantile. @@ -1590,6 +1640,7 @@ groupby: GroupBy, key=None, selection: IndexLabel | None = None, + include_groups: bool = False, ) -> None: # reached via ._gotitem and _get_resampler_for_grouping @@ -1612,6 +1663,7 @@ self.ax = parent.ax self.obj = parent.obj + self.include_groups = include_groups @no_type_check def _apply(self, f, *args, **kwargs): @@ -1628,7 +1680,7 @@ return x.apply(f, *args, **kwargs) - result = self._groupby.apply(func) + result = _apply(self._groupby, func, include_groups=self.include_groups) return self._wrap_result(result) _upsample = _apply @@ -1676,6 +1728,8 @@ class DatetimeIndexResampler(Resampler): + ax: DatetimeIndex + @property def _resampler_for_grouping(self): return DatetimeIndexResamplerGroupby @@ -1716,7 +1770,7 @@ # error: Item "None" of "Optional[Any]" has no attribute "binlabels" if ( (ax.freq is not None or ax.inferred_freq is not None) - and len(self.grouper.binlabels) > len(ax) + and len(self._grouper.binlabels) > len(ax) and how is None ): # let's do an asfreq @@ -1725,10 +1779,10 @@ # we are downsampling # we want to call the actual grouper method here if self.axis == 0: - result = obj.groupby(self.grouper).aggregate(how, **kwargs) + result = obj.groupby(self._grouper).aggregate(how, **kwargs) else: # test_resample_axis1 - result = obj.T.groupby(self.grouper).aggregate(how, **kwargs).T + result = obj.T.groupby(self._grouper).aggregate(how, **kwargs).T return self._wrap_result(result) @@ -1807,7 +1861,11 @@ return result -class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler): +# error: Definition of "ax" in base class "_GroupByMixin" is incompatible +# with definition in base class "DatetimeIndexResampler" +class DatetimeIndexResamplerGroupby( # type: ignore[misc] + _GroupByMixin, DatetimeIndexResampler +): """ Provides a resample of a groupby implementation """ @@ -1818,8 +1876,18 @@ class PeriodIndexResampler(DatetimeIndexResampler): + # error: Incompatible types in assignment (expression has type "PeriodIndex", base + # class "DatetimeIndexResampler" defined the type as "DatetimeIndex") + ax: PeriodIndex # type: ignore[assignment] + @property def _resampler_for_grouping(self): + warnings.warn( + "Resampling a groupby with a PeriodIndex is deprecated. " + "Cast to DatetimeIndex before resampling instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return PeriodIndexResamplerGroupby def _get_binner_for_time(self): @@ -1924,7 +1992,11 @@ return self._wrap_result(new_obj) -class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): +# error: Definition of "ax" in base class "_GroupByMixin" is incompatible with +# definition in base class "PeriodIndexResampler" +class PeriodIndexResamplerGroupby( # type: ignore[misc] + _GroupByMixin, PeriodIndexResampler +): """ Provides a resample of a groupby implementation. """ @@ -1935,6 +2007,10 @@ class TimedeltaIndexResampler(DatetimeIndexResampler): + # error: Incompatible types in assignment (expression has type "TimedeltaIndex", + # base class "DatetimeIndexResampler" defined the type as "DatetimeIndex") + ax: TimedeltaIndex # type: ignore[assignment] + @property def _resampler_for_grouping(self): return TimedeltaIndexResamplerGroupby @@ -1952,7 +2028,11 @@ return binner -class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler): +# error: Definition of "ax" in base class "_GroupByMixin" is incompatible with +# definition in base class "DatetimeIndexResampler" +class TimedeltaIndexResamplerGroupby( # type: ignore[misc] + _GroupByMixin, TimedeltaIndexResampler +): """ Provides a resample of a groupby implementation. """ @@ -1966,7 +2046,7 @@ """ Create a TimeGrouper and return our resampler. """ - tg = TimeGrouper(**kwds) + tg = TimeGrouper(obj, **kwds) # type: ignore[arg-type] return tg._get_resampler(obj, kind=kind) @@ -1981,6 +2061,7 @@ limit: int | None = None, kind=None, on=None, + include_groups: bool = True, **kwargs, ) -> Resampler: """ @@ -1989,7 +2070,9 @@ # .resample uses 'on' similar to how .groupby uses 'key' tg = TimeGrouper(freq=rule, key=on, **kwargs) resampler = tg._get_resampler(groupby.obj, kind=kind) - return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key) + return resampler._get_resampler_for_grouping( + groupby=groupby, include_groups=include_groups, key=tg.key + ) class TimeGrouper(Grouper): @@ -2019,7 +2102,9 @@ def __init__( self, + obj: Grouper | None = None, freq: Frequency = "Min", + key: str | None = None, closed: Literal["left", "right"] | None = None, label: Literal["left", "right"] | None = None, how: str = "mean", @@ -2043,9 +2128,21 @@ if convention not in {None, "start", "end", "e", "s"}: raise ValueError(f"Unsupported value {convention} for `convention`") - freq = to_offset(freq) + if ( + key is None + and obj is not None + and isinstance(obj.index, PeriodIndex) # type: ignore[attr-defined] + or ( + key is not None + and obj is not None + and getattr(obj[key], "dtype", None) == "period" # type: ignore[index] + ) + ): + freq = to_offset(freq, is_period=True) + else: + freq = to_offset(freq) - end_types = {"M", "A", "Q", "BM", "BA", "BQ", "W"} + end_types = {"ME", "YE", "QE", "BME", "BYE", "BQE", "W"} rule = freq.rule_code if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): if closed is None: @@ -2078,6 +2175,7 @@ self.fill_method = fill_method self.limit = limit self.group_keys = group_keys + self._arrow_dtype: ArrowDtype | None = None if origin in ("epoch", "start", "start_day", "end", "end_day"): # error: Incompatible types in assignment (expression has type "Union[Union[ @@ -2107,7 +2205,7 @@ # always sort time groupers kwargs["sort"] = True - super().__init__(freq=freq, axis=axis, **kwargs) + super().__init__(freq=freq, key=key, axis=axis, **kwargs) def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: """ @@ -2128,8 +2226,7 @@ TypeError if incompatible axis """ - _, ax, indexer = self._set_grouper(obj, gpr_index=None) - + _, ax, _ = self._set_grouper(obj, gpr_index=None) if isinstance(ax, DatetimeIndex): return DatetimeIndexResampler( obj, @@ -2140,6 +2237,21 @@ gpr_index=ax, ) elif isinstance(ax, PeriodIndex) or kind == "period": + if isinstance(ax, PeriodIndex): + # GH#53481 + warnings.warn( + "Resampling with a PeriodIndex is deprecated. " + "Cast index to DatetimeIndex before resampling instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + warnings.warn( + "Resampling with kind='period' is deprecated. " + "Use datetime paths instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) return PeriodIndexResampler( obj, timegrouper=self, @@ -2168,7 +2280,7 @@ ) -> tuple[BinGrouper, NDFrameT]: # create the resampler and return our binner r = self._get_resampler(obj) - return r.grouper, cast(NDFrameT, r.obj) + return r._grouper, cast(NDFrameT, r.obj) def _get_time_bins(self, ax: DatetimeIndex): if not isinstance(ax, DatetimeIndex): @@ -2242,7 +2354,17 @@ ) -> tuple[DatetimeIndex, npt.NDArray[np.int64]]: # Some hacks for > daily data, see #1471, #1458, #1483 - if self.freq != "D" and is_superperiod(self.freq, "D"): + if self.freq.name in ("BME", "ME", "W") or self.freq.name.split("-")[0] in ( + "BQE", + "BYE", + "QE", + "YE", + "W", + ): + # If the right end-point is on the last day of the month, roll forwards + # until the last moment of that day. Note that we only do this for offsets + # which correspond to the end of a super-daily period - "month start", for + # example, is excluded. if self.closed == "right": # GH 21459, GH 9119: Adjust the bins relative to the wall time edges_dti = binner.tz_localize(None) @@ -2274,7 +2396,7 @@ # GH#51896 raise ValueError( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - f"e.g. '24H' or '3D', not {self.freq}" + f"e.g. '24h' or '3D', not {self.freq}" ) if not len(ax): @@ -2401,6 +2523,17 @@ return binner, bins, labels + def _set_grouper( + self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None + ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: + obj, ax, indexer = super()._set_grouper(obj, sort, gpr_index=gpr_index) + if isinstance(ax.dtype, ArrowDtype) and ax.dtype.kind in "Mm": + self._arrow_dtype = ax.dtype + ax = Index( + cast(ArrowExtensionArray, ax.array)._maybe_convert_datelike_array() + ) + return obj, ax, indexer + def _take_new_index( obj: NDFrameT, indexer: npt.NDArray[np.intp], new_index: Index, axis: AxisInt = 0 @@ -2415,7 +2548,8 @@ if axis == 1: raise NotImplementedError("axis 1 is not supported") new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) - return obj._constructor_from_mgr(new_mgr, axes=new_mgr.axes) + # error: Incompatible return value type (got "DataFrame", expected "NDFrameT") + return obj._constructor_from_mgr(new_mgr, axes=new_mgr.axes) # type: ignore[return-value] else: raise ValueError("'obj' should be either a Series or a DataFrame") @@ -2471,8 +2605,8 @@ origin = Timestamp("1970-01-01", tz=index_tz) if isinstance(freq, Day): - # _adjust_dates_anchored assumes 'D' means 24H, but first/last - # might contain a DST transition (23H, 24H, or 25H). + # _adjust_dates_anchored assumes 'D' means 24h, but first/last + # might contain a DST transition (23h, 24h, or 25h). # So "pretend" the dates are naive when adjusting the endpoints first = first.tz_localize(None) last = last.tz_localize(None) @@ -2677,6 +2811,15 @@ if how is None: how = "E" + if isinstance(freq, BaseOffset): + if hasattr(freq, "_period_dtype_code"): + freq = freq_to_period_freqstr(freq.n, freq.name) + else: + raise ValueError( + f"Invalid offset: '{freq.base}' for converting time series " + f"with PeriodIndex." + ) + new_obj = obj.copy() new_obj.index = obj.index.asfreq(freq, how=how) @@ -2685,7 +2828,11 @@ new_obj.index = _asfreq_compat(obj.index, freq) else: - dti = date_range(obj.index.min(), obj.index.max(), freq=freq) + unit = None + if isinstance(obj.index, DatetimeIndex): + # TODO: should we disallow non-DatetimeIndex? + unit = obj.index.unit + dti = date_range(obj.index.min(), obj.index.max(), freq=freq, unit=unit) dti.name = obj.index.name new_obj = obj.reindex(dti, method=method, fill_value=fill_value) if normalize: @@ -2756,3 +2903,18 @@ category=FutureWarning, stacklevel=find_stack_level(), ) + + +def _apply( + grouped: GroupBy, how: Callable, *args, include_groups: bool, **kwargs +) -> DataFrame: + # GH#7155 - rewrite warning to appear as if it came from `.resample` + target_message = "DataFrameGroupBy.apply operated on the grouping columns" + new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") + with rewrite_warning( + target_message=target_message, + target_category=DeprecationWarning, + new_message=new_message, + ): + result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) + return result diff -Nru pandas-2.1.4+dfsg/pandas/core/reshape/concat.py pandas-2.2.2+dfsg/pandas/core/reshape/concat.py --- pandas-2.1.4+dfsg/pandas/core/reshape/concat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/reshape/concat.py 2024-04-10 17:42:52.000000000 +0000 @@ -76,7 +76,7 @@ axis: Literal[0, "index"] = ..., join: str = ..., ignore_index: bool = ..., - keys=..., + keys: Iterable[Hashable] | None = ..., levels=..., names: list[HashableT] | None = ..., verify_integrity: bool = ..., @@ -93,7 +93,7 @@ axis: Literal[0, "index"] = ..., join: str = ..., ignore_index: bool = ..., - keys=..., + keys: Iterable[Hashable] | None = ..., levels=..., names: list[HashableT] | None = ..., verify_integrity: bool = ..., @@ -110,7 +110,7 @@ axis: Literal[0, "index"] = ..., join: str = ..., ignore_index: bool = ..., - keys=..., + keys: Iterable[Hashable] | None = ..., levels=..., names: list[HashableT] | None = ..., verify_integrity: bool = ..., @@ -127,7 +127,7 @@ axis: Literal[1, "columns"], join: str = ..., ignore_index: bool = ..., - keys=..., + keys: Iterable[Hashable] | None = ..., levels=..., names: list[HashableT] | None = ..., verify_integrity: bool = ..., @@ -144,7 +144,7 @@ axis: Axis = ..., join: str = ..., ignore_index: bool = ..., - keys=..., + keys: Iterable[Hashable] | None = ..., levels=..., names: list[HashableT] | None = ..., verify_integrity: bool = ..., @@ -160,7 +160,7 @@ axis: Axis = 0, join: str = "outer", ignore_index: bool = False, - keys=None, + keys: Iterable[Hashable] | None = None, levels=None, names: list[HashableT] | None = None, verify_integrity: bool = False, @@ -205,8 +205,10 @@ Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. sort : bool, default False - Sort non-concatenation axis if it is not already aligned. - + Sort non-concatenation axis if it is not already aligned. One exception to + this is when the non-concatentation axis is a DatetimeIndex and join='outer' + and the axis is not already aligned. In that case, the non-concatenation + axis is always sorted lexicographically. copy : bool, default True If False, do not copy data unnecessarily. @@ -405,7 +407,7 @@ objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame], axis: Axis = 0, join: str = "outer", - keys=None, + keys: Iterable[Hashable] | None = None, levels=None, names: list[HashableT] | None = None, ignore_index: bool = False, @@ -464,7 +466,7 @@ # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed if len(ndims) > 1: - objs, sample = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) + objs = self._sanitize_mixed_ndim(objs, sample, ignore_index, axis) self.objs = objs @@ -580,7 +582,7 @@ sample: Series | DataFrame, ignore_index: bool, axis: AxisInt, - ) -> tuple[list[Series | DataFrame], Series | DataFrame]: + ) -> list[Series | DataFrame]: # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed @@ -601,19 +603,21 @@ else: name = getattr(obj, "name", None) if ignore_index or name is None: - name = current_column - current_column += 1 - - # doing a row-wise concatenation so need everything - # to line up - if self._is_frame and axis == 1: - name = 0 + if axis == 1: + # doing a row-wise concatenation so need everything + # to line up + name = 0 + else: + # doing a column-wise concatenation so need series + # to have unique names + name = current_column + current_column += 1 obj = sample._constructor({name: obj}, copy=False) new_objs.append(obj) - return new_objs, sample + return new_objs def get_result(self): cons: Callable[..., DataFrame | Series] @@ -863,12 +867,14 @@ # do something a bit more speedy for hlevel, level in zip(zipped, levels): - hlevel = ensure_index(hlevel) - mapped = level.get_indexer(hlevel) + hlevel_index = ensure_index(hlevel) + mapped = level.get_indexer(hlevel_index) mask = mapped == -1 if mask.any(): - raise ValueError(f"Values not found in passed level: {hlevel[mask]!s}") + raise ValueError( + f"Values not found in passed level: {hlevel_index[mask]!s}" + ) new_codes.append(np.repeat(mapped, n)) diff -Nru pandas-2.1.4+dfsg/pandas/core/reshape/encoding.py pandas-2.2.2+dfsg/pandas/core/reshape/encoding.py --- pandas-2.1.4+dfsg/pandas/core/reshape/encoding.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/reshape/encoding.py 2024-04-10 17:42:52.000000000 +0000 @@ -21,9 +21,14 @@ is_object_dtype, pandas_dtype, ) +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + CategoricalDtype, +) from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import factorize_from_iterable +from pandas.core.arrays.string_ import StringDtype from pandas.core.frame import DataFrame from pandas.core.indexes.api import ( Index, @@ -244,8 +249,25 @@ # Series avoids inconsistent NaN handling codes, levels = factorize_from_iterable(Series(data, copy=False)) - if dtype is None: + if dtype is None and hasattr(data, "dtype"): + input_dtype = data.dtype + if isinstance(input_dtype, CategoricalDtype): + input_dtype = input_dtype.categories.dtype + + if isinstance(input_dtype, ArrowDtype): + import pyarrow as pa + + dtype = ArrowDtype(pa.bool_()) # type: ignore[assignment] + elif ( + isinstance(input_dtype, StringDtype) + and input_dtype.storage != "pyarrow_numpy" + ): + dtype = pandas_dtype("boolean") # type: ignore[assignment] + else: + dtype = np.dtype(bool) + elif dtype is None: dtype = np.dtype(bool) + _dtype = pandas_dtype(dtype) if is_object_dtype(_dtype): @@ -321,13 +343,15 @@ return concat(sparse_series, axis=1, copy=False) else: - # take on axis=1 + transpose to ensure ndarray layout is column-major - eye_dtype: NpDtype + # ensure ndarray layout is column-major + shape = len(codes), number_of_cols + dummy_dtype: NpDtype if isinstance(_dtype, np.dtype): - eye_dtype = _dtype + dummy_dtype = _dtype else: - eye_dtype = np.bool_ - dummy_mat = np.eye(number_of_cols, dtype=eye_dtype).take(codes, axis=1).T + dummy_dtype = np.bool_ + dummy_mat = np.zeros(shape=shape, dtype=dummy_dtype, order="F") + dummy_mat[np.arange(len(codes)), codes] = 1 if not dummy_na: # reset NaN GH4446 diff -Nru pandas-2.1.4+dfsg/pandas/core/reshape/melt.py pandas-2.2.2+dfsg/pandas/core/reshape/melt.py --- pandas-2.1.4+dfsg/pandas/core/reshape/melt.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/reshape/melt.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,12 +12,7 @@ from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos -from pandas.core.arrays import Categorical -import pandas.core.common as com -from pandas.core.indexes.api import ( - Index, - MultiIndex, -) +from pandas.core.indexes.api import MultiIndex from pandas.core.reshape.concat import concat from pandas.core.reshape.util import tile_compat from pandas.core.shared_docs import _shared_docs @@ -31,6 +26,20 @@ from pandas import DataFrame +def ensure_list_vars(arg_vars, variable: str, columns) -> list: + if arg_vars is not None: + if not is_list_like(arg_vars): + return [arg_vars] + elif isinstance(columns, MultiIndex) and not isinstance(arg_vars, list): + raise ValueError( + f"{variable} must be a list of tuples when columns are a MultiIndex" + ) + else: + return list(arg_vars) + else: + return [] + + @Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"}) def melt( frame: DataFrame, @@ -41,61 +50,35 @@ col_level=None, ignore_index: bool = True, ) -> DataFrame: - # If multiindex, gather names of columns on all level for checking presence - # of `id_vars` and `value_vars` - if isinstance(frame.columns, MultiIndex): - cols = [x for c in frame.columns for x in c] - else: - cols = list(frame.columns) - if value_name in frame.columns: raise ValueError( f"value_name ({value_name}) cannot match an element in " "the DataFrame columns." ) + id_vars = ensure_list_vars(id_vars, "id_vars", frame.columns) + value_vars_was_not_none = value_vars is not None + value_vars = ensure_list_vars(value_vars, "value_vars", frame.columns) - if id_vars is not None: - if not is_list_like(id_vars): - id_vars = [id_vars] - elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list): - raise ValueError( - "id_vars must be a list of tuples when columns are a MultiIndex" - ) - else: - # Check that `id_vars` are in frame - id_vars = list(id_vars) - missing = Index(com.flatten(id_vars)).difference(cols) - if not missing.empty: - raise KeyError( - "The following 'id_vars' are not present " - f"in the DataFrame: {list(missing)}" - ) - else: - id_vars = [] - - if value_vars is not None: - if not is_list_like(value_vars): - value_vars = [value_vars] - elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list): - raise ValueError( - "value_vars must be a list of tuples when columns are a MultiIndex" - ) - else: - value_vars = list(value_vars) - # Check that `value_vars` are in frame - missing = Index(com.flatten(value_vars)).difference(cols) - if not missing.empty: - raise KeyError( - "The following 'value_vars' are not present in " - f"the DataFrame: {list(missing)}" - ) + if id_vars or value_vars: if col_level is not None: - idx = frame.columns.get_level_values(col_level).get_indexer( - id_vars + value_vars + level = frame.columns.get_level_values(col_level) + else: + level = frame.columns + labels = id_vars + value_vars + idx = level.get_indexer_for(labels) + missing = idx == -1 + if missing.any(): + missing_labels = [ + lab for lab, not_found in zip(labels, missing) if not_found + ] + raise KeyError( + "The following id_vars or value_vars are not present in " + f"the DataFrame: {missing_labels}" ) + if value_vars_was_not_none: + frame = frame.iloc[:, algos.unique(idx)] else: - idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars)) - frame = frame.iloc[:, idx] + frame = frame.copy() else: frame = frame.copy() @@ -113,45 +96,49 @@ var_name = [ frame.columns.name if frame.columns.name is not None else "variable" ] - if isinstance(var_name, str): + elif is_list_like(var_name): + raise ValueError(f"{var_name=} must be a scalar.") + else: var_name = [var_name] - N, K = frame.shape - K -= len(id_vars) + num_rows, K = frame.shape + num_cols_adjusted = K - len(id_vars) mdata: dict[Hashable, AnyArrayLike] = {} for col in id_vars: id_data = frame.pop(col) if not isinstance(id_data.dtype, np.dtype): # i.e. ExtensionDtype - if K > 0: - mdata[col] = concat([id_data] * K, ignore_index=True) + if num_cols_adjusted > 0: + mdata[col] = concat([id_data] * num_cols_adjusted, ignore_index=True) else: # We can't concat empty list. (GH 46044) mdata[col] = type(id_data)([], name=id_data.name, dtype=id_data.dtype) else: - mdata[col] = np.tile(id_data._values, K) + mdata[col] = np.tile(id_data._values, num_cols_adjusted) mcolumns = id_vars + var_name + [value_name] - if frame.shape[1] > 0: + if frame.shape[1] > 0 and not any( + not isinstance(dt, np.dtype) and dt._supports_2d for dt in frame.dtypes + ): mdata[value_name] = concat( [frame.iloc[:, i] for i in range(frame.shape[1])] ).values else: mdata[value_name] = frame._values.ravel("F") for i, col in enumerate(var_name): - mdata[col] = frame.columns._get_level_values(i).repeat(N) + mdata[col] = frame.columns._get_level_values(i).repeat(num_rows) result = frame._constructor(mdata, columns=mcolumns) if not ignore_index: - result.index = tile_compat(frame.index, K) + result.index = tile_compat(frame.index, num_cols_adjusted) return result -def lreshape(data: DataFrame, groups, dropna: bool = True) -> DataFrame: +def lreshape(data: DataFrame, groups: dict, dropna: bool = True) -> DataFrame: """ Reshape wide-format data to long. Generalized inverse of DataFrame.pivot. @@ -204,30 +191,20 @@ 2 Red Sox 2008 545 3 Yankees 2008 526 """ - if isinstance(groups, dict): - keys = list(groups.keys()) - values = list(groups.values()) - else: - keys, values = zip(*groups) - - all_cols = list(set.union(*(set(x) for x in values))) - id_cols = list(data.columns.difference(all_cols)) - - K = len(values[0]) - - for seq in values: - if len(seq) != K: - raise ValueError("All column lists must be same length") - mdata = {} pivot_cols = [] - - for target, names in zip(keys, values): + all_cols: set[Hashable] = set() + K = len(next(iter(groups.values()))) + for target, names in groups.items(): + if len(names) != K: + raise ValueError("All column lists must be same length") to_concat = [data[col]._values for col in names] mdata[target] = concat_compat(to_concat) pivot_cols.append(target) + all_cols = all_cols.union(names) + id_cols = list(data.columns.difference(all_cols)) for col in id_cols: mdata[col] = np.tile(data[col]._values, K) @@ -479,10 +456,9 @@ two 2.9 """ - def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]: + def get_var_names(df, stub: str, sep: str, suffix: str): regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" - pattern = re.compile(regex) - return [col for col in df.columns if pattern.match(col)] + return df.columns[df.columns.str.match(regex)] def melt_stub(df, stub: str, i, j, value_vars, sep: str): newdf = melt( @@ -492,11 +468,14 @@ value_name=stub.rstrip(sep), var_name=j, ) - newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True) # GH17627 Cast numerics suffixes to int/float - newdf[j] = to_numeric(newdf[j], errors="ignore") + try: + newdf[j] = to_numeric(newdf[j]) + except (TypeError, ValueError, OverflowError): + # TODO: anything else to catch? + pass return newdf.set_index(i + [j]) @@ -505,7 +484,7 @@ else: stubnames = list(stubnames) - if any(col in stubnames for col in df.columns): + if df.columns.isin(stubnames).any(): raise ValueError("stubname can't be identical to a column name") if not is_list_like(i): @@ -516,18 +495,18 @@ if df[i].duplicated().any(): raise ValueError("the id variables need to uniquely identify each row") - value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames] - - value_vars_flattened = [e for sublist in value_vars for e in sublist] - id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) - - _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] - melted = _melted[0].join(_melted[1:], how="outer") + _melted = [] + value_vars_flattened = [] + for stub in stubnames: + value_var = get_var_names(df, stub, sep, suffix) + value_vars_flattened.extend(value_var) + _melted.append(melt_stub(df, stub, i, j, value_var, sep)) + + melted = concat(_melted, axis=1) + id_vars = df.columns.difference(value_vars_flattened) + new = df[id_vars] if len(i) == 1: - new = df[id_vars].set_index(i).join(melted) - return new - - new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) - - return new + return new.set_index(i).join(melted) + else: + return new.merge(melted.reset_index(), on=i).set_index(i + [j]) diff -Nru pandas-2.1.4+dfsg/pandas/core/reshape/merge.py pandas-2.2.2+dfsg/pandas/core/reshape/merge.py --- pandas-2.1.4+dfsg/pandas/core/reshape/merge.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/reshape/merge.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,7 +9,6 @@ ) import datetime from functools import partial -import string from typing import ( TYPE_CHECKING, Literal, @@ -53,7 +52,6 @@ ensure_object, is_bool, is_bool_dtype, - is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, @@ -90,7 +88,6 @@ BaseMaskedArray, ExtensionArray, ) -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com from pandas.core.construction import ( @@ -99,12 +96,16 @@ ) from pandas.core.frame import _merge_doc from pandas.core.indexes.api import default_index -from pandas.core.sorting import is_int64_overflow_possible +from pandas.core.sorting import ( + get_group_index, + is_int64_overflow_possible, +) if TYPE_CHECKING: from pandas import DataFrame from pandas.core import groupby from pandas.core.arrays import DatetimeArray + from pandas.core.indexes.frozen import FrozenList _factorizers = { np.int64: libhashtable.Int64Factorizer, @@ -137,9 +138,9 @@ left: DataFrame | Series, right: DataFrame | Series, how: MergeHow = "inner", - on: IndexLabel | None = None, - left_on: IndexLabel | None = None, - right_on: IndexLabel | None = None, + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, left_index: bool = False, right_index: bool = False, sort: bool = False, @@ -186,9 +187,9 @@ def _cross_merge( left: DataFrame, right: DataFrame, - on: IndexLabel | None = None, - left_on: IndexLabel | None = None, - right_on: IndexLabel | None = None, + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, left_index: bool = False, right_index: bool = False, sort: bool = False, @@ -238,7 +239,9 @@ return res -def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces): +def _groupby_and_merge( + by, left: DataFrame | Series, right: DataFrame | Series, merge_pieces +): """ groupby & merge; we are always performing a left-by type operation @@ -254,14 +257,14 @@ by = [by] lby = left.groupby(by, sort=False) - rby: groupby.DataFrameGroupBy | None = None + rby: groupby.DataFrameGroupBy | groupby.SeriesGroupBy | None = None # if we can groupby the rhs # then we can get vastly better perf if all(item in right.columns for item in by): rby = right.groupby(by, sort=False) - for key, lhs in lby.grouper.get_iterator(lby._selected_obj, axis=lby.axis): + for key, lhs in lby._grouper.get_iterator(lby._selected_obj, axis=lby.axis): if rby is None: rhs = right else: @@ -294,8 +297,8 @@ def merge_ordered( - left: DataFrame, - right: DataFrame, + left: DataFrame | Series, + right: DataFrame | Series, on: IndexLabel | None = None, left_on: IndexLabel | None = None, right_on: IndexLabel | None = None, @@ -714,7 +717,7 @@ """ _merge_type = "merge" - how: MergeHow | Literal["asof"] + how: JoinHow | Literal["asof"] on: IndexLabel | None # left_on/right_on may be None when passed, but in validate_specification # get replaced with non-None. @@ -735,10 +738,10 @@ self, left: DataFrame | Series, right: DataFrame | Series, - how: MergeHow | Literal["asof"] = "inner", - on: IndexLabel | None = None, - left_on: IndexLabel | None = None, - right_on: IndexLabel | None = None, + how: JoinHow | Literal["asof"] = "inner", + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, left_index: bool = False, right_index: bool = False, sort: bool = True, @@ -755,7 +758,7 @@ self.on = com.maybe_make_list(on) self.suffixes = suffixes - self.sort = sort + self.sort = sort or how == "outer" self.left_index = left_index self.right_index = right_index @@ -1016,10 +1019,14 @@ take_left, take_right = None, None if name in result: - if left_indexer is not None and right_indexer is not None: + if left_indexer is not None or right_indexer is not None: if name in self.left: if left_has_missing is None: - left_has_missing = (left_indexer == -1).any() + left_has_missing = ( + False + if left_indexer is None + else (left_indexer == -1).any() + ) if left_has_missing: take_right = self.right_join_keys[i] @@ -1029,7 +1036,11 @@ elif name in self.right: if right_has_missing is None: - right_has_missing = (right_indexer == -1).any() + right_has_missing = ( + False + if right_indexer is None + else (right_indexer == -1).any() + ) if right_has_missing: take_left = self.left_join_keys[i] @@ -1037,13 +1048,15 @@ if result[name].dtype != self.right[name].dtype: take_right = self.right[name]._values - elif left_indexer is not None: + else: take_left = self.left_join_keys[i] take_right = self.right_join_keys[i] if take_left is not None or take_right is not None: if take_left is None: lvals = result[name]._values + elif left_indexer is None: + lvals = take_left else: # TODO: can we pin down take_left's type earlier? take_left = extract_array(take_left, extract_numpy=True) @@ -1052,6 +1065,8 @@ if take_right is None: rvals = result[name]._values + elif right_indexer is None: + rvals = take_right else: # TODO: can we pin down take_right's type earlier? taker = extract_array(take_right, extract_numpy=True) @@ -1060,16 +1075,17 @@ # if we have an all missing left_indexer # make sure to just use the right values or vice-versa - mask_left = left_indexer == -1 - # error: Item "bool" of "Union[Any, bool]" has no attribute "all" - if mask_left.all(): # type: ignore[union-attr] + if left_indexer is not None and (left_indexer == -1).all(): key_col = Index(rvals) result_dtype = rvals.dtype elif right_indexer is not None and (right_indexer == -1).all(): key_col = Index(lvals) result_dtype = lvals.dtype else: - key_col = Index(lvals).where(~mask_left, rvals) + key_col = Index(lvals) + if left_indexer is not None: + mask_left = left_indexer == -1 + key_col = key_col.where(~mask_left, rvals) result_dtype = find_common_type([lvals.dtype, rvals.dtype]) if ( lvals.dtype.kind == "M" @@ -1100,8 +1116,12 @@ else: result.insert(i, name or f"key_{i}", key_col) - def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + def _get_join_indexers( + self, + ) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: """return the join indexers""" + # make mypy happy + assert self.how != "asof" return get_join_indexers( self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how ) @@ -1110,8 +1130,6 @@ def _get_join_info( self, ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: - # make mypy happy - assert self.how != "cross" left_ax = self.left.index right_ax = self.right.index @@ -1140,6 +1158,8 @@ left_indexer, how="right", ) + elif right_indexer is None: + join_index = right_ax.copy() else: join_index = right_ax.take(right_indexer) elif self.left_index: @@ -1159,10 +1179,13 @@ right_indexer, how="left", ) + elif left_indexer is None: + join_index = left_ax.copy() else: join_index = left_ax.take(left_indexer) else: - join_index = default_index(len(left_indexer)) + n = len(left_ax) if left_indexer is None else len(left_indexer) + join_index = default_index(n) return join_index, left_indexer, right_indexer @@ -1171,7 +1194,7 @@ self, index: Index, other_index: Index, - indexer: npt.NDArray[np.intp], + indexer: npt.NDArray[np.intp] | None, how: JoinHow = "left", ) -> Index: """ @@ -1179,9 +1202,12 @@ Parameters ---------- - index : Index being rearranged - other_index : Index used to supply values not found in index - indexer : np.ndarray[np.intp] how to rearrange index + index : Index + index being rearranged + other_index : Index + used to supply values not found in index + indexer : np.ndarray[np.intp] or None + how to rearrange index how : str Replacement is only necessary if indexer based on other_index. @@ -1199,6 +1225,8 @@ if np.any(mask): fill_value = na_value_for_dtype(index.dtype, compat=False) index = index.append(Index([fill_value])) + if indexer is None: + return index.copy() return index.take(indexer) @final @@ -1271,12 +1299,7 @@ # work-around for merge_asof(right_index=True) right_keys.append(right.index._values) if lk is not None and lk == rk: # FIXME: what about other NAs? - # avoid key upcast in corner case (length-0) - lk = cast(Hashable, lk) - if len(left) > 0: - right_drop.append(rk) - else: - left_drop.append(lk) + right_drop.append(rk) else: rk = cast(ArrayLike, rk) right_keys.append(rk) @@ -1356,8 +1379,12 @@ lk_is_cat = isinstance(lk.dtype, CategoricalDtype) rk_is_cat = isinstance(rk.dtype, CategoricalDtype) - lk_is_object = is_object_dtype(lk.dtype) - rk_is_object = is_object_dtype(rk.dtype) + lk_is_object_or_string = is_object_dtype(lk.dtype) or is_string_dtype( + lk.dtype + ) + rk_is_object_or_string = is_object_dtype(rk.dtype) or is_string_dtype( + rk.dtype + ) # if either left or right is a categorical # then the must match exactly in categories & ordered @@ -1386,20 +1413,22 @@ if lk.dtype.kind == rk.dtype.kind: continue - if is_extension_array_dtype(lk.dtype) and not is_extension_array_dtype( - rk.dtype + if isinstance(lk.dtype, ExtensionDtype) and not isinstance( + rk.dtype, ExtensionDtype ): ct = find_common_type([lk.dtype, rk.dtype]) - if is_extension_array_dtype(ct): - rk = ct.construct_array_type()._from_sequence(rk) # type: ignore[union-attr] # noqa: E501 + if isinstance(ct, ExtensionDtype): + com_cls = ct.construct_array_type() + rk = com_cls._from_sequence(rk, dtype=ct, copy=False) else: - rk = rk.astype(ct) # type: ignore[arg-type] - elif is_extension_array_dtype(rk.dtype): + rk = rk.astype(ct) + elif isinstance(rk.dtype, ExtensionDtype): ct = find_common_type([lk.dtype, rk.dtype]) - if is_extension_array_dtype(ct): - lk = ct.construct_array_type()._from_sequence(lk) # type: ignore[union-attr] # noqa: E501 + if isinstance(ct, ExtensionDtype): + com_cls = ct.construct_array_type() + lk = com_cls._from_sequence(lk, dtype=ct, copy=False) else: - lk = lk.astype(ct) # type: ignore[arg-type] + lk = lk.astype(ct) # check whether ints and floats if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype): @@ -1452,14 +1481,14 @@ # incompatible dtypes GH 9780, GH 15800 # bool values are coerced to object - elif (lk_is_object and is_bool_dtype(rk.dtype)) or ( - is_bool_dtype(lk.dtype) and rk_is_object + elif (lk_is_object_or_string and is_bool_dtype(rk.dtype)) or ( + is_bool_dtype(lk.dtype) and rk_is_object_or_string ): pass # object values are allowed to be merged - elif (lk_is_object and is_numeric_dtype(rk.dtype)) or ( - is_numeric_dtype(lk.dtype) and rk_is_object + elif (lk_is_object_or_string and is_numeric_dtype(rk.dtype)) or ( + is_numeric_dtype(lk.dtype) and rk_is_object_or_string ): inferred_left = lib.infer_dtype(lk, skipna=False) inferred_right = lib.infer_dtype(rk, skipna=False) @@ -1497,8 +1526,13 @@ ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"): # allows datetime with different resolutions continue + # datetime and timedelta not allowed + elif lk.dtype.kind == "M" and rk.dtype.kind == "m": + raise ValueError(msg) + elif lk.dtype.kind == "m" and rk.dtype.kind == "M": + raise ValueError(msg) - elif lk_is_object and rk_is_object: + elif is_object_dtype(lk.dtype) and is_object_dtype(rk.dtype): continue # Houston, we have a problem! @@ -1659,8 +1693,8 @@ left_keys: list[ArrayLike], right_keys: list[ArrayLike], sort: bool = False, - how: MergeHow | Literal["asof"] = "inner", -) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + how: JoinHow = "inner", +) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: """ Parameters @@ -1672,9 +1706,9 @@ Returns ------- - np.ndarray[np.intp] + np.ndarray[np.intp] or None Indexer into the left_keys. - np.ndarray[np.intp] + np.ndarray[np.intp] or None Indexer into the right_keys. """ assert len(left_keys) == len( @@ -1685,47 +1719,87 @@ left_n = len(left_keys[0]) right_n = len(right_keys[0]) if left_n == 0: - if how in ["left", "inner", "cross"]: + if how in ["left", "inner"]: return _get_empty_indexer() elif not sort and how in ["right", "outer"]: return _get_no_sort_one_missing_indexer(right_n, True) elif right_n == 0: - if how in ["right", "inner", "cross"]: + if how in ["right", "inner"]: return _get_empty_indexer() elif not sort and how in ["left", "outer"]: return _get_no_sort_one_missing_indexer(left_n, False) - # get left & right join labels and num. of levels at each location - mapped = ( - _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) - for n in range(len(left_keys)) - ) - zipped = zip(*mapped) - llab, rlab, shape = (list(x) for x in zipped) + lkey: ArrayLike + rkey: ArrayLike + if len(left_keys) > 1: + # get left & right join labels and num. of levels at each location + mapped = ( + _factorize_keys(left_keys[n], right_keys[n], sort=sort) + for n in range(len(left_keys)) + ) + zipped = zip(*mapped) + llab, rlab, shape = (list(x) for x in zipped) + + # get flat i8 keys from label lists + lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort) + else: + lkey = left_keys[0] + rkey = right_keys[0] + + left = Index(lkey) + right = Index(rkey) - # get flat i8 keys from label lists - lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort) + if ( + left.is_monotonic_increasing + and right.is_monotonic_increasing + and (left.is_unique or right.is_unique) + ): + _, lidx, ridx = left.join(right, how=how, return_indexers=True, sort=sort) + else: + lidx, ridx = get_join_indexers_non_unique( + left._values, right._values, sort, how + ) + + if lidx is not None and is_range_indexer(lidx, len(left)): + lidx = None + if ridx is not None and is_range_indexer(ridx, len(right)): + ridx = None + return lidx, ridx - # factorize keys to a dense i8 space - # `count` is the num. of unique keys - # set(lkey) | set(rkey) == range(count) - - lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) - # preserve left frame order if how == 'left' and sort == False - kwargs = {} - if how in ("left", "right"): - kwargs["sort"] = sort - join_func = { - "inner": libjoin.inner_join, - "left": libjoin.left_outer_join, - "right": lambda x, y, count, **kwargs: libjoin.left_outer_join( - y, x, count, **kwargs - )[::-1], - "outer": libjoin.full_outer_join, - }[how] - # error: Cannot call function of unknown type - return join_func(lkey, rkey, count, **kwargs) # type: ignore[operator] +def get_join_indexers_non_unique( + left: ArrayLike, + right: ArrayLike, + sort: bool = False, + how: JoinHow = "inner", +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: + """ + Get join indexers for left and right. + + Parameters + ---------- + left : ArrayLike + right : ArrayLike + sort : bool, default False + how : {'inner', 'outer', 'left', 'right'}, default 'inner' + + Returns + ------- + np.ndarray[np.intp] + Indexer into left. + np.ndarray[np.intp] + Indexer into right. + """ + lkey, rkey, count = _factorize_keys(left, right, sort=sort) + if how == "left": + lidx, ridx = libjoin.left_outer_join(lkey, rkey, count, sort=sort) + elif how == "right": + ridx, lidx = libjoin.left_outer_join(rkey, lkey, count, sort=sort) + elif how == "inner": + lidx, ridx = libjoin.inner_join(lkey, rkey, count, sort=sort) + elif how == "outer": + lidx, ridx = libjoin.full_outer_join(lkey, rkey, count) + return lidx, ridx def restore_dropped_levels_multijoin( @@ -1735,7 +1809,7 @@ join_index: Index, lindexer: npt.NDArray[np.intp], rindexer: npt.NDArray[np.intp], -) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]: +) -> tuple[FrozenList, FrozenList, FrozenList]: """ *this is an internal non-public method* @@ -1811,7 +1885,7 @@ # error: Cannot determine type of "__add__" join_levels = join_levels + [restore_levels] # type: ignore[has-type] - join_codes = join_codes + [restore_codes] + join_codes = join_codes + [restore_codes] # type: ignore[has-type] join_names = join_names + [dropped_level_name] return join_levels, join_codes, join_names @@ -1856,14 +1930,18 @@ if self.fill_method == "ffill": if left_indexer is None: - raise TypeError("left_indexer cannot be None") - left_indexer = cast("npt.NDArray[np.intp]", left_indexer) - right_indexer = cast("npt.NDArray[np.intp]", right_indexer) - left_join_indexer = libjoin.ffill_indexer(left_indexer) - right_join_indexer = libjoin.ffill_indexer(right_indexer) - else: + left_join_indexer = None + else: + left_join_indexer = libjoin.ffill_indexer(left_indexer) + if right_indexer is None: + right_join_indexer = None + else: + right_join_indexer = libjoin.ffill_indexer(right_indexer) + elif self.fill_method is None: left_join_indexer = left_indexer right_join_indexer = right_indexer + else: + raise ValueError("fill_method must be 'ffill' or None") result = self._reindex_and_concat( join_index, left_join_indexer, right_join_indexer, copy=copy @@ -1985,7 +2063,12 @@ else: ro_dtype = self.right.index.dtype - if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype): + if ( + is_object_dtype(lo_dtype) + or is_object_dtype(ro_dtype) + or is_string_dtype(lo_dtype) + or is_string_dtype(ro_dtype) + ): raise MergeError( f"Incompatible merge dtype, {repr(ro_dtype)} and " f"{repr(lo_dtype)}, both sides must have numeric dtype" @@ -2065,7 +2148,9 @@ f"with type {repr(lt.dtype)}" ) - if needs_i8_conversion(lt.dtype): + if needs_i8_conversion(lt.dtype) or ( + isinstance(lt, ArrowExtensionArray) and lt.dtype.kind in "mM" + ): if not isinstance(self.tolerance, datetime.timedelta): raise MergeError(msg) if self.tolerance < Timedelta(0): @@ -2116,34 +2201,6 @@ def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: """return the join indexers""" - def flip(xs: list[ArrayLike]) -> np.ndarray: - """unlike np.transpose, this returns an array of tuples""" - - def injection(obj: ArrayLike): - if not isinstance(obj.dtype, ExtensionDtype): - # ndarray - return obj - obj = extract_array(obj) - if isinstance(obj, NDArrayBackedExtensionArray): - # fastpath for e.g. dt64tz, categorical - return obj._ndarray - # FIXME: returning obj._values_for_argsort() here doesn't - # break in any existing test cases, but i (@jbrockmendel) - # am pretty sure it should! - # e.g. - # arr = pd.array([0, pd.NA, 255], dtype="UInt8") - # will have values_for_argsort (before GH#45434) - # np.array([0, 255, 255], dtype=np.uint8) - # and the non-injectivity should make a difference somehow - # shouldn't it? - return np.asarray(obj) - - xs = [injection(x) for x in xs] - labels = list(string.ascii_lowercase[: len(xs)]) - dtypes = [x.dtype for x in xs] - labeled_dtypes = list(zip(labels, dtypes)) - return np.array(list(zip(*xs)), labeled_dtypes) - # values to compare left_values = ( self.left.index._values if self.left_index else self.left_join_keys[-1] @@ -2159,15 +2216,21 @@ if tolerance is not None: # TODO: can we reuse a tolerance-conversion function from # e.g. TimedeltaIndex? - if needs_i8_conversion(left_values.dtype): + if needs_i8_conversion(left_values.dtype) or ( + isinstance(left_values, ArrowExtensionArray) + and left_values.dtype.kind in "mM" + ): tolerance = Timedelta(tolerance) # TODO: we have no test cases with PeriodDtype here; probably # need to adjust tolerance for that case. if left_values.dtype.kind in "mM": # Make sure the i8 representation for tolerance # matches that for left_values/right_values. - lvs = ensure_wrapped_if_datetimelike(left_values) - tolerance = tolerance.as_unit(lvs.unit) + if isinstance(left_values, ArrowExtensionArray): + unit = left_values.dtype.pyarrow_dtype.unit + else: + unit = ensure_wrapped_if_datetimelike(left_values).unit + tolerance = tolerance.as_unit(unit) tolerance = tolerance._value @@ -2179,42 +2242,36 @@ if self.left_by is not None: # remove 'on' parameter from values if one existed if self.left_index and self.right_index: - left_by_values = self.left_join_keys - right_by_values = self.right_join_keys + left_join_keys = self.left_join_keys + right_join_keys = self.right_join_keys else: - left_by_values = self.left_join_keys[0:-1] - right_by_values = self.right_join_keys[0:-1] + left_join_keys = self.left_join_keys[0:-1] + right_join_keys = self.right_join_keys[0:-1] + + mapped = [ + _factorize_keys( + left_join_keys[n], + right_join_keys[n], + sort=False, + ) + for n in range(len(left_join_keys)) + ] - # get tuple representation of values if more than one - if len(left_by_values) == 1: - lbv = left_by_values[0] - rbv = right_by_values[0] - - # TODO: conversions for EAs that can be no-copy. - lbv = np.asarray(lbv) - rbv = np.asarray(rbv) - if needs_i8_conversion(lbv.dtype): - lbv = lbv.view("i8") - if needs_i8_conversion(rbv.dtype): - rbv = rbv.view("i8") + if len(left_join_keys) == 1: + left_by_values = mapped[0][0] + right_by_values = mapped[0][1] else: - # We get here with non-ndarrays in test_merge_by_col_tz_aware - # and test_merge_groupby_multiple_column_with_categorical_column - lbv = flip(left_by_values) - rbv = flip(right_by_values) - lbv = ensure_object(lbv) - rbv = ensure_object(rbv) - - # error: Incompatible types in assignment (expression has type - # "Union[ndarray[Any, dtype[Any]], ndarray[Any, dtype[object_]]]", - # variable has type "List[Union[Union[ExtensionArray, - # ndarray[Any, Any]], Index, Series]]") - right_by_values = rbv # type: ignore[assignment] - # error: Incompatible types in assignment (expression has type - # "Union[ndarray[Any, dtype[Any]], ndarray[Any, dtype[object_]]]", - # variable has type "List[Union[Union[ExtensionArray, - # ndarray[Any, Any]], Index, Series]]") - left_by_values = lbv # type: ignore[assignment] + arrs = [np.concatenate(m[:2]) for m in mapped] + shape = tuple(m[2] for m in mapped) + group_index = get_group_index( + arrs, shape=shape, sort=False, xnull=False + ) + left_len = len(left_join_keys[0]) + left_by_values = group_index[:left_len] + right_by_values = group_index[left_len:] + + left_by_values = ensure_int64(left_by_values) + right_by_values = ensure_int64(right_by_values) # choose appropriate function by type func = _asof_by_function(self.direction) @@ -2339,10 +2396,7 @@ def _factorize_keys( - lk: ArrayLike, - rk: ArrayLike, - sort: bool = True, - how: MergeHow | Literal["asof"] = "inner", + lk: ArrayLike, rk: ArrayLike, sort: bool = True ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: """ Encode left and right keys as enumerated types. @@ -2358,8 +2412,6 @@ sort : bool, defaults to True If True, the encoding is done such that the unique elements in the keys are sorted. - how : {'left', 'right', 'outer', 'inner'}, default 'inner' - Type of merge. Returns ------- @@ -2435,21 +2487,31 @@ .combine_chunks() .dictionary_encode() ) - length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length) + pc.fill_null(dc.indices[slice(len_lk)], -1) .to_numpy() .astype(np.intp, copy=False), - pc.fill_null(dc.indices[slice(len_lk, None)], length) + pc.fill_null(dc.indices[slice(len_lk, None)], -1) .to_numpy() .astype(np.intp, copy=False), len(dc.dictionary), ) + + if sort: + uniques = dc.dictionary.to_numpy(zero_copy_only=False) + llab, rlab = _sort_labels(uniques, llab, rlab) + if dc.null_count > 0: + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) count += 1 - if how == "right": - return rlab, llab, count return llab, rlab, count if not isinstance(lk, BaseMaskedArray) and not ( @@ -2520,8 +2582,6 @@ np.putmask(rlab, rmask, count) count += 1 - if how == "right": - return rlab, llab, count return llab, rlab, count @@ -2537,15 +2597,15 @@ if not isinstance(lk, ExtensionArray): lk = cls._from_sequence(lk, dtype=dtype, copy=False) else: - lk = lk.astype(dtype) + lk = lk.astype(dtype, copy=False) if not isinstance(rk, ExtensionArray): rk = cls._from_sequence(rk, dtype=dtype, copy=False) else: - rk = rk.astype(dtype) + rk = rk.astype(dtype, copy=False) else: - lk = lk.astype(dtype) - rk = rk.astype(dtype) + lk = lk.astype(dtype, copy=False) + rk = rk.astype(dtype, copy=False) if isinstance(lk, BaseMaskedArray): # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]"; # expected type "Type[object]" diff -Nru pandas-2.1.4+dfsg/pandas/core/reshape/pivot.py pandas-2.2.2+dfsg/pandas/core/reshape/pivot.py --- pandas-2.1.4+dfsg/pandas/core/reshape/pivot.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/reshape/pivot.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,8 +7,10 @@ from typing import ( TYPE_CHECKING, Callable, + Literal, cast, ) +import warnings import numpy as np @@ -17,6 +19,7 @@ Appender, Substitution, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( @@ -67,7 +70,7 @@ margins: bool = False, dropna: bool = True, margins_name: Hashable = "All", - observed: bool = False, + observed: bool | lib.NoDefault = lib.no_default, sort: bool = True, ) -> DataFrame: index = _convert_by(index) @@ -122,7 +125,7 @@ margins: bool, dropna: bool, margins_name: Hashable, - observed: bool, + observed: bool | lib.NoDefault, sort: bool, ) -> DataFrame: """ @@ -165,7 +168,18 @@ pass values = list(values) - grouped = data.groupby(keys, observed=observed, sort=sort, dropna=dropna) + observed_bool = False if observed is lib.no_default else observed + grouped = data.groupby(keys, observed=observed_bool, sort=sort, dropna=dropna) + if observed is lib.no_default and any( + ping._passed_categorical for ping in grouped._grouper.groupings + ): + warnings.warn( + "The default value of observed=False is deprecated and will change " + "to observed=True in a future version of pandas. Specify " + "observed=False to silence this warning and retain the current behavior", + category=FutureWarning, + stacklevel=find_stack_level(), + ) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): @@ -308,6 +322,7 @@ row_names = result.index.names # check the result column and leave floats + for dtype in set(result.dtypes): if isinstance(dtype, ExtensionDtype): # Can hold NA already @@ -420,9 +435,10 @@ row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) row_margin = row_margin.stack(future_stack=True) - # slight hack - new_order = [len(cols)] + list(range(len(cols))) - row_margin.index = row_margin.index.reorder_levels(new_order) + # GH#26568. Use names instead of indices in case of numeric names + new_order_indices = [len(cols)] + list(range(len(cols))) + new_order_names = [row_margin.index.names[i] for i in new_order_indices] + row_margin.index = row_margin.index.reorder_levels(new_order_names) else: row_margin = data._constructor_sliced(np.nan, index=result.columns) @@ -449,7 +465,7 @@ return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows].groupby(rows, observed=observed).apply(aggfunc) + margin = data.groupby(rows, observed=observed)[rows].apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table @@ -467,7 +483,7 @@ margin_keys = table.columns if len(cols): - row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc) + row_margin = data.groupby(cols, observed=observed)[cols].apply(aggfunc) else: row_margin = Series(np.nan, index=result.columns) @@ -522,6 +538,7 @@ cols + columns_listlike, append=append # type: ignore[operator] ) else: + index_list: list[Index] | list[Series] if index is lib.no_default: if isinstance(data.index, MultiIndex): # GH 23955 @@ -568,7 +585,7 @@ margins: bool = False, margins_name: Hashable = "All", dropna: bool = True, - normalize: bool = False, + normalize: bool | Literal[0, 1, "all", "index", "columns"] = False, ) -> DataFrame: """ Compute a simple cross tabulation of two (or more) factors. @@ -715,6 +732,7 @@ margins=margins, margins_name=margins_name, dropna=dropna, + observed=False, **kwargs, # type: ignore[arg-type] ) diff -Nru pandas-2.1.4+dfsg/pandas/core/reshape/reshape.py pandas-2.2.2+dfsg/pandas/core/reshape/reshape.py --- pandas-2.1.4+dfsg/pandas/core/reshape/reshape.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/reshape/reshape.py 2024-04-10 17:42:52.000000000 +0000 @@ -222,7 +222,7 @@ @cache_readonly def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]: - # We cache this for re-use in ExtensionBlock._unstack + # We cache this for reuse in ExtensionBlock._unstack dummy_arr = np.arange(len(self.index), dtype=np.intp) new_values, mask = self.get_new_values(dummy_arr, fill_value=-1) return new_values, mask.any(0) @@ -572,7 +572,7 @@ # equiv: result.droplevel(level=0, axis=1) # but this avoids an extra copy - result.columns = result.columns.droplevel(0) + result.columns = result.columns._drop_level_numbers([0]) return result @@ -953,8 +953,8 @@ index_levels = frame.index.levels index_codes = list(np.tile(frame.index.codes, (1, ratio))) else: - index_levels = [frame.index.unique()] - codes = factorize(frame.index)[0] + codes, uniques = factorize(frame.index, use_na_sentinel=False) + index_levels = [uniques] index_codes = list(np.tile(codes, (1, ratio))) if isinstance(stack_cols, MultiIndex): column_levels = ordered_stack_cols.levels diff -Nru pandas-2.1.4+dfsg/pandas/core/reshape/tile.py pandas-2.2.2+dfsg/pandas/core/reshape/tile.py --- pandas-2.1.4+dfsg/pandas/core/reshape/tile.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/reshape/tile.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,10 +17,8 @@ Timestamp, lib, ) -from pandas._libs.lib import infer_dtype from pandas.core.dtypes.common import ( - DT64NS_DTYPE, ensure_platform_int, is_bool_dtype, is_integer, @@ -40,11 +38,9 @@ Categorical, Index, IntervalIndex, - to_datetime, - to_timedelta, ) -from pandas.core import nanops import pandas.core.algorithms as algos +from pandas.core.arrays.datetimelike import dtype_to_unit if TYPE_CHECKING: from pandas._typing import ( @@ -243,66 +239,33 @@ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 original = x - x = _preprocess_for_cut(x) - x, dtype = _coerce_to_type(x) + x_idx = _preprocess_for_cut(x) + x_idx, _ = _coerce_to_type(x_idx) if not np.iterable(bins): - if is_scalar(bins) and bins < 1: - raise ValueError("`bins` should be a positive integer.") - - sz = x.size - - if sz == 0: - raise ValueError("Cannot cut empty array") - - rng = (nanops.nanmin(x), nanops.nanmax(x)) - mn, mx = (mi + 0.0 for mi in rng) - - if np.isinf(mn) or np.isinf(mx): - # GH 24314 - raise ValueError( - "cannot specify integer `bins` when input data contains infinity" - ) - if mn == mx: # adjust end points before binning - mn -= 0.001 * abs(mn) if mn != 0 else 0.001 - mx += 0.001 * abs(mx) if mx != 0 else 0.001 - bins = np.linspace(mn, mx, bins + 1, endpoint=True) - else: # adjust end points after binning - bins = np.linspace(mn, mx, bins + 1, endpoint=True) - adj = (mx - mn) * 0.001 # 0.1% of the range - if right: - bins[0] -= adj - else: - bins[-1] += adj + bins = _nbins_to_bins(x_idx, bins, right) elif isinstance(bins, IntervalIndex): if bins.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") else: - if isinstance(getattr(bins, "dtype", None), DatetimeTZDtype): - bins = np.asarray(bins, dtype=DT64NS_DTYPE) - else: - bins = np.asarray(bins) - bins = _convert_bin_to_numeric_type(bins, dtype) - - # GH 26045: cast to float64 to avoid an overflow - if (np.diff(bins.astype("float64")) < 0).any(): + bins = Index(bins) + if not bins.is_monotonic_increasing: raise ValueError("bins must increase monotonically.") fac, bins = _bins_to_cuts( - x, + x_idx, bins, right=right, labels=labels, precision=precision, include_lowest=include_lowest, - dtype=dtype, duplicates=duplicates, ordered=ordered, ) - return _postprocess_for_cut(fac, bins, retbins, dtype, original) + return _postprocess_for_cut(fac, bins, retbins, original) def qcut( @@ -367,36 +330,93 @@ array([0, 0, 1, 2, 3]) """ original = x - x = _preprocess_for_cut(x) - x, dtype = _coerce_to_type(x) + x_idx = _preprocess_for_cut(x) + x_idx, _ = _coerce_to_type(x_idx) quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q - x_np = np.asarray(x) - x_np = x_np[~np.isnan(x_np)] - bins = np.quantile(x_np, quantiles) + bins = x_idx.to_series().dropna().quantile(quantiles) fac, bins = _bins_to_cuts( - x, - bins, + x_idx, + Index(bins), labels=labels, precision=precision, include_lowest=True, - dtype=dtype, duplicates=duplicates, ) - return _postprocess_for_cut(fac, bins, retbins, dtype, original) + return _postprocess_for_cut(fac, bins, retbins, original) + + +def _nbins_to_bins(x_idx: Index, nbins: int, right: bool) -> Index: + """ + If a user passed an integer N for bins, convert this to a sequence of N + equal(ish)-sized bins. + """ + if is_scalar(nbins) and nbins < 1: + raise ValueError("`bins` should be a positive integer.") + + if x_idx.size == 0: + raise ValueError("Cannot cut empty array") + + rng = (x_idx.min(), x_idx.max()) + mn, mx = rng + + if is_numeric_dtype(x_idx.dtype) and (np.isinf(mn) or np.isinf(mx)): + # GH#24314 + raise ValueError( + "cannot specify integer `bins` when input data contains infinity" + ) + + if mn == mx: # adjust end points before binning + if _is_dt_or_td(x_idx.dtype): + # using seconds=1 is pretty arbitrary here + # error: Argument 1 to "dtype_to_unit" has incompatible type + # "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]" + unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type] + td = Timedelta(seconds=1).as_unit(unit) + # Use DatetimeArray/TimedeltaArray method instead of linspace + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "_generate_range" + bins = x_idx._values._generate_range( # type: ignore[union-attr] + start=mn - td, end=mx + td, periods=nbins + 1, freq=None, unit=unit + ) + else: + mn -= 0.001 * abs(mn) if mn != 0 else 0.001 + mx += 0.001 * abs(mx) if mx != 0 else 0.001 + + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + else: # adjust end points after binning + if _is_dt_or_td(x_idx.dtype): + # Use DatetimeArray/TimedeltaArray method instead of linspace + + # error: Argument 1 to "dtype_to_unit" has incompatible type + # "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]" + unit = dtype_to_unit(x_idx.dtype) # type: ignore[arg-type] + # error: Item "ExtensionArray" of "ExtensionArray | ndarray[Any, Any]" + # has no attribute "_generate_range" + bins = x_idx._values._generate_range( # type: ignore[union-attr] + start=mn, end=mx, periods=nbins + 1, freq=None, unit=unit + ) + else: + bins = np.linspace(mn, mx, nbins + 1, endpoint=True) + adj = (mx - mn) * 0.001 # 0.1% of the range + if right: + bins[0] -= adj + else: + bins[-1] += adj + + return Index(bins) def _bins_to_cuts( - x, - bins: np.ndarray, + x_idx: Index, + bins: Index, right: bool = True, labels=None, precision: int = 3, include_lowest: bool = False, - dtype: DtypeObj | None = None, duplicates: str = "raise", ordered: bool = True, ): @@ -408,9 +428,11 @@ "invalid value for 'duplicates' parameter, valid options are: raise, drop" ) + result: Categorical | np.ndarray + if isinstance(bins, IntervalIndex): # we have a fast-path here - ids = bins.get_indexer(x) + ids = bins.get_indexer(x_idx) cat_dtype = CategoricalDtype(bins, ordered=True) result = Categorical.from_codes(ids, dtype=cat_dtype, validate=False) return result, bins @@ -425,12 +447,29 @@ bins = unique_bins side: Literal["left", "right"] = "left" if right else "right" - ids = ensure_platform_int(bins.searchsorted(x, side=side)) + + try: + ids = bins.searchsorted(x_idx, side=side) + except TypeError as err: + # e.g. test_datetime_nan_error if bins are DatetimeArray and x_idx + # is integers + if x_idx.dtype.kind == "m": + raise ValueError("bins must be of timedelta64 dtype") from err + elif x_idx.dtype.kind == bins.dtype.kind == "M": + raise ValueError( + "Cannot use timezone-naive bins with timezone-aware values, " + "or vice-versa" + ) from err + elif x_idx.dtype.kind == "M": + raise ValueError("bins must be of datetime64 dtype") from err + else: + raise + ids = ensure_platform_int(ids) if include_lowest: - ids[np.asarray(x) == bins[0]] = 1 + ids[x_idx == bins[0]] = 1 - na_mask = isna(x) | (ids == len(bins)) | (ids == 0) + na_mask = isna(x_idx) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: @@ -442,7 +481,7 @@ if labels is None: labels = _format_labels( - bins, precision, right=right, include_lowest=include_lowest, dtype=dtype + bins, precision, right=right, include_lowest=include_lowest ) elif ordered and len(set(labels)) != len(labels): raise ValueError( @@ -474,7 +513,7 @@ return result, bins -def _coerce_to_type(x): +def _coerce_to_type(x: Index) -> tuple[Index, DtypeObj | None]: """ if the passed data is of datetime/timedelta, bool or nullable int type, this method converts it to numeric so that cut or qcut method can @@ -482,14 +521,8 @@ """ dtype: DtypeObj | None = None - if isinstance(x.dtype, DatetimeTZDtype): + if _is_dt_or_td(x.dtype): dtype = x.dtype - elif lib.is_np_dtype(x.dtype, "M"): - x = to_datetime(x).astype("datetime64[ns]", copy=False) - dtype = np.dtype("datetime64[ns]") - elif lib.is_np_dtype(x.dtype, "m"): - x = to_timedelta(x) - dtype = np.dtype("timedelta64[ns]") elif is_bool_dtype(x.dtype): # GH 20303 x = x.astype(np.int64) @@ -498,92 +531,35 @@ # https://github.com/pandas-dev/pandas/pull/31290 # https://github.com/pandas-dev/pandas/issues/31389 elif isinstance(x.dtype, ExtensionDtype) and is_numeric_dtype(x.dtype): - x = x.to_numpy(dtype=np.float64, na_value=np.nan) - - if dtype is not None: - # GH 19768: force NaT to NaN during integer conversion - x = np.where(x.notna(), x.view(np.int64), np.nan) - - return x, dtype - - -def _convert_bin_to_numeric_type(bins, dtype: DtypeObj | None): - """ - if the passed bin is of datetime/timedelta type, - this method converts it to integer - - Parameters - ---------- - bins : list-like of bins - dtype : dtype of data - - Raises - ------ - ValueError if bins are not of a compat dtype to dtype - """ - bins_dtype = infer_dtype(bins, skipna=False) - if lib.is_np_dtype(dtype, "m"): - if bins_dtype in ["timedelta", "timedelta64"]: - bins = to_timedelta(bins).view(np.int64) - else: - raise ValueError("bins must be of timedelta64 dtype") - elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype): - if bins_dtype in ["datetime", "datetime64"]: - bins = to_datetime(bins) - if lib.is_np_dtype(bins.dtype, "M"): - # As of 2.0, to_datetime may give non-nano, so we need to convert - # here until the rest of this file recognizes non-nano - bins = bins.astype("datetime64[ns]", copy=False) - bins = bins.view(np.int64) - else: - raise ValueError("bins must be of datetime64 dtype") + x_arr = x.to_numpy(dtype=np.float64, na_value=np.nan) + x = Index(x_arr) - return bins - - -def _convert_bin_to_datelike_type(bins, dtype: DtypeObj | None): - """ - Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is - datelike + return Index(x), dtype - Parameters - ---------- - bins : list-like of bins - dtype : dtype of data - Returns - ------- - bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is - datelike - """ - if isinstance(dtype, DatetimeTZDtype): - bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz) - elif lib.is_np_dtype(dtype, "mM"): - bins = Index(bins.astype(np.int64), dtype=dtype) - return bins +def _is_dt_or_td(dtype: DtypeObj) -> bool: + # Note: the dtype here comes from an Index.dtype, so we know that that any + # dt64/td64 dtype is of a supported unit. + return isinstance(dtype, DatetimeTZDtype) or lib.is_np_dtype(dtype, "mM") def _format_labels( - bins, + bins: Index, precision: int, right: bool = True, include_lowest: bool = False, - dtype: DtypeObj | None = None, ): """based on the dtype, return our labels""" closed: IntervalLeftRight = "right" if right else "left" formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta] - if isinstance(dtype, DatetimeTZDtype): - formatter = lambda x: Timestamp(x, tz=dtype.tz) - adjust = lambda x: x - Timedelta("1ns") - elif lib.is_np_dtype(dtype, "M"): - formatter = Timestamp - adjust = lambda x: x - Timedelta("1ns") - elif lib.is_np_dtype(dtype, "m"): - formatter = Timedelta - adjust = lambda x: x - Timedelta("1ns") + if _is_dt_or_td(bins.dtype): + # error: Argument 1 to "dtype_to_unit" has incompatible type + # "dtype[Any] | ExtensionDtype"; expected "DatetimeTZDtype | dtype[Any]" + unit = dtype_to_unit(bins.dtype) # type: ignore[arg-type] + formatter = lambda x: x + adjust = lambda x: x - Timedelta(1, unit=unit).as_unit(unit) else: precision = _infer_precision(precision, bins) formatter = lambda x: _round_frac(x, precision) @@ -594,10 +570,14 @@ # adjust lhs of first interval by precision to account for being right closed breaks[0] = adjust(breaks[0]) + if _is_dt_or_td(bins.dtype): + # error: "Index" has no attribute "as_unit" + breaks = type(bins)(breaks).as_unit(unit) # type: ignore[attr-defined] + return IntervalIndex.from_breaks(breaks, closed=closed) -def _preprocess_for_cut(x): +def _preprocess_for_cut(x) -> Index: """ handles preprocessing for cut where we convert passed input to array, strip the index information and store it @@ -611,10 +591,10 @@ if x.ndim != 1: raise ValueError("Input array must be 1 dimensional") - return x + return Index(x) -def _postprocess_for_cut(fac, bins, retbins: bool, dtype: DtypeObj | None, original): +def _postprocess_for_cut(fac, bins, retbins: bool, original): """ handles post processing for the cut method where we combine the index information if the originally passed @@ -626,7 +606,8 @@ if not retbins: return fac - bins = _convert_bin_to_datelike_type(bins, dtype) + if isinstance(bins, Index) and is_numeric_dtype(bins.dtype): + bins = bins._values return fac, bins @@ -646,7 +627,7 @@ return np.around(x, digits) -def _infer_precision(base_precision: int, bins) -> int: +def _infer_precision(base_precision: int, bins: Index) -> int: """ Infer an appropriate precision for _round_frac """ diff -Nru pandas-2.1.4+dfsg/pandas/core/reshape/util.py pandas-2.2.2+dfsg/pandas/core/reshape/util.py --- pandas-2.1.4+dfsg/pandas/core/reshape/util.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/reshape/util.py 2024-04-10 17:42:52.000000000 +0000 @@ -63,7 +63,7 @@ return [ tile_compat( np.repeat(x, b[i]), - np.prod(a[i]), # pyright: ignore[reportGeneralTypeIssues] + np.prod(a[i]), ) for i, x in enumerate(X) ] diff -Nru pandas-2.1.4+dfsg/pandas/core/series.py pandas-2.2.2+dfsg/pandas/core/series.py --- pandas-2.1.4+dfsg/pandas/core/series.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/series.py 2024-04-10 17:42:52.000000000 +0000 @@ -27,9 +27,10 @@ import numpy as np from pandas._config import ( - get_option, using_copy_on_write, + warn_copy_on_write, ) +from pandas._config.config import _get_option from pandas._libs import ( lib, @@ -46,10 +47,14 @@ InvalidIndexError, _chained_assignment_method_msg, _chained_assignment_msg, + _chained_assignment_warning_method_msg, + _chained_assignment_warning_msg, + _check_cacher, ) from pandas.util._decorators import ( Appender, Substitution, + deprecate_nonkeyword_arguments, doc, ) from pandas.util._exceptions import find_stack_level @@ -62,7 +67,9 @@ from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( LossySetitemError, - convert_dtypes, + construct_1d_arraylike_from_scalar, + find_common_type, + infer_dtype_from, maybe_box_native, maybe_cast_pointwise_result, ) @@ -77,10 +84,14 @@ validate_all_hashable, ) from pandas.core.dtypes.dtypes import ( - ArrowDtype, + CategoricalDtype, ExtensionDtype, + SparseDtype, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, ) -from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import ( isna, @@ -101,9 +112,15 @@ from pandas.core.accessor import CachedAccessor from pandas.core.apply import SeriesApply from pandas.core.arrays import ExtensionArray +from pandas.core.arrays.arrow import ( + ListAccessor, + StructAccessor, +) from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor +from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import ( + array as pd_array, extract_array, sanitize_array, ) @@ -163,13 +180,14 @@ CorrelationMethod, DropKeep, Dtype, - DtypeBackend, DtypeObj, FilePath, + Frequency, IgnoreRaise, IndexKeyFunc, IndexLabel, Level, + MutableMappingT, NaPosition, NumpySorter, NumpyValueArrayLike, @@ -375,14 +393,34 @@ dtype: Dtype | None = None, name=None, copy: bool | None = None, - fastpath: bool = False, + fastpath: bool | lib.NoDefault = lib.no_default, ) -> None: + if fastpath is not lib.no_default: + warnings.warn( + "The 'fastpath' keyword in pd.Series is deprecated and will " + "be removed in a future version.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + else: + fastpath = False + + allow_mgr = False if ( isinstance(data, (SingleBlockManager, SingleArrayManager)) and index is None and dtype is None and (copy is False or copy is None) ): + if not allow_mgr: + # GH#52419 + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=2, + ) if using_copy_on_write(): data = data.copy(deep=False) # GH#33357 called with just the SingleBlockManager @@ -394,6 +432,10 @@ self.name = name return + is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) + data_dtype = getattr(data, "dtype", None) + original_dtype = dtype + if isinstance(data, (ExtensionArray, np.ndarray)): if copy is not False and using_copy_on_write(): if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): @@ -405,13 +447,24 @@ if fastpath: # data is a ndarray, index is defined if not isinstance(data, (SingleBlockManager, SingleArrayManager)): - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) if manager == "block": data = SingleBlockManager.from_array(data, index) elif manager == "array": data = SingleArrayManager.from_array(data, index) + allow_mgr = True elif using_copy_on_write() and not copy: data = data.copy(deep=False) + + if not allow_mgr: + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=2, + ) + if copy: data = data.copy() # skips validation of the name @@ -422,6 +475,15 @@ if isinstance(data, SingleBlockManager) and using_copy_on_write() and not copy: data = data.copy(deep=False) + if not allow_mgr: + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=2, + ) + name = ibase.maybe_extract_name(name, data, type(self)) if index is not None: @@ -471,7 +533,7 @@ data = data.reindex(index, copy=copy) copy = False data = data._mgr - elif is_dict_like(data): + elif isinstance(data, Mapping): data, index = self._init_dict(data, index, dtype) dtype = None copy = False @@ -487,6 +549,16 @@ "`index` argument. `copy` must be False." ) + if not allow_mgr: + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=2, + ) + allow_mgr = True + elif isinstance(data, ExtensionArray): pass else: @@ -511,7 +583,7 @@ else: data = sanitize_array(data, index, dtype, copy) - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) if manager == "block": data = SingleBlockManager.from_array(data, index, refs=refs) elif manager == "array": @@ -521,8 +593,19 @@ self.name = name self._set_axis(0, index) + if original_dtype is None and is_pandas_object and data_dtype == np.object_: + if self.dtype != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The Series " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old behavior.", + FutureWarning, + stacklevel=find_stack_level(), + ) + def _init_dict( - self, data, index: Index | None = None, dtype: DtypeObj | None = None + self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None ): """ Derive the "_mgr" and "index" attributes of a new Series from a @@ -579,14 +662,17 @@ return Series def _constructor_from_mgr(self, mgr, axes): - if self._constructor is Series: - # we are pandas.Series (or a subclass that doesn't override _constructor) - ser = Series._from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name + ser = Series._from_mgr(mgr, axes=axes) + ser._name = None # caller is responsible for setting real name + + if type(self) is Series: + # This would also work `if self._constructor is Series`, but + # this check is slightly faster, benefiting the most-common case. return ser - else: - assert axes is mgr.axes - return self._constructor(mgr) + + # We assume that the subclass __init__ knows how to handle a + # pd.Series object. + return self._constructor(ser) @property def _constructor_expanddim(self) -> Callable[..., DataFrame]: @@ -598,24 +684,19 @@ return DataFrame - def _expanddim_from_mgr(self, mgr, axes) -> DataFrame: - # https://github.com/pandas-dev/pandas/pull/52132#issuecomment-1481491828 - # This is a short-term implementation that will be replaced - # with self._constructor_expanddim._constructor_from_mgr(...) - # once downstream packages (geopandas) have had a chance to implement - # their own overrides. - # error: "Callable[..., DataFrame]" has no attribute "_from_mgr" [attr-defined] - from pandas import DataFrame - - return DataFrame._from_mgr(mgr, axes=mgr.axes) - def _constructor_expanddim_from_mgr(self, mgr, axes): from pandas.core.frame import DataFrame - if self._constructor_expanddim is DataFrame: - return self._expanddim_from_mgr(mgr, axes) - assert axes is mgr.axes - return self._constructor_expanddim(mgr) + df = DataFrame._from_mgr(mgr, axes=mgr.axes) + + if type(self) is Series: + # This would also work `if self._constructor_expanddim is DataFrame`, + # but this check is slightly faster, benefiting the most-common case. + return df + + # We assume that the subclass __init__ knows how to handle a + # pd.DataFrame object. + return self._constructor_expanddim(df) # types @property @@ -798,6 +879,11 @@ """ Return the flattened underlying data as an ndarray or ExtensionArray. + .. deprecated:: 2.2.0 + Series.ravel is deprecated. The underlying array is already 1D, so + ravel is not necessary. Use :meth:`to_numpy` for conversion to a numpy + array instead. + Returns ------- numpy.ndarray or ExtensionArray @@ -810,9 +896,16 @@ Examples -------- >>> s = pd.Series([1, 2, 3]) - >>> s.ravel() + >>> s.ravel() # doctest: +SKIP array([1, 2, 3]) """ + warnings.warn( + "Series.ravel is deprecated. The underlying array is already 1D, so " + "ravel is not necessary. Use `to_numpy()` for conversion to a numpy " + "array instead.", + FutureWarning, + stacklevel=2, + ) arr = self._values.ravel(order=order) if isinstance(arr, np.ndarray) and using_copy_on_write(): arr.flags.writeable = False @@ -828,6 +921,10 @@ """ Create a new view of the Series. + .. deprecated:: 2.2.0 + ``Series.view`` is deprecated and will be removed in a future version. + Use :meth:`Series.astype` as an alternative to change the dtype. + This function will return a new Series with a view of the same underlying values in memory, optionally reinterpreted with a new data type. The new data type must preserve the same size in bytes as to not @@ -858,38 +955,14 @@ Examples -------- - >>> s = pd.Series([-2, -1, 0, 1, 2], dtype='int8') - >>> s - 0 -2 - 1 -1 - 2 0 - 3 1 - 4 2 - dtype: int8 - - The 8 bit signed integer representation of `-1` is `0b11111111`, but - the same bytes represent 255 if read as an 8 bit unsigned integer: - - >>> us = s.view('uint8') - >>> us - 0 254 - 1 255 - 2 0 - 3 1 - 4 2 - dtype: uint8 - - The views share the same underlying values: - - >>> us[0] = 128 - >>> s - 0 -128 - 1 -1 - 2 0 - 3 1 - 4 2 - dtype: int8 + Use ``astype`` to change the dtype instead. """ + warnings.warn( + "Series.view is deprecated and will be removed in a future version. " + "Use ``astype`` as an alternative to change the dtype.", + FutureWarning, + stacklevel=2, + ) # self.array instead of self._values so we piggyback on NumpyExtensionArray # implementation res_values = self.array.view(dtype) @@ -897,12 +970,14 @@ if isinstance(res_ser._mgr, SingleBlockManager): blk = res_ser._mgr._block blk.refs = cast("BlockValuesRefs", self._references) - blk.refs.add_reference(blk) # type: ignore[arg-type] + blk.refs.add_reference(blk) return res_ser.__finalize__(self, method="view") # ---------------------------------------------------------------------- # NDArray Compat - def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: + def __array__( + self, dtype: npt.DTypeLike | None = None, copy: bool | None = None + ) -> np.ndarray: """ Return the values as a NumPy array. @@ -915,6 +990,9 @@ The dtype to use for the resulting NumPy array. By default, the dtype is inferred from the data. + copy : bool or None, optional + Unused. + Returns ------- numpy.ndarray @@ -1002,7 +1080,7 @@ Returns ------- - scalar (int) or Series (slice, sequence) + scalar """ return self._values[i] @@ -1010,7 +1088,8 @@ # axis kwarg is retained for compat with NDFrame method # _slice is *always* positional mgr = self._mgr.get_slice(slobj, axis=axis) - out = self._constructor(mgr, fastpath=True) + out = self._constructor_from_mgr(mgr, axes=mgr.axes) + out._name = self._name return out.__finalize__(self) def __getitem__(self, key): @@ -1018,6 +1097,8 @@ key = com.apply_if_callable(key, self) if key is Ellipsis: + if using_copy_on_write() or warn_copy_on_write(): + return self.copy(deep=False) return self key_is_scalar = is_scalar(key) @@ -1128,7 +1209,7 @@ # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) new_ser = self._constructor(self._values[indexer], index=new_index, copy=False) - if using_copy_on_write() and isinstance(indexer, slice): + if isinstance(indexer, slice): new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type] return new_ser.__finalize__(self) @@ -1170,7 +1251,7 @@ new_ser = self._constructor( new_values, index=new_index, name=self.name, copy=False ) - if using_copy_on_write() and isinstance(loc, slice): + if isinstance(loc, slice): new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type] return new_ser.__finalize__(self) @@ -1178,11 +1259,29 @@ return self.iloc[loc] def __setitem__(self, key, value) -> None: + warn = True if not PYPY and using_copy_on_write(): if sys.getrefcount(self) <= 3: warnings.warn( _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 ) + elif not PYPY and not using_copy_on_write(): + ctr = sys.getrefcount(self) + ref_count = 3 + if not warn_copy_on_write() and _check_cacher(self): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count and ( + warn_copy_on_write() + or ( + not warn_copy_on_write() + and self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr] + ) + ): + warn = False + warnings.warn( + _chained_assignment_warning_msg, FutureWarning, stacklevel=2 + ) check_dict_or_set_indexers(key) key = com.apply_if_callable(key, self) @@ -1193,10 +1292,10 @@ if isinstance(key, slice): indexer = self.index._convert_slice_indexer(key, kind="getitem") - return self._set_values(indexer, value) + return self._set_values(indexer, value, warn=warn) try: - self._set_with_engine(key, value) + self._set_with_engine(key, value, warn=warn) except KeyError: # We have a scalar (or for MultiIndex or object-dtype, scalar-like) # key that is not present in self.index. @@ -1255,25 +1354,25 @@ # otherwise with listlike other we interpret series[mask] = other # as series[mask] = other[mask] try: - self._where(~key, value, inplace=True) + self._where(~key, value, inplace=True, warn=warn) except InvalidIndexError: # test_where_dups self.iloc[key] = value return else: - self._set_with(key, value) + self._set_with(key, value, warn=warn) if cacher_needs_updating: self._maybe_update_cacher(inplace=True) - def _set_with_engine(self, key, value) -> None: + def _set_with_engine(self, key, value, warn: bool = True) -> None: loc = self.index.get_loc(key) # this is equivalent to self._values[key] = value - self._mgr.setitem_inplace(loc, value) + self._mgr.setitem_inplace(loc, value, warn=warn) - def _set_with(self, key, value) -> None: + def _set_with(self, key, value, warn: bool = True) -> None: # We got here via exception-handling off of InvalidIndexError, so # key should always be listlike at this point. assert not isinstance(key, tuple) @@ -1284,7 +1383,7 @@ if not self.index._should_fallback_to_positional: # Regardless of the key type, we're treating it as labels - self._set_labels(key, value) + self._set_labels(key, value, warn=warn) else: # Note: key_type == "boolean" should not occur because that @@ -1301,23 +1400,23 @@ FutureWarning, stacklevel=find_stack_level(), ) - self._set_values(key, value) + self._set_values(key, value, warn=warn) else: - self._set_labels(key, value) + self._set_labels(key, value, warn=warn) - def _set_labels(self, key, value) -> None: + def _set_labels(self, key, value, warn: bool = True) -> None: key = com.asarray_tuplesafe(key) indexer: np.ndarray = self.index.get_indexer(key) mask = indexer == -1 if mask.any(): raise KeyError(f"{key[mask]} not in index") - self._set_values(indexer, value) + self._set_values(indexer, value, warn=warn) - def _set_values(self, key, value) -> None: + def _set_values(self, key, value, warn: bool = True) -> None: if isinstance(key, (Index, Series)): key = key._values - self._mgr = self._mgr.setitem(indexer=key, value=value) + self._mgr = self._mgr.setitem(indexer=key, value=value, warn=warn) self._maybe_update_cacher() def _set_value(self, label, value, takeable: bool = False) -> None: @@ -1853,8 +1952,6 @@ {storage_options} - .. versionadded:: 1.2.0 - **kwargs These parameters will be passed to `tabulate \ `_. @@ -1871,7 +1968,7 @@ {examples} """ return self.to_frame().to_markdown( - buf, mode, index, storage_options=storage_options, **kwargs + buf, mode=mode, index=index, storage_options=storage_options, **kwargs ) # ---------------------------------------------------------------------- @@ -1925,21 +2022,40 @@ """ return self.index - def to_dict(self, into: type[dict] = dict) -> dict: + @overload + def to_dict( + self, *, into: type[MutableMappingT] | MutableMappingT + ) -> MutableMappingT: + ... + + @overload + def to_dict(self, *, into: type[dict] = ...) -> dict: + ... + + # error: Incompatible default for argument "into" (default has type "type[ + # dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") + @deprecate_nonkeyword_arguments( + version="3.0", allowed_args=["self"], name="to_dict" + ) + def to_dict( + self, + into: type[MutableMappingT] + | MutableMappingT = dict, # type: ignore[assignment] + ) -> MutableMappingT: """ Convert Series to {label -> value} dict or dict-like object. Parameters ---------- into : class, default dict - The collections.abc.Mapping subclass to use as the return - object. Can be the actual class or an empty - instance of the mapping type you want. If you want a - collections.defaultdict, you must pass it initialized. + The collections.abc.MutableMapping subclass to use as the return + object. Can be the actual class or an empty instance of the mapping + type you want. If you want a collections.defaultdict, you must + pass it initialized. Returns ------- - collections.abc.Mapping + collections.abc.MutableMapping Key-value representation of Series. Examples @@ -1948,10 +2064,10 @@ >>> s.to_dict() {0: 1, 1: 2, 2: 3, 3: 4} >>> from collections import OrderedDict, defaultdict - >>> s.to_dict(OrderedDict) + >>> s.to_dict(into=OrderedDict) OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)]) >>> dd = defaultdict(list) - >>> s.to_dict(dd) + >>> s.to_dict(into=dd) defaultdict(, {0: 1, 1: 2, 2: 3, 3: 4}) """ # GH16122 @@ -2144,13 +2260,13 @@ # Statistics, overridden ndarray methods # TODO: integrate bottleneck - def count(self): + def count(self) -> int: """ Return number of non-NA/null observations in the Series. Returns ------- - int or Series (if level specified) + int Number of non-null values in the Series. See Also @@ -2689,13 +2805,11 @@ dtype: float64 """ nv.validate_round(args, kwargs) - result = self._values.round(decimals) - result = self._constructor(result, index=self.index, copy=False).__finalize__( + new_mgr = self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()) + return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( self, method="round" ) - return result - @overload def quantile( self, q: float = ..., interpolation: QuantileInterpolation = ... @@ -2734,8 +2848,8 @@ This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: - * linear: `i + (j - i) * fraction`, where `fraction` is the - fractional part of the index surrounded by `i` and `j`. + * linear: `i + (j - i) * (x-i)/(j-i)`, where `(x-i)/(j-i)` is + the fractional part of the index surrounded by `i > j`. * lower: `i`. * higher: `j`. * nearest: `i` or `j` whichever is nearest. @@ -3352,7 +3466,12 @@ # try_float=False is to match agg_series npvalues = lib.maybe_convert_objects(new_values, try_float=False) - res_values = maybe_cast_pointwise_result(npvalues, self.dtype, same_dtype=False) + # same_dtype here is a kludge to avoid casting e.g. [True, False] to + # ["True", "False"] + same_dtype = isinstance(self.dtype, (StringDtype, CategoricalDtype)) + res_values = maybe_cast_pointwise_result( + npvalues, self.dtype, same_dtype=same_dtype + ) return self._constructor(res_values, index=new_index, name=new_name, copy=False) def combine_first(self, other) -> Series: @@ -3401,6 +3520,13 @@ """ from pandas.core.reshape.concat import concat + if self.dtype == other.dtype: + if self.index.equals(other.index): + return self.mask(self.isna(), other) + elif self._can_hold_na and not isinstance(self.dtype, SparseDtype): + this, other = self.align(other, join="outer") + return this.mask(this.isna(), other) + new_index = self.index.union(other.index) this = self @@ -3491,6 +3617,18 @@ ChainedAssignmentError, stacklevel=2, ) + elif not PYPY and not using_copy_on_write() and self._is_view_after_cow_rules(): + ctr = sys.getrefcount(self) + ref_count = REF_COUNT + if _check_cacher(self): + # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221 + ref_count += 1 + if ctr <= ref_count: + warnings.warn( + _chained_assignment_warning_method_msg, + FutureWarning, + stacklevel=2, + ) if not isinstance(other, Series): other = Series(other) @@ -3945,6 +4083,7 @@ axis: Axis = 0, kind: SortKind = "quicksort", order: None = None, + stable: None = None, ) -> Series: """ Return the integer indices that would sort the Series values. @@ -3961,6 +4100,8 @@ information. 'mergesort' and 'stable' are the only stable algorithms. order : None Has no effect but is accepted for compatibility with numpy. + stable : None + Has no effect but is accepted for compatibility with numpy. Returns ------- @@ -3989,6 +4130,9 @@ mask = isna(values) if mask.any(): + # TODO(3.0): once this deprecation is enforced we can call + # self.array.argsort directly, which will close GH#43840 and + # GH#12694 warnings.warn( "The behavior of Series.argsort in the presence of NA values is " "deprecated. In a future version, NA values will be ordered " @@ -4210,7 +4354,19 @@ klass=_shared_doc_kwargs["klass"], extra_params=dedent( """copy : bool, default True - Whether to copy underlying data.""" + Whether to copy underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True``""" ), examples=dedent( """\ @@ -4390,7 +4546,7 @@ 3 4 dtype: object """ - if isinstance(self.dtype, ArrowDtype) and self.dtype.type == list: + if isinstance(self.dtype, ExtensionDtype): values, counts = self._values._explode() elif len(self) and is_object_dtype(self.dtype): values, counts = reshape.explode(np.asarray(self._values)) @@ -4399,7 +4555,7 @@ return result.reset_index(drop=True) if ignore_index else result if ignore_index: - index = default_index(len(values)) + index: Index = default_index(len(values)) else: index = self.index.repeat(counts) @@ -4622,7 +4778,11 @@ ) -> DataFrame | Series: # Validate axis argument self._get_axis_number(axis) - ser = self.copy(deep=False) if using_copy_on_write() else self + ser = ( + self.copy(deep=False) + if using_copy_on_write() or warn_copy_on_write() + else self + ) result = SeriesApply(ser, func=func, args=args, kwargs=kwargs).transform() return result @@ -4863,6 +5023,18 @@ Unused. Parameter needed for compatibility with DataFrame. copy : bool, default True Also copy underlying data. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` inplace : bool, default False Whether to return a new Series. If True the value of copy is ignored. level : int or level name, default None @@ -4988,8 +5160,44 @@ tolerance=tolerance, ) + @overload # type: ignore[override] + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + axis: Axis = ..., + copy: bool = ..., + inplace: Literal[True], + ) -> None: + ... + + @overload + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + axis: Axis = ..., + copy: bool = ..., + inplace: Literal[False] = ..., + ) -> Self: + ... + + @overload + def rename_axis( + self, + mapper: IndexLabel | lib.NoDefault = ..., + *, + index=..., + axis: Axis = ..., + copy: bool = ..., + inplace: bool = ..., + ) -> Self | None: + ... + @doc(NDFrame.rename_axis) - def rename_axis( # type: ignore[override] + def rename_axis( self, mapper: IndexLabel | lib.NoDefault = lib.no_default, *, @@ -5170,7 +5378,7 @@ Examples -------- - >>> ser = pd.Series([1,2,3]) + >>> ser = pd.Series([1, 2, 3]) >>> ser.pop(0) 1 @@ -5198,6 +5406,7 @@ show_counts=show_counts, ) + # TODO(3.0): this can be removed once GH#33302 deprecation is enforced def _replace_single(self, to_replace, method: str, inplace: bool, limit): """ Replaces values in a Series using the fill method specified when no @@ -5445,38 +5654,120 @@ return lmask & rmask - # ---------------------------------------------------------------------- - # Convert to types that support pd.NA - - def _convert_dtypes( + def case_when( self, - infer_objects: bool = True, - convert_string: bool = True, - convert_integer: bool = True, - convert_boolean: bool = True, - convert_floating: bool = True, - dtype_backend: DtypeBackend = "numpy_nullable", + caselist: list[ + tuple[ + ArrayLike | Callable[[Series], Series | np.ndarray | Sequence[bool]], + ArrayLike | Scalar | Callable[[Series], Series | np.ndarray], + ], + ], ) -> Series: - input_series = self - if infer_objects: - input_series = input_series.infer_objects() - if is_object_dtype(input_series.dtype): - input_series = input_series.copy(deep=None) - - if convert_string or convert_integer or convert_boolean or convert_floating: - inferred_dtype = convert_dtypes( - input_series._values, - convert_string, - convert_integer, - convert_boolean, - convert_floating, - infer_objects, - dtype_backend, + """ + Replace values where the conditions are True. + + Parameters + ---------- + caselist : A list of tuples of conditions and expected replacements + Takes the form: ``(condition0, replacement0)``, + ``(condition1, replacement1)``, ... . + ``condition`` should be a 1-D boolean array-like object + or a callable. If ``condition`` is a callable, + it is computed on the Series + and should return a boolean Series or array. + The callable must not change the input Series + (though pandas doesn`t check it). ``replacement`` should be a + 1-D array-like object, a scalar or a callable. + If ``replacement`` is a callable, it is computed on the Series + and should return a scalar or Series. The callable + must not change the input Series + (though pandas doesn`t check it). + + .. versionadded:: 2.2.0 + + Returns + ------- + Series + + See Also + -------- + Series.mask : Replace values where the condition is True. + + Examples + -------- + >>> c = pd.Series([6, 7, 8, 9], name='c') + >>> a = pd.Series([0, 0, 1, 2]) + >>> b = pd.Series([0, 3, 4, 5]) + + >>> c.case_when(caselist=[(a.gt(0), a), # condition, replacement + ... (b.gt(0), b)]) + 0 6 + 1 3 + 2 1 + 3 2 + Name: c, dtype: int64 + """ + if not isinstance(caselist, list): + raise TypeError( + f"The caselist argument should be a list; instead got {type(caselist)}" ) - result = input_series.astype(inferred_dtype) - else: - result = input_series.copy(deep=None) - return result + + if not caselist: + raise ValueError( + "provide at least one boolean condition, " + "with a corresponding replacement." + ) + + for num, entry in enumerate(caselist): + if not isinstance(entry, tuple): + raise TypeError( + f"Argument {num} must be a tuple; instead got {type(entry)}." + ) + if len(entry) != 2: + raise ValueError( + f"Argument {num} must have length 2; " + "a condition and replacement; " + f"instead got length {len(entry)}." + ) + caselist = [ + ( + com.apply_if_callable(condition, self), + com.apply_if_callable(replacement, self), + ) + for condition, replacement in caselist + ] + default = self.copy() + conditions, replacements = zip(*caselist) + common_dtypes = [infer_dtype_from(arg)[0] for arg in [*replacements, default]] + if len(set(common_dtypes)) > 1: + common_dtype = find_common_type(common_dtypes) + updated_replacements = [] + for condition, replacement in zip(conditions, replacements): + if is_scalar(replacement): + replacement = construct_1d_arraylike_from_scalar( + value=replacement, length=len(condition), dtype=common_dtype + ) + elif isinstance(replacement, ABCSeries): + replacement = replacement.astype(common_dtype) + else: + replacement = pd_array(replacement, dtype=common_dtype) + updated_replacements.append(replacement) + replacements = updated_replacements + default = default.astype(common_dtype) + + counter = reversed(range(len(conditions))) + for position, condition, replacement in zip( + counter, conditions[::-1], replacements[::-1] + ): + try: + default = default.mask( + condition, other=replacement, axis=0, inplace=False, level=None + ) + except Exception as error: + raise ValueError( + f"Failed to apply condition{position} and replacement{position}." + ) from error + return default # error: Cannot determine type of 'isna' @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] @@ -5626,7 +5917,7 @@ def to_timestamp( self, - freq=None, + freq: Frequency | None = None, how: Literal["s", "e", "start", "end"] = "start", copy: bool | None = None, ) -> Series: @@ -5643,6 +5934,18 @@ copy : bool, default True Whether or not to return a copy. + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + Returns ------- Series with DatetimeIndex @@ -5655,7 +5958,7 @@ 2023 1 2024 2 2025 3 - Freq: A-DEC, dtype: int64 + Freq: Y-DEC, dtype: int64 The resulting frequency of the Timestamps is `YearBegin` @@ -5664,7 +5967,7 @@ 2023-01-01 1 2024-01-01 2 2025-01-01 3 - Freq: AS-JAN, dtype: int64 + Freq: YS-JAN, dtype: int64 Using `freq` which is the offset that the Timestamps will have @@ -5674,7 +5977,7 @@ 2023-01-31 1 2024-01-31 2 2025-01-31 3 - Freq: A-JAN, dtype: int64 + Freq: YE-JAN, dtype: int64 """ if not isinstance(self.index, PeriodIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") @@ -5695,6 +5998,18 @@ copy : bool, default True Whether or not to return a copy. + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + Returns ------- Series @@ -5709,12 +6024,12 @@ 2023 1 2024 2 2025 3 - Freq: A-DEC, dtype: int64 + Freq: Y-DEC, dtype: int64 Viewing the index >>> s.index - PeriodIndex(['2023', '2024', '2025'], dtype='period[A-DEC]') + PeriodIndex(['2023', '2024', '2025'], dtype='period[Y-DEC]') """ if not isinstance(self.index, DatetimeIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") @@ -5749,7 +6064,6 @@ See Also -------- Series.reindex : Conform Series to new index. - Series.set_index : Set Series as DataFrame index. Index : The base pandas index type. Notes @@ -5783,6 +6097,8 @@ cat = CachedAccessor("cat", CategoricalAccessor) plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) sparse = CachedAccessor("sparse", SparseAccessor) + struct = CachedAccessor("struct", StructAccessor) + list = CachedAccessor("list", ListAccessor) # ---------------------------------------------------------------------- # Add plotting methods to Series @@ -5937,60 +6253,68 @@ return result else: if fill_value is not None: + if isna(other): + return op(self, fill_value) self = self.fillna(fill_value) return op(self, other) @Appender(ops.make_flex_doc("eq", "series")) - def eq(self, other, level=None, fill_value=None, axis: Axis = 0): + def eq( + self, + other, + level: Level | None = None, + fill_value: float | None = None, + axis: Axis = 0, + ) -> Series: return self._flex_method( other, operator.eq, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("ne", "series")) - def ne(self, other, level=None, fill_value=None, axis: Axis = 0): + def ne(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.ne, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("le", "series")) - def le(self, other, level=None, fill_value=None, axis: Axis = 0): + def le(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.le, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("lt", "series")) - def lt(self, other, level=None, fill_value=None, axis: Axis = 0): + def lt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.lt, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("ge", "series")) - def ge(self, other, level=None, fill_value=None, axis: Axis = 0): + def ge(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.ge, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("gt", "series")) - def gt(self, other, level=None, fill_value=None, axis: Axis = 0): + def gt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.gt, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("add", "series")) - def add(self, other, level=None, fill_value=None, axis: Axis = 0): + def add(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.add, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("radd", "series")) - def radd(self, other, level=None, fill_value=None, axis: Axis = 0): + def radd(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.radd, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("sub", "series")) - def sub(self, other, level=None, fill_value=None, axis: Axis = 0): + def sub(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.sub, level=level, fill_value=fill_value, axis=axis ) @@ -5998,7 +6322,7 @@ subtract = sub @Appender(ops.make_flex_doc("rsub", "series")) - def rsub(self, other, level=None, fill_value=None, axis: Axis = 0): + def rsub(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rsub, level=level, fill_value=fill_value, axis=axis ) @@ -6010,7 +6334,7 @@ level: Level | None = None, fill_value: float | None = None, axis: Axis = 0, - ): + ) -> Series: return self._flex_method( other, operator.mul, level=level, fill_value=fill_value, axis=axis ) @@ -6018,13 +6342,13 @@ multiply = mul @Appender(ops.make_flex_doc("rmul", "series")) - def rmul(self, other, level=None, fill_value=None, axis: Axis = 0): + def rmul(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rmul, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("truediv", "series")) - def truediv(self, other, level=None, fill_value=None, axis: Axis = 0): + def truediv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.truediv, level=level, fill_value=fill_value, axis=axis ) @@ -6033,7 +6357,7 @@ divide = truediv @Appender(ops.make_flex_doc("rtruediv", "series")) - def rtruediv(self, other, level=None, fill_value=None, axis: Axis = 0): + def rtruediv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis ) @@ -6041,49 +6365,49 @@ rdiv = rtruediv @Appender(ops.make_flex_doc("floordiv", "series")) - def floordiv(self, other, level=None, fill_value=None, axis: Axis = 0): + def floordiv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.floordiv, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rfloordiv", "series")) - def rfloordiv(self, other, level=None, fill_value=None, axis: Axis = 0): + def rfloordiv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("mod", "series")) - def mod(self, other, level=None, fill_value=None, axis: Axis = 0): + def mod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.mod, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rmod", "series")) - def rmod(self, other, level=None, fill_value=None, axis: Axis = 0): + def rmod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rmod, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("pow", "series")) - def pow(self, other, level=None, fill_value=None, axis: Axis = 0): + def pow(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, operator.pow, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rpow", "series")) - def rpow(self, other, level=None, fill_value=None, axis: Axis = 0): + def rpow(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rpow, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("divmod", "series")) - def divmod(self, other, level=None, fill_value=None, axis: Axis = 0): + def divmod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, divmod, level=level, fill_value=fill_value, axis=axis ) @Appender(ops.make_flex_doc("rdivmod", "series")) - def rdivmod(self, other, level=None, fill_value=None, axis: Axis = 0): + def rdivmod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series: return self._flex_method( other, roperator.rdivmod, level=level, fill_value=fill_value, axis=axis ) diff -Nru pandas-2.1.4+dfsg/pandas/core/shared_docs.py pandas-2.2.2+dfsg/pandas/core/shared_docs.py --- pandas-2.1.4+dfsg/pandas/core/shared_docs.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/shared_docs.py 2024-04-10 17:42:52.000000000 +0000 @@ -34,8 +34,6 @@ * scalar : when Series.agg is called with single function * Series : when DataFrame.agg is called with a single function * DataFrame : when DataFrame.agg is called with several functions - - Return scalar, Series or DataFrame. {see_also} Notes ----- @@ -210,17 +208,17 @@ Parameters ---------- -id_vars : tuple, list, or ndarray, optional +id_vars : scalar, tuple, list, or ndarray, optional Column(s) to use as identifier variables. -value_vars : tuple, list, or ndarray, optional +value_vars : scalar, tuple, list, or ndarray, optional Column(s) to unpivot. If not specified, uses all columns that are not set as `id_vars`. -var_name : scalar +var_name : scalar, default None Name to use for the 'variable' column. If None it uses ``frame.columns.name`` or 'variable'. value_name : scalar, default 'value' - Name to use for the 'value' column. -col_level : int or str, optional + Name to use for the 'value' column, can't be an existing column label. +col_level : scalar, optional If columns are a MultiIndex then use this level to melt. ignore_index : bool, default True If True, original index is ignored. If False, the original index is retained. @@ -572,8 +570,7 @@ .. deprecated:: 2.1.0 regex : bool or same types as `to_replace`, default False Whether to interpret `to_replace` and/or `value` as regular - expressions. If this is ``True`` then `to_replace` *must* be a - string. Alternatively, this could be a regular expression or a + expressions. Alternatively, this could be a regular expression or a list, dict, or array of regular expressions in which case `to_replace` must be ``None``. method : {{'pad', 'ffill', 'bfill'}} @@ -792,6 +789,32 @@ .. versionchanged:: 1.4.0 Previously the explicit ``None`` was silently ignored. + + When ``regex=True``, ``value`` is not ``None`` and `to_replace` is a string, + the replacement will be applied in all columns of the DataFrame. + + >>> df = pd.DataFrame({{'A': [0, 1, 2, 3, 4], + ... 'B': ['a', 'b', 'c', 'd', 'e'], + ... 'C': ['f', 'g', 'h', 'i', 'j']}}) + + >>> df.replace(to_replace='^[a-g]', value='e', regex=True) + A B C + 0 0 e e + 1 1 e e + 2 2 e h + 3 3 e i + 4 4 e j + + If ``value`` is not ``None`` and `to_replace` is a dictionary, the dictionary + keys will be the DataFrame columns that the replacement will be applied. + + >>> df.replace(to_replace={{'B': '^[a-c]', 'C': '^[h-j]'}}, value='e', regex=True) + A B C + 0 0 e f + 1 1 e g + 2 2 e e + 3 3 d e + 4 4 e e """ _shared_docs[ diff -Nru pandas-2.1.4+dfsg/pandas/core/sorting.py pandas-2.2.2+dfsg/pandas/core/sorting.py --- pandas-2.1.4+dfsg/pandas/core/sorting.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/sorting.py 2024-04-10 17:42:52.000000000 +0000 @@ -88,7 +88,7 @@ # error: Incompatible types in assignment (expression has type # "Union[ExtensionArray, ndarray[Any, Any], Index, Series]", variable has # type "Index") - target = ensure_key_mapped(target, key, levels=level) # type:ignore[assignment] + target = ensure_key_mapped(target, key, levels=level) # type: ignore[assignment] target = target._sort_levels_monotonic() if level is not None: @@ -98,17 +98,17 @@ sort_remaining=sort_remaining, na_position=na_position, ) + elif (np.all(ascending) and target.is_monotonic_increasing) or ( + not np.any(ascending) and target.is_monotonic_decreasing + ): + # Check monotonic-ness before sort an index (GH 11080) + return None elif isinstance(target, ABCMultiIndex): + codes = [lev.codes for lev in target._get_codes_for_sorting()] indexer = lexsort_indexer( - target.codes, orders=ascending, na_position=na_position, codes_given=True + codes, orders=ascending, na_position=na_position, codes_given=True ) else: - # Check monotonic-ness before sort an index (GH 11080) - if (ascending and target.is_monotonic_increasing) or ( - not ascending and target.is_monotonic_decreasing - ): - return None - # ascending can only be a Sequence for MultiIndex indexer = nargsort( target, @@ -298,22 +298,8 @@ return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels] -def indexer_from_factorized( - labels, shape: Shape, compress: bool = True -) -> npt.NDArray[np.intp]: - ids = get_group_index(labels, shape, sort=True, xnull=False) - - if not compress: - ngroups = (ids.size and ids.max()) + 1 - else: - ids, obs = compress_group_index(ids, sort=True) - ngroups = len(obs) - - return get_group_index_sorter(ids, ngroups) - - def lexsort_indexer( - keys: list[ArrayLike] | list[Series], + keys: Sequence[ArrayLike | Index | Series], orders=None, na_position: str = "last", key: Callable | None = None, @@ -324,9 +310,9 @@ Parameters ---------- - keys : list[ArrayLike] | list[Series] - Sequence of ndarrays to be sorted by the indexer - list[Series] is only if key is not None. + keys : Sequence[ArrayLike | Index | Series] + Sequence of arrays to be sorted by the indexer + Sequence[Series] is only if key is not None. orders : bool or list of booleans, optional Determines the sorting order for each element in keys. If a list, it must be the same length as keys. This determines whether the @@ -346,68 +332,38 @@ """ from pandas.core.arrays import Categorical - labels = [] - shape = [] + if na_position not in ["last", "first"]: + raise ValueError(f"invalid na_position: {na_position}") + if isinstance(orders, bool): orders = [orders] * len(keys) elif orders is None: orders = [True] * len(keys) - # error: Incompatible types in assignment (expression has type - # "List[Union[ExtensionArray, ndarray[Any, Any], Index, Series]]", variable - # has type "Union[List[Union[ExtensionArray, ndarray[Any, Any]]], List[Series]]") - keys = [ensure_key_mapped(k, key) for k in keys] # type: ignore[assignment] + labels = [] for k, order in zip(keys, orders): - if na_position not in ["last", "first"]: - raise ValueError(f"invalid na_position: {na_position}") - + k = ensure_key_mapped(k, key) if codes_given: - mask = k == -1 - codes = k.copy() - # error: Item "ExtensionArray" of "Series | ExtensionArray | - # ndarray[Any, Any]" has no attribute "max" - n = codes.max() + 1 if len(codes) else 0 # type: ignore[union-attr] - + codes = cast(np.ndarray, k) + n = codes.max() + 1 if len(codes) else 0 else: cat = Categorical(k, ordered=True) + codes = cat.codes n = len(cat.categories) - codes = cat.codes.copy() - mask = cat.codes == -1 - if order: # ascending - if na_position == "last": - # error: Argument 1 to "where" has incompatible type "Union[Any, - # ExtensionArray, ndarray[Any, Any]]"; expected - # "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, - # complex, str, bytes, _NestedSequence[Union[bool, int, float, - # complex, str, bytes]]]" - codes = np.where(mask, n, codes) # type: ignore[arg-type] - else: # not order means descending - if na_position == "last": - # error: Unsupported operand types for - ("int" and "ExtensionArray") - # error: Argument 1 to "where" has incompatible type "Union[Any, - # ExtensionArray, ndarray[Any, Any]]"; expected - # "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, - # complex, str, bytes, _NestedSequence[Union[bool, int, float, - # complex, str, bytes]]]" - codes = np.where(mask, n, n - codes - 1) # type: ignore[arg-type] - elif na_position == "first": - # error: Unsupported operand types for - ("int" and "ExtensionArray") - # error: Argument 1 to "where" has incompatible type "Union[Any, - # ExtensionArray, ndarray[Any, Any]]"; expected - # "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, float, - # complex, str, bytes, _NestedSequence[Union[bool, int, float, - # complex, str, bytes]]]" - codes = np.where(mask, -1, n - codes) # type: ignore[arg-type] + mask = codes == -1 + + if na_position == "last" and mask.any(): + codes = np.where(mask, n, codes) + + # not order means descending + if not order: + codes = np.where(mask, codes, n - codes - 1) - shape.append(n + 1) labels.append(codes) - return indexer_from_factorized(labels, tuple(shape)) + return np.lexsort(labels[::-1]) def nargsort( diff -Nru pandas-2.1.4+dfsg/pandas/core/strings/accessor.py pandas-2.2.2+dfsg/pandas/core/strings/accessor.py --- pandas-2.1.4+dfsg/pandas/core/strings/accessor.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/strings/accessor.py 2024-04-10 17:42:52.000000000 +0000 @@ -44,6 +44,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays import ExtensionArray from pandas.core.base import NoNewAttributesMixin from pandas.core.construction import extract_array @@ -373,7 +374,7 @@ if expand: result = list(result) - out = MultiIndex.from_tuples(result, names=name) + out: Index = MultiIndex.from_tuples(result, names=name) if out.nlevels == 1: # We had all tuples of length-one, which are # better represented as a regular Index. @@ -456,7 +457,7 @@ # in case of list-like `others`, all elements must be # either Series/Index/np.ndarray (1-dim)... if all( - isinstance(x, (ABCSeries, ABCIndex)) + isinstance(x, (ABCSeries, ABCIndex, ExtensionArray)) or (isinstance(x, np.ndarray) and x.ndim == 1) for x in others ): @@ -688,16 +689,18 @@ result = cat_safe(all_cols, sep) out: Index | Series + if isinstance(self._orig.dtype, CategoricalDtype): + # We need to infer the new categories. + dtype = self._orig.dtype.categories.dtype + else: + dtype = self._orig.dtype if isinstance(self._orig, ABCIndex): # add dtype for case that result is all-NA + if isna(result).all(): + dtype = object # type: ignore[assignment] - out = Index(result, dtype=object, name=self._orig.name) + out = Index(result, dtype=dtype, name=self._orig.name) else: # Series - if isinstance(self._orig.dtype, CategoricalDtype): - # We need to infer the new categories. - dtype = None - else: - dtype = self._orig.dtype res_ser = Series( result, dtype=dtype, index=data.index, name=self._orig.name, copy=False ) @@ -914,7 +917,13 @@ if is_re(pat): regex = True result = self._data.array._str_split(pat, n, expand, regex) - return self._wrap_result(result, returns_string=expand, expand=expand) + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) @Appender( _shared_docs["str_split"] @@ -932,7 +941,10 @@ @forbid_nonstring_types(["bytes"]) def rsplit(self, pat=None, *, n=-1, expand: bool = False): result = self._data.array._str_rsplit(pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) _shared_docs[ "str_partition" @@ -1028,7 +1040,13 @@ @forbid_nonstring_types(["bytes"]) def partition(self, sep: str = " ", expand: bool = True): result = self._data.array._str_partition(sep, expand) - return self._wrap_result(result, expand=expand, returns_string=expand) + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) @Appender( _shared_docs["str_partition"] @@ -1042,7 +1060,13 @@ @forbid_nonstring_types(["bytes"]) def rpartition(self, sep: str = " ", expand: bool = True): result = self._data.array._str_rpartition(sep, expand) - return self._wrap_result(result, expand=expand, returns_string=expand) + if self._data.dtype == "category": + dtype = self._data.dtype.categories.dtype + else: + dtype = object if self._data.dtype == object else None + return self._wrap_result( + result, expand=expand, returns_string=expand, dtype=dtype + ) def get(self, i): """ @@ -1312,14 +1336,14 @@ return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def match(self, pat, case: bool = True, flags: int = 0, na=None): + def match(self, pat: str, case: bool = True, flags: int = 0, na=None): """ Determine if each string starts with a match of a regular expression. Parameters ---------- pat : str - Character sequence or regular expression. + Character sequence. case : bool, default True If True, case sensitive. flags : int, default 0 (no flags) @@ -2748,7 +2772,7 @@ else: name = _get_single_group_name(regex) result = self._data.array._str_extract(pat, flags=flags, expand=returns_df) - return self._wrap_result(result, name=name) + return self._wrap_result(result, name=name, dtype=result_dtype) @forbid_nonstring_types(["bytes"]) def extractall(self, pat, flags: int = 0) -> DataFrame: @@ -3488,7 +3512,7 @@ raise ValueError("pattern contains no capture groups") if isinstance(arr, ABCIndex): - arr = arr.to_series().reset_index(drop=True) + arr = arr.to_series().reset_index(drop=True).astype(arr.dtype) columns = _get_group_names(regex) match_list = [] diff -Nru pandas-2.1.4+dfsg/pandas/core/strings/object_array.py pandas-2.2.2+dfsg/pandas/core/strings/object_array.py --- pandas-2.1.4+dfsg/pandas/core/strings/object_array.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/strings/object_array.py 2024-04-10 17:42:52.000000000 +0000 @@ -207,7 +207,7 @@ ) if isinstance(self, BaseStringArray): # Not going through map, so we have to do this here. - result = type(self)._from_sequence(result) + result = type(self)._from_sequence(result, dtype=self.dtype) return result def _str_match( diff -Nru pandas-2.1.4+dfsg/pandas/core/tools/datetimes.py pandas-2.2.2+dfsg/pandas/core/tools/datetimes.py --- pandas-2.1.4+dfsg/pandas/core/tools/datetimes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/tools/datetimes.py 2024-04-10 17:42:52.000000000 +0000 @@ -25,14 +25,10 @@ Timedelta, Timestamp, astype_overflowsafe, - get_unit_from_dtype, - iNaT, - is_supported_unit, - nat_strings, - parsing, + is_supported_dtype, timezones as libtimezones, ) -from pandas._libs.tslibs.conversion import precision_from_unit +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -42,7 +38,6 @@ AnyArrayLike, ArrayLike, DateTimeErrorChoices, - npt, ) from pandas.util._exceptions import find_stack_level @@ -62,20 +57,18 @@ ABCDataFrame, ABCSeries, ) -from pandas.core.dtypes.missing import notna from pandas.arrays import ( DatetimeArray, IntegerArray, NumpyExtensionArray, ) -from pandas.core import algorithms from pandas.core.algorithms import unique from pandas.core.arrays import ArrowExtensionArray from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.datetimes import ( maybe_convert_dtype, - objects_to_datetime64ns, + objects_to_datetime64, tz_to_dtype, ) from pandas.core.construction import extract_array @@ -133,7 +126,7 @@ def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None: # Try to guess the format based on the first non-NaN element, return None if can't if (first_non_null := tslib.first_non_null(arr)) != -1: - if type(first_non_nan_element := arr[first_non_null]) is str: + if type(first_non_nan_element := arr[first_non_null]) is str: # noqa: E721 # GH#32264 np.str_ object guessed_format = guess_datetime_format( first_non_nan_element, dayfirst=dayfirst @@ -318,54 +311,6 @@ return _box_as_indexlike(result._values, utc=False, name=name) -def _return_parsed_timezone_results( - result: np.ndarray, timezones, utc: bool, name: str -) -> Index: - """ - Return results from array_strptime if a %z or %Z directive was passed. - - Parameters - ---------- - result : ndarray[int64] - int64 date representations of the dates - timezones : ndarray - pytz timezone objects - utc : bool - Whether to convert/localize timestamps to UTC. - name : string, default None - Name for a DatetimeIndex - - Returns - ------- - tz_result : Index-like of parsed dates with timezone - """ - tz_results = np.empty(len(result), dtype=object) - non_na_timezones = set() - for zone in unique(timezones): - mask = timezones == zone - dta = DatetimeArray(result[mask]).tz_localize(zone) - if utc: - if dta.tzinfo is None: - dta = dta.tz_localize("utc") - else: - dta = dta.tz_convert("utc") - else: - if not dta.isna().all(): - non_na_timezones.add(zone) - tz_results[mask] = dta - if len(non_na_timezones) > 1: - warnings.warn( - "In a future version of pandas, parsing datetimes with mixed time " - "zones will raise an error unless `utc=True`. Please specify `utc=True` " - "to opt in to the new behaviour and silence this warning. " - "To create a `Series` with mixed offsets and `object` dtype, " - "please use `apply` and `datetime.datetime.strptime`", - FutureWarning, - stacklevel=find_stack_level(), - ) - return Index(tz_results, name=name) - - def _convert_listlike_datetimes( arg, format: str | None, @@ -439,7 +384,7 @@ return arg elif lib.is_np_dtype(arg_dtype, "M"): - if not is_supported_unit(get_unit_from_dtype(arg_dtype)): + if not is_supported_dtype(arg_dtype): # We go to closest supported reso, i.e. "s" arg = astype_overflowsafe( # TODO: looks like we incorrectly raise with errors=="ignore" @@ -487,7 +432,7 @@ if format is not None and format != "mixed": return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) - result, tz_parsed = objects_to_datetime64ns( + result, tz_parsed = objects_to_datetime64( arg, dayfirst=dayfirst, yearfirst=yearfirst, @@ -499,7 +444,10 @@ if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array # is in UTC - dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) + out_unit = np.datetime_data(result.dtype)[0] + dtype = cast(DatetimeTZDtype, tz_to_dtype(tz_parsed, out_unit)) + dt64_values = result.view(f"M8[{dtype.unit}]") + dta = DatetimeArray._simple_new(dt64_values, dtype=dtype) return DatetimeIndex._simple_new(dta, name=name) return _box_as_indexlike(result, utc=utc, name=name) @@ -516,11 +464,19 @@ """ Call array_strptime, with fallback behavior depending on 'errors'. """ - result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) - if any(tz is not None for tz in timezones): - return _return_parsed_timezone_results(result, timezones, utc, name) - - return _box_as_indexlike(result, utc=utc, name=name) + result, tz_out = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc) + if tz_out is not None: + unit = np.datetime_data(result.dtype)[0] + dtype = DatetimeTZDtype(tz=tz_out, unit=unit) + dta = DatetimeArray._simple_new(result, dtype=dtype) + if utc: + dta = dta.tz_convert("UTC") + return Index(dta, name=name) + elif result.dtype != object and utc: + unit = np.datetime_data(result.dtype)[0] + res = Index(result, dtype=f"M8[{unit}, UTC]", name=name) + return res + return Index(result, dtype=result.dtype, name=name) def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: @@ -551,23 +507,19 @@ tz_parsed = None elif arg.dtype.kind == "f": - mult, _ = precision_from_unit(unit) - - mask = np.isnan(arg) | (arg == iNaT) - fvalues = (arg * mult).astype("f8", copy=False) - fvalues[mask] = 0 - - if (fvalues < Timestamp.min._value).any() or ( - fvalues > Timestamp.max._value - ).any(): - if errors != "raise": - arg = arg.astype(object) - return _to_datetime_with_unit(arg, unit, name, utc, errors) - raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") - - arr = fvalues.astype("M8[ns]", copy=False) - arr[mask] = np.datetime64("NaT", "ns") + with np.errstate(over="raise"): + try: + arr = cast_from_unit_vectorized(arg, unit=unit) + except OutOfBoundsDatetime: + if errors != "raise": + return _to_datetime_with_unit( + arg.astype(object), unit, name, utc, errors + ) + raise OutOfBoundsDatetime( + f"cannot convert input with unit '{unit}'" + ) + arr = arr.view("M8[ns]") tz_parsed = None else: arg = arg.astype(object, copy=False) @@ -848,8 +800,8 @@ to the day starting at noon on January 1, 4713 BC. - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date string), origin is set to Timestamp identified by origin. - - If a float or integer, origin is the millisecond difference - relative to 1970-01-01. + - If a float or integer, origin is the difference + (in units determined by the ``unit`` argument) relative to 1970-01-01. cache : bool, default True If :const:`True`, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing @@ -980,16 +932,9 @@ **Non-convertible date/times** - If a date does not meet the `timestamp limitations - `_, passing ``errors='ignore'`` - will return the original input instead of raising any exception. - Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. - >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') - '13000101' >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT @@ -1079,6 +1024,16 @@ "You can safely remove this argument.", stacklevel=find_stack_level(), ) + if errors == "ignore": + # GH#54467 + warnings.warn( + "errors='ignore' is deprecated and will raise in a future version. " + "Use to_datetime without passing `errors` and catch exceptions " + "explicitly instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + if arg is None: return None @@ -1246,7 +1201,7 @@ values = to_numeric(values, errors=errors) # prevent overflow in case of int8 or int16 - if is_integer_dtype(values): + if is_integer_dtype(values.dtype): values = values.astype("int64", copy=False) return values @@ -1273,58 +1228,6 @@ return values -def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None: - """ - try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, - arg is a passed in as an object dtype, but could really be ints/strings - with nan-like/or floats (e.g. with nan) - - Parameters - ---------- - arg : np.ndarray[object] - errors : {'raise','ignore','coerce'} - """ - - def calc(carg): - # calculate the actual result - carg = carg.astype(object, copy=False) - parsed = parsing.try_parse_year_month_day( - carg / 10000, carg / 100 % 100, carg % 100 - ) - return tslib.array_to_datetime(parsed, errors=errors)[0] - - def calc_with_mask(carg, mask): - result = np.empty(carg.shape, dtype="M8[ns]") - iresult = result.view("i8") - iresult[~mask] = iNaT - - masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) - result[mask] = masked_result.astype("M8[ns]") - return result - - # try intlike / strings that are ints - try: - return calc(arg.astype(np.int64)) - except (ValueError, OverflowError, TypeError): - pass - - # a float with actual np.nan - try: - carg = arg.astype(np.float64) - return calc_with_mask(carg, notna(carg)) - except (ValueError, OverflowError, TypeError): - pass - - # string with NaN-like - try: - mask = ~algorithms.isin(arg, list(nat_strings)) - return calc_with_mask(arg, mask) - except (ValueError, OverflowError, TypeError): - pass - - return None - - __all__ = [ "DateParseError", "should_cache", diff -Nru pandas-2.1.4+dfsg/pandas/core/tools/numeric.py pandas-2.2.2+dfsg/pandas/core/tools/numeric.py --- pandas-2.1.4+dfsg/pandas/core/tools/numeric.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/tools/numeric.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,10 +4,12 @@ TYPE_CHECKING, Literal, ) +import warnings import numpy as np from pandas._libs import lib +from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.cast import maybe_downcast_numeric @@ -68,6 +70,11 @@ - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. - If 'ignore', then invalid parsing will return the input. + + .. versionchanged:: 2.2 + + "ignore" is deprecated. Catch exceptions explicitly instead. + downcast : str, default None Can be 'integer', 'signed', 'unsigned', or 'float'. If not None, and if the data has been successfully cast to a @@ -134,12 +141,6 @@ 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) - >>> pd.to_numeric(s, errors='ignore') - 0 apple - 1 1.0 - 2 2 - 3 -3 - dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 @@ -167,6 +168,15 @@ if errors not in ("ignore", "raise", "coerce"): raise ValueError("invalid error value specified") + if errors == "ignore": + # GH#54467 + warnings.warn( + "errors='ignore' is deprecated and will raise in a future version. " + "Use to_numeric without passing `errors` and catch exceptions " + "explicitly instead", + FutureWarning, + stacklevel=find_stack_level(), + ) check_dtype_backend(dtype_backend) @@ -219,7 +229,7 @@ values = ensure_object(values) coerce_numeric = errors not in ("ignore", "raise") try: - values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] # noqa: E501 + values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] values, set(), coerce_numeric=coerce_numeric, @@ -293,7 +303,7 @@ IntegerArray, ) - klass: type[IntegerArray] | type[BooleanArray] | type[FloatingArray] + klass: type[IntegerArray | BooleanArray | FloatingArray] if is_integer_dtype(data.dtype): klass = IntegerArray elif is_bool_dtype(data.dtype): diff -Nru pandas-2.1.4+dfsg/pandas/core/tools/timedeltas.py pandas-2.2.2+dfsg/pandas/core/tools/timedeltas.py --- pandas-2.1.4+dfsg/pandas/core/tools/timedeltas.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/tools/timedeltas.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,6 +18,7 @@ ) from pandas._libs.tslibs.timedeltas import ( Timedelta, + disallow_ambiguous_unit, parse_timedelta_unit, ) from pandas.util._exceptions import find_stack_level @@ -113,17 +114,19 @@ * 'W' * 'D' / 'days' / 'day' - * 'hours' / 'hour' / 'hr' / 'h' + * 'hours' / 'hour' / 'hr' / 'h' / 'H' * 'm' / 'minute' / 'min' / 'minutes' / 'T' - * 'S' / 'seconds' / 'sec' / 'second' + * 's' / 'seconds' / 'sec' / 'second' / 'S' * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L' * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U' * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N' - Must not be specified when `arg` context strings and ``errors="raise"``. + Must not be specified when `arg` contains strings and ``errors="raise"``. - .. deprecated:: 2.1.0 - Units 'T' and 'L' are deprecated and will be removed in a future version. + .. deprecated:: 2.2.0 + Units 'H', 'T', 'S', 'L', 'U' and 'N' are deprecated and will be removed + in a future version. Please use 'h', 'min', 's', 'ms', 'us', and 'ns' + instead of 'H', 'T', 'S', 'L', 'U' and 'N'. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. @@ -176,23 +179,20 @@ TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) """ - if unit in {"T", "t", "L", "l"}: - warnings.warn( - f"Unit '{unit}' is deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if unit is not None: unit = parse_timedelta_unit(unit) + disallow_ambiguous_unit(unit) if errors not in ("ignore", "raise", "coerce"): raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'.") - - if unit in {"Y", "y", "M"}: - raise ValueError( - "Units 'M', 'Y', and 'y' are no longer supported, as they do not " - "represent unambiguous timedelta values durations." + if errors == "ignore": + # GH#54467 + warnings.warn( + "errors='ignore' is deprecated and will raise in a future version. " + "Use to_timedelta without passing `errors` and catch exceptions " + "explicitly instead", + FutureWarning, + stacklevel=find_stack_level(), ) if arg is None: @@ -279,5 +279,5 @@ from pandas import TimedeltaIndex - value = TimedeltaIndex(td64arr, unit="ns", name=name) + value = TimedeltaIndex(td64arr, name=name) return value diff -Nru pandas-2.1.4+dfsg/pandas/core/tools/times.py pandas-2.2.2+dfsg/pandas/core/tools/times.py --- pandas-2.1.4+dfsg/pandas/core/tools/times.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/tools/times.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,10 +5,12 @@ time, ) from typing import TYPE_CHECKING +import warnings import numpy as np from pandas._libs.lib import is_list_like +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.generic import ( ABCIndex, @@ -52,6 +54,15 @@ ------- datetime.time """ + if errors == "ignore": + # GH#54467 + warnings.warn( + "errors='ignore' is deprecated and will raise in a future version. " + "Use to_time without passing `errors` and catch exceptions " + "explicitly instead", + FutureWarning, + stacklevel=find_stack_level(), + ) def _convert_listlike(arg, format): if isinstance(arg, (list, tuple)): diff -Nru pandas-2.1.4+dfsg/pandas/core/util/numba_.py pandas-2.2.2+dfsg/pandas/core/util/numba_.py --- pandas-2.1.4+dfsg/pandas/core/util/numba_.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/util/numba_.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,11 +1,14 @@ """Common utilities for Numba operations""" from __future__ import annotations +import types from typing import ( TYPE_CHECKING, Callable, ) +import numpy as np + from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError @@ -83,6 +86,12 @@ if numba.extending.is_jitted(func): # Don't jit a user passed jitted function numba_func = func + elif getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + # Not necessary to jit builtins or np functions + # This will mess up register_jitable + numba_func = func else: numba_func = numba.extending.register_jitable(func) diff -Nru pandas-2.1.4+dfsg/pandas/core/window/ewm.py pandas-2.2.2+dfsg/pandas/core/window/ewm.py --- pandas-2.1.4+dfsg/pandas/core/window/ewm.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/window/ewm.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,12 +12,15 @@ from pandas.util._decorators import doc from pandas.core.dtypes.common import ( - is_datetime64_ns_dtype, + is_datetime64_dtype, is_numeric_dtype, ) +from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import common +from pandas.core.arrays.datetimelike import dtype_to_unit from pandas.core.indexers.objects import ( BaseIndexer, ExponentialMovingWindowIndexer, @@ -55,6 +58,7 @@ from pandas._typing import ( Axis, TimedeltaConvertibleTypes, + npt, ) from pandas import ( @@ -100,7 +104,7 @@ def _calculate_deltas( times: np.ndarray | NDFrame, halflife: float | TimedeltaConvertibleTypes | None, -) -> np.ndarray: +) -> npt.NDArray[np.float64]: """ Return the diff of the times divided by the half-life. These values are used in the calculation of the ewm mean. @@ -118,9 +122,11 @@ np.ndarray Diff of the times divided by the half-life """ + unit = dtype_to_unit(times.dtype) + if isinstance(times, ABCSeries): + times = times._values _times = np.asarray(times.view(np.int64), dtype=np.float64) - # TODO: generalize to non-nano? - _halflife = float(Timedelta(halflife).as_unit("ns")._value) + _halflife = float(Timedelta(halflife).as_unit(unit)._value) return np.diff(_times) / _halflife @@ -363,8 +369,12 @@ if self.times is not None: if not self.adjust: raise NotImplementedError("times is not supported with adjust=False.") - if not is_datetime64_ns_dtype(self.times): - raise ValueError("times must be datetime64[ns] dtype.") + times_dtype = getattr(self.times, "dtype", None) + if not ( + is_datetime64_dtype(times_dtype) + or isinstance(times_dtype, DatetimeTZDtype) + ): + raise ValueError("times must be datetime64 dtype.") if len(self.times) != len(obj): raise ValueError("times must be the same length as the object.") if not isinstance(self.halflife, (str, datetime.timedelta, np.timedelta64)): @@ -1068,7 +1078,7 @@ result_kwargs["columns"] = self._selected_obj.columns else: result_kwargs["name"] = self._selected_obj.name - np_array = self._selected_obj.astype(np.float64).to_numpy() + np_array = self._selected_obj.astype(np.float64, copy=False).to_numpy() ewma_func = generate_online_numba_ewma_func( **get_jit_arguments(self.engine_kwargs) ) diff -Nru pandas-2.1.4+dfsg/pandas/core/window/online.py pandas-2.2.2+dfsg/pandas/core/window/online.py --- pandas-2.1.4+dfsg/pandas/core/window/online.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/window/online.py 2024-04-10 17:42:52.000000000 +0000 @@ -52,7 +52,7 @@ exponentially weighted mean accounting minimum periods. """ result = np.empty(values.shape) - weighted_avg = values[0] + weighted_avg = values[0].copy() nobs = (~np.isnan(weighted_avg)).astype(np.int64) result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan) diff -Nru pandas-2.1.4+dfsg/pandas/core/window/rolling.py pandas-2.2.2+dfsg/pandas/core/window/rolling.py --- pandas-2.1.4+dfsg/pandas/core/window/rolling.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/core/window/rolling.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,7 +14,6 @@ Any, Callable, Literal, - cast, ) import numpy as np @@ -39,6 +38,7 @@ is_numeric_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -104,6 +104,7 @@ NDFrameT, QuantileInterpolation, WindowingRankType, + npt, ) from pandas import ( @@ -404,11 +405,12 @@ result[name] = extra_col @property - def _index_array(self): + def _index_array(self) -> npt.NDArray[np.int64] | None: # TODO: why do we get here with e.g. MultiIndex? - if needs_i8_conversion(self._on.dtype): - idx = cast("PeriodIndex | DatetimeIndex | TimedeltaIndex", self._on) - return idx.asi8 + if isinstance(self._on, (PeriodIndex, DatetimeIndex, TimedeltaIndex)): + return self._on.asi8 + elif isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM": + return self._on.to_numpy(dtype=np.int64) return None def _resolve_output(self, out: DataFrame, obj: DataFrame) -> DataFrame: @@ -439,7 +441,7 @@ self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None ) -> Series: """ - Series version of _apply_blockwise + Series version of _apply_columnwise """ obj = self._create_data(self._selected_obj) @@ -455,7 +457,7 @@ index = self._slice_axis_for_step(obj.index, result) return obj._constructor(result, index=index, name=obj.name) - def _apply_blockwise( + def _apply_columnwise( self, homogeneous_func: Callable[..., ArrayLike], name: str, @@ -614,7 +616,7 @@ return result if self.method == "single": - return self._apply_blockwise(homogeneous_func, name, numeric_only) + return self._apply_columnwise(homogeneous_func, name, numeric_only) else: return self._apply_tablewise(homogeneous_func, name, numeric_only) @@ -940,6 +942,11 @@ For `Series` this parameter is unused and defaults to 0. + .. deprecated:: 2.1.0 + + The axis keyword is deprecated. For ``axis=1``, + transpose the DataFrame first instead. + closed : str, default None If ``'right'``, the first point in the window is excluded from calculations. @@ -952,10 +959,6 @@ Default ``None`` (``'right'``). - .. versionchanged:: 1.2.0 - - The closed parameter with fixed windows is now supported. - step : int, default None .. versionadded:: 1.5.0 @@ -1231,7 +1234,9 @@ return result - return self._apply_blockwise(homogeneous_func, name, numeric_only)[:: self.step] + return self._apply_columnwise(homogeneous_func, name, numeric_only)[ + :: self.step + ] @doc( _shared_docs["aggregate"], @@ -1867,6 +1872,7 @@ if ( self.obj.empty or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex)) + or (isinstance(self._on.dtype, ArrowDtype) and self._on.dtype.kind in "mM") ) and isinstance(self.window, (str, BaseOffset, timedelta)): self._validate_datetimelike_monotonic() @@ -2434,14 +2440,14 @@ create_section_header("Examples"), dedent( """\ - >>> ser = pd.Series([1, 5, 2, 7, 12, 6]) + >>> ser = pd.Series([1, 5, 2, 7, 15, 6]) >>> ser.rolling(3).skew().round(6) 0 NaN 1 NaN 2 1.293343 3 -0.585583 - 4 0.000000 - 5 1.545393 + 4 0.670284 + 5 1.652317 dtype: float64 """ ), @@ -2789,12 +2795,12 @@ >>> v1 = [3, 3, 3, 5, 8] >>> v2 = [3, 4, 4, 4, 8] - >>> # numpy returns a 2X2 array, the correlation coefficient - >>> # is the number at entry [0][1] - >>> print(f"{{np.corrcoef(v1[:-1], v2[:-1])[0][1]:.6f}}") - 0.333333 - >>> print(f"{{np.corrcoef(v1[1:], v2[1:])[0][1]:.6f}}") - 0.916949 + >>> np.corrcoef(v1[:-1], v2[:-1]) + array([[1. , 0.33333333], + [0.33333333, 1. ]]) + >>> np.corrcoef(v1[1:], v2[1:]) + array([[1. , 0.9169493], + [0.9169493, 1. ]]) >>> s1 = pd.Series(v1) >>> s2 = pd.Series(v2) >>> s1.rolling(4).corr(s2) @@ -2808,15 +2814,18 @@ The below example shows a similar rolling calculation on a DataFrame using the pairwise option. - >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\ - [46., 31.], [50., 36.]]) - >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7)) - [[1. 0.6263001] - [0.6263001 1. ]] - >>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7)) - [[1. 0.5553681] - [0.5553681 1. ]] - >>> df = pd.DataFrame(matrix, columns=['X','Y']) + >>> matrix = np.array([[51., 35.], + ... [49., 30.], + ... [47., 32.], + ... [46., 31.], + ... [50., 36.]]) + >>> np.corrcoef(matrix[:-1, 0], matrix[:-1, 1]) + array([[1. , 0.6263001], + [0.6263001, 1. ]]) + >>> np.corrcoef(matrix[1:, 0], matrix[1:, 1]) + array([[1. , 0.55536811], + [0.55536811, 1. ]]) + >>> df = pd.DataFrame(matrix, columns=['X', 'Y']) >>> df X Y 0 51.0 35.0 diff -Nru pandas-2.1.4+dfsg/pandas/errors/__init__.py pandas-2.2.2+dfsg/pandas/errors/__init__.py --- pandas-2.1.4+dfsg/pandas/errors/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/errors/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -329,8 +329,6 @@ """ Error raised when an operation would introduce duplicate labels. - .. versionadded:: 1.2.0 - Examples -------- >>> s = pd.Series([0, 1, 2], index=['a', 'b', 'c']).set_flags( @@ -503,6 +501,55 @@ ) +_chained_assignment_warning_msg = ( + "ChainedAssignmentError: behaviour will change in pandas 3.0!\n" + "You are setting values through chained assignment. Currently this works " + "in certain cases, but when using Copy-on-Write (which will become the " + "default behaviour in pandas 3.0) this will never work to update the " + "original DataFrame or Series, because the intermediate object on which " + "we are setting values will behave as a copy.\n" + "A typical example is when you are setting values in a column of a " + "DataFrame, like:\n\n" + 'df["col"][row_indexer] = value\n\n' + 'Use `df.loc[row_indexer, "col"] = values` instead, to perform the ' + "assignment in a single step and ensure this keeps updating the original `df`.\n\n" + "See the caveats in the documentation: " + "https://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy\n" +) + + +_chained_assignment_warning_method_msg = ( + "A value is trying to be set on a copy of a DataFrame or Series " + "through chained assignment using an inplace method.\n" + "The behavior will change in pandas 3.0. This inplace method will " + "never work because the intermediate object on which we are setting " + "values always behaves as a copy.\n\n" + "For example, when doing 'df[col].method(value, inplace=True)', try " + "using 'df.method({col: value}, inplace=True)' or " + "df[col] = df[col].method(value) instead, to perform " + "the operation inplace on the original object.\n\n" +) + + +def _check_cacher(obj): + # This is a mess, selection paths that return a view set the _cacher attribute + # on the Series; most of them also set _item_cache which adds 1 to our relevant + # reference count, but iloc does not, so we have to check if we are actually + # in the item cache + if hasattr(obj, "_cacher"): + parent = obj._cacher[1]() + # parent could be dead + if parent is None: + return False + if hasattr(parent, "_item_cache"): + if obj._cacher[0] in parent._item_cache: + # Check if we are actually the item from item_cache, iloc creates a + # new object + return obj is parent._item_cache[obj._cacher[0]] + return False + + class NumExprClobberingError(NameError): """ Exception raised when trying to use a built-in numexpr name as a variable name. diff -Nru pandas-2.1.4+dfsg/pandas/io/clipboard/__init__.py pandas-2.2.2+dfsg/pandas/io/clipboard/__init__.py --- pandas-2.1.4+dfsg/pandas/io/clipboard/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/clipboard/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,7 +4,7 @@ A cross-platform clipboard module for Python, with copy & paste functions for plain text. By Al Sweigart al@inventwithpython.com -BSD License +Licence at LICENSES/PYPERCLIP_LICENSE Usage: import pyperclip @@ -17,9 +17,12 @@ On Windows, no additional modules are needed. On Mac, the pyobjc module is used, falling back to the pbcopy and pbpaste cli commands. (These commands should come with OS X.). -On Linux, install xclip or xsel via package manager. For example, in Debian: +On Linux, install xclip, xsel, or wl-clipboard (for "wayland" sessions) via +package manager. +For example, in Debian: sudo apt-get install xclip sudo apt-get install xsel + sudo apt-get install wl-clipboard Otherwise on Linux, you will need the PyQt5 modules installed. @@ -28,12 +31,11 @@ Cygwin is currently not supported. Security Note: This module runs programs with these names: - - which - - where - pbcopy - pbpaste - xclip - xsel + - wl-copy/wl-paste - klipper - qdbus A malicious user could rename or add programs with these names, tricking @@ -41,7 +43,7 @@ """ -__version__ = "1.7.0" +__version__ = "1.8.2" import contextlib @@ -55,7 +57,7 @@ ) import os import platform -from shutil import which +from shutil import which as _executable_exists import subprocess import time import warnings @@ -74,25 +76,14 @@ EXCEPT_MSG = """ Pyperclip could not find a copy/paste mechanism for your system. For more information, please visit - https://pyperclip.readthedocs.io/en/latest/#not-implemented-error + https://pyperclip.readthedocs.io/en/latest/index.html#not-implemented-error """ ENCODING = "utf-8" -# The "which" unix command finds where a command is. -if platform.system() == "Windows": - WHICH_CMD = "where" -else: - WHICH_CMD = "which" - -def _executable_exists(name): - return ( - subprocess.call( - [WHICH_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - == 0 - ) +class PyperclipTimeoutException(PyperclipException): + pass def _stringifyText(text) -> str: @@ -229,6 +220,32 @@ return copy_xsel, paste_xsel +def init_wl_clipboard(): + PRIMARY_SELECTION = "-p" + + def copy_wl(text, primary=False): + text = _stringifyText(text) # Converts non-str values to str. + args = ["wl-copy"] + if primary: + args.append(PRIMARY_SELECTION) + if not text: + args.append("--clear") + subprocess.check_call(args, close_fds=True) + else: + p = subprocess.Popen(args, stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=text.encode(ENCODING)) + + def paste_wl(primary=False): + args = ["wl-paste", "-n"] + if primary: + args.append(PRIMARY_SELECTION) + p = subprocess.Popen(args, stdout=subprocess.PIPE, close_fds=True) + stdout, _stderr = p.communicate() + return stdout.decode(ENCODING) + + return copy_wl, paste_wl + + def init_klipper_clipboard(): def copy_klipper(text): text = _stringifyText(text) # Converts non-str values to str. @@ -534,7 +551,7 @@ return init_windows_clipboard() if platform.system() == "Linux": - if which("wslconfig.exe"): + if _executable_exists("wslconfig.exe"): return init_wsl_clipboard() # Setup for the macOS platform: @@ -549,6 +566,8 @@ # Setup for the LINUX platform: if HAS_DISPLAY: + if os.environ.get("WAYLAND_DISPLAY") and _executable_exists("wl-copy"): + return init_wl_clipboard() if _executable_exists("xsel"): return init_xsel_clipboard() if _executable_exists("xclip"): @@ -602,6 +621,7 @@ "qt": init_qt_clipboard, # TODO - split this into 'qtpy', 'pyqt4', and 'pyqt5' "xclip": init_xclip_clipboard, "xsel": init_xsel_clipboard, + "wl-clipboard": init_wl_clipboard, "klipper": init_klipper_clipboard, "windows": init_windows_clipboard, "no": init_no_clipboard, @@ -671,7 +691,56 @@ copy, paste = lazy_load_stub_copy, lazy_load_stub_paste -__all__ = ["copy", "paste", "set_clipboard", "determine_clipboard"] +def waitForPaste(timeout=None): + """This function call blocks until a non-empty text string exists on the + clipboard. It returns this text. + + This function raises PyperclipTimeoutException if timeout was set to + a number of seconds that has elapsed without non-empty text being put on + the clipboard.""" + startTime = time.time() + while True: + clipboardText = paste() + if clipboardText != "": + return clipboardText + time.sleep(0.01) + + if timeout is not None and time.time() > startTime + timeout: + raise PyperclipTimeoutException( + "waitForPaste() timed out after " + str(timeout) + " seconds." + ) + + +def waitForNewPaste(timeout=None): + """This function call blocks until a new text string exists on the + clipboard that is different from the text that was there when the function + was first called. It returns this text. + + This function raises PyperclipTimeoutException if timeout was set to + a number of seconds that has elapsed without non-empty text being put on + the clipboard.""" + startTime = time.time() + originalText = paste() + while True: + currentText = paste() + if currentText != originalText: + return currentText + time.sleep(0.01) + + if timeout is not None and time.time() > startTime + timeout: + raise PyperclipTimeoutException( + "waitForNewPaste() timed out after " + str(timeout) + " seconds." + ) + + +__all__ = [ + "copy", + "paste", + "waitForPaste", + "waitForNewPaste", + "set_clipboard", + "determine_clipboard", +] # pandas aliases clipboard_get = paste diff -Nru pandas-2.1.4+dfsg/pandas/io/common.py pandas-2.2.2+dfsg/pandas/io/common.py --- pandas-2.1.4+dfsg/pandas/io/common.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/common.py 2024-04-10 17:42:52.000000000 +0000 @@ -30,6 +30,7 @@ import tarfile from typing import ( IO, + TYPE_CHECKING, Any, AnyStr, DefaultDict, @@ -51,13 +52,7 @@ from pandas._typing import ( BaseBuffer, - CompressionDict, - CompressionOptions, - FilePath, - ReadBuffer, ReadCsvBuffer, - StorageOptions, - WriteBuffer, ) from pandas.compat import ( get_bz2_file, @@ -73,8 +68,8 @@ is_integer, is_list_like, ) +from pandas.core.dtypes.generic import ABCMultiIndex -from pandas.core.indexes.api import MultiIndex from pandas.core.shared_docs import _shared_docs _VALID_URLS = set(uses_relative + uses_netloc + uses_params) @@ -84,6 +79,21 @@ BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer) +if TYPE_CHECKING: + from types import TracebackType + + from pandas._typing import ( + CompressionDict, + CompressionOptions, + FilePath, + ReadBuffer, + StorageOptions, + WriteBuffer, + ) + + from pandas import MultiIndex + + @dataclasses.dataclass class IOArgs: """ @@ -138,7 +148,12 @@ def __enter__(self) -> IOHandles[AnyStr]: return self - def __exit__(self, *args: Any) -> None: + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + traceback: TracebackType | None, + ) -> None: self.close() @@ -314,11 +329,8 @@ {storage_options} - .. versionadded:: 1.2.0 - ..versionchange:: 1.2.0 - - Returns the dataclass IOArgs. + Returns the dataclass IOArgs. """ filepath_or_buffer = stringify_path(filepath_or_buffer) @@ -696,8 +708,6 @@ storage_options: StorageOptions = None Passed to _get_filepath_or_buffer - .. versionchanged:: 1.2.0 - Returns the dataclass IOHandles """ # Windows does not default to utf-8. Set to utf-8 for a consistent behavior @@ -1215,7 +1225,7 @@ return bool( len(columns) - and not isinstance(columns, MultiIndex) + and not isinstance(columns, ABCMultiIndex) and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) ) diff -Nru pandas-2.1.4+dfsg/pandas/io/excel/_base.py pandas-2.2.2+dfsg/pandas/io/excel/_base.py --- pandas-2.1.4+dfsg/pandas/io/excel/_base.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/excel/_base.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,5 @@ from __future__ import annotations -import abc from collections.abc import ( Hashable, Iterable, @@ -81,12 +80,13 @@ IntStrT, ReadBuffer, Self, + SequenceNotStr, StorageOptions, WriteExcelBuffer, ) _read_excel_doc = ( """ -Read an Excel file into a pandas DataFrame. +Read an Excel file into a ``pandas`` ``DataFrame``. Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions read from a local filesystem or URL. Supports an option to read @@ -112,7 +112,7 @@ Strings are used for sheet names. Integers are used in zero-indexed sheet positions (chart sheets do not count as a sheet position). Lists of strings/integers are used to request multiple sheets. - Specify None to get all worksheets. + Specify ``None`` to get all worksheets. Available cases: @@ -121,7 +121,7 @@ * ``"Sheet1"``: Load sheet with name "Sheet1" * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" as a dict of `DataFrame` - * None: All worksheets. + * ``None``: All worksheets. header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed @@ -155,37 +155,29 @@ Returns a subset of the columns according to behavior above. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}} - Use `object` to preserve data as stored in Excel and not interpret dtype. + Use ``object`` to preserve data as stored in Excel and not interpret dtype, + which will necessarily result in ``object`` dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. -engine : str, default None + If you use ``None``, it will infer the dtype of each column based on the data. +engine : {{'openpyxl', 'calamine', 'odf', 'pyxlsb', 'xlrd'}}, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". Engine compatibility : - - "xlrd" supports old-style Excel files (.xls). - - "openpyxl" supports newer Excel file formats. - - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - - "pyxlsb" supports Binary Excel files. - - .. versionchanged:: 1.2.0 - The engine `xlrd `_ - now only supports old-style ``.xls`` files. - When ``engine=None``, the following logic will be - used to determine the engine: - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is an xls format, - ``xlrd`` will be used. - - Otherwise if ``path_or_buffer`` is in xlsb format, - ``pyxlsb`` will be used. - - .. versionadded:: 1.3.0 - - Otherwise ``openpyxl`` will be used. - - .. versionchanged:: 1.3.0 - + - ``openpyxl`` supports newer Excel file formats. + - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. + - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). + - ``pyxlsb`` supports Binary Excel files. + - ``xlrd`` supports old-style Excel files (.xls). + + When ``engine=None``, the following logic will be used to determine the engine: + + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, ``xlrd`` will be used. + - Otherwise if ``path_or_buffer`` is in xlsb format, ``pyxlsb`` will be used. + - Otherwise ``openpyxl`` will be used. converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one @@ -211,34 +203,34 @@ + """'. keep_default_na : bool, default True Whether or not to include the default NaN values when parsing the data. - Depending on whether `na_values` is passed in, the behavior is as follows: + Depending on whether ``na_values`` is passed in, the behavior is as follows: - * If `keep_default_na` is True, and `na_values` are specified, `na_values` - is appended to the default NaN values used for parsing. - * If `keep_default_na` is True, and `na_values` are not specified, only + * If ``keep_default_na`` is True, and ``na_values`` are specified, + ``na_values`` is appended to the default NaN values used for parsing. + * If ``keep_default_na`` is True, and ``na_values`` are not specified, only the default NaN values are used for parsing. - * If `keep_default_na` is False, and `na_values` are specified, only - the NaN values specified `na_values` are used for parsing. - * If `keep_default_na` is False, and `na_values` are not specified, no + * If ``keep_default_na`` is False, and ``na_values`` are specified, only + the NaN values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is False, and ``na_values`` are not specified, no strings will be parsed as NaN. - Note that if `na_filter` is passed in as False, the `keep_default_na` and - `na_values` parameters will be ignored. + Note that if `na_filter` is passed in as False, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. na_filter : bool, default True Detect missing value markers (empty strings and the value of na_values). In - data without any NAs, passing na_filter=False can improve the performance - of reading a large file. + data without any NAs, passing ``na_filter=False`` can improve the + performance of reading a large file. verbose : bool, default False Indicate number of NA values placed in non-numeric columns. parse_dates : bool, list-like, or dict, default False The behavior is as follows: - * bool. If True -> try parsing the index. - * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + * ``bool``. If True -> try parsing the index. + * ``list`` of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column. - * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + * ``list`` of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as a single date column. - * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call + * ``dict``, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call result 'foo' If a column or index contains an unparsable date, the entire column or @@ -288,8 +280,6 @@ Rows at the end to skip (0-indexed). {storage_options} - .. versionadded:: 1.2.0 - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' Back-end data type applied to the resultant :class:`DataFrame` (still experimental). Behaviour is as follows: @@ -368,7 +358,8 @@ 1 NaN 2 2 #Comment 3 -Comment lines in the excel input file can be skipped using the `comment` kwarg +Comment lines in the excel input file can be skipped using the +``comment`` kwarg. >>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP Name Value @@ -386,8 +377,8 @@ sheet_name: str | int = ..., *, header: int | Sequence[int] | None = ..., - names: list[str] | None = ..., - index_col: int | Sequence[int] | None = ..., + names: SequenceNotStr[Hashable] | range | None = ..., + index_col: int | str | Sequence[int] | None = ..., usecols: int | str | Sequence[int] @@ -395,7 +386,7 @@ | Callable[[str], bool] | None = ..., dtype: DtypeArg | None = ..., - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., false_values: Iterable[Hashable] | None = ..., @@ -425,8 +416,8 @@ sheet_name: list[IntStrT] | None, *, header: int | Sequence[int] | None = ..., - names: list[str] | None = ..., - index_col: int | Sequence[int] | None = ..., + names: SequenceNotStr[Hashable] | range | None = ..., + index_col: int | str | Sequence[int] | None = ..., usecols: int | str | Sequence[int] @@ -434,7 +425,7 @@ | Callable[[str], bool] | None = ..., dtype: DtypeArg | None = ..., - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ..., + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = ..., converters: dict[str, Callable] | dict[int, Callable] | None = ..., true_values: Iterable[Hashable] | None = ..., false_values: Iterable[Hashable] | None = ..., @@ -464,8 +455,8 @@ sheet_name: str | int | list[IntStrT] | None = 0, *, header: int | Sequence[int] | None = 0, - names: list[str] | None = None, - index_col: int | Sequence[int] | None = None, + names: SequenceNotStr[Hashable] | range | None = None, + index_col: int | str | Sequence[int] | None = None, usecols: int | str | Sequence[int] @@ -473,7 +464,7 @@ | Callable[[str], bool] | None = None, dtype: DtypeArg | None = None, - engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None, + engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb", "calamine"] | None = None, converters: dict[str, Callable] | dict[int, Callable] | None = None, true_values: Iterable[Hashable] | None = None, false_values: Iterable[Hashable] | None = None, @@ -549,7 +540,7 @@ _WorkbookT = TypeVar("_WorkbookT") -class BaseExcelReader(Generic[_WorkbookT], metaclass=abc.ABCMeta): +class BaseExcelReader(Generic[_WorkbookT]): book: _WorkbookT def __init__( @@ -589,13 +580,11 @@ ) @property - @abc.abstractmethod def _workbook_class(self) -> type[_WorkbookT]: - pass + raise NotImplementedError - @abc.abstractmethod def load_workbook(self, filepath_or_buffer, engine_kwargs) -> _WorkbookT: - pass + raise NotImplementedError def close(self) -> None: if hasattr(self, "book"): @@ -611,21 +600,17 @@ self.handles.close() @property - @abc.abstractmethod def sheet_names(self) -> list[str]: - pass + raise NotImplementedError - @abc.abstractmethod def get_sheet_by_name(self, name: str): - pass + raise NotImplementedError - @abc.abstractmethod def get_sheet_by_index(self, index: int): - pass + raise NotImplementedError - @abc.abstractmethod def get_sheet_data(self, sheet, rows: int | None = None): - pass + raise NotImplementedError def raise_if_bad_sheet_by_index(self, index: int) -> None: n_sheets = len(self.sheet_names) @@ -683,7 +668,7 @@ ---------- header : int, list of int, or None See read_excel docstring. - index_col : int, list of int, or None + index_col : int, str, list of int, or None See read_excel docstring. skiprows : list-like, int, callable, or None See read_excel docstring. @@ -735,7 +720,7 @@ self, sheet_name: str | int | list[int] | list[str] | None = 0, header: int | Sequence[int] | None = 0, - names=None, + names: SequenceNotStr[Hashable] | range | None = None, index_col: int | Sequence[int] | None = None, usecols=None, dtype: DtypeArg | None = None, @@ -940,7 +925,7 @@ @doc(storage_options=_shared_docs["storage_options"]) -class ExcelWriter(Generic[_WorkbookT], metaclass=abc.ABCMeta): +class ExcelWriter(Generic[_WorkbookT]): """ Class for writing DataFrame objects into excel sheets. @@ -972,8 +957,6 @@ File mode to use (write or append). Append does not work with fsspec URLs. {storage_options} - .. versionadded:: 1.2.0 - if_sheet_exists : {{'error', 'new', 'replace', 'overlay'}}, default 'error' How to behave when trying to write to a sheet that already exists (append mode only). @@ -1178,20 +1161,19 @@ return self._engine @property - @abc.abstractmethod def sheets(self) -> dict[str, Any]: """Mapping of sheet names to sheet objects.""" + raise NotImplementedError @property - @abc.abstractmethod def book(self) -> _WorkbookT: """ Book instance. Class type will depend on the engine used. This attribute can be used to access engine-specific features. """ + raise NotImplementedError - @abc.abstractmethod def _write_cells( self, cells, @@ -1214,12 +1196,13 @@ freeze_panes: int tuple of length 2 contains the bottom-most row and right-most column to freeze """ + raise NotImplementedError - @abc.abstractmethod def _save(self) -> None: """ Save workbook to disk. """ + raise NotImplementedError def __init__( self, @@ -1463,13 +1446,15 @@ .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` + Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, ``calamine`` Engine compatibility : - ``xlrd`` supports old-style Excel files (.xls). - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. + - ``calamine`` supports Excel (.xls, .xlsx, .xlsm, .xlsb) + and OpenDocument (.ods) file formats. .. versionchanged:: 1.2.0 @@ -1505,6 +1490,7 @@ ... df1 = pd.read_excel(xls, "Sheet1") # doctest: +SKIP """ + from pandas.io.excel._calamine import CalamineReader from pandas.io.excel._odfreader import ODFReader from pandas.io.excel._openpyxl import OpenpyxlReader from pandas.io.excel._pyxlsb import PyxlsbReader @@ -1515,6 +1501,7 @@ "openpyxl": OpenpyxlReader, "odf": ODFReader, "pyxlsb": PyxlsbReader, + "calamine": CalamineReader, } def __init__( @@ -1590,7 +1577,7 @@ self, sheet_name: str | int | list[int] | list[str] | None = 0, header: int | Sequence[int] | None = 0, - names=None, + names: SequenceNotStr[Hashable] | range | None = None, index_col: int | Sequence[int] | None = None, usecols=None, converters=None, diff -Nru pandas-2.1.4+dfsg/pandas/io/excel/_calamine.py pandas-2.2.2+dfsg/pandas/io/excel/_calamine.py --- pandas-2.1.4+dfsg/pandas/io/excel/_calamine.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/excel/_calamine.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,121 @@ +from __future__ import annotations + +from datetime import ( + date, + datetime, + time, + timedelta, +) +from typing import ( + TYPE_CHECKING, + Any, + Union, +) + +from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import doc + +import pandas as pd +from pandas.core.shared_docs import _shared_docs + +from pandas.io.excel._base import BaseExcelReader + +if TYPE_CHECKING: + from python_calamine import ( + CalamineSheet, + CalamineWorkbook, + ) + + from pandas._typing import ( + FilePath, + NaTType, + ReadBuffer, + Scalar, + StorageOptions, + ) + +_CellValue = Union[int, float, str, bool, time, date, datetime, timedelta] + + +class CalamineReader(BaseExcelReader["CalamineWorkbook"]): + @doc(storage_options=_shared_docs["storage_options"]) + def __init__( + self, + filepath_or_buffer: FilePath | ReadBuffer[bytes], + storage_options: StorageOptions | None = None, + engine_kwargs: dict | None = None, + ) -> None: + """ + Reader using calamine engine (xlsx/xls/xlsb/ods). + + Parameters + ---------- + filepath_or_buffer : str, path to be parsed or + an open readable stream. + {storage_options} + engine_kwargs : dict, optional + Arbitrary keyword arguments passed to excel engine. + """ + import_optional_dependency("python_calamine") + super().__init__( + filepath_or_buffer, + storage_options=storage_options, + engine_kwargs=engine_kwargs, + ) + + @property + def _workbook_class(self) -> type[CalamineWorkbook]: + from python_calamine import CalamineWorkbook + + return CalamineWorkbook + + def load_workbook( + self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs: Any + ) -> CalamineWorkbook: + from python_calamine import load_workbook + + return load_workbook(filepath_or_buffer, **engine_kwargs) + + @property + def sheet_names(self) -> list[str]: + from python_calamine import SheetTypeEnum + + return [ + sheet.name + for sheet in self.book.sheets_metadata + if sheet.typ == SheetTypeEnum.WorkSheet + ] + + def get_sheet_by_name(self, name: str) -> CalamineSheet: + self.raise_if_bad_sheet_by_name(name) + return self.book.get_sheet_by_name(name) + + def get_sheet_by_index(self, index: int) -> CalamineSheet: + self.raise_if_bad_sheet_by_index(index) + return self.book.get_sheet_by_index(index) + + def get_sheet_data( + self, sheet: CalamineSheet, file_rows_needed: int | None = None + ) -> list[list[Scalar | NaTType | time]]: + def _convert_cell(value: _CellValue) -> Scalar | NaTType | time: + if isinstance(value, float): + val = int(value) + if val == value: + return val + else: + return value + elif isinstance(value, date): + return pd.Timestamp(value) + elif isinstance(value, timedelta): + return pd.Timedelta(value) + elif isinstance(value, time): + return value + + return value + + rows: list[list[_CellValue]] = sheet.to_python( + skip_empty_area=False, nrows=file_rows_needed + ) + data = [[_convert_cell(cell) for cell in row] for row in rows] + + return data diff -Nru pandas-2.1.4+dfsg/pandas/io/excel/_odfreader.py pandas-2.2.2+dfsg/pandas/io/excel/_odfreader.py --- pandas-2.1.4+dfsg/pandas/io/excel/_odfreader.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/excel/_odfreader.py 2024-04-10 17:42:52.000000000 +0000 @@ -150,7 +150,7 @@ max_row_len = len(table_row) row_repeat = self._get_row_repeat(sheet_row) - if self._is_empty_row(sheet_row): + if len(table_row) == 0: empty_rows += row_repeat else: # add blank rows to our table @@ -182,16 +182,6 @@ return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1)) - def _is_empty_row(self, row) -> bool: - """ - Helper function to find empty rows - """ - for column in row.childNodes: - if len(column.childNodes) > 0: - return False - - return True - def _get_cell_value(self, cell) -> Scalar | NaTType: from odf.namespaces import OFFICENS @@ -238,8 +228,10 @@ """ from odf.element import Element from odf.namespaces import TEXTNS + from odf.office import Annotation from odf.text import S + office_annotation = Annotation().qname text_s = S().qname value = [] @@ -249,6 +241,8 @@ if fragment.qname == text_s: spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) value.append(" " * spaces) + elif fragment.qname == office_annotation: + continue else: # recursive impl needed in case of nested fragments # with multiple spaces diff -Nru pandas-2.1.4+dfsg/pandas/io/excel/_odswriter.py pandas-2.2.2+dfsg/pandas/io/excel/_odswriter.py --- pandas-2.1.4+dfsg/pandas/io/excel/_odswriter.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/excel/_odswriter.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,6 +2,7 @@ from collections import defaultdict import datetime +import json from typing import ( TYPE_CHECKING, Any, @@ -10,8 +11,6 @@ overload, ) -from pandas._libs import json - from pandas.io.excel._base import ExcelWriter from pandas.io.excel._util import ( combine_kwargs, @@ -193,7 +192,15 @@ if isinstance(val, bool): value = str(val).lower() pvalue = str(val).upper() - if isinstance(val, datetime.datetime): + return ( + pvalue, + TableCell( + valuetype="boolean", + booleanvalue=value, + attributes=attributes, + ), + ) + elif isinstance(val, datetime.datetime): # Fast formatting value = val.isoformat() # Slow but locale-dependent @@ -211,17 +218,20 @@ pvalue, TableCell(valuetype="date", datevalue=value, attributes=attributes), ) + elif isinstance(val, str): + return ( + pvalue, + TableCell( + valuetype="string", + stringvalue=value, + attributes=attributes, + ), + ) else: - class_to_cell_type = { - str: "string", - int: "float", - float: "float", - bool: "boolean", - } return ( pvalue, TableCell( - valuetype=class_to_cell_type[type(val)], + valuetype="float", value=value, attributes=attributes, ), @@ -257,7 +267,7 @@ if style is None: return None - style_key = json.ujson_dumps(style) + style_key = json.dumps(style) if style_key in self._style_dict: return self._style_dict[style_key] name = f"pd{len(self._style_dict)+1}" diff -Nru pandas-2.1.4+dfsg/pandas/io/excel/_openpyxl.py pandas-2.2.2+dfsg/pandas/io/excel/_openpyxl.py --- pandas-2.1.4+dfsg/pandas/io/excel/_openpyxl.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/excel/_openpyxl.py 2024-04-10 17:42:52.000000000 +0000 @@ -567,12 +567,11 @@ ) -> Workbook: from openpyxl import load_workbook + default_kwargs = {"read_only": True, "data_only": True, "keep_links": False} + return load_workbook( filepath_or_buffer, - read_only=True, - data_only=True, - keep_links=False, - **engine_kwargs, + **(default_kwargs | engine_kwargs), ) @property diff -Nru pandas-2.1.4+dfsg/pandas/io/excel/_xlrd.py pandas-2.2.2+dfsg/pandas/io/excel/_xlrd.py --- pandas-2.1.4+dfsg/pandas/io/excel/_xlrd.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/excel/_xlrd.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,7 @@ from __future__ import annotations from datetime import time +import math from typing import TYPE_CHECKING import numpy as np @@ -120,9 +121,11 @@ elif cell_typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less surprising - val = int(cell_contents) - if val == cell_contents: - cell_contents = val + if math.isfinite(cell_contents): + # GH54564 - don't attempt to convert NaN/Inf + val = int(cell_contents) + if val == cell_contents: + cell_contents = val return cell_contents data = [] diff -Nru pandas-2.1.4+dfsg/pandas/io/excel/_xlsxwriter.py pandas-2.2.2+dfsg/pandas/io/excel/_xlsxwriter.py --- pandas-2.1.4+dfsg/pandas/io/excel/_xlsxwriter.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/excel/_xlsxwriter.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,12 +1,11 @@ from __future__ import annotations +import json from typing import ( TYPE_CHECKING, Any, ) -from pandas._libs import json - from pandas.io.excel._base import ExcelWriter from pandas.io.excel._util import ( combine_kwargs, @@ -262,7 +261,7 @@ for cell in cells: val, fmt = self._value_with_fmt(cell.val) - stylekey = json.ujson_dumps(cell.style) + stylekey = json.dumps(cell.style) if fmt: stylekey += fmt diff -Nru pandas-2.1.4+dfsg/pandas/io/feather_format.py pandas-2.2.2+dfsg/pandas/io/feather_format.py --- pandas-2.1.4+dfsg/pandas/io/feather_format.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/feather_format.py 2024-04-10 17:42:52.000000000 +0000 @@ -50,9 +50,6 @@ df : DataFrame path : str, path object, or file-like object {storage_options} - - .. versionadded:: 1.2.0 - **kwargs : Additional keywords passed to `pyarrow.feather.write_feather`. @@ -93,8 +90,6 @@ Whether to parallelize reading using multiple threads. {storage_options} - .. versionadded:: 1.2.0 - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' Back-end data type applied to the resultant :class:`DataFrame` (still experimental). Behaviour is as follows: @@ -118,7 +113,7 @@ from pyarrow import feather # import utils to register the pyarrow extension types - import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401,E501 + import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401 check_dtype_backend(dtype_backend) diff -Nru pandas-2.1.4+dfsg/pandas/io/formats/csvs.py pandas-2.2.2+dfsg/pandas/io/formats/csvs.py --- pandas-2.1.4+dfsg/pandas/io/formats/csvs.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/formats/csvs.py 2024-04-10 17:42:52.000000000 +0000 @@ -21,6 +21,7 @@ import numpy as np from pandas._libs import writers as libwriters +from pandas._typing import SequenceNotStr from pandas.util._decorators import cache_readonly from pandas.core.dtypes.generic import ( @@ -43,6 +44,7 @@ IndexLabel, StorageOptions, WriteBuffer, + npt, ) from pandas.io.formats.format import DataFrameFormatter @@ -52,7 +54,7 @@ class CSVFormatter: - cols: np.ndarray + cols: npt.NDArray[np.object_] def __init__( self, @@ -109,7 +111,7 @@ return self.fmt.decimal @property - def header(self) -> bool | list[str]: + def header(self) -> bool | SequenceNotStr[str]: return self.fmt.header @property @@ -148,7 +150,9 @@ def has_mi_columns(self) -> bool: return bool(isinstance(self.obj.columns, ABCMultiIndex)) - def _initialize_columns(self, cols: Iterable[Hashable] | None) -> np.ndarray: + def _initialize_columns( + self, cols: Iterable[Hashable] | None + ) -> npt.NDArray[np.object_]: # validate mi options if self.has_mi_columns: if cols is not None: @@ -157,7 +161,7 @@ if cols is not None: if isinstance(cols, ABCIndex): - cols = cols._format_native_types(**self._number_format) + cols = cols._get_values_for_csv(**self._number_format) else: cols = list(cols) self.obj = self.obj.loc[:, cols] @@ -165,7 +169,7 @@ # update columns to include possible multiplicity of dupes # and make sure cols is just a list of labels new_cols = self.obj.columns - return new_cols._format_native_types(**self._number_format) + return new_cols._get_values_for_csv(**self._number_format) def _initialize_chunksize(self, chunksize: int | None) -> int: if chunksize is None: @@ -213,7 +217,7 @@ return bool(self._has_aliases or self.header) @property - def write_cols(self) -> Sequence[Hashable]: + def write_cols(self) -> SequenceNotStr[Hashable]: if self._has_aliases: assert not isinstance(self.header, bool) if len(self.header) != len(self.cols): @@ -222,9 +226,9 @@ ) return self.header else: - # self.cols is an ndarray derived from Index._format_native_types, + # self.cols is an ndarray derived from Index._get_values_for_csv, # so its entries are strings, i.e. hashable - return cast(Sequence[Hashable], self.cols) + return cast(SequenceNotStr[Hashable], self.cols) @property def encoded_labels(self) -> list[Hashable]: @@ -313,10 +317,10 @@ slicer = slice(start_i, end_i) df = self.obj.iloc[slicer] - res = df._mgr.to_native_types(**self._number_format) - data = [res.iget_values(i) for i in range(len(res.items))] + res = df._get_values_for_csv(**self._number_format) + data = list(res._iter_column_arrays()) - ix = self.data_index[slicer]._format_native_types(**self._number_format) + ix = self.data_index[slicer]._get_values_for_csv(**self._number_format) libwriters.write_csv_rows( data, ix, diff -Nru pandas-2.1.4+dfsg/pandas/io/formats/excel.py pandas-2.2.2+dfsg/pandas/io/formats/excel.py --- pandas-2.1.4+dfsg/pandas/io/formats/excel.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/formats/excel.py 2024-04-10 17:42:52.000000000 +0000 @@ -623,8 +623,8 @@ return columns = self.columns - level_strs = columns.format( - sparsify=self.merge_cells, adjoin=False, names=False + level_strs = columns._format_multi( + sparsify=self.merge_cells, include_names=False ) level_lengths = get_level_lengths(level_strs) coloffset = 0 @@ -813,8 +813,8 @@ if self.merge_cells: # Format hierarchical rows as merged cells. - level_strs = self.df.index.format( - sparsify=True, adjoin=False, names=False + level_strs = self.df.index._format_multi( + sparsify=True, include_names=False ) level_lengths = get_level_lengths(level_strs) @@ -921,7 +921,6 @@ {storage_options} - .. versionadded:: 1.2.0 engine_kwargs: dict, optional Arbitrary keyword arguments passed to excel engine. """ @@ -941,9 +940,7 @@ if isinstance(writer, ExcelWriter): need_save = False else: - # error: Cannot instantiate abstract class 'ExcelWriter' with abstract - # attributes 'engine', 'save', 'supported_extensions' and 'write_cells' - writer = ExcelWriter( # type: ignore[abstract] + writer = ExcelWriter( writer, engine=engine, storage_options=storage_options, diff -Nru pandas-2.1.4+dfsg/pandas/io/formats/format.py pandas-2.2.2+dfsg/pandas/io/formats/format.py --- pandas-2.1.4+dfsg/pandas/io/formats/format.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/formats/format.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,15 +7,11 @@ from collections.abc import ( Generator, Hashable, - Iterable, Mapping, Sequence, ) from contextlib import contextmanager -from csv import ( - QUOTE_NONE, - QUOTE_NONNUMERIC, -) +from csv import QUOTE_NONE from decimal import Decimal from functools import partial from io import StringIO @@ -23,14 +19,12 @@ import re from shutil import get_terminal_size from typing import ( - IO, TYPE_CHECKING, Any, Callable, Final, cast, ) -from unicodedata import east_asian_width import numpy as np @@ -45,9 +39,6 @@ NaT, Timedelta, Timestamp, - get_unit_from_dtype, - iNaT, - periods_per_day, ) from pandas._libs.tslibs.nattype import NaTType @@ -72,12 +63,12 @@ from pandas.core.arrays import ( Categorical, DatetimeArray, + ExtensionArray, TimedeltaArray, ) from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.construction import extract_array from pandas.core.indexes.api import ( Index, MultiIndex, @@ -105,6 +96,7 @@ FloatFormatType, FormattersType, IndexLabel, + SequenceNotStr, StorageOptions, WriteBuffer, ) @@ -140,9 +132,6 @@ floats. This function must return a unicode string and will be applied only to the non-``NaN`` elements, with ``NaN`` being handled by ``na_rep``. - - .. versionchanged:: 1.2.0 - sparsify : bool, optional, default True Set to False for a DataFrame with a hierarchical index to print every multiindex key at each row. @@ -174,7 +163,7 @@ Character recognized as decimal separator, e.g. ',' in Europe. """ -_VALID_JUSTIFY_PARAMETERS = ( +VALID_JUSTIFY_PARAMETERS = ( "left", "right", "center", @@ -197,75 +186,16 @@ """ -class CategoricalFormatter: - def __init__( - self, - categorical: Categorical, - buf: IO[str] | None = None, - length: bool = True, - na_rep: str = "NaN", - footer: bool = True, - ) -> None: - self.categorical = categorical - self.buf = buf if buf is not None else StringIO("") - self.na_rep = na_rep - self.length = length - self.footer = footer - self.quoting = QUOTE_NONNUMERIC - - def _get_footer(self) -> str: - footer = "" - - if self.length: - if footer: - footer += ", " - footer += f"Length: {len(self.categorical)}" - - level_info = self.categorical._repr_categories_info() - - # Levels are added in a newline - if footer: - footer += "\n" - footer += level_info - - return str(footer) - - def _get_formatted_values(self) -> list[str]: - return format_array( - self.categorical._internal_get_values(), - None, - float_format=None, - na_rep=self.na_rep, - quoting=self.quoting, - ) - - def to_string(self) -> str: - categorical = self.categorical - - if len(categorical) == 0: - if self.footer: - return self._get_footer() - else: - return "" - - fmt_values = self._get_formatted_values() - - fmt_values = [i.strip() for i in fmt_values] - values = ", ".join(fmt_values) - result = ["[" + values + "]"] - if self.footer: - footer = self._get_footer() - if footer: - result.append(footer) - - return str("\n".join(result)) - - class SeriesFormatter: + """ + Implement the main logic of Series.to_string, which underlies + Series.__repr__. + """ + def __init__( self, series: Series, - buf: IO[str] | None = None, + *, length: bool | str = True, header: bool = True, index: bool = True, @@ -277,7 +207,7 @@ min_rows: int | None = None, ) -> None: self.series = series - self.buf = buf if buf is not None else StringIO() + self.buf = StringIO() self.name = name self.na_rep = na_rep self.header = header @@ -290,7 +220,7 @@ float_format = get_option("display.float_format") self.float_format = float_format self.dtype = dtype - self.adj = get_adjustment() + self.adj = printing.get_adjustment() self._chk_truncate() @@ -325,11 +255,12 @@ name = self.series.name footer = "" - if getattr(self.series.index, "freq", None) is not None: - assert isinstance( - self.series.index, (DatetimeIndex, PeriodIndex, TimedeltaIndex) - ) - footer += f"Freq: {self.series.index.freqstr}" + index = self.series.index + if ( + isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)) + and index.freq is not None + ): + footer += f"Freq: {index.freqstr}" if self.name is not False and name is not None: if footer: @@ -355,24 +286,13 @@ # level infos are added to the end and in a new line, like it is done # for Categoricals if isinstance(self.tr_series.dtype, CategoricalDtype): - level_info = self.tr_series._values._repr_categories_info() + level_info = self.tr_series._values._get_repr_footer() if footer: footer += "\n" footer += level_info return str(footer) - def _get_formatted_index(self) -> tuple[list[str], bool]: - index = self.tr_series.index - - if isinstance(index, MultiIndex): - have_header = any(name for name in index.names) - fmt_index = index.format(names=True) - else: - have_header = index.name is not None - fmt_index = index.format(name=True) - return fmt_index, have_header - def _get_formatted_values(self) -> list[str]: return format_array( self.tr_series._values, @@ -389,7 +309,14 @@ if len(series) == 0: return f"{type(self.series).__name__}([], {footer})" - fmt_index, have_header = self._get_formatted_index() + index = series.index + have_header = _has_names(index) + if isinstance(index, MultiIndex): + fmt_index = index._format_multi(include_names=True, sparsify=None) + adj = printing.get_adjustment() + fmt_index = adj.adjoin(2, *fmt_index).split("\n") + else: + fmt_index = index._format_flat(include_name=True) fmt_values = self._get_formatted_values() if self.is_truncated_vertically: @@ -421,69 +348,6 @@ return str("".join(result)) -class TextAdjustment: - def __init__(self) -> None: - self.encoding = get_option("display.encoding") - - def len(self, text: str) -> int: - return len(text) - - def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]: - return printing.justify(texts, max_len, mode=mode) - - def adjoin(self, space: int, *lists, **kwargs) -> str: - return printing.adjoin( - space, *lists, strlen=self.len, justfunc=self.justify, **kwargs - ) - - -class EastAsianTextAdjustment(TextAdjustment): - def __init__(self) -> None: - super().__init__() - if get_option("display.unicode.ambiguous_as_wide"): - self.ambiguous_width = 2 - else: - self.ambiguous_width = 1 - - # Definition of East Asian Width - # https://unicode.org/reports/tr11/ - # Ambiguous width can be changed by option - self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1} - - def len(self, text: str) -> int: - """ - Calculate display width considering unicode East Asian Width - """ - if not isinstance(text, str): - return len(text) - - return sum( - self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text - ) - - def justify( - self, texts: Iterable[str], max_len: int, mode: str = "right" - ) -> list[str]: - # re-calculate padding space per str considering East Asian Width - def _get_pad(t): - return max_len - self.len(t) + len(t) - - if mode == "left": - return [x.ljust(_get_pad(x)) for x in texts] - elif mode == "center": - return [x.center(_get_pad(x)) for x in texts] - else: - return [x.rjust(_get_pad(x)) for x in texts] - - -def get_adjustment() -> TextAdjustment: - use_east_asian_width = get_option("display.unicode.east_asian_width") - if use_east_asian_width: - return EastAsianTextAdjustment() - else: - return TextAdjustment() - - def get_dataframe_repr_params() -> dict[str, Any]: """Get the parameters used to repr(dataFrame) calls using DataFrame.to_string. @@ -535,16 +399,9 @@ True """ width, height = get_terminal_size() - max_rows = ( - height - if get_option("display.max_rows") == 0 - else get_option("display.max_rows") - ) - min_rows = ( - height - if get_option("display.max_rows") == 0 - else get_option("display.min_rows") - ) + max_rows_opt = get_option("display.max_rows") + max_rows = height if max_rows_opt == 0 else max_rows_opt + min_rows = height if max_rows_opt == 0 else get_option("display.min_rows") return { "name": True, @@ -556,7 +413,11 @@ class DataFrameFormatter: - """Class for processing dataframe formatting options and data.""" + """ + Class for processing dataframe formatting options and data. + + Used by DataFrame.to_string, which backs DataFrame.__repr__. + """ __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring @@ -566,7 +427,7 @@ frame: DataFrame, columns: Axes | None = None, col_space: ColspaceArgType | None = None, - header: bool | list[str] = True, + header: bool | SequenceNotStr[str] = True, index: bool = True, na_rep: str = "NaN", formatters: FormattersType | None = None, @@ -606,7 +467,7 @@ self.tr_frame = self.frame self.truncate() - self.adj = get_adjustment() + self.adj = printing.get_adjustment() def get_strcols(self) -> list[list[str]]: """ @@ -832,9 +693,9 @@ assert self.max_rows_fitted is not None row_num = self.max_rows_fitted // 2 if row_num >= 1: - head = self.tr_frame.iloc[:row_num, :] - tail = self.tr_frame.iloc[-row_num:, :] - self.tr_frame = concat((head, tail)) + _len = len(self.tr_frame) + _slice = np.hstack([np.arange(row_num), np.arange(_len - row_num, _len)]) + self.tr_frame = self.tr_frame.iloc[_slice] else: row_num = cast(int, self.max_rows) self.tr_frame = self.tr_frame.iloc[:row_num, :] @@ -918,7 +779,7 @@ columns = frame.columns if isinstance(columns, MultiIndex): - fmt_columns = columns.format(sparsify=False, adjoin=False) + fmt_columns = columns._format_multi(sparsify=False, include_names=False) fmt_columns = list(zip(*fmt_columns)) dtypes = self.frame.dtypes._values @@ -935,15 +796,15 @@ return " " + y return y - str_columns = list( + str_columns_tuple = list( zip(*([space_format(x, y) for y in x] for x in fmt_columns)) ) - if self.sparsify and len(str_columns): - str_columns = sparsify_labels(str_columns) + if self.sparsify and len(str_columns_tuple): + str_columns_tuple = sparsify_labels(str_columns_tuple) - str_columns = [list(x) for x in zip(*str_columns)] + str_columns = [list(x) for x in zip(*str_columns_tuple)] else: - fmt_columns = columns.format() + fmt_columns = columns._format_flat(include_name=False) dtypes = self.frame.dtypes need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) str_columns = [ @@ -962,14 +823,15 @@ fmt = self._get_formatter("__index__") if isinstance(index, MultiIndex): - fmt_index = index.format( + fmt_index = index._format_multi( sparsify=self.sparsify, - adjoin=False, - names=self.show_row_idx_names, + include_names=self.show_row_idx_names, formatter=fmt, ) else: - fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)] + fmt_index = [ + index._format_flat(include_name=self.show_row_idx_names, formatter=fmt) + ] fmt_index = [ tuple( @@ -1168,16 +1030,16 @@ """ Perform serialization. Write to buf or return as string if buf is None. """ - with get_buffer(buf, encoding=encoding) as f: - f.write(string) + with _get_buffer(buf, encoding=encoding) as fd: + fd.write(string) if buf is None: # error: "WriteBuffer[str]" has no attribute "getvalue" - return f.getvalue() # type: ignore[attr-defined] + return fd.getvalue() # type: ignore[attr-defined] return None @contextmanager -def get_buffer( +def _get_buffer( buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None ) -> Generator[WriteBuffer[str], None, None] | Generator[StringIO, None, None]: """ @@ -1215,7 +1077,7 @@ def format_array( - values: Any, + values: ArrayLike, formatter: Callable | None, float_format: FloatFormatType | None = None, na_rep: str = "NaN", @@ -1232,7 +1094,7 @@ Parameters ---------- - values + values : np.ndarray or ExtensionArray formatter float_format na_rep @@ -1246,7 +1108,7 @@ the leading space to pad between columns. When formatting an Index subclass - (e.g. IntervalIndex._format_native_types), we don't want the + (e.g. IntervalIndex._get_values_for_csv), we don't want the leading space since it should be left-aligned. fallback_formatter @@ -1254,21 +1116,24 @@ ------- List[str] """ - fmt_klass: type[GenericArrayFormatter] + fmt_klass: type[_GenericArrayFormatter] if lib.is_np_dtype(values.dtype, "M"): - fmt_klass = Datetime64Formatter + fmt_klass = _Datetime64Formatter + values = cast(DatetimeArray, values) elif isinstance(values.dtype, DatetimeTZDtype): - fmt_klass = Datetime64TZFormatter + fmt_klass = _Datetime64TZFormatter + values = cast(DatetimeArray, values) elif lib.is_np_dtype(values.dtype, "m"): - fmt_klass = Timedelta64Formatter + fmt_klass = _Timedelta64Formatter + values = cast(TimedeltaArray, values) elif isinstance(values.dtype, ExtensionDtype): - fmt_klass = ExtensionArrayFormatter + fmt_klass = _ExtensionArrayFormatter elif lib.is_np_dtype(values.dtype, "fc"): fmt_klass = FloatArrayFormatter elif lib.is_np_dtype(values.dtype, "iu"): - fmt_klass = IntArrayFormatter + fmt_klass = _IntArrayFormatter else: - fmt_klass = GenericArrayFormatter + fmt_klass = _GenericArrayFormatter if space is None: space = 12 @@ -1296,10 +1161,10 @@ return fmt_obj.get_result() -class GenericArrayFormatter: +class _GenericArrayFormatter: def __init__( self, - values: Any, + values: ArrayLike, digits: int = 7, formatter: Callable | None = None, na_rep: str = "NaN", @@ -1354,18 +1219,16 @@ def _format(x): if self.na_rep is not None and is_scalar(x) and isna(x): - try: - # try block for np.isnat specifically - # determine na_rep if x is None or NaT-like - if x is None: - return "None" - elif x is NA: - return str(NA) - elif x is NaT or np.isnat(x): - return "NaT" - except (TypeError, ValueError): - # np.isnat only handles datetime or timedelta objects - pass + if x is None: + return "None" + elif x is NA: + return str(NA) + elif lib.is_float(x) and np.isinf(x): + # TODO(3.0): this will be unreachable when use_inf_as_na + # deprecation is enforced + return str(x) + elif x is NaT or isinstance(x, (np.datetime64, np.timedelta64)): + return "NaT" return self.na_rep elif isinstance(x, PandasObject): return str(x) @@ -1375,10 +1238,10 @@ # object dtype return str(formatter(x)) - vals = extract_array(self.values, extract_numpy=True) + vals = self.values if not isinstance(vals, np.ndarray): raise TypeError( - "ExtensionArray formatting should use ExtensionArrayFormatter" + "ExtensionArray formatting should use _ExtensionArrayFormatter" ) inferred = lib.map_infer(vals, is_float) is_float_type = ( @@ -1408,7 +1271,7 @@ return fmt_values -class FloatArrayFormatter(GenericArrayFormatter): +class FloatArrayFormatter(_GenericArrayFormatter): def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) @@ -1609,7 +1472,7 @@ return list(self.get_result_as_array()) -class IntArrayFormatter(GenericArrayFormatter): +class _IntArrayFormatter(_GenericArrayFormatter): def _format_strings(self) -> list[str]: if self.leading_space is False: formatter_str = lambda x: f"{x:d}".format(x=x) @@ -1620,10 +1483,12 @@ return fmt_values -class Datetime64Formatter(GenericArrayFormatter): +class _Datetime64Formatter(_GenericArrayFormatter): + values: DatetimeArray + def __init__( self, - values: np.ndarray | Series | DatetimeIndex | DatetimeArray, + values: DatetimeArray, nat_rep: str = "NaT", date_format: None = None, **kwargs, @@ -1636,21 +1501,20 @@ """we by definition have DO NOT have a TZ""" values = self.values - if not isinstance(values, DatetimeIndex): - values = DatetimeIndex(values) - - if self.formatter is not None and callable(self.formatter): + if self.formatter is not None: return [self.formatter(x) for x in values] - fmt_values = values._data._format_native_types( + fmt_values = values._format_native_types( na_rep=self.nat_rep, date_format=self.date_format ) return fmt_values.tolist() -class ExtensionArrayFormatter(GenericArrayFormatter): +class _ExtensionArrayFormatter(_GenericArrayFormatter): + values: ExtensionArray + def _format_strings(self) -> list[str]: - values = extract_array(self.values, extract_numpy=True) + values = self.values formatter = self.formatter fallback_formatter = None @@ -1661,7 +1525,7 @@ # Categorical is special for now, so that we can preserve tzinfo array = values._internal_get_values() else: - array = np.asarray(values) + array = np.asarray(values, dtype=object) fmt_values = format_array( array, @@ -1724,7 +1588,8 @@ raise ValueError("percentiles should all be in the interval [0,1]") percentiles = 100 * percentiles - percentiles_round_type = percentiles.round().astype(int) + prec = get_precision(percentiles) + percentiles_round_type = percentiles.round(prec).astype(int) int_idx = np.isclose(percentiles_round_type, percentiles) @@ -1733,14 +1598,7 @@ return [i + "%" for i in out] unique_pcts = np.unique(percentiles) - to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None - to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None - - # Least precision that keeps percentiles unique after rounding - prec = -np.floor( - np.log10(np.min(np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end))) - ).astype(int) - prec = max(1, prec) + prec = get_precision(unique_pcts) out = np.empty_like(percentiles, dtype=object) out[int_idx] = percentiles[int_idx].round().astype(int).astype(str) @@ -1748,29 +1606,14 @@ return [i + "%" for i in out] -def is_dates_only(values: np.ndarray | DatetimeArray | Index | DatetimeIndex) -> bool: - # return a boolean if we are only dates (and don't have a timezone) - if not isinstance(values, Index): - values = values.ravel() - - if not isinstance(values, (DatetimeArray, DatetimeIndex)): - values = DatetimeIndex(values) - - if values.tz is not None: - return False - - values_int = values.asi8 - consider_values = values_int != iNaT - # error: Argument 1 to "py_get_unit_from_dtype" has incompatible type - # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]" - reso = get_unit_from_dtype(values.dtype) # type: ignore[arg-type] - ppd = periods_per_day(reso) - - # TODO: can we reuse is_date_array_normalized? would need a skipna kwd - even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0 - if even_days: - return True - return False +def get_precision(array: np.ndarray | Sequence[float]) -> int: + to_begin = array[0] if array[0] > 0 else None + to_end = 100 - array[-1] if array[-1] < 100 else None + diff = np.ediff1d(array, to_begin=to_begin, to_end=to_end) + diff = abs(diff) + prec = -np.floor(np.log10(np.min(diff))).astype(int) + prec = max(1, prec) + return prec def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str: @@ -1798,12 +1641,12 @@ def get_format_datetime64( - is_dates_only_: bool, nat_rep: str = "NaT", date_format: str | None = None + is_dates_only: bool, nat_rep: str = "NaT", date_format: str | None = None ) -> Callable: """Return a formatter callable taking a datetime64 as input and providing a string as output""" - if is_dates_only_: + if is_dates_only: return lambda x: _format_datetime64_dateonly( x, nat_rep=nat_rep, date_format=date_format ) @@ -1811,26 +1654,12 @@ return lambda x: _format_datetime64(x, nat_rep=nat_rep) -def get_format_datetime64_from_values( - values: np.ndarray | DatetimeArray | DatetimeIndex, date_format: str | None -) -> str | None: - """given values and a date_format, return a string format""" - if isinstance(values, np.ndarray) and values.ndim > 1: - # We don't actually care about the order of values, and DatetimeIndex - # only accepts 1D values - values = values.ravel() - - ido = is_dates_only(values) - if ido: - # Only dates and no timezone: provide a default format - return date_format or "%Y-%m-%d" - return date_format - +class _Datetime64TZFormatter(_Datetime64Formatter): + values: DatetimeArray -class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self) -> list[str]: """we by definition have a TZ""" - ido = is_dates_only(self.values) + ido = self.values._is_dates_only values = self.values.astype(object) formatter = self.formatter or get_format_datetime64( ido, date_format=self.date_format @@ -1840,27 +1669,28 @@ return fmt_values -class Timedelta64Formatter(GenericArrayFormatter): +class _Timedelta64Formatter(_GenericArrayFormatter): + values: TimedeltaArray + def __init__( self, - values: np.ndarray | TimedeltaIndex, + values: TimedeltaArray, nat_rep: str = "NaT", - box: bool = False, **kwargs, ) -> None: + # TODO: nat_rep is never passed, na_rep is. super().__init__(values, **kwargs) self.nat_rep = nat_rep - self.box = box def _format_strings(self) -> list[str]: formatter = self.formatter or get_format_timedelta64( - self.values, nat_rep=self.nat_rep, box=self.box + self.values, nat_rep=self.nat_rep, box=False ) return [formatter(x) for x in self.values] def get_format_timedelta64( - values: np.ndarray | TimedeltaIndex | TimedeltaArray, + values: TimedeltaArray, nat_rep: str | float = "NaT", box: bool = False, ) -> Callable: @@ -1870,20 +1700,7 @@ If box, then show the return in quotes """ - values_int = values.view(np.int64) - - consider_values = values_int != iNaT - - one_day_nanos = 86400 * 10**9 - # error: Unsupported operand types for % ("ExtensionArray" and "int") - not_midnight = values_int % one_day_nanos != 0 # type: ignore[operator] - # error: Argument 1 to "__call__" of "ufunc" has incompatible type - # "Union[Any, ExtensionArray, ndarray]"; expected - # "Union[Union[int, float, complex, str, bytes, generic], - # Sequence[Union[int, float, complex, str, bytes, generic]], - # Sequence[Sequence[Any]], _SupportsArray]" - both = np.logical_and(consider_values, not_midnight) # type: ignore[arg-type] - even_days = both.sum() == 0 + even_days = values._is_dates_only if even_days: format = None @@ -1910,13 +1727,13 @@ strings: list[str], justify: str = "right", minimum: int | None = None, - adj: TextAdjustment | None = None, + adj: printing._TextAdjustment | None = None, ) -> list[str]: if len(strings) == 0 or justify == "all": return strings if adj is None: - adjustment = get_adjustment() + adjustment = printing.get_adjustment() else: adjustment = adj @@ -1940,7 +1757,7 @@ return result -def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> list[str]: +def _trim_zeros_complex(str_complexes: ArrayLike, decimal: str = ".") -> list[str]: """ Separates the real and imaginary parts from the complex number, and executes the _trim_zeros_float method on each of those. @@ -1986,7 +1803,7 @@ def _trim_zeros_float( - str_floats: np.ndarray | list[str], decimal: str = "." + str_floats: ArrayLike | list[str], decimal: str = "." ) -> list[str]: """ Trims the maximum number of trailing zeros equally from @@ -1999,7 +1816,7 @@ def is_number_with_decimal(x) -> bool: return re.match(number_regex, x) is not None - def should_trim(values: np.ndarray | list[str]) -> bool: + def should_trim(values: ArrayLike | list[str]) -> bool: """ Determine if an array of strings should be trimmed. diff -Nru pandas-2.1.4+dfsg/pandas/io/formats/html.py pandas-2.2.2+dfsg/pandas/io/formats/html.py --- pandas-2.1.4+dfsg/pandas/io/formats/html.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/formats/html.py 2024-04-10 17:42:52.000000000 +0000 @@ -282,7 +282,7 @@ sentinel = lib.no_default else: sentinel = False - levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False) + levels = self.columns._format_multi(sparsify=sentinel, include_names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 for lnum, (records, values) in enumerate(zip(level_lengths, levels)): @@ -437,7 +437,8 @@ if fmt is not None: index_values = self.fmt.tr_frame.index.map(fmt) else: - index_values = self.fmt.tr_frame.index.format() + # only reached with non-Multi index + index_values = self.fmt.tr_frame.index._format_flat(include_name=False) row: list[str] = [] for i in range(nrows): @@ -480,13 +481,13 @@ nrows = len(frame) assert isinstance(frame.index, MultiIndex) - idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) + idx_values = frame.index._format_multi(sparsify=False, include_names=False) idx_values = list(zip(*idx_values)) if self.fmt.sparsify: # GH3547 sentinel = lib.no_default - levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) + levels = frame.index._format_multi(sparsify=sentinel, include_names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 @@ -579,7 +580,7 @@ ) idx_values = list( - zip(*frame.index.format(sparsify=False, adjoin=False, names=False)) + zip(*frame.index._format_multi(sparsify=False, include_names=False)) ) row = [] row.extend(idx_values[i]) @@ -606,7 +607,8 @@ return {i: self.fmt.format_col(i) for i in range(self.ncols)} def _get_columns_formatted_values(self) -> list[str]: - return self.columns.format() + # only reached with non-Multi Index + return self.columns._format_flat(include_name=False) def write_style(self) -> None: # We use the "scoped" attribute here so that the desired @@ -633,7 +635,7 @@ else: element_props.append(("thead th", "text-align", "right")) template_mid = "\n\n".join(template_select % t for t in element_props) - template = dedent("\n".join((template_first, template_mid, template_last))) + template = dedent(f"{template_first}\n{template_mid}\n{template_last}") self.write(template) def render(self) -> list[str]: diff -Nru pandas-2.1.4+dfsg/pandas/io/formats/info.py pandas-2.2.2+dfsg/pandas/io/formats/info.py --- pandas-2.1.4+dfsg/pandas/io/formats/info.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/formats/info.py 2024-04-10 17:42:52.000000000 +0000 @@ -356,7 +356,7 @@ return memory_usage -class BaseInfo(ABC): +class _BaseInfo(ABC): """ Base class for DataFrameInfo and SeriesInfo. @@ -439,7 +439,7 @@ pass -class DataFrameInfo(BaseInfo): +class DataFrameInfo(_BaseInfo): """ Class storing dataframe-specific info. """ @@ -503,7 +503,7 @@ verbose: bool | None, show_counts: bool | None, ) -> None: - printer = DataFrameInfoPrinter( + printer = _DataFrameInfoPrinter( info=self, max_cols=max_cols, verbose=verbose, @@ -512,7 +512,7 @@ printer.to_buffer(buf) -class SeriesInfo(BaseInfo): +class SeriesInfo(_BaseInfo): """ Class storing series-specific info. """ @@ -538,7 +538,7 @@ "Argument `max_cols` can only be passed " "in DataFrame.info, not Series.info" ) - printer = SeriesInfoPrinter( + printer = _SeriesInfoPrinter( info=self, verbose=verbose, show_counts=show_counts, @@ -572,7 +572,7 @@ return self.data.memory_usage(index=True, deep=deep) -class InfoPrinterAbstract: +class _InfoPrinterAbstract: """ Class for printing dataframe or series info. """ @@ -586,11 +586,11 @@ fmt.buffer_put_lines(buf, lines) @abstractmethod - def _create_table_builder(self) -> TableBuilderAbstract: + def _create_table_builder(self) -> _TableBuilderAbstract: """Create instance of table builder.""" -class DataFrameInfoPrinter(InfoPrinterAbstract): +class _DataFrameInfoPrinter(_InfoPrinterAbstract): """ Class for printing dataframe info. @@ -650,27 +650,27 @@ else: return show_counts - def _create_table_builder(self) -> DataFrameTableBuilder: + def _create_table_builder(self) -> _DataFrameTableBuilder: """ Create instance of table builder based on verbosity and display settings. """ if self.verbose: - return DataFrameTableBuilderVerbose( + return _DataFrameTableBuilderVerbose( info=self.info, with_counts=self.show_counts, ) elif self.verbose is False: # specifically set to False, not necessarily None - return DataFrameTableBuilderNonVerbose(info=self.info) + return _DataFrameTableBuilderNonVerbose(info=self.info) elif self.exceeds_info_cols: - return DataFrameTableBuilderNonVerbose(info=self.info) + return _DataFrameTableBuilderNonVerbose(info=self.info) else: - return DataFrameTableBuilderVerbose( + return _DataFrameTableBuilderVerbose( info=self.info, with_counts=self.show_counts, ) -class SeriesInfoPrinter(InfoPrinterAbstract): +class _SeriesInfoPrinter(_InfoPrinterAbstract): """Class for printing series info. Parameters @@ -694,17 +694,17 @@ self.verbose = verbose self.show_counts = self._initialize_show_counts(show_counts) - def _create_table_builder(self) -> SeriesTableBuilder: + def _create_table_builder(self) -> _SeriesTableBuilder: """ Create instance of table builder based on verbosity. """ if self.verbose or self.verbose is None: - return SeriesTableBuilderVerbose( + return _SeriesTableBuilderVerbose( info=self.info, with_counts=self.show_counts, ) else: - return SeriesTableBuilderNonVerbose(info=self.info) + return _SeriesTableBuilderNonVerbose(info=self.info) def _initialize_show_counts(self, show_counts: bool | None) -> bool: if show_counts is None: @@ -713,13 +713,13 @@ return show_counts -class TableBuilderAbstract(ABC): +class _TableBuilderAbstract(ABC): """ Abstract builder for info table. """ _lines: list[str] - info: BaseInfo + info: _BaseInfo @abstractmethod def get_lines(self) -> list[str]: @@ -769,7 +769,7 @@ self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") -class DataFrameTableBuilder(TableBuilderAbstract): +class _DataFrameTableBuilder(_TableBuilderAbstract): """ Abstract builder for dataframe info table. @@ -820,7 +820,7 @@ self._lines.append(f"memory usage: {self.memory_usage_string}") -class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): +class _DataFrameTableBuilderNonVerbose(_DataFrameTableBuilder): """ Dataframe info table builder for non-verbose output. """ @@ -838,7 +838,7 @@ self._lines.append(self.ids._summary(name="Columns")) -class TableBuilderVerboseMixin(TableBuilderAbstract): +class _TableBuilderVerboseMixin(_TableBuilderAbstract): """ Mixin for verbose info output. """ @@ -931,7 +931,7 @@ yield pprint_thing(dtype) -class DataFrameTableBuilderVerbose(DataFrameTableBuilder, TableBuilderVerboseMixin): +class _DataFrameTableBuilderVerbose(_DataFrameTableBuilder, _TableBuilderVerboseMixin): """ Dataframe info table builder for verbose output. """ @@ -997,7 +997,7 @@ yield pprint_thing(col) -class SeriesTableBuilder(TableBuilderAbstract): +class _SeriesTableBuilder(_TableBuilderAbstract): """ Abstract builder for series info table. @@ -1029,7 +1029,7 @@ """Add lines to the info table, pertaining to non-empty series.""" -class SeriesTableBuilderNonVerbose(SeriesTableBuilder): +class _SeriesTableBuilderNonVerbose(_SeriesTableBuilder): """ Series info table builder for non-verbose output. """ @@ -1043,7 +1043,7 @@ self.add_memory_usage_line() -class SeriesTableBuilderVerbose(SeriesTableBuilder, TableBuilderVerboseMixin): +class _SeriesTableBuilderVerbose(_SeriesTableBuilder, _TableBuilderVerboseMixin): """ Series info table builder for verbose output. """ diff -Nru pandas-2.1.4+dfsg/pandas/io/formats/printing.py pandas-2.2.2+dfsg/pandas/io/formats/printing.py --- pandas-2.1.4+dfsg/pandas/io/formats/printing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/formats/printing.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,11 +15,14 @@ TypeVar, Union, ) +from unicodedata import east_asian_width from pandas._config import get_option from pandas.core.dtypes.inference import is_sequence +from pandas.io.formats.console import get_console_size + EscapeChars = Union[Mapping[str, str], Iterable[str]] _KT = TypeVar("_KT") _VT = TypeVar("_VT") @@ -42,7 +45,7 @@ function used to justify str. Needed for unicode handling. """ strlen = kwargs.pop("strlen", len) - justfunc = kwargs.pop("justfunc", justify) + justfunc = kwargs.pop("justfunc", _adj_justify) newLists = [] lengths = [max(map(strlen, x)) + space for x in lists[:-1]] @@ -57,7 +60,7 @@ return "\n".join("".join(lines) for lines in toJoin) -def justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]: +def _adj_justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]: """ Perform ljust, center, rjust against string or list-like """ @@ -314,9 +317,6 @@ ------- summary string """ - from pandas.io.formats.console import get_console_size - from pandas.io.formats.format import get_adjustment - display_width, _ = get_console_size() if display_width is None: display_width = get_option("display.width") or 80 @@ -501,3 +501,72 @@ def __repr__(self) -> str: return pprint_thing(self) + + +class _TextAdjustment: + def __init__(self) -> None: + self.encoding = get_option("display.encoding") + + def len(self, text: str) -> int: + return len(text) + + def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]: + """ + Perform ljust, center, rjust against string or list-like + """ + if mode == "left": + return [x.ljust(max_len) for x in texts] + elif mode == "center": + return [x.center(max_len) for x in texts] + else: + return [x.rjust(max_len) for x in texts] + + def adjoin(self, space: int, *lists, **kwargs) -> str: + return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs) + + +class _EastAsianTextAdjustment(_TextAdjustment): + def __init__(self) -> None: + super().__init__() + if get_option("display.unicode.ambiguous_as_wide"): + self.ambiguous_width = 2 + else: + self.ambiguous_width = 1 + + # Definition of East Asian Width + # https://unicode.org/reports/tr11/ + # Ambiguous width can be changed by option + self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1} + + def len(self, text: str) -> int: + """ + Calculate display width considering unicode East Asian Width + """ + if not isinstance(text, str): + return len(text) + + return sum( + self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text + ) + + def justify( + self, texts: Iterable[str], max_len: int, mode: str = "right" + ) -> list[str]: + # re-calculate padding space per str considering East Asian Width + def _get_pad(t): + return max_len - self.len(t) + len(t) + + if mode == "left": + return [x.ljust(_get_pad(x)) for x in texts] + elif mode == "center": + return [x.center(_get_pad(x)) for x in texts] + else: + return [x.rjust(_get_pad(x)) for x in texts] + + +def get_adjustment() -> _TextAdjustment: + use_east_asian_width = get_option("display.unicode.east_asian_width") + if use_east_asian_width: + return _EastAsianTextAdjustment() + else: + return _TextAdjustment() diff -Nru pandas-2.1.4+dfsg/pandas/io/formats/string.py pandas-2.2.2+dfsg/pandas/io/formats/string.py --- pandas-2.1.4+dfsg/pandas/io/formats/string.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/formats/string.py 2024-04-10 17:42:52.000000000 +0000 @@ -28,7 +28,7 @@ def to_string(self) -> str: text = self._get_string_representation() if self.fmt.should_show_dimensions: - text = "".join([text, self.fmt.dimensions_info]) + text = f"{text}{self.fmt.dimensions_info}" return text def _get_strcols(self) -> list[list[str]]: diff -Nru pandas-2.1.4+dfsg/pandas/io/formats/style.py pandas-2.2.2+dfsg/pandas/io/formats/style.py --- pandas-2.1.4+dfsg/pandas/io/formats/style.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/formats/style.py 2024-04-10 17:42:52.000000000 +0000 @@ -161,9 +161,6 @@ uuid_len : int, default 5 If ``uuid`` is not specified, the length of the ``uuid`` to randomly generate expressed in hex characters, in range [0, 32]. - - .. versionadded:: 1.2.0 - decimal : str, optional Character used as decimal separator for floats, complex and integers. If not given uses ``pandas.options.styler.format.decimal``. @@ -2517,23 +2514,14 @@ in their respective tuple form. The dict values should be a list as specified in the form with CSS selectors and props that will be applied to the specified row or column. - - .. versionchanged:: 1.2.0 - axis : {0 or 'index', 1 or 'columns', None}, default 0 Apply to each column (``axis=0`` or ``'index'``), to each row (``axis=1`` or ``'columns'``). Only used if `table_styles` is dict. - - .. versionadded:: 1.2.0 - overwrite : bool, default True Styles are replaced if `True`, or extended if `False`. CSS rules are preserved so most recent styles set will dominate if selectors intersect. - - .. versionadded:: 1.2.0 - css_class_names : dict, optional A dict of strings used to replace the default CSS classes described below. @@ -2722,7 +2710,7 @@ - Boolean - ValueError: cannot supply ``subset`` and ``level`` simultaneously. - Note this method only hides the identifed elements so can be chained to hide + Note this method only hides the identified elements so can be chained to hide multiple elements in sequence. Examples @@ -4082,8 +4070,9 @@ return ret values = data.to_numpy() - left = np.nanmin(values) if vmin is None else vmin - right = np.nanmax(values) if vmax is None else vmax + # A tricky way to address the issue where np.nanmin/np.nanmax fail to handle pd.NA. + left = np.nanmin(data.min(skipna=True)) if vmin is None else vmin + right = np.nanmax(data.max(skipna=True)) if vmax is None else vmax z: float = 0 # adjustment to translate data if align == "mid": diff -Nru pandas-2.1.4+dfsg/pandas/io/formats/style_render.py pandas-2.2.2+dfsg/pandas/io/formats/style_render.py --- pandas-2.1.4+dfsg/pandas/io/formats/style_render.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/formats/style_render.py 2024-04-10 17:42:52.000000000 +0000 @@ -1652,9 +1652,9 @@ Result is a dictionary of (level, initial_position): span """ if isinstance(index, MultiIndex): - levels = index.format(sparsify=lib.no_default, adjoin=False) + levels = index._format_multi(sparsify=lib.no_default, include_names=False) else: - levels = index.format() + levels = index._format_flat(include_name=False) if hidden_elements is None: hidden_elements = [] @@ -2357,7 +2357,7 @@ return latex_styles -def _escape_latex(s): +def _escape_latex(s: str) -> str: r""" Replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, ``{``, ``}``, ``~``, ``^``, and ``\`` in the string with LaTeX-safe sequences. @@ -2392,7 +2392,7 @@ ) -def _math_mode_with_dollar(s): +def _math_mode_with_dollar(s: str) -> str: r""" All characters in LaTeX math mode are preserved. @@ -2425,7 +2425,7 @@ return "".join(res).replace(r"rt8§=§7wz", r"\$") -def _math_mode_with_parentheses(s): +def _math_mode_with_parentheses(s: str) -> str: r""" All characters in LaTeX math mode are preserved. @@ -2461,7 +2461,7 @@ return "".join(res) -def _escape_latex_math(s): +def _escape_latex_math(s: str) -> str: r""" All characters in LaTeX math mode are preserved. diff -Nru pandas-2.1.4+dfsg/pandas/io/formats/xml.py pandas-2.2.2+dfsg/pandas/io/formats/xml.py --- pandas-2.1.4+dfsg/pandas/io/formats/xml.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/formats/xml.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,10 +8,15 @@ from typing import ( TYPE_CHECKING, Any, + final, ) +import warnings from pandas.errors import AbstractMethodError -from pandas.util._decorators import doc +from pandas.util._decorators import ( + cache_readonly, + doc, +) from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.missing import isna @@ -40,7 +45,7 @@ storage_options=_shared_docs["storage_options"], compression_options=_shared_docs["compression_options"] % "path_or_buffer", ) -class BaseXMLFormatter: +class _BaseXMLFormatter: """ Subclass for formatting data in XML. @@ -137,14 +142,14 @@ self.storage_options = storage_options self.orig_cols = self.frame.columns.tolist() - self.frame_dicts = self.process_dataframe() + self.frame_dicts = self._process_dataframe() - self.validate_columns() - self.validate_encoding() - self.prefix_uri = self.get_prefix_uri() - self.handle_indexes() + self._validate_columns() + self._validate_encoding() + self.prefix_uri = self._get_prefix_uri() + self._handle_indexes() - def build_tree(self) -> bytes: + def _build_tree(self) -> bytes: """ Build tree from data. @@ -153,7 +158,8 @@ """ raise AbstractMethodError(self) - def validate_columns(self) -> None: + @final + def _validate_columns(self) -> None: """ Validate elems_cols and attrs_cols. @@ -174,7 +180,8 @@ f"{type(self.elem_cols).__name__} is not a valid type for elem_cols" ) - def validate_encoding(self) -> None: + @final + def _validate_encoding(self) -> None: """ Validate encoding. @@ -188,7 +195,8 @@ codecs.lookup(self.encoding) - def process_dataframe(self) -> dict[int | str, dict[str, Any]]: + @final + def _process_dataframe(self) -> dict[int | str, dict[str, Any]]: """ Adjust Data Frame to fit xml output. @@ -202,11 +210,18 @@ df = df.reset_index() if self.na_rep is not None: - df = df.fillna(self.na_rep) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + df = df.fillna(self.na_rep) return df.to_dict(orient="index") - def handle_indexes(self) -> None: + @final + def _handle_indexes(self) -> None: """ Handle indexes. @@ -227,7 +242,7 @@ if self.elem_cols: self.elem_cols = indexes + self.elem_cols - def get_prefix_uri(self) -> str: + def _get_prefix_uri(self) -> str: """ Get uri of namespace prefix. @@ -241,7 +256,8 @@ raise AbstractMethodError(self) - def other_namespaces(self) -> dict: + @final + def _other_namespaces(self) -> dict: """ Define other namespaces. @@ -260,7 +276,8 @@ return nmsp_dict - def build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any: + @final + def _build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any: """ Create attributes of row. @@ -280,6 +297,7 @@ raise KeyError(f"no valid column, {col}") return elem_row + @final def _get_flat_col_name(self, col: str | tuple) -> str: flat_col = col if isinstance(col, tuple): @@ -290,17 +308,20 @@ ) return f"{self.prefix_uri}{flat_col}" - def build_elems(self, d: dict[str, Any], elem_row: Any) -> None: + @cache_readonly + def _sub_element_cls(self): + raise AbstractMethodError(self) + + @final + def _build_elems(self, d: dict[str, Any], elem_row: Any) -> None: """ Create child elements of row. This method adds child elements using elem_cols to row element and works with tuples for multindex or hierarchical columns. """ + sub_element_cls = self._sub_element_cls - raise AbstractMethodError(self) - - def _build_elems(self, sub_element_cls, d: dict[str, Any], elem_row: Any) -> None: if not self.elem_cols: return @@ -312,8 +333,9 @@ except KeyError: raise KeyError(f"no valid column, {col}") + @final def write_output(self) -> str | None: - xml_doc = self.build_tree() + xml_doc = self._build_tree() if self.path_or_buffer is not None: with get_handle( @@ -330,13 +352,13 @@ return xml_doc.decode(self.encoding).rstrip() -class EtreeXMLFormatter(BaseXMLFormatter): +class EtreeXMLFormatter(_BaseXMLFormatter): """ Class for formatting data in xml using Python standard library modules: `xml.etree.ElementTree` and `xml.dom.minidom`. """ - def build_tree(self) -> bytes: + def _build_tree(self) -> bytes: from xml.etree.ElementTree import ( Element, SubElement, @@ -344,7 +366,7 @@ ) self.root = Element( - f"{self.prefix_uri}{self.root_name}", attrib=self.other_namespaces() + f"{self.prefix_uri}{self.root_name}", attrib=self._other_namespaces() ) for d in self.frame_dicts.values(): @@ -352,11 +374,11 @@ if not self.attr_cols and not self.elem_cols: self.elem_cols = list(d.keys()) - self.build_elems(d, elem_row) + self._build_elems(d, elem_row) else: - elem_row = self.build_attribs(d, elem_row) - self.build_elems(d, elem_row) + elem_row = self._build_attribs(d, elem_row) + self._build_elems(d, elem_row) self.out_xml = tostring( self.root, @@ -366,7 +388,7 @@ ) if self.pretty_print: - self.out_xml = self.prettify_tree() + self.out_xml = self._prettify_tree() if self.stylesheet is not None: raise ValueError( @@ -375,7 +397,7 @@ return self.out_xml - def get_prefix_uri(self) -> str: + def _get_prefix_uri(self) -> str: from xml.etree.ElementTree import register_namespace uri = "" @@ -395,12 +417,13 @@ return uri - def build_elems(self, d: dict[str, Any], elem_row: Any) -> None: + @cache_readonly + def _sub_element_cls(self): from xml.etree.ElementTree import SubElement - self._build_elems(SubElement, d, elem_row) + return SubElement - def prettify_tree(self) -> bytes: + def _prettify_tree(self) -> bytes: """ Output tree for pretty print format. @@ -414,7 +437,7 @@ return dom.toprettyxml(indent=" ", encoding=self.encoding) -class LxmlXMLFormatter(BaseXMLFormatter): +class LxmlXMLFormatter(_BaseXMLFormatter): """ Class for formatting data in xml using Python standard library modules: `xml.etree.ElementTree` and `xml.dom.minidom`. @@ -423,9 +446,9 @@ def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self.convert_empty_str_key() + self._convert_empty_str_key() - def build_tree(self) -> bytes: + def _build_tree(self) -> bytes: """ Build tree from data. @@ -445,11 +468,11 @@ if not self.attr_cols and not self.elem_cols: self.elem_cols = list(d.keys()) - self.build_elems(d, elem_row) + self._build_elems(d, elem_row) else: - elem_row = self.build_attribs(d, elem_row) - self.build_elems(d, elem_row) + elem_row = self._build_attribs(d, elem_row) + self._build_elems(d, elem_row) self.out_xml = tostring( self.root, @@ -460,11 +483,11 @@ ) if self.stylesheet is not None: - self.out_xml = self.transform_doc() + self.out_xml = self._transform_doc() return self.out_xml - def convert_empty_str_key(self) -> None: + def _convert_empty_str_key(self) -> None: """ Replace zero-length string in `namespaces`. @@ -475,7 +498,7 @@ if self.namespaces and "" in self.namespaces.keys(): self.namespaces[None] = self.namespaces.pop("", "default") - def get_prefix_uri(self) -> str: + def _get_prefix_uri(self) -> str: uri = "" if self.namespaces: if self.prefix: @@ -490,12 +513,13 @@ return uri - def build_elems(self, d: dict[str, Any], elem_row: Any) -> None: + @cache_readonly + def _sub_element_cls(self): from lxml.etree import SubElement - self._build_elems(SubElement, d, elem_row) + return SubElement - def transform_doc(self) -> bytes: + def _transform_doc(self) -> bytes: """ Parse stylesheet from file or buffer and run it. diff -Nru pandas-2.1.4+dfsg/pandas/io/gbq.py pandas-2.2.2+dfsg/pandas/io/gbq.py --- pandas-2.1.4+dfsg/pandas/io/gbq.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/gbq.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,8 +5,10 @@ TYPE_CHECKING, Any, ) +import warnings from pandas.compat._optional import import_optional_dependency +from pandas.util._exceptions import find_stack_level if TYPE_CHECKING: import google.auth @@ -43,6 +45,10 @@ """ Load data from Google BigQuery. + .. deprecated:: 2.2.0 + + Please use ``pandas_gbq.read_gbq`` instead. + This function requires the `pandas-gbq package `__. @@ -178,6 +184,13 @@ ... dialect="standard" ... ) # doctest: +SKIP """ + warnings.warn( + "read_gbq is deprecated and will be removed in a future version. " + "Please use pandas_gbq.read_gbq instead: " + "https://pandas-gbq.readthedocs.io/en/latest/api.html#pandas_gbq.read_gbq", + FutureWarning, + stacklevel=find_stack_level(), + ) pandas_gbq = _try_import() kwargs: dict[str, str | bool | int | None] = {} @@ -219,6 +232,13 @@ progress_bar: bool = True, credentials: google.auth.credentials.Credentials | None = None, ) -> None: + warnings.warn( + "to_gbq is deprecated and will be removed in a future version. " + "Please use pandas_gbq.to_gbq instead: " + "https://pandas-gbq.readthedocs.io/en/latest/api.html#pandas_gbq.to_gbq", + FutureWarning, + stacklevel=find_stack_level(), + ) pandas_gbq = _try_import() pandas_gbq.to_gbq( dataframe, diff -Nru pandas-2.1.4+dfsg/pandas/io/html.py pandas-2.2.2+dfsg/pandas/io/html.py --- pandas-2.1.4+dfsg/pandas/io/html.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/html.py 2024-04-10 17:42:52.000000000 +0000 @@ -57,6 +57,7 @@ BaseBuffer, DtypeBackend, FilePath, + HTMLFlavors, ReadBuffer, StorageOptions, ) @@ -268,7 +269,7 @@ # Both lxml and BeautifulSoup have the same implementation: return obj.get(attr) - def _href_getter(self, obj): + def _href_getter(self, obj) -> str | None: """ Return a href if the DOM node contains a child or None. @@ -391,7 +392,7 @@ """ raise AbstractMethodError(self) - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: """ Return whether an individual DOM node matches a tag @@ -590,14 +591,8 @@ :class:`pandas.io.html._HtmlFrameParser`. """ - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - from bs4 import SoupStrainer - - self._strainer = SoupStrainer("table") - def _parse_tables(self, document, match, attrs): - element_name = self._strainer.name + element_name = "table" tables = document.find_all(element_name, attrs=attrs) if not tables: raise ValueError("No tables found") @@ -628,7 +623,7 @@ def _text_getter(self, obj): return obj.text - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: return obj.name == tag def _parse_td(self, row): @@ -757,7 +752,7 @@ raise ValueError(f"No tables found matching regex {repr(pattern)}") return tables - def _equals_tag(self, obj, tag): + def _equals_tag(self, obj, tag) -> bool: return obj.tag == tag def _build_doc(self): @@ -889,13 +884,13 @@ } -def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]: +def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]: """ Choose the parser based on the input flavor. Parameters ---------- - flavor : str + flavor : {{"lxml", "html5lib", "bs4"}} or None The type of parser to use. This must be a valid backend. Returns @@ -1033,7 +1028,7 @@ io: FilePath | ReadBuffer[str], *, match: str | Pattern = ".+", - flavor: str | None = None, + flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None, header: int | Sequence[int] | None = None, index_col: int | Sequence[int] | None = None, skiprows: int | Sequence[int] | slice | None = None, @@ -1074,11 +1069,11 @@ This value is converted to a regular expression so that there is consistent behavior between Beautiful Soup and lxml. - flavor : str, optional - The parsing engine to use. 'bs4' and 'html5lib' are synonymous with - each other, they are both there for backwards compatibility. The - default of ``None`` tries to use ``lxml`` to parse and if that fails it - falls back on ``bs4`` + ``html5lib``. + flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional + The parsing engine (or list of parsing engines) to use. 'bs4' and + 'html5lib' are synonymous with each other, they are both there for + backwards compatibility. The default of ``None`` tries to use ``lxml`` + to parse and if that fails it falls back on ``bs4`` + ``html5lib``. header : int or list-like, optional The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to diff -Nru pandas-2.1.4+dfsg/pandas/io/json/__init__.py pandas-2.2.2+dfsg/pandas/io/json/__init__.py --- pandas-2.1.4+dfsg/pandas/io/json/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/json/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,14 +1,14 @@ from pandas.io.json._json import ( read_json, to_json, - ujson_dumps as dumps, - ujson_loads as loads, + ujson_dumps, + ujson_loads, ) from pandas.io.json._table_schema import build_table_schema __all__ = [ - "dumps", - "loads", + "ujson_dumps", + "ujson_loads", "read_json", "to_json", "build_table_schema", diff -Nru pandas-2.1.4+dfsg/pandas/io/json/_json.py pandas-2.2.2+dfsg/pandas/io/json/_json.py --- pandas-2.1.4+dfsg/pandas/io/json/_json.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/json/_json.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,6 +14,7 @@ Generic, Literal, TypeVar, + final, overload, ) import warnings @@ -32,13 +33,16 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend -from pandas.core.dtypes.common import ensure_str +from pandas.core.dtypes.common import ( + ensure_str, + is_string_dtype, +) from pandas.core.dtypes.dtypes import PeriodDtype -from pandas.core.dtypes.generic import ABCIndex from pandas import ( ArrowDtype, DataFrame, + Index, MultiIndex, Series, isna, @@ -82,6 +86,7 @@ JSONEngine, JSONSerializable, ReadBuffer, + Self, StorageOptions, WriteBuffer, ) @@ -174,7 +179,7 @@ if mode == "a" and (not lines or orient != "records"): msg = ( - "mode='a' (append) is only supported when" + "mode='a' (append) is only supported when " "lines is True and orient is 'records'" ) raise ValueError(msg) @@ -250,7 +255,7 @@ self.is_copy = None self._format_axes() - def _format_axes(self): + def _format_axes(self) -> None: raise AbstractMethodError(self) def write(self) -> str: @@ -282,7 +287,7 @@ else: return self.obj - def _format_axes(self): + def _format_axes(self) -> None: if not self.obj.index.is_unique and self.orient == "index": raise ValueError(f"Series index must be unique for orient='{self.orient}'") @@ -299,7 +304,7 @@ obj_to_write = self.obj return obj_to_write - def _format_axes(self): + def _format_axes(self) -> None: """ Try to format axes if they are datelike. """ @@ -644,11 +649,6 @@ for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. - - .. versionchanged:: 1.2 - - ``JsonReader`` is a context manager. - {decompression_options} .. versionchanged:: 1.4.0 Zstandard support. @@ -660,8 +660,6 @@ {storage_options} - .. versionadded:: 1.2.0 - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' Back-end data type applied to the resultant :class:`DataFrame` (still experimental). Behaviour is as follows: @@ -759,6 +757,19 @@ {{"index":"row 2","col 1":"c","col 2":"d"}}]\ }}\ ' + + The following example uses ``dtype_backend="numpy_nullable"`` + + >>> data = '''{{"index": {{"0": 0, "1": 1}}, + ... "a": {{"0": 1, "1": null}}, + ... "b": {{"0": 2.5, "1": 4.5}}, + ... "c": {{"0": true, "1": false}}, + ... "d": {{"0": "a", "1": "b"}}, + ... "e": {{"0": 1577.2, "1": 1577.1}}}}''' + >>> pd.read_json(StringIO(data), dtype_backend="numpy_nullable") + index a b c d e + 0 0 1 2.5 True a 1577.2 + 1 1 4.5 False b 1577.1 """ if orient == "table" and dtype: raise ValueError("cannot pass both dtype and orient='table'") @@ -1056,7 +1067,7 @@ if self.handles is not None: self.handles.close() - def __iter__(self: JsonReader[FrameSeriesStrT]) -> JsonReader[FrameSeriesStrT]: + def __iter__(self) -> Self: return self @overload @@ -1099,7 +1110,7 @@ else: return obj - def __enter__(self) -> JsonReader[FrameSeriesStrT]: + def __enter__(self) -> Self: return self def __exit__( @@ -1122,10 +1133,11 @@ "us": 31536000000000, "ns": 31536000000000000, } + json: str def __init__( self, - json, + json: str, orient, dtype: DtypeArg | None = None, convert_axes: bool = True, @@ -1160,7 +1172,8 @@ self.obj: DataFrame | Series | None = None self.dtype_backend = dtype_backend - def check_keys_split(self, decoded) -> None: + @final + def check_keys_split(self, decoded: dict) -> None: """ Checks that dict has only the appropriate keys for orient='split'. """ @@ -1169,6 +1182,7 @@ bad_keys_joined = ", ".join(bad_keys) raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}") + @final def parse(self): self._parse() @@ -1179,9 +1193,10 @@ self._try_convert_types() return self.obj - def _parse(self): + def _parse(self) -> None: raise AbstractMethodError(self) + @final def _convert_axes(self) -> None: """ Try to convert axes. @@ -1189,34 +1204,49 @@ obj = self.obj assert obj is not None # for mypy for axis_name in obj._AXIS_ORDERS: - new_axis, result = self._try_convert_data( + ax = obj._get_axis(axis_name) + ser = Series(ax, dtype=ax.dtype, copy=False) + new_ser, result = self._try_convert_data( name=axis_name, - data=obj._get_axis(axis_name), + data=ser, use_dtypes=False, convert_dates=True, + is_axis=True, ) if result: + new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False) setattr(self.obj, axis_name, new_axis) - def _try_convert_types(self): + def _try_convert_types(self) -> None: raise AbstractMethodError(self) + @final def _try_convert_data( self, name: Hashable, - data, + data: Series, use_dtypes: bool = True, convert_dates: bool | list[str] = True, - ): + is_axis: bool = False, + ) -> tuple[Series, bool]: """ - Try to parse a ndarray like into a column by inferring dtype. + Try to parse a Series into a column by inferring dtype. """ # don't try to coerce, unless a force conversion if use_dtypes: if not self.dtype: if all(notna(data)): return data, False - return data.fillna(np.nan), True + + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + filled = data.fillna(np.nan) + + return filled, True elif self.dtype is True: pass @@ -1236,23 +1266,25 @@ if result: return new_data, True - if self.dtype_backend is not lib.no_default and not isinstance(data, ABCIndex): + converted = False + if self.dtype_backend is not lib.no_default and not is_axis: # Fall through for conversion later on return data, True - elif data.dtype == "object": + elif is_string_dtype(data.dtype): # try float try: data = data.astype("float64") + converted = True except (TypeError, ValueError): pass - if data.dtype.kind == "f": - if data.dtype != "float64": - # coerce floats to 64 - try: - data = data.astype("float64") - except (TypeError, ValueError): - pass + if data.dtype.kind == "f" and data.dtype != "float64": + # coerce floats to 64 + try: + data = data.astype("float64") + converted = True + except (TypeError, ValueError): + pass # don't coerce 0-len data if len(data) and data.dtype in ("float", "object"): @@ -1261,14 +1293,15 @@ new_data = data.astype("int64") if (new_data == data).all(): data = new_data + converted = True except (TypeError, ValueError, OverflowError): pass - # coerce ints to 64 - if data.dtype == "int": - # coerce floats to 64 + if data.dtype == "int" and data.dtype != "int64": + # coerce ints to 64 try: data = data.astype("int64") + converted = True except (TypeError, ValueError): pass @@ -1277,9 +1310,10 @@ if self.orient == "split": return data, False - return data, True + return data, converted - def _try_convert_to_date(self, data): + @final + def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: """ Try to parse a ndarray like into a date column. @@ -1291,6 +1325,10 @@ return data, False new_data = data + + if new_data.dtype == "string": + new_data = new_data.astype(object) + if new_data.dtype == "object": try: new_data = data.astype("int64") @@ -1325,13 +1363,11 @@ return new_data, True return data, False - def _try_convert_dates(self): - raise AbstractMethodError(self) - class SeriesParser(Parser): _default_orient = "index" _split_keys = ("name", "index", "data") + obj: Series | None def _parse(self) -> None: data = ujson_loads(self.json, precise_float=self.precise_float) @@ -1356,6 +1392,7 @@ class FrameParser(Parser): _default_orient = "columns" _split_keys = ("columns", "index", "data") + obj: DataFrame | None def _parse(self) -> None: json = self.json @@ -1393,12 +1430,16 @@ ujson_loads(json, precise_float=self.precise_float), dtype=None ) - def _process_converter(self, f, filt=None) -> None: + def _process_converter( + self, + f: Callable[[Hashable, Series], tuple[Series, bool]], + filt: Callable[[Hashable], bool] | None = None, + ) -> None: """ Take a conversion function and possibly recreate the frame. """ if filt is None: - filt = lambda col, c: True + filt = lambda col: True obj = self.obj assert obj is not None # for mypy @@ -1406,7 +1447,7 @@ needs_new_obj = False new_obj = {} for i, (col, c) in enumerate(obj.items()): - if filt(col, c): + if filt(col): new_data, result = f(col, c) if result: c = new_data @@ -1443,6 +1484,10 @@ """ Return if this col is ok to try for a date parse. """ + if col in convert_dates: + return True + if not self.keep_default_dates: + return False if not isinstance(col, str): return False @@ -1457,9 +1502,4 @@ return True return False - self._process_converter( - lambda col, c: self._try_convert_to_date(c), - lambda col, c: ( - (self.keep_default_dates and is_ok(col)) or col in convert_dates - ), - ) + self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok) diff -Nru pandas-2.1.4+dfsg/pandas/io/json/_table_schema.py pandas-2.2.2+dfsg/pandas/io/json/_table_schema.py --- pandas-2.1.4+dfsg/pandas/io/json/_table_schema.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/json/_table_schema.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,6 +15,7 @@ from pandas._libs import lib from pandas._libs.json import ujson_loads from pandas._libs.tslibs import timezones +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import _registry as registry @@ -34,6 +35,8 @@ from pandas import DataFrame import pandas.core.common as com +from pandas.tseries.frequencies import to_offset + if TYPE_CHECKING: from pandas._typing import ( DtypeObj, @@ -207,8 +210,12 @@ if field.get("tz"): return f"datetime64[ns, {field['tz']}]" elif field.get("freq"): + # GH#9586 rename frequency M to ME for offsets + offset = to_offset(field["freq"]) + freq_n, freq_name = offset.n, offset.name + freq = freq_to_period_freqstr(freq_n, freq_name) # GH#47747 using datetime over period to minimize the change surface - return f"period[{field['freq']}]" + return f"period[{freq}]" else: return "datetime64[ns]" elif typ == "any": diff -Nru pandas-2.1.4+dfsg/pandas/io/orc.py pandas-2.2.2+dfsg/pandas/io/orc.py --- pandas-2.1.4+dfsg/pandas/io/orc.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/orc.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,17 +12,9 @@ from pandas._config import using_pyarrow_string_dtype from pandas._libs import lib -from pandas.compat import pa_version_under8p0 from pandas.compat._optional import import_optional_dependency from pandas.util._validators import check_dtype_backend -from pandas.core.dtypes.common import is_unsigned_integer_dtype -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - IntervalDtype, - PeriodDtype, -) - import pandas as pd from pandas.core.indexes.api import default_index @@ -168,7 +160,7 @@ (e.g. via builtin open function). If path is None, a bytes object is returned. engine : str, default 'pyarrow' - ORC library to use. Pyarrow must be >= 7.0.0. + ORC library to use. index : bool, optional If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. @@ -224,20 +216,9 @@ if df.index.name is not None: raise ValueError("orc does not serialize index meta-data on a default index") - # If unsupported dtypes are found raise NotImplementedError - # In Pyarrow 8.0.0 this check will no longer be needed - if pa_version_under8p0: - for dtype in df.dtypes: - if isinstance( - dtype, (IntervalDtype, CategoricalDtype, PeriodDtype) - ) or is_unsigned_integer_dtype(dtype): - raise NotImplementedError( - "The dtype of one or more columns is not supported yet." - ) - if engine != "pyarrow": raise ValueError("engine must be 'pyarrow'") - engine = import_optional_dependency(engine, min_version="7.0.0") + engine = import_optional_dependency(engine, min_version="10.0.1") pa = import_optional_dependency("pyarrow") orc = import_optional_dependency("pyarrow.orc") diff -Nru pandas-2.1.4+dfsg/pandas/io/parquet.py pandas-2.2.2+dfsg/pandas/io/parquet.py --- pandas-2.1.4+dfsg/pandas/io/parquet.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/parquet.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,6 +13,7 @@ from warnings import catch_warnings from pandas._config import using_pyarrow_string_dtype +from pandas._config.config import _get_option from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -165,7 +166,7 @@ import pyarrow.parquet # import utils to register the pyarrow extension types - import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401,E501 + import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401 self.api = pyarrow @@ -206,9 +207,10 @@ and hasattr(path_or_handle, "name") and isinstance(path_or_handle.name, (str, bytes)) ): - path_or_handle = path_or_handle.name - if isinstance(path_or_handle, bytes): - path_or_handle = path_or_handle.decode() + if isinstance(path_or_handle.name, bytes): + path_or_handle = path_or_handle.name.decode() + else: + path_or_handle = path_or_handle.name try: if partition_cols is not None: @@ -254,11 +256,11 @@ mapping = _arrow_dtype_mapping() to_pandas_kwargs["types_mapper"] = mapping.get elif dtype_backend == "pyarrow": - to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] # noqa: E501 + to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] elif using_pyarrow_string_dtype(): to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() - manager = get_option("mode.data_manager") + manager = _get_option("mode.data_manager", silent=True) if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] @@ -428,9 +430,6 @@ returned as bytes. If a string, it will be used as Root Directory path when writing a partitioned dataset. The engine fastparquet does not accept file-like objects. - - .. versionchanged:: 1.2.0 - engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` @@ -459,8 +458,6 @@ Must be None if path is not a string. {storage_options} - .. versionadded:: 1.2.0 - filesystem : fsspec or pyarrow filesystem, default None Filesystem object to use when reading the parquet file. Only implemented for ``engine="pyarrow"``. diff -Nru pandas-2.1.4+dfsg/pandas/io/parsers/arrow_parser_wrapper.py pandas-2.2.2+dfsg/pandas/io/parsers/arrow_parser_wrapper.py --- pandas-2.1.4+dfsg/pandas/io/parsers/arrow_parser_wrapper.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/parsers/arrow_parser_wrapper.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,12 +1,19 @@ from __future__ import annotations from typing import TYPE_CHECKING +import warnings from pandas._config import using_pyarrow_string_dtype from pandas._libs import lib from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + ParserError, + ParserWarning, +) +from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.inference import is_integer import pandas as pd @@ -34,7 +41,7 @@ self._parse_kwds() - def _parse_kwds(self): + def _parse_kwds(self) -> None: """ Validates keywords before passing to pyarrow. """ @@ -58,6 +65,7 @@ "escapechar": "escape_char", "skip_blank_lines": "ignore_empty_lines", "decimal": "decimal_point", + "quotechar": "quote_char", } for pandas_name, pyarrow_name in mapping.items(): if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None: @@ -85,6 +93,30 @@ and option_name in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") } + + on_bad_lines = self.kwds.get("on_bad_lines") + if on_bad_lines is not None: + if callable(on_bad_lines): + self.parse_options["invalid_row_handler"] = on_bad_lines + elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR: + self.parse_options[ + "invalid_row_handler" + ] = None # PyArrow raises an exception by default + elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN: + + def handle_warning(invalid_row) -> str: + warnings.warn( + f"Expected {invalid_row.expected_columns} columns, but found " + f"{invalid_row.actual_columns}: {invalid_row.text}", + ParserWarning, + stacklevel=find_stack_level(), + ) + return "skip" + + self.parse_options["invalid_row_handler"] = handle_warning + elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP: + self.parse_options["invalid_row_handler"] = lambda _: "skip" + self.convert_options = { option_name: option_value for option_name, option_value in self.kwds.items() @@ -100,6 +132,12 @@ ) } self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] + # autogenerated column names are prefixed with 'f' in pyarrow.csv + if self.header is None and "include_columns" in self.convert_options: + self.convert_options["include_columns"] = [ + f"f{n}" for n in self.convert_options["include_columns"] + ] + self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header @@ -167,7 +205,13 @@ # Ignore non-existent columns from dtype mapping # like other parsers do if isinstance(self.dtype, dict): - self.dtype = {k: v for k, v in self.dtype.items() if k in frame.columns} + self.dtype = { + k: pandas_dtype(v) + for k, v in self.dtype.items() + if k in frame.columns + } + else: + self.dtype = pandas_dtype(self.dtype) try: frame = frame.astype(self.dtype) except TypeError as e: @@ -175,6 +219,17 @@ raise ValueError(e) return frame + def _validate_usecols(self, usecols) -> None: + if lib.is_list_like(usecols) and not all(isinstance(x, str) for x in usecols): + raise ValueError( + "The pyarrow engine does not allow 'usecols' to be integer " + "column positions. Pass a list of string column names instead." + ) + elif callable(usecols): + raise ValueError( + "The pyarrow engine does not allow 'usecols' to be a callable." + ) + def read(self) -> DataFrame: """ Reads the contents of a CSV file into a DataFrame and @@ -190,12 +245,32 @@ pyarrow_csv = import_optional_dependency("pyarrow.csv") self._get_pyarrow_options() - table = pyarrow_csv.read_csv( - self.src, - read_options=pyarrow_csv.ReadOptions(**self.read_options), - parse_options=pyarrow_csv.ParseOptions(**self.parse_options), - convert_options=pyarrow_csv.ConvertOptions(**self.convert_options), - ) + try: + convert_options = pyarrow_csv.ConvertOptions(**self.convert_options) + except TypeError: + include = self.convert_options.get("include_columns", None) + if include is not None: + self._validate_usecols(include) + + nulls = self.convert_options.get("null_values", set()) + if not lib.is_list_like(nulls) or not all( + isinstance(x, str) for x in nulls + ): + raise TypeError( + "The 'pyarrow' engine requires all na_values to be strings" + ) + + raise + + try: + table = pyarrow_csv.read_csv( + self.src, + read_options=pyarrow_csv.ReadOptions(**self.read_options), + parse_options=pyarrow_csv.ParseOptions(**self.parse_options), + convert_options=convert_options, + ) + except pa.ArrowInvalid as e: + raise ParserError(e) from e dtype_backend = self.kwds["dtype_backend"] @@ -222,6 +297,7 @@ frame = table.to_pandas(types_mapper=dtype_mapping.get) elif using_pyarrow_string_dtype(): frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + else: frame = table.to_pandas() return self._finalize_pandas_output(frame) diff -Nru pandas-2.1.4+dfsg/pandas/io/parsers/base_parser.py pandas-2.2.2+dfsg/pandas/io/parsers/base_parser.py --- pandas-2.1.4+dfsg/pandas/io/parsers/base_parser.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/parsers/base_parser.py 2024-04-10 17:42:52.000000000 +0000 @@ -63,6 +63,7 @@ from pandas.core import algorithms from pandas.core.arrays import ( ArrowExtensionArray, + BaseMaskedArray, BooleanArray, Categorical, ExtensionArray, @@ -711,7 +712,7 @@ values, na_values, False, - convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa: E501 + convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] ) except (ValueError, TypeError): # e.g. encountering datetime string gets ValueError @@ -747,7 +748,7 @@ np.asarray(values), true_values=self.true_values, false_values=self.false_values, - convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa: E501 + convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] ) if result.dtype == np.bool_ and non_default_dtype_backend: if bool_mask is None: @@ -756,14 +757,23 @@ elif result.dtype == np.object_ and non_default_dtype_backend: # read_excel sends array of datetime objects if not lib.is_datetime_array(result, skipna=True): - result = StringDtype().construct_array_type()._from_sequence(values) + dtype = StringDtype() + cls = dtype.construct_array_type() + result = cls._from_sequence(values, dtype=dtype) if dtype_backend == "pyarrow": pa = import_optional_dependency("pyarrow") if isinstance(result, np.ndarray): result = ArrowExtensionArray(pa.array(result, from_pandas=True)) + elif isinstance(result, BaseMaskedArray): + if result._mask.all(): + # We want an arrow null array here + result = ArrowExtensionArray(pa.array([None] * len(result))) + else: + result = ArrowExtensionArray( + pa.array(result._data, mask=result._mask) + ) else: - # ExtensionArray result = ArrowExtensionArray( pa.array(result.to_numpy(), from_pandas=True) ) @@ -810,7 +820,7 @@ if isinstance(cast_type, BooleanDtype): # error: Unexpected keyword argument "true_values" for # "_from_sequence_of_strings" of "ExtensionArray" - return array_type._from_sequence_of_strings( # type: ignore[call-arg] # noqa: E501 + return array_type._from_sequence_of_strings( # type: ignore[call-arg] values, dtype=cast_type, true_values=self.true_values, @@ -1150,14 +1160,19 @@ ".*parsing datetimes with mixed time zones will raise an error", category=FutureWarning, ) - result = tools.to_datetime( - ensure_object(strs), - format=date_fmt, - utc=False, - dayfirst=dayfirst, - errors="ignore", - cache=cache_dates, - ) + str_objs = ensure_object(strs) + try: + result = tools.to_datetime( + str_objs, + format=date_fmt, + utc=False, + dayfirst=dayfirst, + cache=cache_dates, + ) + except (ValueError, TypeError): + # test_usecols_with_parse_dates4 + return str_objs + if isinstance(result, DatetimeIndex): arr = result.to_numpy() arr.flags.writeable = True @@ -1172,17 +1187,22 @@ "will raise an error", category=FutureWarning, ) - result = tools.to_datetime( - date_parser( - *(unpack_if_single_element(arg) for arg in date_cols) - ), - errors="ignore", - cache=cache_dates, + pre_parsed = date_parser( + *(unpack_if_single_element(arg) for arg in date_cols) ) + try: + result = tools.to_datetime( + pre_parsed, + cache=cache_dates, + ) + except (ValueError, TypeError): + # test_read_csv_with_custom_date_parser + result = pre_parsed if isinstance(result, datetime.datetime): raise Exception("scalar parser") return result except Exception: + # e.g. test_datetime_fractional_seconds with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -1190,13 +1210,15 @@ "will raise an error", category=FutureWarning, ) - return tools.to_datetime( - parsing.try_parse_dates( - parsing.concat_date_cols(date_cols), - parser=date_parser, - ), - errors="ignore", + pre_parsed = parsing.try_parse_dates( + parsing.concat_date_cols(date_cols), + parser=date_parser, ) + try: + return tools.to_datetime(pre_parsed) + except (ValueError, TypeError): + # TODO: not reached in tests 2023-10-27; needed? + return pre_parsed return converter diff -Nru pandas-2.1.4+dfsg/pandas/io/parsers/python_parser.py pandas-2.2.2+dfsg/pandas/io/parsers/python_parser.py --- pandas-2.1.4+dfsg/pandas/io/parsers/python_parser.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/parsers/python_parser.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,7 +13,6 @@ import csv from io import StringIO import re -import sys from typing import ( IO, TYPE_CHECKING, @@ -21,6 +20,7 @@ Literal, cast, ) +import warnings import numpy as np @@ -28,8 +28,10 @@ from pandas.errors import ( EmptyDataError, ParserError, + ParserWarning, ) from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_bool_dtype, @@ -613,8 +615,8 @@ ] if missing_usecols: raise ParserError( - "Defining usecols without of bounds indices is not allowed. " - f"{missing_usecols} are out of bounds.", + "Defining usecols with out-of-bounds indices is not allowed. " + f"{missing_usecols} are out-of-bounds.", ) col_indices = self.usecols @@ -778,8 +780,11 @@ if self.on_bad_lines == self.BadLineHandleMethod.ERROR: raise ParserError(msg) if self.on_bad_lines == self.BadLineHandleMethod.WARN: - base = f"Skipping line {row_num}: " - sys.stderr.write(base + msg + "\n") + warnings.warn( + f"Skipping line {row_num}: {msg}\n", + ParserWarning, + stacklevel=find_stack_level(), + ) def _next_iter_line(self, row_num: int) -> list[Scalar] | None: """ @@ -1112,18 +1117,18 @@ new_rows = [] try: if rows is not None: - rows_to_skip = 0 - if self.skiprows is not None and self.pos is not None: - # Only read additional rows if pos is in skiprows - rows_to_skip = len( - set(self.skiprows) - set(range(self.pos)) - ) - - for _ in range(rows + rows_to_skip): + row_index = 0 + row_ct = 0 + offset = self.pos if self.pos is not None else 0 + while row_ct < rows: # assert for mypy, data is Iterator[str] or None, would # error in next assert self.data is not None - new_rows.append(next(self.data)) + new_row = next(self.data) + if not self.skipfunc(offset + row_index): + row_ct += 1 + row_index += 1 + new_rows.append(new_row) len_new_rows = len(new_rows) new_rows = self._remove_skipped_rows(new_rows) @@ -1132,11 +1137,11 @@ rows = 0 while True: - new_row = self._next_iter_line(row_num=self.pos + rows + 1) + next_row = self._next_iter_line(row_num=self.pos + rows + 1) rows += 1 - if new_row is not None: - new_rows.append(new_row) + if next_row is not None: + new_rows.append(next_row) len_new_rows = len(new_rows) except StopIteration: diff -Nru pandas-2.1.4+dfsg/pandas/io/parsers/readers.py pandas-2.2.2+dfsg/pandas/io/parsers/readers.py --- pandas-2.1.4+dfsg/pandas/io/parsers/readers.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/parsers/readers.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,7 +5,10 @@ """ from __future__ import annotations -from collections import abc +from collections import ( + abc, + defaultdict, +) import csv import sys from textwrap import fill @@ -23,6 +26,8 @@ import numpy as np +from pandas._config import using_copy_on_write + from pandas._libs import lib from pandas._libs.parsers import STR_NA_VALUES from pandas.errors import ( @@ -36,10 +41,13 @@ from pandas.core.dtypes.common import ( is_file_like, is_float, + is_hashable, is_integer, is_list_like, + pandas_dtype, ) +from pandas import Series from pandas.core.frame import DataFrame from pandas.core.indexes.api import RangeIndex from pandas.core.shared_docs import _shared_docs @@ -65,6 +73,7 @@ if TYPE_CHECKING: from collections.abc import ( Hashable, + Iterable, Mapping, Sequence, ) @@ -76,10 +85,11 @@ DtypeArg, DtypeBackend, FilePath, - HashableT, IndexLabel, ReadCsvBuffer, + Self, StorageOptions, + UsecolsArgType, ) _doc_read_csv_and_table = ( r""" @@ -140,7 +150,7 @@ Note: ``index_col=False`` can be used to force pandas to *not* use the first column as the index, e.g., when you have a malformed file with delimiters at the end of each line. -usecols : list of Hashable or Callable, optional +usecols : Sequence of Hashable or Callable, optional Subset of columns to select, denoted either by column labels or column indices. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings @@ -230,19 +240,22 @@ performance of reading a large file. verbose : bool, default False Indicate number of ``NA`` values placed in non-numeric columns. + + .. deprecated:: 2.2.0 skip_blank_lines : bool, default True If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. parse_dates : bool, list of Hashable, list of lists or dict of {{Hashable : list}}, \ default False The behavior is as follows: - * ``bool``. If ``True`` -> try parsing the index. + * ``bool``. If ``True`` -> try parsing the index. Note: Automatically set to + ``True`` if ``date_format`` or ``date_parser`` arguments have been passed. * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse - as a single date column. + as a single date column. Values are joined with a space before parsing. * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call - result 'foo' + result 'foo'. Values are joined with a space before parsing. If a column or index cannot be represented as an array of ``datetime``, say because of an unparsable value or a mixture of timezones, the column @@ -278,11 +291,20 @@ Use ``date_format`` instead, or read in as ``object`` and then apply :func:`~pandas.to_datetime` as-needed. date_format : str or dict of column -> format, optional - Format to use for parsing dates when used in conjunction with ``parse_dates``. - For anything more complex, please read in as ``object`` and then apply - :func:`~pandas.to_datetime` as-needed. + Format to use for parsing dates when used in conjunction with ``parse_dates``. + The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See + `strftime documentation + `_ for more information on choices, though + note that :const:`"%f"` will parse all the way up to nanoseconds. + You can also pass: + + - "ISO8601", to parse any `ISO8601 `_ + time string (not necessarily in exactly the same format); + - "mixed", to infer the format for each element individually. This is risky, + and you should probably use it along with `dayfirst`. - .. versionadded:: 2.0.0 + .. versionadded:: 2.0.0 dayfirst : bool, default False DD/MM format dates, international and European format. cache_dates : bool, default True @@ -293,10 +315,6 @@ iterator : bool, default False Return ``TextFileReader`` object for iteration or getting chunks with ``get_chunk()``. - - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. chunksize : int, optional Number of lines to read from the file per chunk. Passing a value will cause the function to return a ``TextFileReader`` object for iteration. @@ -304,9 +322,6 @@ `_ for more information on ``iterator`` and ``chunksize``. - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. {decompression_options} .. versionchanged:: 1.4.0 Zstandard support. @@ -346,17 +361,6 @@ standard encodings `_ . - .. versionchanged:: 1.2 - - When ``encoding`` is ``None``, ``errors='replace'`` is passed to - ``open()``. Otherwise, ``errors='strict'`` is passed to ``open()``. - This behavior was previously only the case for ``engine='python'``. - - .. versionchanged:: 1.3.0 - - ``encoding_errors`` is a new argument. ``encoding`` has no longer an - influence on how encoding errors are handled. - encoding_errors : str, optional, default 'strict' How encoding errors are treated. `List of possible values `_ . @@ -389,11 +393,21 @@ expected, a ``ParserWarning`` will be emitted while dropping extra elements. Only supported when ``engine='python'`` + .. versionchanged:: 2.2.0 + + - Callable, function with signature + as described in `pyarrow documentation + `_ when ``engine='pyarrow'`` + delim_whitespace : bool, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option is set to ``True``, nothing should be passed in for the ``delimiter`` parameter. + + .. deprecated:: 2.2.0 + Use ``sep="\\s+"`` instead. low_memory : bool, default True Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. To ensure no mixed @@ -411,12 +425,8 @@ ``'legacy'`` for the original lower precision pandas converter, and ``'round_trip'`` for the round-trip converter. - .. versionchanged:: 1.2 - {storage_options} - .. versionadded:: 1.2 - dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' Back-end data type applied to the resultant :class:`DataFrame` (still experimental). Behaviour is as follows: @@ -482,7 +492,6 @@ "thousands", "memory_map", "dialect", - "on_bad_lines", "delim_whitespace", "quoting", "lineterminator", @@ -627,7 +636,7 @@ header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -637,16 +646,18 @@ skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., skipfooter: int = ..., nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., - keep_default_na: bool = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., na_filter: bool = ..., - verbose: bool = ..., + verbose: bool | lib.NoDefault = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool = ..., + keep_date_col: bool | lib.NoDefault = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[True], @@ -664,7 +675,7 @@ encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., on_bad_lines=..., - delim_whitespace: bool = ..., + delim_whitespace: bool | lib.NoDefault = ..., low_memory: bool = ..., memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., @@ -684,7 +695,7 @@ header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -694,16 +705,19 @@ skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., skipfooter: int = ..., nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., - verbose: bool = ..., + verbose: bool | lib.NoDefault = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool = ..., + keep_date_col: bool | lib.NoDefault = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -721,7 +735,7 @@ encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., on_bad_lines=..., - delim_whitespace: bool = ..., + delim_whitespace: bool | lib.NoDefault = ..., low_memory: bool = ..., memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., @@ -741,7 +755,7 @@ header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -751,16 +765,19 @@ skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., skipfooter: int = ..., nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., - verbose: bool = ..., + verbose: bool | lib.NoDefault = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool = ..., + keep_date_col: bool | lib.NoDefault = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[False] = ..., @@ -778,7 +795,7 @@ encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., on_bad_lines=..., - delim_whitespace: bool = ..., + delim_whitespace: bool | lib.NoDefault = ..., low_memory: bool = ..., memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., @@ -798,7 +815,7 @@ header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -808,16 +825,19 @@ skiprows: list[int] | int | Callable[[Hashable], bool] | None = ..., skipfooter: int = ..., nrows: int | None = ..., - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., - verbose: bool = ..., + verbose: bool | lib.NoDefault = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool = ..., + keep_date_col: bool | lib.NoDefault = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -835,7 +855,7 @@ encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., on_bad_lines=..., - delim_whitespace: bool = ..., + delim_whitespace: bool | lib.NoDefault = ..., low_memory: bool = ..., memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., @@ -866,7 +886,7 @@ header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] | Callable[[Hashable], bool] | None = None, + usecols: UsecolsArgType = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -878,17 +898,20 @@ skipfooter: int = 0, nrows: int | None = None, # NA and Missing Data Handling - na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None, + na_values: Hashable + | Iterable[Hashable] + | Mapping[Hashable, Iterable[Hashable]] + | None = None, keep_default_na: bool = True, na_filter: bool = True, - verbose: bool = False, + verbose: bool | lib.NoDefault = lib.no_default, skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, infer_datetime_format: bool | lib.NoDefault = lib.no_default, - keep_date_col: bool = False, + keep_date_col: bool | lib.NoDefault = lib.no_default, date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | None = None, + date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, # Iteration @@ -910,13 +933,45 @@ # Error Handling on_bad_lines: str = "error", # Internal - delim_whitespace: bool = False, + delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, float_precision: Literal["high", "legacy"] | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: + if keep_date_col is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'keep_date_col' keyword in pd.read_csv is deprecated and " + "will be removed in a future version. Explicitly remove unwanted " + "columns after parsing instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + keep_date_col = False + + if lib.is_list_like(parse_dates): + # GH#55569 + depr = False + # error: Item "bool" of "bool | Sequence[Hashable] | None" has no + # attribute "__iter__" (not iterable) + if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] + depr = True + elif isinstance(parse_dates, dict) and any( + lib.is_list_like(x) for x in parse_dates.values() + ): + depr = True + if depr: + warnings.warn( + "Support for nested sequences for 'parse_dates' in pd.read_csv " + "is deprecated. Combine the desired columns with pd.to_datetime " + "after parsing instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if infer_datetime_format is not lib.no_default: warnings.warn( "The argument 'infer_datetime_format' is deprecated and will " @@ -927,6 +982,29 @@ FutureWarning, stacklevel=find_stack_level(), ) + + if delim_whitespace is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'delim_whitespace' keyword in pd.read_csv is deprecated and " + "will be removed in a future version. Use ``sep='\\s+'`` instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + delim_whitespace = False + + if verbose is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'verbose' keyword in pd.read_csv is deprecated and " + "will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + verbose = False + # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -958,7 +1036,7 @@ header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -971,13 +1049,13 @@ na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., - verbose: bool = ..., + verbose: bool | lib.NoDefault = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool = ..., + keep_date_col: bool | lib.NoDefault = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[True], @@ -1015,7 +1093,7 @@ header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1028,13 +1106,13 @@ na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., - verbose: bool = ..., + verbose: bool | lib.NoDefault = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool = ..., + keep_date_col: bool | lib.NoDefault = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -1072,7 +1150,7 @@ header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1085,13 +1163,13 @@ na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., - verbose: bool = ..., + verbose: bool | lib.NoDefault = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool = ..., + keep_date_col: bool | lib.NoDefault = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[False] = ..., @@ -1129,7 +1207,7 @@ header: int | Sequence[int] | None | Literal["infer"] = ..., names: Sequence[Hashable] | None | lib.NoDefault = ..., index_col: IndexLabel | Literal[False] | None = ..., - usecols: list[HashableT] | Callable[[Hashable], bool] | None = ..., + usecols: UsecolsArgType = ..., dtype: DtypeArg | None = ..., engine: CSVEngine | None = ..., converters: Mapping[Hashable, Callable] | None = ..., @@ -1142,13 +1220,13 @@ na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = ..., keep_default_na: bool = ..., na_filter: bool = ..., - verbose: bool = ..., + verbose: bool | lib.NoDefault = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., infer_datetime_format: bool | lib.NoDefault = ..., - keep_date_col: bool = ..., + keep_date_col: bool | lib.NoDefault = ..., date_parser: Callable | lib.NoDefault = ..., - date_format: str | None = ..., + date_format: str | dict[Hashable, str] | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -1199,7 +1277,7 @@ header: int | Sequence[int] | None | Literal["infer"] = "infer", names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default, index_col: IndexLabel | Literal[False] | None = None, - usecols: list[HashableT] | Callable[[Hashable], bool] | None = None, + usecols: UsecolsArgType = None, # General Parsing Configuration dtype: DtypeArg | None = None, engine: CSVEngine | None = None, @@ -1214,14 +1292,14 @@ na_values: Sequence[str] | Mapping[str, Sequence[str]] | None = None, keep_default_na: bool = True, na_filter: bool = True, - verbose: bool = False, + verbose: bool | lib.NoDefault = lib.no_default, skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] = False, infer_datetime_format: bool | lib.NoDefault = lib.no_default, - keep_date_col: bool = False, + keep_date_col: bool | lib.NoDefault = lib.no_default, date_parser: Callable | lib.NoDefault = lib.no_default, - date_format: str | None = None, + date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, # Iteration @@ -1243,13 +1321,36 @@ # Error Handling on_bad_lines: str = "error", # Internal - delim_whitespace: bool = False, + delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, float_precision: str | None = None, storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: + if keep_date_col is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'keep_date_col' keyword in pd.read_table is deprecated and " + "will be removed in a future version. Explicitly remove unwanted " + "columns after parsing instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + keep_date_col = False + + # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__" + if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] + # GH#55569 + warnings.warn( + "Support for nested sequences for 'parse_dates' in pd.read_table " + "is deprecated. Combine the desired columns with pd.to_datetime " + "after parsing instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if infer_datetime_format is not lib.no_default: warnings.warn( "The argument 'infer_datetime_format' is deprecated and will " @@ -1261,6 +1362,28 @@ stacklevel=find_stack_level(), ) + if delim_whitespace is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'delim_whitespace' keyword in pd.read_table is deprecated and " + "will be removed in a future version. Use ``sep='\\s+'`` instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + delim_whitespace = False + + if verbose is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'verbose' keyword in pd.read_table is deprecated and " + "will be removed in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + verbose = False + # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -1282,6 +1405,51 @@ return _read(filepath_or_buffer, kwds) +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: Literal[True], + chunksize: int | None = ..., + **kwds, +) -> TextFileReader: + ... + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: bool = ..., + chunksize: int, + **kwds, +) -> TextFileReader: + ... + + +@overload +def read_fwf( + filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], + *, + colspecs: Sequence[tuple[int, int]] | str | None = ..., + widths: Sequence[int] | None = ..., + infer_nrows: int = ..., + dtype_backend: DtypeBackend | lib.NoDefault = ..., + iterator: Literal[False] = ..., + chunksize: None = ..., + **kwds, +) -> DataFrame: + ... + + def read_fwf( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], *, @@ -1289,6 +1457,8 @@ widths: Sequence[int] | None = None, infer_nrows: int = 100, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + iterator: bool = False, + chunksize: int | None = None, **kwds, ) -> DataFrame | TextFileReader: r""" @@ -1387,6 +1557,8 @@ kwds["colspecs"] = colspecs kwds["infer_nrows"] = infer_nrows kwds["engine"] = "python-fwf" + kwds["iterator"] = iterator + kwds["chunksize"] = chunksize check_dtype_backend(dtype_backend) kwds["dtype_backend"] = dtype_backend @@ -1635,7 +1807,10 @@ # Converting values to NA keep_default_na = options["keep_default_na"] - na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) + floatify = engine != "pyarrow" + na_values, na_fvalues = _clean_na_values( + na_values, keep_default_na, floatify=floatify + ) # handle skiprows; this is internally handled by the # c-engine, so only need for python and pyarrow parsers @@ -1762,7 +1937,40 @@ else: new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + if hasattr(self, "orig_options"): + dtype_arg = self.orig_options.get("dtype", None) + else: + dtype_arg = None + + if isinstance(dtype_arg, dict): + dtype = defaultdict(lambda: None) # type: ignore[var-annotated] + dtype.update(dtype_arg) + elif dtype_arg is not None and pandas_dtype(dtype_arg) in ( + np.str_, + np.object_, + ): + dtype = defaultdict(lambda: dtype_arg) + else: + dtype = None + + if dtype is not None: + new_col_dict = {} + for k, v in col_dict.items(): + d = ( + dtype[k] + if pandas_dtype(dtype[k]) in (np.str_, np.object_) + else None + ) + new_col_dict[k] = Series(v, index=index, dtype=d, copy=False) + else: + new_col_dict = col_dict + + df = DataFrame( + new_col_dict, + columns=columns, + index=index, + copy=not using_copy_on_write(), + ) self._currow += new_rows return df @@ -1776,7 +1984,7 @@ size = min(size, self.nrows - self._currow) return self.read(nrows=size) - def __enter__(self) -> TextFileReader: + def __enter__(self) -> Self: return self def __exit__( @@ -1840,14 +2048,12 @@ values. The options are `None` or `high` for the ordinary converter, `legacy` for the original lower precision pandas converter, and `round_trip` for the round-trip converter. - - .. versionchanged:: 1.2 """ kwds["engine"] = "python" return TextFileReader(*args, **kwds) -def _clean_na_values(na_values, keep_default_na: bool = True): +def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = True): na_fvalues: set | dict if na_values is None: if keep_default_na: @@ -1875,7 +2081,7 @@ else: if not is_list_like(na_values): na_values = [na_values] - na_values = _stringify_na_values(na_values) + na_values = _stringify_na_values(na_values, floatify) if keep_default_na: na_values = na_values | STR_NA_VALUES @@ -1897,7 +2103,7 @@ return result -def _stringify_na_values(na_values): +def _stringify_na_values(na_values, floatify: bool): """return a stringified and numeric for these values""" result: list[str | float] = [] for x in na_values: @@ -1912,13 +2118,15 @@ result.append(f"{v}.0") result.append(str(v)) - result.append(v) - except (TypeError, ValueError, OverflowError): - pass - try: - result.append(int(x)) + if floatify: + result.append(v) except (TypeError, ValueError, OverflowError): pass + if floatify: + try: + result.append(int(x)) + except (TypeError, ValueError, OverflowError): + pass return set(result) @@ -1950,6 +2158,9 @@ used as the sep. Equivalent to setting ``sep='\\s+'``. If this option is set to True, nothing should be passed in for the ``delimiter`` parameter. + + .. deprecated:: 2.2.0 + Use ``sep="\\s+"`` instead. engine : {{'c', 'python'}} Parser engine to use. The C engine is faster while the python engine is currently more feature-complete. @@ -2037,9 +2248,10 @@ elif on_bad_lines == "skip": kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP elif callable(on_bad_lines): - if engine != "python": + if engine not in ["python", "pyarrow"]: raise ValueError( - "on_bad_line can only be a callable function if engine='python'" + "on_bad_line can only be a callable function " + "if engine='python' or 'pyarrow'" ) kwds["on_bad_lines"] = on_bad_lines else: diff -Nru pandas-2.1.4+dfsg/pandas/io/pickle.py pandas-2.2.2+dfsg/pandas/io/pickle.py --- pandas-2.1.4+dfsg/pandas/io/pickle.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/pickle.py 2024-04-10 17:42:52.000000000 +0000 @@ -67,8 +67,6 @@ {storage_options} - .. versionadded:: 1.2.0 - .. [1] https://docs.python.org/3/library/pickle.html See Also @@ -143,8 +141,6 @@ {storage_options} - .. versionadded:: 1.2.0 - Returns ------- same type as object stored in file diff -Nru pandas-2.1.4+dfsg/pandas/io/pytables.py pandas-2.2.2+dfsg/pandas/io/pytables.py --- pandas-2.1.4+dfsg/pandas/io/pytables.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/pytables.py 2024-04-10 17:42:52.000000000 +0000 @@ -30,6 +30,7 @@ from pandas._config import ( config, get_option, + using_copy_on_write, using_pyarrow_string_dtype, ) @@ -1706,7 +1707,7 @@ # ------------------------------------------------------------------------ # private methods - def _check_if_open(self): + def _check_if_open(self) -> None: if not self.is_open: raise ClosedFileError(f"{self._path} file is not open!") @@ -1729,7 +1730,7 @@ errors: str = "strict", ) -> GenericFixed | Table: """return a suitable class to operate""" - cls: type[GenericFixed] | type[Table] + cls: type[GenericFixed | Table] if value is not None and not isinstance(value, (Series, DataFrame)): raise TypeError("value must be None, Series, or DataFrame") @@ -2117,7 +2118,7 @@ ] ) - def __eq__(self, other: Any) -> bool: + def __eq__(self, other: object) -> bool: """compare 2 col items""" return all( getattr(self, a, None) == getattr(other, a, None) @@ -2151,14 +2152,13 @@ val_kind = _ensure_decoded(self.kind) values = _maybe_convert(values, val_kind, encoding, errors) - kwargs = {} kwargs["name"] = _ensure_decoded(self.index_name) if self.freq is not None: kwargs["freq"] = _ensure_decoded(self.freq) - factory: type[Index] | type[DatetimeIndex] = Index + factory: type[Index | DatetimeIndex] = Index if lib.is_np_dtype(values.dtype, "M") or isinstance( values.dtype, DatetimeTZDtype ): @@ -2168,8 +2168,10 @@ # error: Incompatible types in assignment (expression has type # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type # "Union[Type[Index], Type[DatetimeIndex]]") - factory = lambda x, **kwds: PeriodIndex( # type: ignore[assignment] - ordinal=x, **kwds + factory = lambda x, **kwds: PeriodIndex.from_ordinals( # type: ignore[assignment] + x, freq=kwds.get("freq", None) + )._rename( + kwds["name"] ) # making an Index instance could throw a number of different errors @@ -2424,7 +2426,7 @@ ] ) - def __eq__(self, other: Any) -> bool: + def __eq__(self, other: object) -> bool: """compare 2 col items""" return all( getattr(self, a, None) == getattr(other, a, None) @@ -2576,7 +2578,7 @@ dtype = _ensure_decoded(dtype_name) # reverse converts - if dtype == "datetime64": + if dtype.startswith("datetime64"): # recreate with tz if indicated converted = _set_tz(converted, tz, coerce=True) @@ -2822,7 +2824,7 @@ "cannot read on an abstract storer: subclasses should implement" ) - def write(self, **kwargs): + def write(self, obj, **kwargs) -> None: raise NotImplementedError( "cannot write on an abstract storer: subclasses should implement" ) @@ -2869,7 +2871,9 @@ def f(values, freq=None, tz=None): # data are already in UTC, localize and convert if tz present - dta = DatetimeArray._simple_new(values.values, freq=freq) + dta = DatetimeArray._simple_new( + values.values, dtype=values.dtype, freq=freq + ) result = DatetimeIndex._simple_new(dta, name=None) if tz is not None: result = result.tz_localize("UTC").tz_convert(tz) @@ -2936,8 +2940,7 @@ for n in self.attributes: setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) - # error: Signature of "write" incompatible with supertype "Fixed" - def write(self, obj, **kwargs) -> None: # type: ignore[override] + def write(self, obj, **kwargs) -> None: self.set_attrs() def read_array(self, key: str, start: int | None = None, stop: int | None = None): @@ -2961,7 +2964,7 @@ else: ret = node[start:stop] - if dtype == "datetime64": + if dtype and dtype.startswith("datetime64"): # reconstruct a timezone if indicated tz = getattr(attrs, "tz", None) ret = _set_tz(ret, tz, coerce=True) @@ -3170,7 +3173,7 @@ elif lib.is_np_dtype(value.dtype, "M"): self._handle.create_array(self.group, key, value.view("i8")) - getattr(self.group, key)._v_attrs.value_type = "datetime64" + getattr(self.group, key)._v_attrs.value_type = str(value.dtype) elif isinstance(value.dtype, DatetimeTZDtype): # store as UTC # with a zone @@ -3185,7 +3188,7 @@ # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no # attribute "tz" node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr] - node._v_attrs.value_type = "datetime64" + node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]" elif lib.is_np_dtype(value.dtype, "m"): self._handle.create_array(self.group, key, value.view("i8")) getattr(self.group, key)._v_attrs.value_type = "timedelta64" @@ -3225,8 +3228,7 @@ result = result.astype("string[pyarrow_numpy]") return result - # error: Signature of "write" incompatible with supertype "Fixed" - def write(self, obj, **kwargs) -> None: # type: ignore[override] + def write(self, obj, **kwargs) -> None: super().write(obj, **kwargs) self.write_index("index", obj.index) self.write_array("values", obj) @@ -3297,13 +3299,16 @@ if len(dfs) > 0: out = concat(dfs, axis=1, copy=True) + if using_copy_on_write(): + # with CoW, concat ignores the copy keyword. Here, we still want + # to copy to enforce optimized column-major layout + out = out.copy() out = out.reindex(columns=items, copy=False) return out return DataFrame(columns=axes[0], index=axes[1]) - # error: Signature of "write" incompatible with supertype "Fixed" - def write(self, obj, **kwargs) -> None: # type: ignore[override] + def write(self, obj, **kwargs) -> None: super().write(obj, **kwargs) # TODO(ArrayManager) HDFStore relies on accessing the blocks @@ -4060,7 +4065,7 @@ if isinstance(data_converted.dtype, CategoricalDtype): ordered = data_converted.ordered meta = "category" - metadata = np.array(data_converted.categories, copy=False).ravel() + metadata = np.asarray(data_converted.categories).ravel() data, dtype_name = _get_data_and_dtype_name(data_converted) @@ -4354,7 +4359,7 @@ """ raise NotImplementedError("WORMTable needs to implement read") - def write(self, **kwargs) -> None: + def write(self, obj, **kwargs) -> None: """ write in a format that we can search later on (but cannot append to): write out the indices and the values using _write_array @@ -4691,7 +4696,6 @@ selection = Selection(self, where=where, start=start, stop=stop) # apply the selection filters & axis orderings df = self.process_axes(df, selection=selection, columns=columns) - return df @@ -4711,12 +4715,13 @@ def get_object(cls, obj, transposed: bool): return obj - def write(self, obj, data_columns=None, **kwargs): + # error: Signature of "write" incompatible with supertype "Fixed" + def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override] """we are going to write this as a frame table""" if not isinstance(obj, DataFrame): name = obj.name or "values" obj = obj.to_frame(name) - return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs) + super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs) def read( self, @@ -4749,7 +4754,8 @@ pandas_kind = "series_table" table_type = "appendable_multiseries" - def write(self, obj, **kwargs): + # error: Signature of "write" incompatible with supertype "Fixed" + def write(self, obj, **kwargs) -> None: # type: ignore[override] """we are going to write this as a frame table""" name = obj.name or "values" newobj, self.levels = self.validate_multiindex(obj) @@ -4757,7 +4763,7 @@ cols = list(self.levels) cols.append(name) newobj.columns = Index(cols) - return super().write(obj=newobj, **kwargs) + super().write(obj=newobj, **kwargs) class GenericTable(AppendableFrameTable): @@ -4822,7 +4828,8 @@ return _indexables - def write(self, **kwargs): + # error: Signature of "write" incompatible with supertype "AppendableTable" + def write(self, **kwargs) -> None: # type: ignore[override] raise NotImplementedError("cannot write on an generic table") @@ -4838,7 +4845,8 @@ def table_type_short(self) -> str: return "appendable_multi" - def write(self, obj, data_columns=None, **kwargs): + # error: Signature of "write" incompatible with supertype "Fixed" + def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override] if data_columns is None: data_columns = [] elif data_columns is True: @@ -4848,7 +4856,7 @@ for n in self.levels: if n not in data_columns: data_columns.insert(0, n) - return super().write(obj=obj, data_columns=data_columns, **kwargs) + super().write(obj=obj, data_columns=data_columns, **kwargs) def read( self, @@ -4930,11 +4938,12 @@ # call below (which returns an ndarray). So we are only non-lossy # if `tz` matches `values.tz`. assert values.tz is None or values.tz == tz + if values.tz is not None: + return values if tz is not None: if isinstance(values, DatetimeIndex): name = values.name - values = values.asi8 else: name = None values = values.ravel() @@ -5017,8 +5026,12 @@ def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index: index: Index | np.ndarray - if kind == "datetime64": - index = DatetimeIndex(data) + if kind.startswith("datetime64"): + if kind == "datetime64": + # created before we stored resolution information + index = DatetimeIndex(data) + else: + index = DatetimeIndex(data.view(kind)) elif kind == "timedelta64": index = TimedeltaIndex(data) elif kind == "date": @@ -5192,6 +5205,8 @@ def _get_converter(kind: str, encoding: str, errors: str): if kind == "datetime64": return lambda x: np.asarray(x, dtype="M8[ns]") + elif "datetime64" in kind: + return lambda x: np.asarray(x, dtype=kind) elif kind == "string": return lambda x: _unconvert_string_array( x, nan_rep=None, encoding=encoding, errors=errors @@ -5201,7 +5216,7 @@ def _need_convert(kind: str) -> bool: - if kind in ("datetime64", "string"): + if kind in ("datetime64", "string") or "datetime64" in kind: return True return False @@ -5246,7 +5261,7 @@ elif dtype_str.startswith(("int", "uint")): kind = "integer" elif dtype_str.startswith("datetime64"): - kind = "datetime64" + kind = dtype_str elif dtype_str.startswith("timedelta"): kind = "timedelta64" elif dtype_str.startswith("bool"): @@ -5271,8 +5286,11 @@ if isinstance(data, Categorical): data = data.codes - # For datetime64tz we need to drop the TZ in tests TODO: why? - dtype_name = data.dtype.name.split("[")[0] + if isinstance(data.dtype, DatetimeTZDtype): + # For datetime64tz we need to drop the TZ in tests TODO: why? + dtype_name = f"datetime64[{data.dtype.unit}]" + else: + dtype_name = data.dtype.name if data.dtype.kind in "mM": data = np.asarray(data.view("i8")) diff -Nru pandas-2.1.4+dfsg/pandas/io/sas/sas7bdat.py pandas-2.2.2+dfsg/pandas/io/sas/sas7bdat.py --- pandas-2.1.4+dfsg/pandas/io/sas/sas7bdat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/sas/sas7bdat.py 2024-04-10 17:42:52.000000000 +0000 @@ -21,10 +21,7 @@ timedelta, ) import sys -from typing import ( - TYPE_CHECKING, - cast, -) +from typing import TYPE_CHECKING import numpy as np @@ -39,14 +36,13 @@ Parser, get_subheader_index, ) -from pandas.errors import ( - EmptyDataError, - OutOfBoundsDatetime, -) +from pandas._libs.tslibs.conversion import cast_from_unit_vectorized +from pandas.errors import EmptyDataError import pandas as pd from pandas import ( DataFrame, + Timestamp, isna, ) @@ -62,6 +58,10 @@ ) +_unix_origin = Timestamp("1970-01-01") +_sas_origin = Timestamp("1960-01-01") + + def _parse_datetime(sas_datetime: float, unit: str): if isna(sas_datetime): return pd.NaT @@ -86,7 +86,7 @@ ---------- sas_datetimes : {Series, Sequence[float]} Dates or datetimes in SAS - unit : {str} + unit : {'d', 's'} "d" if the floats represent dates, "s" for datetimes Returns @@ -94,12 +94,16 @@ Series Series of datetime64 dtype or datetime.datetime. """ - try: - return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01") - except OutOfBoundsDatetime: - s_series = sas_datetimes.apply(_parse_datetime, unit=unit) - s_series = cast(pd.Series, s_series) - return s_series + td = (_sas_origin - _unix_origin).as_unit("s") + if unit == "s": + millis = cast_from_unit_vectorized( + sas_datetimes._values, unit="s", out_unit="ms" + ) + dt64ms = millis.view("M8[ms]") + td + return pd.Series(dt64ms, index=sas_datetimes.index, copy=False) + else: + vals = np.array(sas_datetimes, dtype="M8[D]") + td + return pd.Series(vals, dtype="M8[s]", index=sas_datetimes.index, copy=False) class _Column: @@ -723,7 +727,7 @@ if self._column_types[j] == b"d": col_arr = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d") - rslt[name] = pd.Series(col_arr, dtype=np.float64, index=ix) + rslt[name] = pd.Series(col_arr, dtype=np.float64, index=ix, copy=False) if self.convert_dates: if self.column_formats[j] in const.sas_date_formats: rslt[name] = _convert_datetimes(rslt[name], "d") @@ -731,7 +735,7 @@ rslt[name] = _convert_datetimes(rslt[name], "s") jb += 1 elif self._column_types[j] == b"s": - rslt[name] = pd.Series(self._string_chunk[js, :], index=ix) + rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False) if self.convert_text and (self.encoding is not None): rslt[name] = self._decode_string(rslt[name].str) js += 1 diff -Nru pandas-2.1.4+dfsg/pandas/io/sas/sas_xport.py pandas-2.2.2+dfsg/pandas/io/sas/sas_xport.py --- pandas-2.1.4+dfsg/pandas/io/sas/sas_xport.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/sas/sas_xport.py 2024-04-10 17:42:52.000000000 +0000 @@ -288,7 +288,7 @@ def _get_row(self): return self.filepath_or_buffer.read(80).decode() - def _read_header(self): + def _read_header(self) -> None: self.filepath_or_buffer.seek(0) # read file header diff -Nru pandas-2.1.4+dfsg/pandas/io/sas/sasreader.py pandas-2.2.2+dfsg/pandas/io/sas/sasreader.py --- pandas-2.1.4+dfsg/pandas/io/sas/sasreader.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/sas/sasreader.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,9 +3,12 @@ """ from __future__ import annotations +from abc import ( + ABC, + abstractmethod, +) from typing import ( TYPE_CHECKING, - Protocol, overload, ) @@ -23,23 +26,26 @@ CompressionOptions, FilePath, ReadBuffer, + Self, ) from pandas import DataFrame -class ReaderBase(Protocol): +class ReaderBase(ABC): """ Protocol for XportReader and SAS7BDATReader classes. """ + @abstractmethod def read(self, nrows: int | None = None) -> DataFrame: ... + @abstractmethod def close(self) -> None: ... - def __enter__(self) -> ReaderBase: + def __enter__(self) -> Self: return self def __exit__( @@ -110,16 +116,8 @@ Encoding for text data. If None, text data are stored as raw bytes. chunksize : int Read file `chunksize` lines at a time, returns iterator. - - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. iterator : bool, defaults to False If True, returns an iterator for reading the file incrementally. - - .. versionchanged:: 1.2 - - ``TextFileReader`` is a context manager. {decompression_options} Returns diff -Nru pandas-2.1.4+dfsg/pandas/io/spss.py pandas-2.2.2+dfsg/pandas/io/spss.py --- pandas-2.1.4+dfsg/pandas/io/spss.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/spss.py 2024-04-10 17:42:52.000000000 +0000 @@ -63,9 +63,10 @@ raise TypeError("usecols must be list-like.") usecols = list(usecols) # pyreadstat requires a list - df, _ = pyreadstat.read_sav( + df, metadata = pyreadstat.read_sav( stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals ) + df.attrs = metadata.__dict__ if dtype_backend is not lib.no_default: df = df.convert_dtypes(dtype_backend=dtype_backend) return df diff -Nru pandas-2.1.4+dfsg/pandas/io/sql.py pandas-2.2.2+dfsg/pandas/io/sql.py --- pandas-2.1.4+dfsg/pandas/io/sql.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/sql.py 2024-04-10 17:42:52.000000000 +0000 @@ -32,6 +32,8 @@ import numpy as np +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -45,7 +47,10 @@ is_dict_like, is_list_like, ) -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + DatetimeTZDtype, +) from pandas.core.dtypes.missing import isna from pandas import get_option @@ -56,6 +61,7 @@ from pandas.core.arrays import ArrowExtensionArray from pandas.core.base import PandasObject import pandas.core.common as com +from pandas.core.common import maybe_make_list from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime @@ -105,6 +111,12 @@ # Format can take on custom to_datetime argument values such as # {"errors": "coerce"} or {"dayfirst": True} error: DateTimeErrorChoices = format.pop("errors", None) or "ignore" + if error == "ignore": + try: + return to_datetime(col, **format) + except (TypeError, ValueError): + # TODO: not reached 2023-10-27; needed? + return col return to_datetime(col, errors=error, **format) else: # Allow passing of formatting string for integers @@ -138,7 +150,7 @@ if isinstance(df_col.dtype, DatetimeTZDtype) or col_name in parse_dates: try: fmt = parse_dates[col_name] - except TypeError: + except (KeyError, TypeError): fmt = None data_frame.isetitem(i, _handle_date_column(df_col, format=fmt)) @@ -160,9 +172,17 @@ ) if dtype_backend == "pyarrow": pa = import_optional_dependency("pyarrow") - arrays = [ - ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays - ] + + result_arrays = [] + for arr in arrays: + pa_array = pa.array(arr, from_pandas=True) + if arr.dtype == "string": + # TODO: Arrow still infers strings arrays as regular strings instead + # of large_string, which is what we preserver everywhere else for + # dtype_backend="pyarrow". We may want to reconsider this + pa_array = pa_array.cast(pa.string()) + result_arrays.append(ArrowExtensionArray(pa_array)) + arrays = result_arrays # type: ignore[assignment] if arrays: df = DataFrame(dict(zip(list(range(len(columns))), arrays))) df.columns = columns @@ -180,7 +200,7 @@ dtype: DtypeArg | None = None, dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ): - """Wrap result set of query in a DataFrame.""" + """Wrap result set of a SQLAlchemy query in a DataFrame.""" frame = _convert_arrays_to_dataframe(data, columns, coerce_float, dtype_backend) if dtype: @@ -194,6 +214,26 @@ return frame +def _wrap_result_adbc( + df: DataFrame, + *, + index_col=None, + parse_dates=None, + dtype: DtypeArg | None = None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", +) -> DataFrame: + """Wrap result set of a SQLAlchemy query in a DataFrame.""" + if dtype: + df = df.astype(dtype) + + df = _parse_date_columns(df, parse_dates) + + if index_col is not None: + df = df.set_index(index_col) + + return df + + def execute(sql, con, params=None): """ Execute the given SQL query using the provided connection object. @@ -553,12 +593,13 @@ ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. - con : SQLAlchemy connectable, str, or sqlite3 connection + con : ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible - for engine disposal and connection closure for the SQLAlchemy connectable; str - connections are closed automatically. See - `here `_. + for engine disposal and connection closure for the ADBC connection and + SQLAlchemy connectable; str connections are closed automatically. See + `here `_. index_col : str or list of str, optional, default: None Column(s) to set as index(MultiIndex). coerce_float : bool, default True @@ -642,6 +683,17 @@ int_column date_column 0 0 2012-11-10 1 1 2010-11-12 + + .. versionadded:: 2.2.0 + + pandas now supports reading via ADBC drivers + + >>> from adbc_driver_postgresql import dbapi # doctest:+SKIP + >>> with dbapi.connect('postgres:///db_name') as conn: # doctest:+SKIP + ... pd.read_sql('SELECT int_column FROM test_data', conn) + int_column + 0 0 + 1 1 """ check_dtype_backend(dtype_backend) @@ -713,8 +765,9 @@ frame : DataFrame, Series name : str Name of SQL table. - con : SQLAlchemy connectable(engine/connection) or database string URI + con : ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection or sqlite3 DBAPI2 connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. @@ -769,7 +822,8 @@ Notes ----- The returned rows affected is the sum of the ``rowcount`` attribute of ``sqlite3.Cursor`` - or SQLAlchemy connectable. The returned value may not reflect the exact number of written + or SQLAlchemy connectable. If using ADBC the returned rows are the result + of ``Cursor.adbc_ingest``. The returned value may not reflect the exact number of written rows as stipulated in the `sqlite3 `__ or `SQLAlchemy `__ @@ -808,7 +862,8 @@ ---------- table_name: string Name of SQL table. - con: SQLAlchemy connectable(engine/connection) or sqlite3 DBAPI2 connection + con: ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. @@ -850,6 +905,10 @@ if sqlalchemy is not None and isinstance(con, (str, sqlalchemy.engine.Connectable)): return SQLDatabase(con, schema, need_transaction) + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(con, adbc.Connection): + return ADBCDatabase(con) + warnings.warn( "pandas only supports SQLAlchemy connectable (engine/connection) or " "database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 " @@ -953,11 +1012,12 @@ def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: """ - Alternative to _execute_insert for DBs support multivalue INSERT. + Alternative to _execute_insert for DBs support multi-value INSERT. Note: multi-value insert is usually faster for analytics DBs and tables containing a few columns but performance degrades quickly with increase of columns. + """ from sqlalchemy import insert @@ -1451,7 +1511,7 @@ keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: pass @@ -2010,7 +2070,7 @@ keys: list[str] | None = None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: table = SQLTable( table_name, self, @@ -2024,6 +2084,358 @@ # ---- SQL without SQLAlchemy --- + + +class ADBCDatabase(PandasSQL): + """ + This class enables conversion between DataFrame and SQL databases + using ADBC to handle DataBase abstraction. + + Parameters + ---------- + con : adbc_driver_manager.dbapi.Connection + """ + + def __init__(self, con) -> None: + self.con = con + + @contextmanager + def run_transaction(self): + with self.con.cursor() as cur: + try: + yield cur + except Exception: + self.con.rollback() + raise + self.con.commit() + + def execute(self, sql: str | Select | TextClause, params=None): + if not isinstance(sql, str): + raise TypeError("Query must be a string unless using sqlalchemy.") + args = [] if params is None else [params] + cur = self.con.cursor() + try: + cur.execute(sql, *args) + return cur + except Exception as exc: + try: + self.con.rollback() + except Exception as inner_exc: # pragma: no cover + ex = DatabaseError( + f"Execution failed on sql: {sql}\n{exc}\nunable to rollback" + ) + raise ex from inner_exc + + ex = DatabaseError(f"Execution failed on sql '{sql}': {exc}") + raise ex from exc + + def read_table( + self, + table_name: str, + index_col: str | list[str] | None = None, + coerce_float: bool = True, + parse_dates=None, + columns=None, + schema: str | None = None, + chunksize: int | None = None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", + ) -> DataFrame | Iterator[DataFrame]: + """ + Read SQL database table into a DataFrame. + + Parameters + ---------- + table_name : str + Name of SQL table in database. + coerce_float : bool, default True + Raises NotImplementedError + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg}``, where the arg corresponds + to the keyword arguments of :func:`pandas.to_datetime`. + Especially useful with databases without native Datetime support, + such as SQLite. + columns : list, default: None + List of column names to select from SQL table. + schema : string, default None + Name of SQL schema in database to query (if database flavor + supports this). If specified, this overwrites the default + schema of the SQL database object. + chunksize : int, default None + Raises NotImplementedError + dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + Returns + ------- + DataFrame + + See Also + -------- + pandas.read_sql_table + SQLDatabase.read_query + + """ + if coerce_float is not True: + raise NotImplementedError( + "'coerce_float' is not implemented for ADBC drivers" + ) + if chunksize: + raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + + if columns: + if index_col: + index_select = maybe_make_list(index_col) + else: + index_select = [] + to_select = index_select + columns + select_list = ", ".join(f'"{x}"' for x in to_select) + else: + select_list = "*" + if schema: + stmt = f"SELECT {select_list} FROM {schema}.{table_name}" + else: + stmt = f"SELECT {select_list} FROM {table_name}" + + mapping: type[ArrowDtype] | None | Callable + if dtype_backend == "pyarrow": + mapping = ArrowDtype + elif dtype_backend == "numpy_nullable": + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping().get + elif using_pyarrow_string_dtype(): + from pandas.io._util import arrow_string_types_mapper + + arrow_string_types_mapper() + else: + mapping = None + + with self.con.cursor() as cur: + cur.execute(stmt) + df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + + return _wrap_result_adbc( + df, + index_col=index_col, + parse_dates=parse_dates, + ) + + def read_query( + self, + sql: str, + index_col: str | list[str] | None = None, + coerce_float: bool = True, + parse_dates=None, + params=None, + chunksize: int | None = None, + dtype: DtypeArg | None = None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", + ) -> DataFrame | Iterator[DataFrame]: + """ + Read SQL query into a DataFrame. + + Parameters + ---------- + sql : str + SQL query to be executed. + index_col : string, optional, default: None + Column name to use as index for the returned DataFrame object. + coerce_float : bool, default True + Raises NotImplementedError + params : list, tuple or dict, optional, default: None + Raises NotImplementedError + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict + corresponds to the keyword arguments of + :func:`pandas.to_datetime` Especially useful with databases + without native Datetime support, such as SQLite. + chunksize : int, default None + Raises NotImplementedError + dtype : Type name or dict of columns + Data type for data or columns. E.g. np.float64 or + {'a': np.float64, 'b': np.int32, 'c': 'Int64'} + + .. versionadded:: 1.3.0 + + Returns + ------- + DataFrame + + See Also + -------- + read_sql_table : Read SQL database table into a DataFrame. + read_sql + + """ + if coerce_float is not True: + raise NotImplementedError( + "'coerce_float' is not implemented for ADBC drivers" + ) + if params: + raise NotImplementedError("'params' is not implemented for ADBC drivers") + if chunksize: + raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + + mapping: type[ArrowDtype] | None | Callable + if dtype_backend == "pyarrow": + mapping = ArrowDtype + elif dtype_backend == "numpy_nullable": + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping().get + else: + mapping = None + + with self.con.cursor() as cur: + cur.execute(sql) + df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + + return _wrap_result_adbc( + df, + index_col=index_col, + parse_dates=parse_dates, + dtype=dtype, + ) + + read_sql = read_query + + def to_sql( + self, + frame, + name: str, + if_exists: Literal["fail", "replace", "append"] = "fail", + index: bool = True, + index_label=None, + schema: str | None = None, + chunksize: int | None = None, + dtype: DtypeArg | None = None, + method: Literal["multi"] | Callable | None = None, + engine: str = "auto", + **engine_kwargs, + ) -> int | None: + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame : DataFrame + name : string + Name of SQL table. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + - fail: If table exists, do nothing. + - replace: If table exists, drop it, recreate it, and insert data. + - append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column. + index_label : string or sequence, default None + Raises NotImplementedError + schema : string, default None + Name of SQL schema in database to write to (if database flavor + supports this). If specified, this overwrites the default + schema of the SQLDatabase object. + chunksize : int, default None + Raises NotImplementedError + dtype : single type or dict of column name to SQL type, default None + Raises NotImplementedError + method : {None', 'multi', callable}, default None + Raises NotImplementedError + engine : {'auto', 'sqlalchemy'}, default 'auto' + Raises NotImplementedError if not set to 'auto' + """ + if index_label: + raise NotImplementedError( + "'index_label' is not implemented for ADBC drivers" + ) + if chunksize: + raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + if dtype: + raise NotImplementedError("'dtype' is not implemented for ADBC drivers") + if method: + raise NotImplementedError("'method' is not implemented for ADBC drivers") + if engine != "auto": + raise NotImplementedError( + "engine != 'auto' not implemented for ADBC drivers" + ) + + if schema: + table_name = f"{schema}.{name}" + else: + table_name = name + + # pandas if_exists="append" will still create the + # table if it does not exist; ADBC is more explicit with append/create + # as applicable modes, so the semantics get blurred across + # the libraries + mode = "create" + if self.has_table(name, schema): + if if_exists == "fail": + raise ValueError(f"Table '{table_name}' already exists.") + elif if_exists == "replace": + with self.con.cursor() as cur: + cur.execute(f"DROP TABLE {table_name}") + elif if_exists == "append": + mode = "append" + + import pyarrow as pa + + try: + tbl = pa.Table.from_pandas(frame, preserve_index=index) + except pa.ArrowNotImplementedError as exc: + raise ValueError("datatypes not supported") from exc + + with self.con.cursor() as cur: + total_inserted = cur.adbc_ingest( + table_name=name, data=tbl, mode=mode, db_schema_name=schema + ) + + self.con.commit() + return total_inserted + + def has_table(self, name: str, schema: str | None = None) -> bool: + meta = self.con.adbc_get_objects( + db_schema_filter=schema, table_name_filter=name + ).read_all() + + for catalog_schema in meta["catalog_db_schemas"].to_pylist(): + if not catalog_schema: + continue + for schema_record in catalog_schema: + if not schema_record: + continue + + for table_record in schema_record["db_schema_tables"]: + if table_record["table_name"] == name: + return True + + return False + + def _create_sql_schema( + self, + frame: DataFrame, + table_name: str, + keys: list[str] | None = None, + dtype: DtypeArg | None = None, + schema: str | None = None, + ) -> str: + raise NotImplementedError("not implemented for adbc") + + # sqlite-specific sql strings and handler class # dictionary used for readability purposes _SQL_TYPES = { @@ -2466,7 +2878,7 @@ keys=None, dtype: DtypeArg | None = None, schema: str | None = None, - ): + ) -> str: table = SQLiteTable( table_name, self, @@ -2497,17 +2909,16 @@ name of SQL table keys : string or sequence, default: None columns to use a primary key - con: an open SQL database connection object or a SQLAlchemy connectable + con: ADBC Connection, SQLAlchemy connectable, sqlite3 connection, default: None + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that - library, default: None + library If a DBAPI2 object, only sqlite3 is supported. dtype : dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should be a SQLAlchemy type, or a string for sqlite3 fallback connection. schema: str, default: None Optional specifying the schema to be used in creating the table. - - .. versionadded:: 1.2.0 """ with pandasSQL_builder(con=con) as pandas_sql: return pandas_sql._create_sql_schema( diff -Nru pandas-2.1.4+dfsg/pandas/io/stata.py pandas-2.2.2+dfsg/pandas/io/stata.py --- pandas-2.1.4+dfsg/pandas/io/stata.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/stata.py 2024-04-10 17:42:52.000000000 +0000 @@ -23,7 +23,6 @@ from typing import ( IO, TYPE_CHECKING, - Any, AnyStr, Callable, Final, @@ -48,9 +47,11 @@ ) from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_object, is_numeric_dtype, + is_string_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype @@ -63,10 +64,9 @@ to_datetime, to_timedelta, ) -from pandas.core.arrays.boolean import BooleanDtype -from pandas.core.arrays.integer import IntegerDtype from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index +from pandas.core.indexes.range import RangeIndex from pandas.core.series import Series from pandas.core.shared_docs import _shared_docs @@ -84,6 +84,7 @@ CompressionOptions, FilePath, ReadBuffer, + Self, StorageOptions, WriteBuffer, ) @@ -233,10 +234,7 @@ stata_epoch: Final = datetime(1960, 1, 1) -# TODO: Add typing. As of January 2020 it is not possible to type this function since -# mypy doesn't understand that a Series and an int can be combined using mathematical -# operations. (+, -). -def _stata_elapsed_date_to_datetime_vec(dates, fmt) -> Series: +def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series: """ Convert from SIF to datetime. https://www.stata.com/help.cgi?datetime @@ -344,10 +342,7 @@ has_bad_values = False if bad_locs.any(): has_bad_values = True - # reset cache to avoid SettingWithCopy checks (we own the DataFrame and the - # `dates` Series is used to overwrite itself in the DataFramae) - dates._reset_cacher() - dates[bad_locs] = 1.0 # Replace with NaT + dates._values[bad_locs] = 1.0 # Replace with NaT dates = dates.astype(np.int64) if fmt.startswith(("%tc", "tc")): # Delta ms relative to base @@ -428,9 +423,9 @@ d["year"] = date_index._data.year d["month"] = date_index._data.month if days: - days_in_ns = dates.view(np.int64) - to_datetime( + days_in_ns = dates._values.view(np.int64) - to_datetime( d["year"], format="%Y" - ).view(np.int64) + )._values.view(np.int64) d["days"] = days_in_ns // NS_PER_DAY elif infer_dtype(dates, skipna=False) == "datetime": @@ -464,11 +459,10 @@ bad_loc = isna(dates) index = dates.index if bad_loc.any(): - dates = Series(dates) if lib.is_np_dtype(dates.dtype, "M"): - dates[bad_loc] = to_datetime(stata_epoch) + dates._values[bad_loc] = to_datetime(stata_epoch) else: - dates[bad_loc] = stata_epoch + dates._values[bad_loc] = stata_epoch if fmt in ["%tc", "tc"]: d = parse_dates_safe(dates, delta=True) @@ -500,11 +494,11 @@ else: raise ValueError(f"Format {fmt} is not a known Stata date format") - conv_dates = Series(conv_dates, dtype=np.float64) + conv_dates = Series(conv_dates, dtype=np.float64, copy=False) missing_value = struct.unpack(" None: """Encode value labels.""" self.text_len = 0 @@ -819,7 +814,7 @@ self.labname = labname self._encoding = encoding - self.value_labels: list[tuple[float, str]] = sorted( + self.value_labels = sorted( # type: ignore[assignment] value_labels.items(), key=lambda x: x[0] ) self._prepare_value_labels() @@ -935,7 +930,7 @@ def __repr__(self) -> str: return f"{type(self)}({self})" - def __eq__(self, other: Any) -> bool: + def __eq__(self, other: object) -> bool: return ( isinstance(other, type(self)) and self.string == other.string @@ -1054,7 +1049,7 @@ } # Reserved words cannot be used as variable names - self.RESERVED_WORDS = ( + self.RESERVED_WORDS = { "aggregate", "array", "boolean", @@ -1115,7 +1110,7 @@ "_se", "with", "_n", - ) + } class StataReader(StataParser, abc.Iterator): @@ -1138,7 +1133,6 @@ storage_options: StorageOptions | None = None, ) -> None: super().__init__() - self._col_sizes: list[int] = [] # Arguments to the reader (can be temporarily overridden in # calls to read). @@ -1163,7 +1157,6 @@ # State variables for the file self._close_file: Callable[[], None] | None = None - self._has_string_data = False self._missing_values = False self._can_read_value_labels = False self._column_selector_set = False @@ -1212,7 +1205,7 @@ self._read_header() self._setup_dtype() - def __enter__(self) -> StataReader: + def __enter__(self) -> Self: """enter context manager""" self._entered = True return self @@ -1293,11 +1286,6 @@ else: self._read_old_header(first_char) - self._has_string_data = len([x for x in self._typlist if type(x) is int]) > 0 - - # calculate size of a data record - self._col_sizes = [self._calcsize(typ) for typ in self._typlist] - def _read_new_header(self) -> None: # The first part of the header is common to 117 - 119. self._path_or_buf.read(27) # stata_dta>
@@ -1358,29 +1346,21 @@ self, seek_vartypes: int ) -> tuple[list[int | str], list[str | np.dtype]]: self._path_or_buf.seek(seek_vartypes) - raw_typlist = [self._read_uint16() for _ in range(self._nvar)] - - def f(typ: int) -> int | str: + typlist = [] + dtyplist = [] + for _ in range(self._nvar): + typ = self._read_uint16() if typ <= 2045: - return typ - try: - return self.TYPE_MAP_XML[typ] - except KeyError as err: - raise ValueError(f"cannot convert stata types [{typ}]") from err - - typlist = [f(x) for x in raw_typlist] - - def g(typ: int) -> str | np.dtype: - if typ <= 2045: - return str(typ) - try: - return self.DTYPE_MAP_XML[typ] - except KeyError as err: - raise ValueError(f"cannot convert stata dtype [{typ}]") from err - - dtyplist = [g(x) for x in raw_typlist] + typlist.append(typ) + dtyplist.append(str(typ)) + else: + try: + typlist.append(self.TYPE_MAP_XML[typ]) # type: ignore[arg-type] + dtyplist.append(self.DTYPE_MAP_XML[typ]) # type: ignore[arg-type] + except KeyError as err: + raise ValueError(f"cannot convert stata types [{typ}]") from err - return typlist, dtyplist + return typlist, dtyplist # type: ignore[return-value] def _get_varlist(self) -> list[str]: # 33 in order formats, 129 in formats 118 and 119 @@ -1558,11 +1538,6 @@ return self._dtype - def _calcsize(self, fmt: int | str) -> int: - if isinstance(fmt, int): - return fmt - return struct.calcsize(self._byteorder + fmt) - def _decode(self, s: bytes) -> str: # have bytes not strings, so must decode s = s.partition(b"\0")[0] @@ -1785,52 +1760,36 @@ # If index is not specified, use actual row number rather than # restarting at 0 for each chunk. if index_col is None: - rng = range(self._lines_read - read_lines, self._lines_read) - data.index = Index(rng) # set attr instead of set_index to avoid copy + data.index = RangeIndex( + self._lines_read - read_lines, self._lines_read + ) # set attr instead of set_index to avoid copy if columns is not None: data = self._do_select_columns(data, columns) # Decode strings for col, typ in zip(data, self._typlist): - if type(typ) is int: + if isinstance(typ, int): data[col] = data[col].apply(self._decode) data = self._insert_strls(data) - cols_ = np.where([dtyp is not None for dtyp in self._dtyplist])[0] # Convert columns (if needed) to match input type - ix = data.index - requires_type_conversion = False - data_formatted = [] - for i in cols_: - if self._dtyplist[i] is not None: - col = data.columns[i] - dtype = data[col].dtype - if dtype != np.dtype(object) and dtype != self._dtyplist[i]: - requires_type_conversion = True - data_formatted.append( - (col, Series(data[col], ix, self._dtyplist[i])) - ) - else: - data_formatted.append((col, data[col])) - if requires_type_conversion: - data = DataFrame.from_dict(dict(data_formatted)) - del data_formatted + valid_dtypes = [i for i, dtyp in enumerate(self._dtyplist) if dtyp is not None] + object_type = np.dtype(object) + for idx in valid_dtypes: + dtype = data.iloc[:, idx].dtype + if dtype not in (object_type, self._dtyplist[idx]): + data.isetitem(idx, data.iloc[:, idx].astype(dtype)) data = self._do_convert_missing(data, convert_missing) if convert_dates: - - def any_startswith(x: str) -> bool: - return any(x.startswith(fmt) for fmt in _date_formats) - - cols = np.where([any_startswith(x) for x in self._fmtlist])[0] - for i in cols: - col = data.columns[i] - data[col] = _stata_elapsed_date_to_datetime_vec( - data[col], self._fmtlist[i] - ) + for i, fmt in enumerate(self._fmtlist): + if any(fmt.startswith(date_fmt) for date_fmt in _date_formats): + data.isetitem( + i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt) + ) if convert_categoricals and self._format_version > 108: data = self._do_convert_categoricals( @@ -1864,14 +1823,14 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame: # Check for missing values, and replace if found replacements = {} - for i, colname in enumerate(data): + for i in range(len(data.columns)): fmt = self._typlist[i] if fmt not in self.VALID_RANGE: continue fmt = cast(str, fmt) # only strs in VALID_RANGE nmin, nmax = self.VALID_RANGE[fmt] - series = data[colname] + series = data.iloc[:, i] # appreciably faster to do this with ndarray instead of Series svals = series._values @@ -1901,11 +1860,10 @@ # Note: operating on ._values is much faster than directly # TODO: can we fix that? replacement._values[missing] = np.nan - replacements[colname] = replacement - + replacements[i] = replacement if replacements: - for col, value in replacements.items(): - data[col] = value + for idx, value in replacements.items(): + data.isetitem(idx, value) return data def _insert_strls(self, data: DataFrame) -> DataFrame: @@ -1915,7 +1873,7 @@ if typ != "Q": continue # Wrap v_o in a string to allow uint64 values as keys on 32bit OS - data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]] + data.isetitem(i, [self.GSO[str(k)] for k in data.iloc[:, i]]) return data def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFrame: @@ -1960,10 +1918,11 @@ """ Converts categorical columns to Categorical type. """ - value_labels = list(value_label_dict.keys()) + if not value_label_dict: + return data cat_converted_data = [] for col, label in zip(data, lbllist): - if label in value_labels: + if label in value_label_dict: # Explicit call with ordered=True vl = value_label_dict[label] keys = np.array(list(vl.keys())) @@ -2329,8 +2288,6 @@ {storage_options} - .. versionadded:: 1.2.0 - value_labels : dict of dicts Dictionary containing columns as keys and dictionaries of column value to labels as values. The combined length of all labels for a single @@ -2464,7 +2421,7 @@ Check for categorical columns, retain categorical information for Stata file and convert categorical data to int """ - is_cat = [isinstance(data[col].dtype, CategoricalDtype) for col in data] + is_cat = [isinstance(dtype, CategoricalDtype) for dtype in data.dtypes] if not any(is_cat): return data @@ -2981,7 +2938,14 @@ for i, col in enumerate(data): typ = typlist[i] if typ <= self._max_string_length: - data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + dc = data[col].fillna("") + data[col] = dc.apply(_pad_bytes, args=(typ,)) stype = f"S{typ}" dtypes[col] = stype data[col] = data[col].astype(stype) diff -Nru pandas-2.1.4+dfsg/pandas/io/xml.py pandas-2.2.2+dfsg/pandas/io/xml.py --- pandas-2.1.4+dfsg/pandas/io/xml.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/io/xml.py 2024-04-10 17:42:52.000000000 +0000 @@ -88,7 +88,7 @@ Parse only the attributes at the specified ``xpath``. names : list - Column names for :class:`~pandas.DataFrame`of parsed XML data. + Column names for :class:`~pandas.DataFrame` of parsed XML data. dtype : dict Data type for data or columns. E.g. {{'a': np.float64, @@ -1058,7 +1058,7 @@ Examples -------- - >>> import io + >>> from io import StringIO >>> xml = ''' ... ... @@ -1078,7 +1078,7 @@ ... ... ''' - >>> df = pd.read_xml(io.StringIO(xml)) + >>> df = pd.read_xml(StringIO(xml)) >>> df shape degrees sides 0 square 360 4.0 @@ -1092,7 +1092,7 @@ ... ... ''' - >>> df = pd.read_xml(io.StringIO(xml), xpath=".//row") + >>> df = pd.read_xml(StringIO(xml), xpath=".//row") >>> df shape degrees sides 0 square 360 4.0 @@ -1118,7 +1118,7 @@ ... ... ''' - >>> df = pd.read_xml(io.StringIO(xml), + >>> df = pd.read_xml(StringIO(xml), ... xpath="//doc:row", ... namespaces={{"doc": "https://example.com"}}) >>> df @@ -1126,6 +1126,34 @@ 0 square 360 4.0 1 circle 360 NaN 2 triangle 180 3.0 + + >>> xml_data = ''' + ... + ... + ... 0 + ... 1 + ... 2.5 + ... True + ... a + ... 2019-12-31 00:00:00 + ... + ... + ... 1 + ... 4.5 + ... False + ... b + ... 2019-12-31 00:00:00 + ... + ... + ... ''' + + >>> df = pd.read_xml(StringIO(xml_data), + ... dtype_backend="numpy_nullable", + ... parse_dates=["e"]) + >>> df + index a b c d e + 0 0 1 2.5 True a 2019-12-31 + 1 1 4.5 False b 2019-12-31 """ check_dtype_backend(dtype_backend) diff -Nru pandas-2.1.4+dfsg/pandas/plotting/_core.py pandas-2.2.2+dfsg/pandas/plotting/_core.py --- pandas-2.1.4+dfsg/pandas/plotting/_core.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/plotting/_core.py 2024-04-10 17:42:52.000000000 +0000 @@ -37,11 +37,15 @@ from pandas._typing import IndexLabel - from pandas import DataFrame + from pandas import ( + DataFrame, + Series, + ) + from pandas.core.groupby.generic import DataFrameGroupBy def hist_series( - self, + self: Series, by=None, ax=None, grid: bool = True, @@ -237,10 +241,10 @@ .. plot:: :context: close-figs - >>> df = pd.DataFrame({ - ... 'length': [1.5, 0.5, 1.2, 0.9, 3], - ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1] - ... }, index=['pig', 'rabbit', 'duck', 'chicken', 'horse']) + >>> data = {'length': [1.5, 0.5, 1.2, 0.9, 3], + ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]} + >>> index = ['pig', 'rabbit', 'duck', 'chicken', 'horse'] + >>> df = pd.DataFrame(data, index=index) >>> hist = df.hist(bins=3) """ plot_backend = _get_plot_backend(backend) @@ -512,7 +516,7 @@ @Substitution(data="", backend=_backend_doc) @Appender(_boxplot_doc) def boxplot_frame( - self, + self: DataFrame, column=None, by=None, ax=None, @@ -542,7 +546,7 @@ def boxplot_frame_groupby( - grouped, + grouped: DataFrameGroupBy, subplots: bool = True, column=None, fontsize: int | None = None, @@ -603,10 +607,10 @@ >>> import itertools >>> tuples = [t for t in itertools.product(range(1000), range(4))] >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) - >>> data = np.random.randn(len(index),4) + >>> data = np.random.randn(len(index), 4) >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index) >>> grouped = df.groupby(level='lvl1') - >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8,10)) # doctest: +SKIP + >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8, 10)) # doctest: +SKIP The ``subplots=False`` option shows the boxplots in a single figure. @@ -721,10 +725,6 @@ Name to use for the xlabel on x-axis. Default uses index name as xlabel, or the x-column name for planar plots. - .. versionchanged:: 1.2.0 - - Now applicable to planar plots (`scatter`, `hexbin`). - .. versionchanged:: 2.0.0 Now applicable to histograms. @@ -733,10 +733,6 @@ Name to use for the ylabel on y-axis. Default will show no ylabel, or the y-column name for planar plots. - .. versionchanged:: 1.2.0 - - Now applicable to planar plots (`scatter`, `hexbin`). - .. versionchanged:: 2.0.0 Now applicable to histograms. @@ -843,11 +839,11 @@ _kind_aliases = {"density": "kde"} _all_kinds = _common_kinds + _series_kinds + _dataframe_kinds - def __init__(self, data) -> None: + def __init__(self, data: Series | DataFrame) -> None: self._parent = data @staticmethod - def _get_call_args(backend_name: str, data, args, kwargs): + def _get_call_args(backend_name: str, data: Series | DataFrame, args, kwargs): """ This function makes calls to this accessor `__call__` method compatible with the previous `SeriesPlotMethods.__call__` and @@ -961,7 +957,10 @@ return plot_backend.plot(self._parent, x=x, y=y, kind=kind, **kwargs) if kind not in self._all_kinds: - raise ValueError(f"{kind} is not a valid plot kind") + raise ValueError( + f"{kind} is not a valid plot kind " + f"Valid plot kinds: {self._all_kinds}" + ) # The original data structured can be transformed before passed to the # backend. For example, for DataFrame is common to set the index as the @@ -1393,9 +1392,7 @@ .. plot:: :context: close-figs - >>> df = pd.DataFrame( - ... np.random.randint(1, 7, 6000), - ... columns = ['one']) + >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) @@ -1573,7 +1570,7 @@ ... 'signups': [5, 5, 6, 12, 14, 13], ... 'visits': [20, 42, 28, 62, 81, 50], ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01', - ... freq='M')) + ... freq='ME')) >>> ax = df.plot.area() Area plots are stacked by default. To produce an unstacked plot, @@ -1887,7 +1884,7 @@ # entry_points lost dict API ~ PY 3.10 # https://github.com/python/importlib_metadata/issues/298 if hasattr(eps, "select"): - entry = eps.select(group=key) # pyright: ignore[reportGeneralTypeIssues] + entry = eps.select(group=key) else: # Argument 2 to "get" of "dict" has incompatible type "Tuple[]"; # expected "EntryPoints" [arg-type] diff -Nru pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/boxplot.py pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/boxplot.py --- pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/boxplot.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/boxplot.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,9 +10,12 @@ from matplotlib.artist import setp import numpy as np +from pandas._libs import lib +from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import is_dict_like +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import remove_na_arraylike import pandas as pd @@ -35,11 +38,30 @@ from collections.abc import Collection from matplotlib.axes import Axes + from matplotlib.figure import Figure from matplotlib.lines import Line2D from pandas._typing import MatplotlibColor +def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> None: + """Set the tick labels of a given axis. + + Due to https://github.com/matplotlib/matplotlib/pull/17266, we need to handle the + case of repeated ticks (due to `FixedLocator`) and thus we duplicate the number of + labels. + """ + ticks = ax.get_xticks() if is_vertical else ax.get_yticks() + if len(ticks) != len(labels): + i, remainder = divmod(len(ticks), len(labels)) + assert remainder == 0, remainder + labels *= i + if is_vertical: + ax.set_xticklabels(labels, **kwargs) + else: + ax.set_yticklabels(labels, **kwargs) + + class BoxPlot(LinePlot): @property def _kind(self) -> Literal["box"]: @@ -62,7 +84,6 @@ # Do not call LinePlot.__init__ which may fill nan MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called - def _args_adjust(self) -> None: if self.subplots: # Disable label ax sharing. Otherwise, all subplots shows last # column label @@ -74,17 +95,18 @@ # error: Signature of "_plot" incompatible with supertype "MPLPlot" @classmethod def _plot( # type: ignore[override] - cls, ax, y, column_num=None, return_type: str = "axes", **kwds + cls, ax: Axes, y: np.ndarray, column_num=None, return_type: str = "axes", **kwds ): + ys: np.ndarray | list[np.ndarray] if y.ndim == 2: - y = [remove_na_arraylike(v) for v in y] + ys = [remove_na_arraylike(v) for v in y] # Boxplot fails with empty arrays, so need to add a NaN # if any cols are empty # GH 8181 - y = [v if v.size > 0 else np.array([np.nan]) for v in y] + ys = [v if v.size > 0 else np.array([np.nan]) for v in ys] else: - y = remove_na_arraylike(y) - bp = ax.boxplot(y, **kwds) + ys = remove_na_arraylike(y) + bp = ax.boxplot(ys, **kwds) if return_type == "dict": return bp, bp @@ -93,36 +115,50 @@ else: return ax, bp - def _validate_color_args(self): - if "color" in self.kwds: - if self.colormap is not None: - warnings.warn( - "'color' and 'colormap' cannot be used " - "simultaneously. Using 'color'", - stacklevel=find_stack_level(), - ) - self.color = self.kwds.pop("color") + def _validate_color_args(self, color, colormap): + if color is lib.no_default: + return None + + if colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used " + "simultaneously. Using 'color'", + stacklevel=find_stack_level(), + ) - if isinstance(self.color, dict): - valid_keys = ["boxes", "whiskers", "medians", "caps"] - for key in self.color: - if key not in valid_keys: - raise ValueError( - f"color dict contains invalid key '{key}'. " - f"The key must be either {valid_keys}" - ) - else: - self.color = None + if isinstance(color, dict): + valid_keys = ["boxes", "whiskers", "medians", "caps"] + for key in color: + if key not in valid_keys: + raise ValueError( + f"color dict contains invalid key '{key}'. " + f"The key must be either {valid_keys}" + ) + return color + @cache_readonly + def _color_attrs(self): # get standard colors for default - colors = get_standard_colors(num_colors=3, colormap=self.colormap, color=None) # use 2 colors by default, for box/whisker and median # flier colors isn't needed here # because it can be specified by ``sym`` kw - self._boxes_c = colors[0] - self._whiskers_c = colors[0] - self._medians_c = colors[2] - self._caps_c = colors[0] + return get_standard_colors(num_colors=3, colormap=self.colormap, color=None) + + @cache_readonly + def _boxes_c(self): + return self._color_attrs[0] + + @cache_readonly + def _whiskers_c(self): + return self._color_attrs[0] + + @cache_readonly + def _medians_c(self): + return self._color_attrs[2] + + @cache_readonly + def _caps_c(self): + return self._color_attrs[0] def _get_colors( self, @@ -148,18 +184,10 @@ medians = self.color or self._medians_c caps = self.color or self._caps_c - # GH 30346, when users specifying those arguments explicitly, our defaults - # for these four kwargs should be overridden; if not, use Pandas settings - if not self.kwds.get("boxprops"): - setp(bp["boxes"], color=boxes, alpha=1) - if not self.kwds.get("whiskerprops"): - setp(bp["whiskers"], color=whiskers, alpha=1) - if not self.kwds.get("medianprops"): - setp(bp["medians"], color=medians, alpha=1) - if not self.kwds.get("capprops"): - setp(bp["caps"], color=caps, alpha=1) + color_tup = (boxes, whiskers, medians, caps) + maybe_color_bp(bp, color_tup=color_tup, **self.kwds) - def _make_plot(self) -> None: + def _make_plot(self, fig: Figure) -> None: if self.subplots: self._return_obj = pd.Series(dtype=object) @@ -170,7 +198,10 @@ else self.data ) - for i, (label, y) in enumerate(self._iter_data(data=data)): + # error: Argument "data" to "_iter_data" of "MPLPlot" has + # incompatible type "object"; expected "DataFrame | + # dict[Hashable, Series | DataFrame]" + for i, (label, y) in enumerate(self._iter_data(data=data)): # type: ignore[arg-type] ax = self._get_ax(i) kwds = self.kwds.copy() @@ -182,9 +213,9 @@ # When `by` is assigned, the ticklabels will become unique grouped # values, instead of label which is used as subtitle in this case. - ticklabels = [ - pprint_thing(col) for col in self.data.columns.levels[0] - ] + # error: "Index" has no attribute "levels"; maybe "nlevels"? + levels = self.data.columns.levels # type: ignore[attr-defined] + ticklabels = [pprint_thing(col) for col in levels[0]] else: ticklabels = [pprint_thing(label)] @@ -193,7 +224,9 @@ ) self.maybe_color_bp(bp) self._return_obj[label] = ret - self._set_ticklabels(ax, ticklabels) + _set_ticklabels( + ax=ax, labels=ticklabels, is_vertical=self.orientation == "vertical" + ) else: y = self.data.values.T ax = self._get_ax(0) @@ -205,22 +238,17 @@ self.maybe_color_bp(bp) self._return_obj = ret - labels = [left for left, _ in self._iter_data()] - labels = [pprint_thing(left) for left in labels] + labels = [pprint_thing(left) for left in self.data.columns] if not self.use_index: labels = [pprint_thing(key) for key in range(len(labels))] - self._set_ticklabels(ax, labels) - - def _set_ticklabels(self, ax: Axes, labels: list[str]) -> None: - if self.orientation == "vertical": - ax.set_xticklabels(labels) - else: - ax.set_yticklabels(labels) + _set_ticklabels( + ax=ax, labels=labels, is_vertical=self.orientation == "vertical" + ) def _make_legend(self) -> None: pass - def _post_plot_logic(self, ax, data) -> None: + def _post_plot_logic(self, ax: Axes, data) -> None: # GH 45465: make sure that the boxplot doesn't ignore xlabel/ylabel if self.xlabel: ax.set_xlabel(pprint_thing(self.xlabel)) @@ -242,6 +270,19 @@ return self._return_obj +def maybe_color_bp(bp, color_tup, **kwds) -> None: + # GH#30346, when users specifying those arguments explicitly, our defaults + # for these four kwargs should be overridden; if not, use Pandas settings + if not kwds.get("boxprops"): + setp(bp["boxes"], color=color_tup[0], alpha=1) + if not kwds.get("whiskerprops"): + setp(bp["whiskers"], color=color_tup[1], alpha=1) + if not kwds.get("medianprops"): + setp(bp["medians"], color=color_tup[2], alpha=1) + if not kwds.get("capprops"): + setp(bp["caps"], color=color_tup[3], alpha=1) + + def _grouped_plot_by_column( plotf, data, @@ -322,7 +363,7 @@ if return_type not in BoxPlot._valid_return_types: raise ValueError("return_type must be {'axes', 'dict', 'both'}") - if isinstance(data, pd.Series): + if isinstance(data, ABCSeries): data = data.to_frame("x") column = "x" @@ -355,18 +396,6 @@ return result - def maybe_color_bp(bp, **kwds) -> None: - # GH 30346, when users specifying those arguments explicitly, our defaults - # for these four kwargs should be overridden; if not, use Pandas settings - if not kwds.get("boxprops"): - setp(bp["boxes"], color=colors[0], alpha=1) - if not kwds.get("whiskerprops"): - setp(bp["whiskers"], color=colors[1], alpha=1) - if not kwds.get("medianprops"): - setp(bp["medians"], color=colors[2], alpha=1) - if not kwds.get("capprops"): - setp(bp["caps"], color=colors[3], alpha=1) - def plot_group(keys, values, ax: Axes, **kwds): # GH 45465: xlabel/ylabel need to be popped out before plotting happens xlabel, ylabel = kwds.pop("xlabel", None), kwds.pop("ylabel", None) @@ -382,17 +411,10 @@ ax.tick_params(axis="both", labelsize=fontsize) # GH 45465: x/y are flipped when "vert" changes - is_vertical = kwds.get("vert", True) - ticks = ax.get_xticks() if is_vertical else ax.get_yticks() - if len(ticks) != len(keys): - i, remainder = divmod(len(ticks), len(keys)) - assert remainder == 0, remainder - keys *= i - if is_vertical: - ax.set_xticklabels(keys, rotation=rot) - else: - ax.set_yticklabels(keys, rotation=rot) - maybe_color_bp(bp, **kwds) + _set_ticklabels( + ax=ax, labels=keys, is_vertical=kwds.get("vert", True), rotation=rot + ) + maybe_color_bp(bp, color_tup=colors, **kwds) # Return axes in multiplot case, maybe revisit later # 985 if return_type == "dict": diff -Nru pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/converter.py pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/converter.py --- pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/converter.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/converter.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,7 +11,6 @@ from typing import ( TYPE_CHECKING, Any, - Final, cast, ) import warnings @@ -31,8 +30,14 @@ Timestamp, to_offset, ) -from pandas._libs.tslibs.dtypes import FreqGroup -from pandas._typing import F +from pandas._libs.tslibs.dtypes import ( + FreqGroup, + periods_per_day, +) +from pandas._typing import ( + F, + npt, +) from pandas.core.dtypes.common import ( is_float, @@ -59,17 +64,10 @@ if TYPE_CHECKING: from collections.abc import Generator - from pandas._libs.tslibs.offsets import BaseOffset + from matplotlib.axis import Axis -# constants -HOURS_PER_DAY: Final = 24.0 -MIN_PER_HOUR: Final = 60.0 -SEC_PER_MIN: Final = 60.0 - -SEC_PER_HOUR: Final = SEC_PER_MIN * MIN_PER_HOUR -SEC_PER_DAY: Final = SEC_PER_HOUR * HOURS_PER_DAY + from pandas._libs.tslibs.offsets import BaseOffset -MUSEC_PER_DAY: Final = 10**6 * SEC_PER_DAY _mpl_units = {} # Cache for units overwritten by us @@ -191,7 +189,7 @@ def __init__(self, locs) -> None: self.locs = locs - def __call__(self, x, pos: int = 0) -> str: + def __call__(self, x, pos: int | None = 0) -> str: """ Return the time of day as a formatted string. @@ -368,8 +366,14 @@ locator = MilliSecondLocator(self.tz) locator.set_axis(self.axis) - locator.axis.set_view_interval(*self.axis.get_view_interval()) - locator.axis.set_data_interval(*self.axis.get_data_interval()) + # error: Item "None" of "Axis | _DummyAxis | _AxisWrapper | None" + # has no attribute "get_data_interval" + locator.axis.set_view_interval( # type: ignore[union-attr] + *self.axis.get_view_interval() # type: ignore[union-attr] + ) + locator.axis.set_data_interval( # type: ignore[union-attr] + *self.axis.get_data_interval() # type: ignore[union-attr] + ) return locator return mdates.AutoDateLocator.get_locator(self, dmin, dmax) @@ -424,7 +428,7 @@ ) interval = self._get_interval() - freq = f"{interval}L" + freq = f"{interval}ms" tz = self.tz.tzname(None) st = dmin.replace(tzinfo=None) ed = dmin.replace(tzinfo=None) @@ -507,7 +511,7 @@ return (min_spacing, maj_spacing) -def period_break(dates: PeriodIndex, period: str) -> np.ndarray: +def _period_break(dates: PeriodIndex, period: str) -> npt.NDArray[np.intp]: """ Returns the indices where the given period changes. @@ -518,12 +522,17 @@ period : str Name of the period to monitor. """ + mask = _period_break_mask(dates, period) + return np.nonzero(mask)[0] + + +def _period_break_mask(dates: PeriodIndex, period: str) -> npt.NDArray[np.bool_]: current = getattr(dates, period) previous = getattr(dates - 1 * dates.freq, period) - return np.nonzero(current - previous)[0] + return current != previous -def has_level_label(label_flags: np.ndarray, vmin: float) -> bool: +def has_level_label(label_flags: npt.NDArray[np.intp], vmin: float) -> bool: """ Returns true if the ``label_flags`` indicate there is at least one label for this level. @@ -539,45 +548,52 @@ return True -def _daily_finder(vmin, vmax, freq: BaseOffset): +def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]: # error: "BaseOffset" has no attribute "_period_dtype_code" dtype_code = freq._period_dtype_code # type: ignore[attr-defined] freq_group = FreqGroup.from_period_dtype_code(dtype_code) - periodsperday = -1 + ppd = -1 # placeholder for above-day freqs if dtype_code >= FreqGroup.FR_HR.value: - if freq_group == FreqGroup.FR_NS: - periodsperday = 24 * 60 * 60 * 1000000000 - elif freq_group == FreqGroup.FR_US: - periodsperday = 24 * 60 * 60 * 1000000 - elif freq_group == FreqGroup.FR_MS: - periodsperday = 24 * 60 * 60 * 1000 - elif freq_group == FreqGroup.FR_SEC: - periodsperday = 24 * 60 * 60 - elif freq_group == FreqGroup.FR_MIN: - periodsperday = 24 * 60 - elif freq_group == FreqGroup.FR_HR: - periodsperday = 24 - else: # pragma: no cover - raise ValueError(f"unexpected frequency: {dtype_code}") - periodsperyear = 365 * periodsperday - periodspermonth = 28 * periodsperday - + # error: "BaseOffset" has no attribute "_creso" + ppd = periods_per_day(freq._creso) # type: ignore[attr-defined] + ppm = 28 * ppd + ppy = 365 * ppd elif freq_group == FreqGroup.FR_BUS: - periodsperyear = 261 - periodspermonth = 19 + ppm = 19 + ppy = 261 elif freq_group == FreqGroup.FR_DAY: - periodsperyear = 365 - periodspermonth = 28 + ppm = 28 + ppy = 365 elif freq_group == FreqGroup.FR_WK: - periodsperyear = 52 - periodspermonth = 3 - else: # pragma: no cover - raise ValueError("unexpected frequency") + ppm = 3 + ppy = 52 + elif freq_group == FreqGroup.FR_MTH: + ppm = 1 + ppy = 12 + elif freq_group == FreqGroup.FR_QTR: + ppm = -1 # placerholder + ppy = 4 + elif freq_group == FreqGroup.FR_ANN: + ppm = -1 # placeholder + ppy = 1 + else: + raise NotImplementedError(f"Unsupported frequency: {dtype_code}") + + return ppd, ppm, ppy + + +def _daily_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: + # error: "BaseOffset" has no attribute "_period_dtype_code" + dtype_code = freq._period_dtype_code # type: ignore[attr-defined] + + periodsperday, periodspermonth, periodsperyear = _get_periods_per_ymd(freq) # save this for later usage vmin_orig = vmin + (vmin, vmax) = (int(vmin), int(vmax)) + span = vmax - vmin + 1 with warnings.catch_warnings(): warnings.filterwarnings( @@ -586,22 +602,11 @@ warnings.filterwarnings( "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning ) - (vmin, vmax) = ( - Period(ordinal=int(vmin), freq=freq), - Period(ordinal=int(vmax), freq=freq), - ) - assert isinstance(vmin, Period) - assert isinstance(vmax, Period) - span = vmax.ordinal - vmin.ordinal + 1 - - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", "Period with BDay freq is deprecated", category=FutureWarning - ) - warnings.filterwarnings( - "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning + dates_ = period_range( + start=Period(ordinal=vmin, freq=freq), + end=Period(ordinal=vmax, freq=freq), + freq=freq, ) - dates_ = period_range(start=vmin, end=vmax, freq=freq) # Initialize the output info = np.zeros( @@ -623,45 +628,38 @@ # Case 1. Less than a month if span <= periodspermonth: - day_start = period_break(dates_, "day") - month_start = period_break(dates_, "month") - - def _hour_finder(label_interval, force_year_start) -> None: - _hour = dates_.hour - _prev_hour = (dates_ - 1 * dates_.freq).hour - hour_start = (_hour - _prev_hour) != 0 + day_start = _period_break(dates_, "day") + month_start = _period_break(dates_, "month") + year_start = _period_break(dates_, "year") + + def _hour_finder(label_interval: int, force_year_start: bool) -> None: + target = dates_.hour + mask = _period_break_mask(dates_, "hour") info_maj[day_start] = True - info_min[hour_start & (_hour % label_interval == 0)] = True - year_start = period_break(dates_, "year") - info_fmt[hour_start & (_hour % label_interval == 0)] = "%H:%M" + info_min[mask & (target % label_interval == 0)] = True + info_fmt[mask & (target % label_interval == 0)] = "%H:%M" info_fmt[day_start] = "%H:%M\n%d-%b" info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" if force_year_start and not has_level_label(year_start, vmin_orig): info_fmt[first_label(day_start)] = "%H:%M\n%d-%b\n%Y" - def _minute_finder(label_interval) -> None: - hour_start = period_break(dates_, "hour") - _minute = dates_.minute - _prev_minute = (dates_ - 1 * dates_.freq).minute - minute_start = (_minute - _prev_minute) != 0 + def _minute_finder(label_interval: int) -> None: + target = dates_.minute + hour_start = _period_break(dates_, "hour") + mask = _period_break_mask(dates_, "minute") info_maj[hour_start] = True - info_min[minute_start & (_minute % label_interval == 0)] = True - year_start = period_break(dates_, "year") - info_fmt = info["fmt"] - info_fmt[minute_start & (_minute % label_interval == 0)] = "%H:%M" + info_min[mask & (target % label_interval == 0)] = True + info_fmt[mask & (target % label_interval == 0)] = "%H:%M" info_fmt[day_start] = "%H:%M\n%d-%b" info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" - def _second_finder(label_interval) -> None: - minute_start = period_break(dates_, "minute") - _second = dates_.second - _prev_second = (dates_ - 1 * dates_.freq).second - second_start = (_second - _prev_second) != 0 - info["maj"][minute_start] = True - info["min"][second_start & (_second % label_interval == 0)] = True - year_start = period_break(dates_, "year") - info_fmt = info["fmt"] - info_fmt[second_start & (_second % label_interval == 0)] = "%H:%M:%S" + def _second_finder(label_interval: int) -> None: + target = dates_.second + minute_start = _period_break(dates_, "minute") + mask = _period_break_mask(dates_, "second") + info_maj[minute_start] = True + info_min[mask & (target % label_interval == 0)] = True + info_fmt[mask & (target % label_interval == 0)] = "%H:%M:%S" info_fmt[day_start] = "%H:%M:%S\n%d-%b" info_fmt[year_start] = "%H:%M:%S\n%d-%b\n%Y" @@ -700,8 +698,6 @@ else: info_maj[month_start] = True info_min[day_start] = True - year_start = period_break(dates_, "year") - info_fmt = info["fmt"] info_fmt[day_start] = "%d" info_fmt[month_start] = "%d\n%b" info_fmt[year_start] = "%d\n%b\n%Y" @@ -713,15 +709,15 @@ # Case 2. Less than three months elif span <= periodsperyear // 4: - month_start = period_break(dates_, "month") + month_start = _period_break(dates_, "month") info_maj[month_start] = True if dtype_code < FreqGroup.FR_HR.value: info["min"] = True else: - day_start = period_break(dates_, "day") + day_start = _period_break(dates_, "day") info["min"][day_start] = True - week_start = period_break(dates_, "week") - year_start = period_break(dates_, "year") + week_start = _period_break(dates_, "week") + year_start = _period_break(dates_, "year") info_fmt[week_start] = "%d" info_fmt[month_start] = "\n\n%b" info_fmt[year_start] = "\n\n%b\n%Y" @@ -732,9 +728,9 @@ info_fmt[first_label(month_start)] = "\n\n%b\n%Y" # Case 3. Less than 14 months ............... elif span <= 1.15 * periodsperyear: - year_start = period_break(dates_, "year") - month_start = period_break(dates_, "month") - week_start = period_break(dates_, "week") + year_start = _period_break(dates_, "year") + month_start = _period_break(dates_, "month") + week_start = _period_break(dates_, "week") info_maj[month_start] = True info_min[week_start] = True info_min[year_start] = False @@ -745,17 +741,17 @@ info_fmt[first_label(month_start)] = "%b\n%Y" # Case 4. Less than 2.5 years ............... elif span <= 2.5 * periodsperyear: - year_start = period_break(dates_, "year") - quarter_start = period_break(dates_, "quarter") - month_start = period_break(dates_, "month") + year_start = _period_break(dates_, "year") + quarter_start = _period_break(dates_, "quarter") + month_start = _period_break(dates_, "month") info_maj[quarter_start] = True info_min[month_start] = True info_fmt[quarter_start] = "%b" info_fmt[year_start] = "%b\n%Y" # Case 4. Less than 4 years ................. elif span <= 4 * periodsperyear: - year_start = period_break(dates_, "year") - month_start = period_break(dates_, "month") + year_start = _period_break(dates_, "year") + month_start = _period_break(dates_, "month") info_maj[year_start] = True info_min[month_start] = True info_min[year_start] = False @@ -766,15 +762,15 @@ info_fmt[year_start] = "%b\n%Y" # Case 5. Less than 11 years ................ elif span <= 11 * periodsperyear: - year_start = period_break(dates_, "year") - quarter_start = period_break(dates_, "quarter") + year_start = _period_break(dates_, "year") + quarter_start = _period_break(dates_, "quarter") info_maj[year_start] = True info_min[quarter_start] = True info_min[year_start] = False info_fmt[year_start] = "%Y" # Case 6. More than 12 years ................ else: - year_start = period_break(dates_, "year") + year_start = _period_break(dates_, "year") year_break = dates_[year_start].year nyears = span / periodsperyear (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) @@ -787,8 +783,8 @@ return info -def _monthly_finder(vmin, vmax, freq): - periodsperyear = 12 +def _monthly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: + _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) @@ -823,6 +819,7 @@ quarter_start = (dates_ % 3 == 0).nonzero() info_maj[year_start] = True # TODO: Check the following : is it really info['fmt'] ? + # 2023-09-15 this is reached in test_finder_monthly info["fmt"][quarter_start] = True info["min"] = True @@ -857,8 +854,8 @@ return info -def _quarterly_finder(vmin, vmax, freq): - periodsperyear = 4 +def _quarterly_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: + _, _, periodsperyear = _get_periods_per_ymd(freq) vmin_orig = vmin (vmin, vmax) = (int(vmin), int(vmax)) span = vmax - vmin + 1 @@ -904,7 +901,8 @@ return info -def _annual_finder(vmin, vmax, freq): +def _annual_finder(vmin, vmax, freq: BaseOffset) -> np.ndarray: + # Note: small difference here vs other finders in adding 1 to vmax (vmin, vmax) = (int(vmin), int(vmax + 1)) span = vmax - vmin + 1 @@ -917,8 +915,9 @@ (min_anndef, maj_anndef) = _get_default_annual_spacing(span) major_idx = dates_ % maj_anndef == 0 + minor_idx = dates_ % min_anndef == 0 info["maj"][major_idx] = True - info["min"][(dates_ % min_anndef == 0)] = True + info["min"][minor_idx] = True info["fmt"][major_idx] = "%Y" return info @@ -959,6 +958,8 @@ day : {int}, optional """ + axis: Axis + def __init__( self, freq: BaseOffset, @@ -970,7 +971,7 @@ day: int = 1, plot_obj=None, ) -> None: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) self.freq = freq self.base = base (self.quarter, self.month, self.day) = (quarter, month, day) @@ -982,10 +983,7 @@ def _get_default_locs(self, vmin, vmax): """Returns the default locations of ticks.""" - if self.plot_obj.date_axis_info is None: - self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq) - - locator = self.plot_obj.date_axis_info + locator = self.finder(vmin, vmax, self.freq) if self.isminor: return np.compress(locator["min"], locator["val"]) @@ -996,9 +994,6 @@ # axis calls Locator.set_axis inside set_m_formatter vi = tuple(self.axis.get_view_interval()) - if vi != self.plot_obj.view_interval: - self.plot_obj.date_axis_info = None - self.plot_obj.view_interval = vi vmin, vmax = vi if vmax < vmin: vmin, vmax = vmax, vmin @@ -1008,7 +1003,9 @@ base = self.base (d, m) = divmod(vmin, base) vmin = (d + 1) * base - locs = list(range(vmin, vmax + 1, base)) + # error: No overload variant of "range" matches argument types "float", + # "float", "int" + locs = list(range(vmin, vmax + 1, base)) # type: ignore[call-overload] return locs def autoscale(self): @@ -1047,6 +1044,8 @@ Whether the formatter works in dynamic mode or not. """ + axis: Axis + def __init__( self, freq: BaseOffset, @@ -1054,7 +1053,7 @@ dynamic_mode: bool = True, plot_obj=None, ) -> None: - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) self.format = None self.freq = freq self.locs: list[Any] = [] # unused, for matplotlib compat @@ -1067,9 +1066,7 @@ def _set_default_format(self, vmin, vmax): """Returns the default ticks spacing.""" - if self.plot_obj.date_axis_info is None: - self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq) - info = self.plot_obj.date_axis_info + info = self.finder(vmin, vmax, self.freq) if self.isminor: format = np.compress(info["min"] & np.logical_not(info["maj"]), info) @@ -1085,15 +1082,12 @@ self.locs = locs - (vmin, vmax) = vi = tuple(self.axis.get_view_interval()) - if vi != self.plot_obj.view_interval: - self.plot_obj.date_axis_info = None - self.plot_obj.view_interval = vi + (vmin, vmax) = tuple(self.axis.get_view_interval()) if vmax < vmin: (vmin, vmax) = (vmax, vmin) self._set_default_format(vmin, vmax) - def __call__(self, x, pos: int = 0) -> str: + def __call__(self, x, pos: int | None = 0) -> str: if self.formatdict is None: return "" else: @@ -1116,12 +1110,14 @@ Formats the ticks along an axis controlled by a :class:`TimedeltaIndex`. """ + axis: Axis + @staticmethod def format_timedelta_ticks(x, pos, n_decimals: int) -> str: """ Convert seconds to 'D days HH:MM:SS.F' """ - s, ns = divmod(x, 10**9) + s, ns = divmod(x, 10**9) # TODO(non-nano): this looks like it assumes ns m, s = divmod(s, 60) h, m = divmod(m, 60) d, h = divmod(h, 24) @@ -1133,7 +1129,7 @@ s = f"{int(d):d} days {s}" return s - def __call__(self, x, pos: int = 0) -> str: + def __call__(self, x, pos: int | None = 0) -> str: (vmin, vmax) = tuple(self.axis.get_view_interval()) n_decimals = min(int(np.ceil(np.log10(100 * 10**9 / abs(vmax - vmin)))), 9) return self.format_timedelta_ticks(x, pos, n_decimals) diff -Nru pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/core.py pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/core.py --- pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/core.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/core.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,23 +7,29 @@ from collections.abc import ( Hashable, Iterable, + Iterator, Sequence, ) from typing import ( TYPE_CHECKING, + Any, Literal, + cast, + final, ) import warnings import matplotlib as mpl import numpy as np +from pandas._libs import lib from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_any_real_numeric_dtype, + is_bool, is_float, is_float_dtype, is_hashable, @@ -40,15 +46,13 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, + ABCDatetimeIndex, ABCIndex, ABCMultiIndex, ABCPeriodIndex, ABCSeries, ) -from pandas.core.dtypes.missing import ( - isna, - notna, -) +from pandas.core.dtypes.missing import isna import pandas.core.common as com from pandas.core.frame import DataFrame @@ -80,13 +84,17 @@ from matplotlib.artist import Artist from matplotlib.axes import Axes from matplotlib.axis import Axis + from matplotlib.figure import Figure from pandas._typing import ( IndexLabel, + NDFrameT, PlottingOrientation, npt, ) + from pandas import Series + def _color_in_style(style: str) -> bool: """ @@ -120,7 +128,7 @@ def orientation(self) -> str | None: return None - axes: np.ndarray # of Axes objects + data: DataFrame def __init__( self, @@ -128,7 +136,7 @@ kind=None, by: IndexLabel | None = None, subplots: bool | Sequence[Sequence[str]] = False, - sharex=None, + sharex: bool | None = None, sharey: bool = False, use_index: bool = True, figsize: tuple[float, float] | None = None, @@ -151,12 +159,18 @@ layout=None, include_bool: bool = False, column: IndexLabel | None = None, + *, + logx: bool | None | Literal["sym"] = False, + logy: bool | None | Literal["sym"] = False, + loglog: bool | None | Literal["sym"] = False, + mark_right: bool = True, + stacked: bool = False, + label: Hashable | None = None, + style=None, **kwds, ) -> None: import matplotlib.pyplot as plt - self.data = data - # if users assign an empty list or tuple, raise `ValueError` # similar to current `df.box` and `df.hist` APIs. if by in ([], ()): @@ -187,19 +201,11 @@ self.kind = kind - self.subplots = self._validate_subplots_kwarg(subplots) - - if sharex is None: - # if by is defined, subplots are used and sharex should be False - if ax is None and by is None: - self.sharex = True - else: - # if we get an axis, the users should do the visibility - # setting... - self.sharex = False - else: - self.sharex = sharex + self.subplots = type(self)._validate_subplots_kwarg( + subplots, data, kind=self._kind + ) + self.sharex = type(self)._validate_sharex(sharex, ax, by) self.sharey = sharey self.figsize = figsize self.layout = layout @@ -232,25 +238,28 @@ self.legend_handles: list[Artist] = [] self.legend_labels: list[Hashable] = [] - self.logx = kwds.pop("logx", False) - self.logy = kwds.pop("logy", False) - self.loglog = kwds.pop("loglog", False) - self.label = kwds.pop("label", None) - self.style = kwds.pop("style", None) - self.mark_right = kwds.pop("mark_right", True) - self.stacked = kwds.pop("stacked", False) + self.logx = type(self)._validate_log_kwd("logx", logx) + self.logy = type(self)._validate_log_kwd("logy", logy) + self.loglog = type(self)._validate_log_kwd("loglog", loglog) + self.label = label + self.style = style + self.mark_right = mark_right + self.stacked = stacked + # ax may be an Axes object or (if self.subplots) an ndarray of + # Axes objects self.ax = ax - self.fig = fig - self.axes = np.array([], dtype=object) # "real" version get set in `generate` + # TODO: deprecate fig keyword as it is ignored, not passed in tests + # as of 2023-11-05 # parse errorbar input if given xerr = kwds.pop("xerr", None) yerr = kwds.pop("yerr", None) - self.errors = { - kw: self._parse_errorbars(kw, err) - for kw, err in zip(["xerr", "yerr"], [xerr, yerr]) - } + nseries = self._get_nseries(data) + xerr, data = type(self)._parse_errorbars("xerr", xerr, data, nseries) + yerr, data = type(self)._parse_errorbars("yerr", yerr, data, nseries) + self.errors = {"xerr": xerr, "yerr": yerr} + self.data = data if not isinstance(secondary_y, (bool, tuple, list, np.ndarray, ABCIndex)): secondary_y = [secondary_y] @@ -270,10 +279,47 @@ self.kwds = kwds - self._validate_color_args() + color = kwds.pop("color", lib.no_default) + self.color = self._validate_color_args(color, self.colormap) + assert "color" not in self.kwds + + self.data = self._ensure_frame(self.data) + + @final + @staticmethod + def _validate_sharex(sharex: bool | None, ax, by) -> bool: + if sharex is None: + # if by is defined, subplots are used and sharex should be False + if ax is None and by is None: # pylint: disable=simplifiable-if-statement + sharex = True + else: + # if we get an axis, the users should do the visibility + # setting... + sharex = False + elif not is_bool(sharex): + raise TypeError("sharex must be a bool or None") + return bool(sharex) + + @classmethod + def _validate_log_kwd( + cls, + kwd: str, + value: bool | None | Literal["sym"], + ) -> bool | None | Literal["sym"]: + if ( + value is None + or isinstance(value, bool) + or (isinstance(value, str) and value == "sym") + ): + return value + raise ValueError( + f"keyword '{kwd}' should be bool, None, or 'sym', not '{value}'" + ) + @final + @staticmethod def _validate_subplots_kwarg( - self, subplots: bool | Sequence[Sequence[str]] + subplots: bool | Sequence[Sequence[str]], data: Series | DataFrame, kind: str ) -> bool | list[tuple[int, ...]]: """ Validate the subplots parameter @@ -310,18 +356,18 @@ "area", "pie", ) - if self._kind not in supported_kinds: + if kind not in supported_kinds: raise ValueError( "When subplots is an iterable, kind must be " - f"one of {', '.join(supported_kinds)}. Got {self._kind}." + f"one of {', '.join(supported_kinds)}. Got {kind}." ) - if isinstance(self.data, ABCSeries): + if isinstance(data, ABCSeries): raise NotImplementedError( "An iterable subplots for a Series is not supported." ) - columns = self.data.columns + columns = data.columns if isinstance(columns, ABCMultiIndex): raise NotImplementedError( "An iterable subplots for a DataFrame with a MultiIndex column " @@ -377,34 +423,31 @@ out.append((idx_loc,)) return out - def _validate_color_args(self): - if ( - "color" in self.kwds - and self.nseries == 1 - and self.kwds["color"] is not None - and not is_list_like(self.kwds["color"]) - ): + def _validate_color_args(self, color, colormap): + if color is lib.no_default: + # It was not provided by the user + if "colors" in self.kwds and colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used simultaneously. " + "Using 'color'", + stacklevel=find_stack_level(), + ) + return None + if self.nseries == 1 and color is not None and not is_list_like(color): # support series.plot(color='green') - self.kwds["color"] = [self.kwds["color"]] + color = [color] - if ( - "color" in self.kwds - and isinstance(self.kwds["color"], tuple) - and self.nseries == 1 - and len(self.kwds["color"]) in (3, 4) - ): + if isinstance(color, tuple) and self.nseries == 1 and len(color) in (3, 4): # support RGB and RGBA tuples in series plot - self.kwds["color"] = [self.kwds["color"]] + color = [color] - if ( - "color" in self.kwds or "colors" in self.kwds - ) and self.colormap is not None: + if colormap is not None: warnings.warn( "'color' and 'colormap' cannot be used simultaneously. Using 'color'", stacklevel=find_stack_level(), ) - if "color" in self.kwds and self.style is not None: + if self.style is not None: if is_list_like(self.style): styles = self.style else: @@ -417,57 +460,61 @@ "'color' keyword argument. Please use one or the " "other or pass 'style' without a color symbol" ) + return color - def _iter_data(self, data=None, keep_index: bool = False, fillna=None): - if data is None: - data = self.data - if fillna is not None: - data = data.fillna(fillna) - + @final + @staticmethod + def _iter_data( + data: DataFrame | dict[Hashable, Series | DataFrame] + ) -> Iterator[tuple[Hashable, np.ndarray]]: for col, values in data.items(): - if keep_index is True: - yield col, values - else: - yield col, values.values + # This was originally written to use values.values before EAs + # were implemented; adding np.asarray(...) to keep consistent + # typing. + yield col, np.asarray(values.values) - @property - def nseries(self) -> int: + def _get_nseries(self, data: Series | DataFrame) -> int: # When `by` is explicitly assigned, grouped data size will be defined, and # this will determine number of subplots to have, aka `self.nseries` - if self.data.ndim == 1: + if data.ndim == 1: return 1 elif self.by is not None and self._kind == "hist": return len(self._grouped) elif self.by is not None and self._kind == "box": return len(self.columns) else: - return self.data.shape[1] + return data.shape[1] + @final + @property + def nseries(self) -> int: + return self._get_nseries(self.data) + + @final def draw(self) -> None: self.plt.draw_if_interactive() + @final def generate(self) -> None: - self._args_adjust() self._compute_plot_data() - self._setup_subplots() - self._make_plot() + fig = self.fig + self._make_plot(fig) self._add_table() self._make_legend() - self._adorn_subplots() + self._adorn_subplots(fig) for ax in self.axes: - self._post_plot_logic_common(ax, self.data) + self._post_plot_logic_common(ax) self._post_plot_logic(ax, self.data) - @abstractmethod - def _args_adjust(self) -> None: - pass - - def _has_plotted_object(self, ax: Axes) -> bool: + @final + @staticmethod + def _has_plotted_object(ax: Axes) -> bool: """check whether ax has data""" return len(ax.lines) != 0 or len(ax.artists) != 0 or len(ax.containers) != 0 - def _maybe_right_yaxis(self, ax: Axes, axes_num): + @final + def _maybe_right_yaxis(self, ax: Axes, axes_num: int) -> Axes: if not self.on_right(axes_num): # secondary axes may be passed via ax kw return self._get_ax_layer(ax) @@ -482,9 +529,16 @@ # otherwise, create twin axes orig_ax, new_ax = ax, ax.twinx() # TODO: use Matplotlib public API when available - new_ax._get_lines = orig_ax._get_lines - new_ax._get_patches_for_fill = orig_ax._get_patches_for_fill - orig_ax.right_ax, new_ax.left_ax = new_ax, orig_ax + new_ax._get_lines = orig_ax._get_lines # type: ignore[attr-defined] + # TODO #54485 + new_ax._get_patches_for_fill = ( # type: ignore[attr-defined] + orig_ax._get_patches_for_fill # type: ignore[attr-defined] + ) + # TODO #54485 + orig_ax.right_ax, new_ax.left_ax = ( # type: ignore[attr-defined] + new_ax, + orig_ax, + ) if not self._has_plotted_object(orig_ax): # no data on left y orig_ax.get_yaxis().set_visible(False) @@ -493,9 +547,22 @@ new_ax.set_yscale("log") elif self.logy == "sym" or self.loglog == "sym": new_ax.set_yscale("symlog") - return new_ax + return new_ax # type: ignore[return-value] + + @final + @cache_readonly + def fig(self) -> Figure: + return self._axes_and_fig[1] - def _setup_subplots(self): + @final + @cache_readonly + # TODO: can we annotate this as both a Sequence[Axes] and ndarray[object]? + def axes(self) -> Sequence[Axes]: + return self._axes_and_fig[0] + + @final + @cache_readonly + def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]: if self.subplots: naxes = ( self.nseries if isinstance(self.subplots, bool) else len(self.subplots) @@ -520,14 +587,6 @@ axes = flatten_axes(axes) - valid_log = {False, True, "sym", None} - input_log = {self.logx, self.logy, self.loglog} - if input_log - valid_log: - invalid_log = next(iter(input_log - valid_log)) - raise ValueError( - f"Boolean, None and 'sym' are valid options, '{invalid_log}' is given." - ) - if self.logx is True or self.loglog is True: [a.set_xscale("log") for a in axes] elif self.logx == "sym" or self.loglog == "sym": @@ -538,8 +597,8 @@ elif self.logy == "sym" or self.loglog == "sym": [a.set_yscale("symlog") for a in axes] - self.fig = fig - self.axes = axes + axes_seq = cast(Sequence["Axes"], axes) + return axes_seq, fig @property def result(self): @@ -548,7 +607,8 @@ """ if self.subplots: if self.layout is not None and not is_list_like(self.ax): - return self.axes.reshape(*self.layout) + # error: "Sequence[Any]" has no attribute "reshape" + return self.axes.reshape(*self.layout) # type: ignore[attr-defined] else: return self.axes else: @@ -565,7 +625,9 @@ else: return self.axes[0] - def _convert_to_ndarray(self, data): + @final + @staticmethod + def _convert_to_ndarray(data): # GH31357: categorical columns are processed separately if isinstance(data.dtype, CategoricalDtype): return data @@ -583,9 +645,8 @@ return data - def _compute_plot_data(self): - data = self.data - + @final + def _ensure_frame(self, data) -> DataFrame: if isinstance(data, ABCSeries): label = self.label if label is None and data.name is None: @@ -598,6 +659,11 @@ elif self._kind in ("hist", "box"): cols = self.columns if self.by is None else self.columns + self.by data = data.loc[:, cols] + return data + + @final + def _compute_plot_data(self) -> None: + data = self.data # GH15079 reconstruct data if by is defined if self.by is not None: @@ -622,24 +688,21 @@ # GH 18755, include object and category type for scatter plot if self._kind == "scatter": - include_type.extend(["object", "category"]) + include_type.extend(["object", "category", "string"]) numeric_data = data.select_dtypes(include=include_type, exclude=exclude_type) - try: - is_empty = numeric_data.columns.empty - except AttributeError: - is_empty = not len(numeric_data) - + is_empty = numeric_data.shape[-1] == 0 # no non-numeric frames or series allowed if is_empty: raise TypeError("no numeric data to plot") - self.data = numeric_data.apply(self._convert_to_ndarray) + self.data = numeric_data.apply(type(self)._convert_to_ndarray) - def _make_plot(self): + def _make_plot(self, fig: Figure) -> None: raise AbstractMethodError(self) + @final def _add_table(self) -> None: if self.table is False: return @@ -650,33 +713,43 @@ ax = self._get_ax(0) tools.table(ax, data) - def _post_plot_logic_common(self, ax, data): + @final + def _post_plot_logic_common(self, ax: Axes) -> None: """Common post process for each axes""" if self.orientation == "vertical" or self.orientation is None: - self._apply_axis_properties(ax.xaxis, rot=self.rot, fontsize=self.fontsize) - self._apply_axis_properties(ax.yaxis, fontsize=self.fontsize) + type(self)._apply_axis_properties( + ax.xaxis, rot=self.rot, fontsize=self.fontsize + ) + type(self)._apply_axis_properties(ax.yaxis, fontsize=self.fontsize) if hasattr(ax, "right_ax"): - self._apply_axis_properties(ax.right_ax.yaxis, fontsize=self.fontsize) + type(self)._apply_axis_properties( + ax.right_ax.yaxis, fontsize=self.fontsize + ) elif self.orientation == "horizontal": - self._apply_axis_properties(ax.yaxis, rot=self.rot, fontsize=self.fontsize) - self._apply_axis_properties(ax.xaxis, fontsize=self.fontsize) + type(self)._apply_axis_properties( + ax.yaxis, rot=self.rot, fontsize=self.fontsize + ) + type(self)._apply_axis_properties(ax.xaxis, fontsize=self.fontsize) if hasattr(ax, "right_ax"): - self._apply_axis_properties(ax.right_ax.yaxis, fontsize=self.fontsize) + type(self)._apply_axis_properties( + ax.right_ax.yaxis, fontsize=self.fontsize + ) else: # pragma no cover raise ValueError @abstractmethod - def _post_plot_logic(self, ax, data) -> None: + def _post_plot_logic(self, ax: Axes, data) -> None: """Post process for each axes. Overridden in child classes""" - def _adorn_subplots(self): + @final + def _adorn_subplots(self, fig: Figure) -> None: """Common post process unrelated to data""" if len(self.axes) > 0: - all_axes = self._get_subplots() - nrows, ncols = self._get_axes_layout() + all_axes = self._get_subplots(fig) + nrows, ncols = self._get_axes_layout(fig) handle_shared_axes( axarr=all_axes, nplots=len(all_axes), @@ -723,7 +796,7 @@ for ax, title in zip(self.axes, self.title): ax.set_title(title) else: - self.fig.suptitle(self.title) + fig.suptitle(self.title) else: if is_list_like(self.title): msg = ( @@ -733,8 +806,10 @@ raise ValueError(msg) self.axes[0].set_title(self.title) + @final + @staticmethod def _apply_axis_properties( - self, axis: Axis, rot=None, fontsize: int | None = None + axis: Axis, rot=None, fontsize: int | None = None ) -> None: """ Tick creation within matplotlib is reasonably expensive and is @@ -751,6 +826,7 @@ if fontsize is not None: label.set_fontsize(fontsize) + @final @property def legend_title(self) -> str | None: if not isinstance(self.data.columns, ABCMultiIndex): @@ -762,6 +838,7 @@ stringified = map(pprint_thing, self.data.columns.names) return ",".join(stringified) + @final def _mark_right_label(self, label: str, index: int) -> str: """ Append ``(right)`` to the label of a line if it's plotted on the right axis. @@ -772,6 +849,7 @@ label += " (right)" return label + @final def _append_legend_handles_labels(self, handle: Artist, label: str) -> None: """ Append current handle and label to ``legend_handles`` and ``legend_labels``. @@ -817,7 +895,9 @@ if ax.get_visible(): ax.legend(loc="best") - def _get_ax_legend(self, ax: Axes): + @final + @staticmethod + def _get_ax_legend(ax: Axes): """ Take in axes and return ax and legend under different scenarios """ @@ -832,6 +912,7 @@ ax = other_ax return ax, leg + @final @cache_readonly def plt(self): import matplotlib.pyplot as plt @@ -840,24 +921,27 @@ _need_to_set_index = False - def _get_xticks(self, convert_period: bool = False): + @final + def _get_xticks(self): index = self.data.index is_datetype = index.inferred_type in ("datetime", "date", "datetime64", "time") + # TODO: be stricter about x? + x: list[int] | np.ndarray if self.use_index: - if convert_period and isinstance(index, ABCPeriodIndex): - self.data = self.data.reindex(index=index.sort_values()) - x = self.data.index.to_timestamp()._mpl_repr() + if isinstance(index, ABCPeriodIndex): + # test_mixed_freq_irreg_period + x = index.to_timestamp()._mpl_repr() + # TODO: why do we need to do to_timestamp() here but not other + # places where we call mpl_repr? elif is_any_real_numeric_dtype(index.dtype): # Matplotlib supports numeric values or datetime objects as # xaxis values. Taking LBYL approach here, by the time # matplotlib raises exception when using non numeric/datetime # values for xaxis, several actions are already taken by plt. x = index._mpl_repr() - elif is_datetype: - self.data = self.data[notna(self.data.index)] - self.data = self.data.sort_index() - x = self.data.index._mpl_repr() + elif isinstance(index, ABCDatetimeIndex) or is_datetype: + x = index._mpl_repr() else: self._need_to_set_index = True x = list(range(len(index))) @@ -894,6 +978,7 @@ """Specify whether xlabel/ylabel should be used to override index name""" return self.xlabel + @final def _get_index_name(self) -> str | None: if isinstance(self.data.index, ABCMultiIndex): name = self.data.index.names @@ -913,6 +998,7 @@ return name + @final @classmethod def _get_ax_layer(cls, ax, primary: bool = True): """get left (primary) or right (secondary) axes""" @@ -921,6 +1007,7 @@ else: return getattr(ax, "right_ax", ax) + @final def _col_idx_to_axis_idx(self, col_idx: int) -> int: """Return the index of the axis where the column at col_idx should be plotted""" if isinstance(self.subplots, list): @@ -934,13 +1021,15 @@ # subplots is True: one ax per column return col_idx + @final def _get_ax(self, i: int): # get the twinx ax if appropriate if self.subplots: i = self._col_idx_to_axis_idx(i) ax = self.axes[i] ax = self._maybe_right_yaxis(ax, i) - self.axes[i] = ax + # error: Unsupported target for indexed assignment ("Sequence[Any]") + self.axes[i] = ax # type: ignore[index] else: ax = self.axes[0] ax = self._maybe_right_yaxis(ax, i) @@ -948,23 +1037,18 @@ ax.get_yaxis().set_visible(True) return ax - @classmethod - def get_default_ax(cls, ax) -> None: - import matplotlib.pyplot as plt - - if ax is None and len(plt.get_fignums()) > 0: - with plt.rc_context(): - ax = plt.gca() - ax = cls._get_ax_layer(ax) - - def on_right(self, i): + @final + def on_right(self, i: int): if isinstance(self.secondary_y, bool): return self.secondary_y if isinstance(self.secondary_y, (tuple, list, np.ndarray, ABCIndex)): return self.data.columns[i] in self.secondary_y - def _apply_style_colors(self, colors, kwds, col_num, label: str): + @final + def _apply_style_colors( + self, colors, kwds: dict[str, Any], col_num: int, label: str + ): """ Manage style and color based on column number and its label. Returns tuple of appropriate style and kwds which "color" may be added. @@ -997,14 +1081,22 @@ ): if num_colors is None: num_colors = self.nseries - + if color_kwds == "color": + color = self.color + else: + color = self.kwds.get(color_kwds) return get_standard_colors( num_colors=num_colors, colormap=self.colormap, - color=self.kwds.get(color_kwds), + color=color, ) - def _parse_errorbars(self, label, err): + # TODO: tighter typing for first return? + @final + @staticmethod + def _parse_errorbars( + label: str, err, data: NDFrameT, nseries: int + ) -> tuple[Any, NDFrameT]: """ Look for error keyword arguments and return the actual errorbar data or return the error DataFrame/dict @@ -1024,7 +1116,7 @@ should be in a ``Mx2xN`` array. """ if err is None: - return None + return None, data def match_labels(data, e): e = e.reindex(data.index) @@ -1032,7 +1124,7 @@ # key-matched DataFrame if isinstance(err, ABCDataFrame): - err = match_labels(self.data, err) + err = match_labels(data, err) # key-matched dict elif isinstance(err, dict): pass @@ -1040,16 +1132,16 @@ # Series of error values elif isinstance(err, ABCSeries): # broadcast error series across data - err = match_labels(self.data, err) + err = match_labels(data, err) err = np.atleast_2d(err) - err = np.tile(err, (self.nseries, 1)) + err = np.tile(err, (nseries, 1)) # errors are a column in the dataframe elif isinstance(err, str): - evalues = self.data[err].values - self.data = self.data[self.data.columns.drop(err)] + evalues = data[err].values + data = data[data.columns.drop(err)] err = np.atleast_2d(evalues) - err = np.tile(err, (self.nseries, 1)) + err = np.tile(err, (nseries, 1)) elif is_list_like(err): if is_iterator(err): @@ -1061,41 +1153,45 @@ err_shape = err.shape # asymmetrical error bars - if isinstance(self.data, ABCSeries) and err_shape[0] == 2: + if isinstance(data, ABCSeries) and err_shape[0] == 2: err = np.expand_dims(err, 0) err_shape = err.shape - if err_shape[2] != len(self.data): + if err_shape[2] != len(data): raise ValueError( "Asymmetrical error bars should be provided " - f"with the shape (2, {len(self.data)})" + f"with the shape (2, {len(data)})" ) - elif isinstance(self.data, ABCDataFrame) and err.ndim == 3: + elif isinstance(data, ABCDataFrame) and err.ndim == 3: if ( - (err_shape[0] != self.nseries) + (err_shape[0] != nseries) or (err_shape[1] != 2) - or (err_shape[2] != len(self.data)) + or (err_shape[2] != len(data)) ): raise ValueError( "Asymmetrical error bars should be provided " - f"with the shape ({self.nseries}, 2, {len(self.data)})" + f"with the shape ({nseries}, 2, {len(data)})" ) # broadcast errors to each data series if len(err) == 1: - err = np.tile(err, (self.nseries, 1)) + err = np.tile(err, (nseries, 1)) elif is_number(err): - err = np.tile([err], (self.nseries, len(self.data))) + err = np.tile( + [err], + (nseries, len(data)), + ) else: msg = f"No valid {label} detected" raise ValueError(msg) - return err + return err, data + @final def _get_errorbars( self, label=None, index=None, xerr: bool = True, yerr: bool = True - ): + ) -> dict[str, Any]: errors = {} for kw, flag in zip(["xerr", "yerr"], [xerr, yerr]): @@ -1114,17 +1210,22 @@ errors[kw] = err return errors - def _get_subplots(self): - from matplotlib.axes import Subplot + @final + def _get_subplots(self, fig: Figure): + if Version(mpl.__version__) < Version("3.8"): + from matplotlib.axes import Subplot as Klass + else: + from matplotlib.axes import Axes as Klass return [ ax - for ax in self.fig.get_axes() - if (isinstance(ax, Subplot) and ax.get_subplotspec() is not None) + for ax in fig.get_axes() + if (isinstance(ax, Klass) and ax.get_subplotspec() is not None) ] - def _get_axes_layout(self) -> tuple[int, int]: - axes = self._get_subplots() + @final + def _get_axes_layout(self, fig: Figure) -> tuple[int, int]: + axes = self._get_subplots(fig) x_set = set() y_set = set() for ax in axes: @@ -1151,28 +1252,25 @@ if is_integer(y) and not self.data.columns._holds_integer(): y = self.data.columns[y] - # Scatter plot allows to plot objects data - if self._kind == "hexbin": - if len(self.data[x]._get_numeric_data()) == 0: - raise ValueError(self._kind + " requires x column to be numeric") - if len(self.data[y]._get_numeric_data()) == 0: - raise ValueError(self._kind + " requires y column to be numeric") - self.x = x self.y = y - @property - def nseries(self) -> int: + @final + def _get_nseries(self, data: Series | DataFrame) -> int: return 1 + @final def _post_plot_logic(self, ax: Axes, data) -> None: x, y = self.x, self.y xlabel = self.xlabel if self.xlabel is not None else pprint_thing(x) ylabel = self.ylabel if self.ylabel is not None else pprint_thing(y) - ax.set_xlabel(xlabel) - ax.set_ylabel(ylabel) + # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible + # type "Hashable"; expected "str" + ax.set_xlabel(xlabel) # type: ignore[arg-type] + ax.set_ylabel(ylabel) # type: ignore[arg-type] - def _plot_colorbar(self, ax: Axes, **kwds): + @final + def _plot_colorbar(self, ax: Axes, *, fig: Figure, **kwds): # Addresses issues #10611 and #10678: # When plotting scatterplots and hexbinplots in IPython # inline backend the colorbar axis height tends not to @@ -1189,7 +1287,7 @@ # use the last one which contains the latest information # about the ax img = ax.collections[-1] - return self.fig.colorbar(img, ax=ax, **kwds) + return fig.colorbar(img, ax=ax, **kwds) class ScatterPlot(PlanePlot): @@ -1197,19 +1295,35 @@ def _kind(self) -> Literal["scatter"]: return "scatter" - def __init__(self, data, x, y, s=None, c=None, **kwargs) -> None: + def __init__( + self, + data, + x, + y, + s=None, + c=None, + *, + colorbar: bool | lib.NoDefault = lib.no_default, + norm=None, + **kwargs, + ) -> None: if s is None: # hide the matplotlib default for size, in case we want to change # the handling of this argument later s = 20 elif is_hashable(s) and s in data.columns: s = data[s] - super().__init__(data, x, y, s=s, **kwargs) + self.s = s + + self.colorbar = colorbar + self.norm = norm + + super().__init__(data, x, y, **kwargs) if is_integer(c) and not self.data.columns._holds_integer(): c = self.data.columns[c] self.c = c - def _make_plot(self): + def _make_plot(self, fig: Figure) -> None: x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] @@ -1219,7 +1333,50 @@ self.data[c].dtype, CategoricalDtype ) - color = self.kwds.pop("color", None) + color = self.color + c_values = self._get_c_values(color, color_by_categorical, c_is_column) + norm, cmap = self._get_norm_and_cmap(c_values, color_by_categorical) + cb = self._get_colorbar(c_values, c_is_column) + + if self.legend: + label = self.label + else: + label = None + scatter = ax.scatter( + data[x].values, + data[y].values, + c=c_values, + label=label, + cmap=cmap, + norm=norm, + s=self.s, + **self.kwds, + ) + if cb: + cbar_label = c if c_is_column else "" + cbar = self._plot_colorbar(ax, fig=fig, label=cbar_label) + if color_by_categorical: + n_cats = len(self.data[c].cat.categories) + cbar.set_ticks(np.linspace(0.5, n_cats - 0.5, n_cats)) + cbar.ax.set_yticklabels(self.data[c].cat.categories) + + if label is not None: + self._append_legend_handles_labels( + # error: Argument 2 to "_append_legend_handles_labels" of + # "MPLPlot" has incompatible type "Hashable"; expected "str" + scatter, + label, # type: ignore[arg-type] + ) + + errors_x = self._get_errorbars(label=x, index=0, yerr=False) + errors_y = self._get_errorbars(label=y, index=0, xerr=False) + if len(errors_x) > 0 or len(errors_y) > 0: + err_kwds = dict(errors_x, **errors_y) + err_kwds["ecolor"] = scatter.get_facecolor()[0] + ax.errorbar(data[x].values, data[y].values, linestyle="none", **err_kwds) + + def _get_c_values(self, color, color_by_categorical: bool, c_is_column: bool): + c = self.c if c is not None and color is not None: raise TypeError("Specify exactly one of `c` and `color`") if c is None and color is None: @@ -1232,7 +1389,10 @@ c_values = self.data[c].values else: c_values = c + return c_values + def _get_norm_and_cmap(self, c_values, color_by_categorical: bool): + c = self.c if self.colormap is not None: cmap = mpl.colormaps.get_cmap(self.colormap) # cmap is only used if c_values are integers, otherwise UserWarning. @@ -1244,55 +1404,28 @@ else: cmap = None - if color_by_categorical: + if color_by_categorical and cmap is not None: from matplotlib import colors n_cats = len(self.data[c].cat.categories) cmap = colors.ListedColormap([cmap(i) for i in range(cmap.N)]) bounds = np.linspace(0, n_cats, n_cats + 1) norm = colors.BoundaryNorm(bounds, cmap.N) + # TODO: warn that we are ignoring self.norm if user specified it? + # Doesn't happen in any tests 2023-11-09 else: - norm = self.kwds.pop("norm", None) + norm = self.norm + return norm, cmap + + def _get_colorbar(self, c_values, c_is_column: bool) -> bool: # plot colorbar if # 1. colormap is assigned, and # 2.`c` is a column containing only numeric values plot_colorbar = self.colormap or c_is_column - cb = self.kwds.pop("colorbar", is_numeric_dtype(c_values) and plot_colorbar) - - if self.legend and hasattr(self, "label"): - label = self.label - else: - label = None - scatter = ax.scatter( - data[x].values, - data[y].values, - c=c_values, - label=label, - cmap=cmap, - norm=norm, - **self.kwds, - ) - if cb: - cbar_label = c if c_is_column else "" - cbar = self._plot_colorbar(ax, label=cbar_label) - if color_by_categorical: - cbar.set_ticks(np.linspace(0.5, n_cats - 0.5, n_cats)) - cbar.ax.set_yticklabels(self.data[c].cat.categories) - - if label is not None: - self._append_legend_handles_labels(scatter, label) - else: - self.legend = False - - errors_x = self._get_errorbars(label=x, index=0, yerr=False) - errors_y = self._get_errorbars(label=y, index=0, xerr=False) - if len(errors_x) > 0 or len(errors_y) > 0: - err_kwds = dict(errors_x, **errors_y) - err_kwds["ecolor"] = scatter.get_facecolor()[0] - ax.errorbar(data[x].values, data[y].values, linestyle="none", **err_kwds) - - def _args_adjust(self) -> None: - pass + cb = self.colorbar + if cb is lib.no_default: + return is_numeric_dtype(c_values) and plot_colorbar + return cb class HexBinPlot(PlanePlot): @@ -1300,19 +1433,27 @@ def _kind(self) -> Literal["hexbin"]: return "hexbin" - def __init__(self, data, x, y, C=None, **kwargs) -> None: + def __init__(self, data, x, y, C=None, *, colorbar: bool = True, **kwargs) -> None: super().__init__(data, x, y, **kwargs) if is_integer(C) and not self.data.columns._holds_integer(): C = self.data.columns[C] self.C = C - def _make_plot(self) -> None: + self.colorbar = colorbar + + # Scatter plot allows to plot objects data + if len(self.data[self.x]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires x column to be numeric") + if len(self.data[self.y]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires y column to be numeric") + + def _make_plot(self, fig: Figure) -> None: x, y, data, C = self.x, self.y, self.data, self.C ax = self.axes[0] # pandas uses colormap, matplotlib uses cmap. cmap = self.colormap or "BuGn" cmap = mpl.colormaps.get_cmap(cmap) - cb = self.kwds.pop("colorbar", True) + cb = self.colorbar if C is None: c_values = None @@ -1321,14 +1462,11 @@ ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap, **self.kwds) if cb: - self._plot_colorbar(ax) + self._plot_colorbar(ax, fig=fig) def _make_legend(self) -> None: pass - def _args_adjust(self) -> None: - pass - class LinePlot(MPLPlot): _default_rot = 0 @@ -1351,27 +1489,32 @@ if "x_compat" in self.kwds: self.x_compat = bool(self.kwds.pop("x_compat")) + @final def _is_ts_plot(self) -> bool: # this is slightly deceptive return not self.x_compat and self.use_index and self._use_dynamic_x() - def _use_dynamic_x(self): + @final + def _use_dynamic_x(self) -> bool: return use_dynamic_x(self._get_ax(0), self.data) - def _make_plot(self) -> None: + def _make_plot(self, fig: Figure) -> None: if self._is_ts_plot(): data = maybe_convert_index(self._get_ax(0), self.data) x = data.index # dummy, not used plotf = self._ts_plot - it = self._iter_data(data=data, keep_index=True) + it = data.items() else: - x = self._get_xticks(convert_period=True) + x = self._get_xticks() # error: Incompatible types in assignment (expression has type # "Callable[[Any, Any, Any, Any, Any, Any, KwArg(Any)], Any]", variable has # type "Callable[[Any, Any, Any, Any, KwArg(Any)], Any]") plotf = self._plot # type: ignore[assignment] - it = self._iter_data() + # error: Incompatible types in assignment (expression has type + # "Iterator[tuple[Hashable, ndarray[Any, Any]]]", variable has + # type "Iterable[tuple[Hashable, Series]]") + it = self._iter_data(data=self.data) # type: ignore[assignment] stacking_id = self._get_stacking_id() is_errorbar = com.any_not_none(*self.errors.values()) @@ -1380,12 +1523,21 @@ for i, (label, y) in enumerate(it): ax = self._get_ax(i) kwds = self.kwds.copy() - style, kwds = self._apply_style_colors(colors, kwds, i, label) + if self.color is not None: + kwds["color"] = self.color + style, kwds = self._apply_style_colors( + colors, + kwds, + i, + # error: Argument 4 to "_apply_style_colors" of "MPLPlot" has + # incompatible type "Hashable"; expected "str" + label, # type: ignore[arg-type] + ) errors = self._get_errorbars(label=label, index=i) kwds = dict(kwds, **errors) - label = pprint_thing(label) # .encode('utf-8') + label = pprint_thing(label) label = self._mark_right_label(label, index=i) kwds["label"] = label @@ -1411,7 +1563,14 @@ # error: Signature of "_plot" incompatible with supertype "MPLPlot" @classmethod def _plot( # type: ignore[override] - cls, ax: Axes, x, y, style=None, column_num=None, stacking_id=None, **kwds + cls, + ax: Axes, + x, + y: np.ndarray, + style=None, + column_num=None, + stacking_id=None, + **kwds, ): # column_num is used to get the target column from plotf in line and # area plots @@ -1422,45 +1581,57 @@ cls._update_stacker(ax, stacking_id, y) return lines - def _ts_plot(self, ax: Axes, x, data, style=None, **kwds): + @final + def _ts_plot(self, ax: Axes, x, data: Series, style=None, **kwds): # accept x to be consistent with normal plot func, # x is not passed to tsplot as it uses data.index as x coordinate # column_num must be in kwds for stacking purpose freq, data = maybe_resample(data, ax, kwds) # Set ax with freq info - decorate_axes(ax, freq, kwds) + decorate_axes(ax, freq) # digging deeper if hasattr(ax, "left_ax"): - decorate_axes(ax.left_ax, freq, kwds) + decorate_axes(ax.left_ax, freq) if hasattr(ax, "right_ax"): - decorate_axes(ax.right_ax, freq, kwds) - ax._plot_data.append((data, self._kind, kwds)) + decorate_axes(ax.right_ax, freq) + # TODO #54485 + ax._plot_data.append((data, self._kind, kwds)) # type: ignore[attr-defined] - lines = self._plot(ax, data.index, data.values, style=style, **kwds) + lines = self._plot(ax, data.index, np.asarray(data.values), style=style, **kwds) # set date formatter, locators and rescale limits - format_dateaxis(ax, ax.freq, data.index) + # TODO #54485 + format_dateaxis(ax, ax.freq, data.index) # type: ignore[arg-type, attr-defined] return lines - def _get_stacking_id(self): + @final + def _get_stacking_id(self) -> int | None: if self.stacked: return id(self.data) else: return None + @final @classmethod def _initialize_stacker(cls, ax: Axes, stacking_id, n: int) -> None: if stacking_id is None: return if not hasattr(ax, "_stacker_pos_prior"): - ax._stacker_pos_prior = {} + # TODO #54485 + ax._stacker_pos_prior = {} # type: ignore[attr-defined] if not hasattr(ax, "_stacker_neg_prior"): - ax._stacker_neg_prior = {} - ax._stacker_pos_prior[stacking_id] = np.zeros(n) - ax._stacker_neg_prior[stacking_id] = np.zeros(n) + # TODO #54485 + ax._stacker_neg_prior = {} # type: ignore[attr-defined] + # TODO #54485 + ax._stacker_pos_prior[stacking_id] = np.zeros(n) # type: ignore[attr-defined] + # TODO #54485 + ax._stacker_neg_prior[stacking_id] = np.zeros(n) # type: ignore[attr-defined] + @final @classmethod - def _get_stacked_values(cls, ax: Axes, stacking_id, values, label): + def _get_stacked_values( + cls, ax: Axes, stacking_id: int | None, values: np.ndarray, label + ) -> np.ndarray: if stacking_id is None: return values if not hasattr(ax, "_stacker_pos_prior"): @@ -1468,9 +1639,17 @@ cls._initialize_stacker(ax, stacking_id, len(values)) if (values >= 0).all(): - return ax._stacker_pos_prior[stacking_id] + values + # TODO #54485 + return ( + ax._stacker_pos_prior[stacking_id] # type: ignore[attr-defined] + + values + ) elif (values <= 0).all(): - return ax._stacker_neg_prior[stacking_id] + values + # TODO #54485 + return ( + ax._stacker_neg_prior[stacking_id] # type: ignore[attr-defined] + + values + ) raise ValueError( "When stacked is True, each column must be either " @@ -1478,17 +1657,17 @@ f"Column '{label}' contains both positive and negative values" ) + @final @classmethod - def _update_stacker(cls, ax: Axes, stacking_id, values) -> None: + def _update_stacker(cls, ax: Axes, stacking_id: int | None, values) -> None: if stacking_id is None: return if (values >= 0).all(): - ax._stacker_pos_prior[stacking_id] += values + # TODO #54485 + ax._stacker_pos_prior[stacking_id] += values # type: ignore[attr-defined] elif (values <= 0).all(): - ax._stacker_neg_prior[stacking_id] += values - - def _args_adjust(self) -> None: - pass + # TODO #54485 + ax._stacker_neg_prior[stacking_id] += values # type: ignore[attr-defined] def _post_plot_logic(self, ax: Axes, data) -> None: from matplotlib.ticker import FixedLocator @@ -1504,7 +1683,9 @@ if self._need_to_set_index: xticks = ax.get_xticks() xticklabels = [get_label(x) for x in xticks] - ax.xaxis.set_major_locator(FixedLocator(xticks)) + # error: Argument 1 to "FixedLocator" has incompatible type "ndarray[Any, + # Any]"; expected "Sequence[float]" + ax.xaxis.set_major_locator(FixedLocator(xticks)) # type: ignore[arg-type] ax.set_xticklabels(xticklabels) # If the index is an irregular time series, then by default @@ -1538,7 +1719,13 @@ def __init__(self, data, **kwargs) -> None: kwargs.setdefault("stacked", True) - data = data.fillna(value=0) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + data = data.fillna(value=0) LinePlot.__init__(self, data, **kwargs) if not self.stacked: @@ -1554,7 +1741,7 @@ cls, ax: Axes, x, - y, + y: np.ndarray, style=None, column_num=None, stacking_id=None, @@ -1577,9 +1764,11 @@ if stacking_id is None: start = np.zeros(len(y)) elif (y >= 0).all(): - start = ax._stacker_pos_prior[stacking_id] + # TODO #54485 + start = ax._stacker_pos_prior[stacking_id] # type: ignore[attr-defined] elif (y <= 0).all(): - start = ax._stacker_neg_prior[stacking_id] + # TODO #54485 + start = ax._stacker_neg_prior[stacking_id] # type: ignore[attr-defined] else: start = np.zeros(len(y)) @@ -1593,9 +1782,6 @@ res = [rect] return res - def _args_adjust(self) -> None: - pass - def _post_plot_logic(self, ax: Axes, data) -> None: LinePlot._post_plot_logic(self, ax, data) @@ -1619,42 +1805,63 @@ def orientation(self) -> PlottingOrientation: return "vertical" - def __init__(self, data, **kwargs) -> None: + def __init__( + self, + data, + *, + align="center", + bottom=0, + left=0, + width=0.5, + position=0.5, + log=False, + **kwargs, + ) -> None: # we have to treat a series differently than a # 1-column DataFrame w.r.t. color handling self._is_series = isinstance(data, ABCSeries) - self.bar_width = kwargs.pop("width", 0.5) - pos = kwargs.pop("position", 0.5) - kwargs.setdefault("align", "center") + self.bar_width = width + self._align = align + self._position = position self.tick_pos = np.arange(len(data)) - self.bottom = kwargs.pop("bottom", 0) - self.left = kwargs.pop("left", 0) + if is_list_like(bottom): + bottom = np.array(bottom) + if is_list_like(left): + left = np.array(left) + self.bottom = bottom + self.left = left + + self.log = log - self.log = kwargs.pop("log", False) MPLPlot.__init__(self, data, **kwargs) + @cache_readonly + def ax_pos(self) -> np.ndarray: + return self.tick_pos - self.tickoffset + + @cache_readonly + def tickoffset(self): if self.stacked or self.subplots: - self.tickoffset = self.bar_width * pos - if kwargs["align"] == "edge": - self.lim_offset = self.bar_width / 2 - else: - self.lim_offset = 0 - elif kwargs["align"] == "edge": + return self.bar_width * self._position + elif self._align == "edge": w = self.bar_width / self.nseries - self.tickoffset = self.bar_width * (pos - 0.5) + w * 0.5 - self.lim_offset = w * 0.5 + return self.bar_width * (self._position - 0.5) + w * 0.5 else: - self.tickoffset = self.bar_width * pos - self.lim_offset = 0 + return self.bar_width * self._position - self.ax_pos = self.tick_pos - self.tickoffset - - def _args_adjust(self) -> None: - if is_list_like(self.bottom): - self.bottom = np.array(self.bottom) - if is_list_like(self.left): - self.left = np.array(self.left) + @cache_readonly + def lim_offset(self): + if self.stacked or self.subplots: + if self._align == "edge": + return self.bar_width / 2 + else: + return 0 + elif self._align == "edge": + w = self.bar_width / self.nseries + return w * 0.5 + else: + return 0 # error: Signature of "_plot" incompatible with supertype "MPLPlot" @classmethod @@ -1662,7 +1869,7 @@ cls, ax: Axes, x, - y, + y: np.ndarray, w, start: int | npt.NDArray[np.intp] = 0, log: bool = False, @@ -1674,14 +1881,15 @@ def _start_base(self): return self.bottom - def _make_plot(self) -> None: + def _make_plot(self, fig: Figure) -> None: colors = self._get_colors() ncolors = len(colors) pos_prior = neg_prior = np.zeros(len(self.data)) K = self.nseries - for i, (label, y) in enumerate(self._iter_data(fillna=0)): + data = self.data.fillna(0) + for i, (label, y) in enumerate(self._iter_data(data=data)): ax = self._get_ax(i) kwds = self.kwds.copy() if self._is_series: @@ -1705,6 +1913,7 @@ start = 1 start = start + self._start_base + kwds["align"] = self._align if self.subplots: w = self.bar_width / 2 rect = self._plot( @@ -1759,7 +1968,14 @@ self._decorate_ticks(ax, self._get_index_name(), str_index, s_edge, e_edge) - def _decorate_ticks(self, ax: Axes, name, ticklabels, start_edge, end_edge) -> None: + def _decorate_ticks( + self, + ax: Axes, + name: str | None, + ticklabels: list[str], + start_edge: float, + end_edge: float, + ) -> None: ax.set_xlim((start_edge, end_edge)) if self.xticks is not None: @@ -1793,7 +2009,7 @@ cls, ax: Axes, x, - y, + y: np.ndarray, w, start: int | npt.NDArray[np.intp] = 0, log: bool = False, @@ -1804,14 +2020,23 @@ def _get_custom_index_name(self): return self.ylabel - def _decorate_ticks(self, ax: Axes, name, ticklabels, start_edge, end_edge) -> None: + def _decorate_ticks( + self, + ax: Axes, + name: str | None, + ticklabels: list[str], + start_edge: float, + end_edge: float, + ) -> None: # horizontal bars ax.set_ylim((start_edge, end_edge)) ax.set_yticks(self.tick_pos) ax.set_yticklabels(ticklabels) if name is not None and self.use_index: ax.set_ylabel(name) - ax.set_xlabel(self.xlabel) + # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible type + # "Hashable | None"; expected "str" + ax.set_xlabel(self.xlabel) # type: ignore[arg-type] class PiePlot(MPLPlot): @@ -1827,20 +2052,30 @@ raise ValueError(f"{self._kind} plot doesn't allow negative values") MPLPlot.__init__(self, data, kind=kind, **kwargs) - def _args_adjust(self) -> None: - self.grid = False - self.logy = False - self.logx = False - self.loglog = False + @classmethod + def _validate_log_kwd( + cls, + kwd: str, + value: bool | None | Literal["sym"], + ) -> bool | None | Literal["sym"]: + super()._validate_log_kwd(kwd=kwd, value=value) + if value is not False: + warnings.warn( + f"PiePlot ignores the '{kwd}' keyword", + UserWarning, + stacklevel=find_stack_level(), + ) + return False - def _validate_color_args(self) -> None: - pass + def _validate_color_args(self, color, colormap) -> None: + # TODO: warn if color is passed and ignored? + return None - def _make_plot(self) -> None: + def _make_plot(self, fig: Figure) -> None: colors = self._get_colors(num_colors=len(self.data), color_kwds="colors") self.kwds.setdefault("colors", colors) - for i, (label, y) in enumerate(self._iter_data()): + for i, (label, y) in enumerate(self._iter_data(data=self.data)): ax = self._get_ax(i) if label is not None: label = pprint_thing(label) diff -Nru pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/groupby.py pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/groupby.py --- pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/groupby.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/groupby.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,21 +7,26 @@ from pandas.core.dtypes.missing import remove_na_arraylike from pandas import ( - DataFrame, MultiIndex, - Series, concat, ) from pandas.plotting._matplotlib.misc import unpack_single_str_list if TYPE_CHECKING: + from collections.abc import Hashable + from pandas._typing import IndexLabel + from pandas import ( + DataFrame, + Series, + ) + def create_iter_data_given_by( data: DataFrame, kind: str = "hist" -) -> dict[str, DataFrame | Series]: +) -> dict[Hashable, DataFrame | Series]: """ Create data for iteration given `by` is assigned or not, and it is only used in both hist and boxplot. @@ -46,10 +51,10 @@ >>> import numpy as np >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')] - >>> mi = MultiIndex.from_tuples(tuples) + >>> mi = pd.MultiIndex.from_tuples(tuples) >>> value = [[1, 3, np.nan, np.nan], ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]] - >>> data = DataFrame(value, columns=mi) + >>> data = pd.DataFrame(value, columns=mi) >>> create_iter_data_given_by(data) {'h1': h1 a b @@ -102,7 +107,7 @@ Examples -------- >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]} - >>> df = DataFrame(d) + >>> df = pd.DataFrame(d) >>> reconstruct_data_with_by(df, by='h', cols=['a', 'b']) h1 h2 a b a b @@ -126,9 +131,7 @@ return data -def reformat_hist_y_given_by( - y: Series | np.ndarray, by: IndexLabel | None -) -> Series | np.ndarray: +def reformat_hist_y_given_by(y: np.ndarray, by: IndexLabel | None) -> np.ndarray: """Internal function to reformat y given `by` is applied or not for hist plot. If by is None, input y is 1-d with NaN removed; and if by is not None, groupby diff -Nru pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/hist.py pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/hist.py --- pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/hist.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/hist.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,7 +2,9 @@ from typing import ( TYPE_CHECKING, + Any, Literal, + final, ) import numpy as np @@ -39,10 +41,14 @@ if TYPE_CHECKING: from matplotlib.axes import Axes + from matplotlib.figure import Figure from pandas._typing import PlottingOrientation - from pandas import DataFrame + from pandas import ( + DataFrame, + Series, + ) class HistPlot(LinePlot): @@ -55,46 +61,50 @@ data, bins: int | np.ndarray | list[np.ndarray] = 10, bottom: int | np.ndarray = 0, + *, + range=None, + weights=None, **kwargs, ) -> None: - self.bins = bins # use mpl default + if is_list_like(bottom): + bottom = np.array(bottom) self.bottom = bottom + + self._bin_range = range + self.weights = weights + self.xlabel = kwargs.get("xlabel") self.ylabel = kwargs.get("ylabel") # Do not call LinePlot.__init__ which may fill nan MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called - def _args_adjust(self) -> None: - # calculate bin number separately in different subplots - # where subplots are created based on by argument - if is_integer(self.bins): + self.bins = self._adjust_bins(bins) + + def _adjust_bins(self, bins: int | np.ndarray | list[np.ndarray]): + if is_integer(bins): if self.by is not None: by_modified = unpack_single_str_list(self.by) grouped = self.data.groupby(by_modified)[self.columns] - self.bins = [self._calculate_bins(group) for key, group in grouped] + bins = [self._calculate_bins(group, bins) for key, group in grouped] else: - self.bins = self._calculate_bins(self.data) - - if is_list_like(self.bottom): - self.bottom = np.array(self.bottom) + bins = self._calculate_bins(self.data, bins) + return bins - def _calculate_bins(self, data: DataFrame) -> np.ndarray: + def _calculate_bins(self, data: Series | DataFrame, bins) -> np.ndarray: """Calculate bins given data""" nd_values = data.infer_objects(copy=False)._get_numeric_data() values = np.ravel(nd_values) values = values[~isna(values)] - hist, bins = np.histogram( - values, bins=self.bins, range=self.kwds.get("range", None) - ) + hist, bins = np.histogram(values, bins=bins, range=self._bin_range) return bins # error: Signature of "_plot" incompatible with supertype "LinePlot" @classmethod def _plot( # type: ignore[override] cls, - ax, - y, + ax: Axes, + y: np.ndarray, style=None, bottom: int | np.ndarray = 0, column_num: int = 0, @@ -113,7 +123,7 @@ cls._update_stacker(ax, stacking_id, n) return patches - def _make_plot(self) -> None: + def _make_plot(self, fig: Figure) -> None: colors = self._get_colors() stacking_id = self._get_stacking_id() @@ -124,10 +134,14 @@ else self.data ) - for i, (label, y) in enumerate(self._iter_data(data=data)): + # error: Argument "data" to "_iter_data" of "MPLPlot" has incompatible + # type "object"; expected "DataFrame | dict[Hashable, Series | DataFrame]" + for i, (label, y) in enumerate(self._iter_data(data=data)): # type: ignore[arg-type] ax = self._get_ax(i) kwds = self.kwds.copy() + if self.color is not None: + kwds["color"] = self.color label = pprint_thing(label) label = self._mark_right_label(label, index=i) @@ -137,7 +151,7 @@ if style is not None: kwds["style"] = style - kwds = self._make_plot_keywords(kwds, y) + self._make_plot_keywords(kwds, y) # the bins is multi-dimension array now and each plot need only 1-d and # when by is applied, label should be columns that are grouped @@ -146,21 +160,8 @@ kwds["label"] = self.columns kwds.pop("color") - # We allow weights to be a multi-dimensional array, e.g. a (10, 2) array, - # and each sub-array (10,) will be called in each iteration. If users only - # provide 1D array, we assume the same weights is used for all iterations - weights = kwds.get("weights", None) - if weights is not None: - if np.ndim(weights) != 1 and np.shape(weights)[-1] != 1: - try: - weights = weights[:, i] - except IndexError as err: - raise ValueError( - "weights must have the same shape as data, " - "or be a single column" - ) from err - weights = weights[~isna(y)] - kwds["weights"] = weights + if self.weights is not None: + kwds["weights"] = type(self)._get_column_weights(self.weights, i, y) y = reformat_hist_y_given_by(y, self.by) @@ -172,20 +173,47 @@ self._append_legend_handles_labels(artists[0], label) - def _make_plot_keywords(self, kwds, y): + def _make_plot_keywords(self, kwds: dict[str, Any], y: np.ndarray) -> None: """merge BoxPlot/KdePlot properties to passed kwds""" # y is required for KdePlot kwds["bottom"] = self.bottom kwds["bins"] = self.bins - return kwds + + @final + @staticmethod + def _get_column_weights(weights, i: int, y): + # We allow weights to be a multi-dimensional array, e.g. a (10, 2) array, + # and each sub-array (10,) will be called in each iteration. If users only + # provide 1D array, we assume the same weights is used for all iterations + if weights is not None: + if np.ndim(weights) != 1 and np.shape(weights)[-1] != 1: + try: + weights = weights[:, i] + except IndexError as err: + raise ValueError( + "weights must have the same shape as data, " + "or be a single column" + ) from err + weights = weights[~isna(y)] + return weights def _post_plot_logic(self, ax: Axes, data) -> None: if self.orientation == "horizontal": - ax.set_xlabel("Frequency" if self.xlabel is None else self.xlabel) - ax.set_ylabel(self.ylabel) + # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible + # type "Hashable"; expected "str" + ax.set_xlabel( + "Frequency" + if self.xlabel is None + else self.xlabel # type: ignore[arg-type] + ) + ax.set_ylabel(self.ylabel) # type: ignore[arg-type] else: - ax.set_xlabel(self.xlabel) - ax.set_ylabel("Frequency" if self.ylabel is None else self.ylabel) + ax.set_xlabel(self.xlabel) # type: ignore[arg-type] + ax.set_ylabel( + "Frequency" + if self.ylabel is None + else self.ylabel # type: ignore[arg-type] + ) @property def orientation(self) -> PlottingOrientation: @@ -204,17 +232,18 @@ def orientation(self) -> Literal["vertical"]: return "vertical" - def __init__(self, data, bw_method=None, ind=None, **kwargs) -> None: + def __init__( + self, data, bw_method=None, ind=None, *, weights=None, **kwargs + ) -> None: # Do not call LinePlot.__init__ which may fill nan MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called self.bw_method = bw_method self.ind = ind + self.weights = weights - def _args_adjust(self) -> None: - pass - - def _get_ind(self, y): - if self.ind is None: + @staticmethod + def _get_ind(y: np.ndarray, ind): + if ind is None: # np.nanmax() and np.nanmin() ignores the missing values sample_range = np.nanmax(y) - np.nanmin(y) ind = np.linspace( @@ -222,27 +251,26 @@ np.nanmax(y) + 0.5 * sample_range, 1000, ) - elif is_integer(self.ind): + elif is_integer(ind): sample_range = np.nanmax(y) - np.nanmin(y) ind = np.linspace( np.nanmin(y) - 0.5 * sample_range, np.nanmax(y) + 0.5 * sample_range, - self.ind, + ind, ) - else: - ind = self.ind return ind @classmethod - def _plot( + # error: Signature of "_plot" incompatible with supertype "MPLPlot" + def _plot( # type: ignore[override] cls, - ax, - y, + ax: Axes, + y: np.ndarray, style=None, bw_method=None, ind=None, column_num=None, - stacking_id=None, + stacking_id: int | None = None, **kwds, ): from scipy.stats import gaussian_kde @@ -254,18 +282,17 @@ lines = MPLPlot._plot(ax, ind, y, style=style, **kwds) return lines - def _make_plot_keywords(self, kwds, y): + def _make_plot_keywords(self, kwds: dict[str, Any], y: np.ndarray) -> None: kwds["bw_method"] = self.bw_method - kwds["ind"] = self._get_ind(y) - return kwds + kwds["ind"] = type(self)._get_ind(y, ind=self.ind) - def _post_plot_logic(self, ax, data) -> None: + def _post_plot_logic(self, ax: Axes, data) -> None: ax.set_ylabel("Density") def _grouped_plot( plotf, - data, + data: Series | DataFrame, column=None, by=None, numeric_only: bool = True, @@ -308,7 +335,7 @@ def _grouped_hist( - data, + data: Series | DataFrame, column=None, by=None, ax=None, @@ -390,7 +417,7 @@ def hist_series( - self, + self: Series, by=None, ax=None, grid: bool = True, @@ -430,8 +457,14 @@ ax.grid(grid) axes = np.array([ax]) + # error: Argument 1 to "set_ticks_props" has incompatible type "ndarray[Any, + # dtype[Any]]"; expected "Axes | Sequence[Axes]" set_ticks_props( - axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + axes, # type: ignore[arg-type] + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, ) else: @@ -462,7 +495,7 @@ def hist_frame( - data, + data: DataFrame, column=None, by=None, grid: bool = True, diff -Nru pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/style.py pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/style.py --- pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/style.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/style.py 2024-04-10 17:42:52.000000000 +0000 @@ -269,7 +269,9 @@ """ conv = matplotlib.colors.ColorConverter() try: - conv.to_rgba(color) + # error: Argument 1 to "to_rgba" of "ColorConverter" has incompatible type + # "str | Sequence[float]"; expected "tuple[float, float, float] | ..." + conv.to_rgba(color) # type: ignore[arg-type] except ValueError: return False else: diff -Nru pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/timeseries.py pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/timeseries.py --- pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/timeseries.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/timeseries.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,8 +5,10 @@ import functools from typing import ( TYPE_CHECKING, + Any, cast, ) +import warnings import numpy as np @@ -15,7 +17,10 @@ Period, to_offset, ) -from pandas._libs.tslibs.dtypes import FreqGroup +from pandas._libs.tslibs.dtypes import ( + OFFSET_TO_PERIOD_FREQSTR, + FreqGroup, +) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -40,10 +45,13 @@ from matplotlib.axes import Axes + from pandas._typing import NDFrameT + from pandas import ( DataFrame, DatetimeIndex, Index, + PeriodIndex, Series, ) @@ -51,8 +59,15 @@ # Plotting functions and monkey patches -def maybe_resample(series: Series, ax: Axes, kwargs): +def maybe_resample(series: Series, ax: Axes, kwargs: dict[str, Any]): # resample against axes freq if necessary + + if "how" in kwargs: + raise ValueError( + "'how' is not a valid keyword for plotting functions. If plotting " + "multiple objects on shared axes, resample manually first." + ) + freq, ax_freq = _get_freq(ax, series) if freq is None: # pragma: no cover @@ -71,9 +86,12 @@ ) freq = ax_freq elif _is_sup(freq, ax_freq): # one is weekly - how = kwargs.pop("how", "last") - series = getattr(series.resample("D"), how)().dropna() - series = getattr(series.resample(ax_freq), how)().dropna() + # Resampling with PeriodDtype is deprecated, so we convert to + # DatetimeIndex, resample, then convert back. + ser_ts = series.to_timestamp() + ser_d = ser_ts.resample("D").last().dropna() + ser_freq = ser_d.resample(ax_freq).last().dropna() + series = ser_freq.to_period(ax_freq) freq = ax_freq elif is_subperiod(freq, ax_freq) or _is_sub(freq, ax_freq): _upsample_others(ax, freq, kwargs) @@ -94,10 +112,10 @@ ) -def _upsample_others(ax: Axes, freq, kwargs) -> None: +def _upsample_others(ax: Axes, freq: BaseOffset, kwargs: dict[str, Any]) -> None: legend = ax.get_legend() - lines, labels = _replot_ax(ax, freq, kwargs) - _replot_ax(ax, freq, kwargs) + lines, labels = _replot_ax(ax, freq) + _replot_ax(ax, freq) other_ax = None if hasattr(ax, "left_ax"): @@ -106,25 +124,26 @@ other_ax = ax.right_ax if other_ax is not None: - rlines, rlabels = _replot_ax(other_ax, freq, kwargs) + rlines, rlabels = _replot_ax(other_ax, freq) lines.extend(rlines) labels.extend(rlabels) if legend is not None and kwargs.get("legend", True) and len(lines) > 0: - title = legend.get_title().get_text() + title: str | None = legend.get_title().get_text() if title == "None": title = None ax.legend(lines, labels, loc="best", title=title) -def _replot_ax(ax: Axes, freq, kwargs): +def _replot_ax(ax: Axes, freq: BaseOffset): data = getattr(ax, "_plot_data", None) # clear current axes and data - ax._plot_data = [] + # TODO #54485 + ax._plot_data = [] # type: ignore[attr-defined] ax.clear() - decorate_axes(ax, freq, kwargs) + decorate_axes(ax, freq) lines = [] labels = [] @@ -133,7 +152,8 @@ series = series.copy() idx = series.index.asfreq(freq, how="S") series.index = idx - ax._plot_data.append((series, plotf, kwds)) + # TODO #54485 + ax._plot_data.append((series, plotf, kwds)) # type: ignore[attr-defined] # for tsplot if isinstance(plotf, str): @@ -147,20 +167,17 @@ return lines, labels -def decorate_axes(ax: Axes, freq, kwargs) -> None: +def decorate_axes(ax: Axes, freq: BaseOffset) -> None: """Initialize axes for time-series plotting""" if not hasattr(ax, "_plot_data"): - ax._plot_data = [] + # TODO #54485 + ax._plot_data = [] # type: ignore[attr-defined] - ax.freq = freq + # TODO #54485 + ax.freq = freq # type: ignore[attr-defined] xaxis = ax.get_xaxis() - xaxis.freq = freq - if not hasattr(ax, "legendlabels"): - ax.legendlabels = [kwargs.get("label", None)] - else: - ax.legendlabels.append(kwargs.get("label", None)) - ax.view_interval = None - ax.date_axis_info = None + # TODO #54485 + xaxis.freq = freq # type: ignore[attr-defined] def _get_ax_freq(ax: Axes): @@ -188,7 +205,10 @@ def _get_period_alias(freq: timedelta | BaseOffset | str) -> str | None: - freqstr = to_offset(freq).rule_code + if isinstance(freq, BaseOffset): + freqstr = freq.name + else: + freqstr = to_offset(freq, is_period=True).rule_code return get_period_alias(freqstr) @@ -198,7 +218,7 @@ freq = getattr(series.index, "freq", None) if freq is None: freq = getattr(series.index, "inferred_freq", None) - freq = to_offset(freq) + freq = to_offset(freq, is_period=True) ax_freq = _get_ax_freq(ax) @@ -232,7 +252,10 @@ # FIXME: hack this for 0.10.1, creating more technical debt...sigh if isinstance(data.index, ABCDatetimeIndex): # error: "BaseOffset" has no attribute "_period_dtype_code" - base = to_offset(freq_str)._period_dtype_code # type: ignore[attr-defined] + freq_str = OFFSET_TO_PERIOD_FREQSTR.get(freq_str, freq_str) + base = to_offset( + freq_str, is_period=True + )._period_dtype_code # type: ignore[attr-defined] x = data.index if base <= FreqGroup.FR_DAY.value: return x[:1].is_normalized @@ -256,7 +279,7 @@ return freq -def maybe_convert_index(ax: Axes, data): +def maybe_convert_index(ax: Axes, data: NDFrameT) -> NDFrameT: # tsplot converts automatically, but don't want to convert index # over and over for DataFrames if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)): @@ -276,8 +299,6 @@ freq_str = _get_period_alias(freq) - import warnings - with warnings.catch_warnings(): # suppress Period[B] deprecation warning # TODO: need to find an alternative to this before the deprecation @@ -295,8 +316,7 @@ return data -# Patch methods for subplot. Only format_dateaxis is currently used. -# Do we need the rest for convenience? +# Patch methods for subplot. def _format_coord(freq, t, y) -> str: @@ -304,7 +324,9 @@ return f"t = {time_period} y = {y:8f}" -def format_dateaxis(subplot, freq, index) -> None: +def format_dateaxis( + subplot, freq: BaseOffset, index: DatetimeIndex | PeriodIndex +) -> None: """ Pretty-formats the date axis (x-axis). diff -Nru pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/tools.py pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/tools.py --- pandas-2.1.4+dfsg/pandas/plotting/_matplotlib/tools.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/plotting/_matplotlib/tools.py 2024-04-10 17:42:52.000000000 +0000 @@ -52,10 +52,12 @@ def format_date_labels(ax: Axes, rot) -> None: # mini version of autofmt_xdate for label in ax.get_xticklabels(): - label.set_ha("right") + label.set_horizontalalignment("right") label.set_rotation(rot) fig = ax.get_figure() - maybe_adjust_figure(fig, bottom=0.2) + if fig is not None: + # should always be a Figure but can technically be None + maybe_adjust_figure(fig, bottom=0.2) def table( @@ -76,8 +78,14 @@ cellText = data.values + # error: Argument "cellText" to "table" has incompatible type "ndarray[Any, + # Any]"; expected "Sequence[Sequence[str]] | None" return matplotlib.table.table( - ax, cellText=cellText, rowLabels=rowLabels, colLabels=colLabels, **kwargs + ax, + cellText=cellText, # type: ignore[arg-type] + rowLabels=rowLabels, + colLabels=colLabels, + **kwargs, ) @@ -369,12 +377,12 @@ "_has_externally_shared_axis() needs 'x' or 'y' as a second parameter" ) - axes = axes.get_siblings(ax1) + axes_siblings = axes.get_siblings(ax1) # Retain ax1 and any of its siblings which aren't in the same position as it ax1_points = ax1.get_position().get_points() - for ax2 in axes: + for ax2 in axes_siblings: if not np.array_equal(ax1_points, ax2.get_position().get_points()): return True diff -Nru pandas-2.1.4+dfsg/pandas/plotting/_misc.py pandas-2.2.2+dfsg/pandas/plotting/_misc.py --- pandas-2.1.4+dfsg/pandas/plotting/_misc.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/plotting/_misc.py 2024-04-10 17:42:52.000000000 +0000 @@ -329,7 +329,7 @@ **kwargs, ) -> Axes: """ - Generate a matplotlib plot for visualising clusters of multivariate data. + Generate a matplotlib plot for visualizing clusters of multivariate data. Andrews curves have the functional form: @@ -439,7 +439,7 @@ :context: close-figs >>> s = pd.Series(np.random.uniform(size=100)) - >>> pd.plotting.bootstrap_plot(s) + >>> pd.plotting.bootstrap_plot(s) # doctest: +SKIP
""" plot_backend = _get_plot_backend("matplotlib") diff -Nru pandas-2.1.4+dfsg/pandas/tests/apply/conftest.py pandas-2.2.2+dfsg/pandas/tests/apply/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/apply/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/apply/conftest.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,18 +0,0 @@ -import numpy as np -import pytest - -from pandas import DataFrame - - -@pytest.fixture -def int_frame_const_col(): - """ - Fixture for DataFrame of ints which are constant per column - - Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3] - """ - df = DataFrame( - np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, - columns=["A", "B", "C"], - ) - return df diff -Nru pandas-2.1.4+dfsg/pandas/tests/apply/test_frame_apply.py pandas-2.2.2+dfsg/pandas/tests/apply/test_frame_apply.py --- pandas-2.1.4+dfsg/pandas/tests/apply/test_frame_apply.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/apply/test_frame_apply.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,20 +18,44 @@ from pandas.tests.frame.common import zip_frames -def test_apply(float_frame): +@pytest.fixture +def int_frame_const_col(): + """ + Fixture for DataFrame of ints which are constant per column + + Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3] + """ + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) + return df + + +@pytest.fixture(params=["python", pytest.param("numba", marks=pytest.mark.single_cpu)]) +def engine(request): + if request.param == "numba": + pytest.importorskip("numba") + return request.param + + +def test_apply(float_frame, engine, request): + if engine == "numba": + mark = pytest.mark.xfail(reason="numba engine not supporting numpy ufunc yet") + request.node.add_marker(mark) with np.errstate(all="ignore"): # ufunc result = np.sqrt(float_frame["A"]) - expected = float_frame.apply(np.sqrt)["A"] + expected = float_frame.apply(np.sqrt, engine=engine)["A"] tm.assert_series_equal(result, expected) # aggregator - result = float_frame.apply(np.mean)["A"] + result = float_frame.apply(np.mean, engine=engine)["A"] expected = np.mean(float_frame["A"]) assert result == expected d = float_frame.index[0] - result = float_frame.apply(np.mean, axis=1) + result = float_frame.apply(np.mean, axis=1, engine=engine) expected = np.mean(float_frame.xs(d)) assert result[d] == expected assert result.index is float_frame.index @@ -39,8 +63,13 @@ @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) -def test_apply_args(float_frame, axis, raw): - result = float_frame.apply(lambda x, y: x + y, axis, args=(1,), raw=raw) +def test_apply_args(float_frame, axis, raw, engine, request): + if engine == "numba": + mark = pytest.mark.xfail(reason="numba engine doesn't support args") + request.node.add_marker(mark) + result = float_frame.apply( + lambda x, y: x + y, axis, args=(1,), raw=raw, engine=engine + ) expected = float_frame + 1 tm.assert_frame_equal(result, expected) @@ -87,30 +116,30 @@ @pytest.mark.parametrize("func", [np.sqrt, np.mean]) -def test_apply_empty(func): +def test_apply_empty(func, engine): # empty empty_frame = DataFrame() - result = empty_frame.apply(func) + result = empty_frame.apply(func, engine=engine) assert result.empty -def test_apply_float_frame(float_frame): +def test_apply_float_frame(float_frame, engine): no_rows = float_frame[:0] - result = no_rows.apply(lambda x: x.mean()) + result = no_rows.apply(lambda x: x.mean(), engine=engine) expected = Series(np.nan, index=float_frame.columns) tm.assert_series_equal(result, expected) no_cols = float_frame.loc[:, []] - result = no_cols.apply(lambda x: x.mean(), axis=1) + result = no_cols.apply(lambda x: x.mean(), axis=1, engine=engine) expected = Series(np.nan, index=float_frame.index) tm.assert_series_equal(result, expected) -def test_apply_empty_except_index(): +def test_apply_empty_except_index(engine): # GH 2476 expected = DataFrame(index=["a"]) - result = expected.apply(lambda x: x["a"], axis=1) + result = expected.apply(lambda x: x["a"], axis=1, engine=engine) tm.assert_frame_equal(result, expected) @@ -235,36 +264,52 @@ @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_float_frame(float_frame, axis): +def test_apply_raw_float_frame(float_frame, axis, engine): + if engine == "numba": + pytest.skip("numba can't handle when UDF returns None.") + def _assert_raw(x): assert isinstance(x, np.ndarray) assert x.ndim == 1 - float_frame.apply(_assert_raw, axis=axis, raw=True) + float_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_float_frame_lambda(float_frame, axis): - result = float_frame.apply(np.mean, axis=axis, raw=True) +def test_apply_raw_float_frame_lambda(float_frame, axis, engine): + result = float_frame.apply(np.mean, axis=axis, engine=engine, raw=True) expected = float_frame.apply(lambda x: x.values.mean(), axis=axis) tm.assert_series_equal(result, expected) -def test_apply_raw_float_frame_no_reduction(float_frame): +def test_apply_raw_float_frame_no_reduction(float_frame, engine): # no reduction - result = float_frame.apply(lambda x: x * 2, raw=True) + result = float_frame.apply(lambda x: x * 2, engine=engine, raw=True) expected = float_frame * 2 tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_raw_mixed_type_frame(mixed_type_frame, axis): +def test_apply_raw_mixed_type_frame(axis, engine): + if engine == "numba": + pytest.skip("isinstance check doesn't work with numba") + def _assert_raw(x): assert isinstance(x, np.ndarray) assert x.ndim == 1 # Mixed dtype (GH-32423) - mixed_type_frame.apply(_assert_raw, axis=axis, raw=True) + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) + df.apply(_assert_raw, axis=axis, engine=engine, raw=True) def test_apply_axis1(float_frame): @@ -301,14 +346,14 @@ ) @pytest.mark.parametrize("raw", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_empty_infer_type(ax, func, raw, axis): +def test_apply_empty_infer_type(ax, func, raw, axis, engine, request): df = DataFrame(**{ax: ["a", "b", "c"]}) with np.errstate(all="ignore"): test_res = func(np.array([], dtype="f8")) is_reduction = not isinstance(test_res, np.ndarray) - result = df.apply(func, axis=axis, raw=raw) + result = df.apply(func, axis=axis, engine=engine, raw=raw) if is_reduction: agg_axis = df._get_agg_axis(axis) assert isinstance(result, Series) @@ -608,8 +653,10 @@ assert names == list(df.index) -def test_apply_raw_function_runs_once(): +def test_apply_raw_function_runs_once(engine): # https://github.com/pandas-dev/pandas/issues/34506 + if engine == "numba": + pytest.skip("appending to list outside of numba func is not supported") df = DataFrame({"a": [1, 2, 3]}) values = [] # Save row values function is applied to @@ -624,7 +671,7 @@ for func in [reducing_function, non_reducing_function]: del values[:] - df.apply(func, raw=True, axis=1) + df.apply(func, engine=engine, raw=True, axis=1) assert values == list(df.a.to_list()) @@ -818,7 +865,7 @@ { "a": Series(np.random.default_rng(2).standard_normal(4)), "b": ["a", "list", "of", "words"], - "ts": date_range("2016-10-01", periods=4, freq="H"), + "ts": date_range("2016-10-01", periods=4, freq="h"), } ) @@ -960,45 +1007,69 @@ tm.assert_frame_equal(result, expected) -def test_result_type_broadcast(int_frame_const_col): +def test_result_type_broadcast(int_frame_const_col, request, engine): # result_type should be consistent no matter which # path we take in the code + if engine == "numba": + mark = pytest.mark.xfail(reason="numba engine doesn't support list return") + request.node.add_marker(mark) df = int_frame_const_col # broadcast result - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") + result = df.apply( + lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine + ) expected = df.copy() tm.assert_frame_equal(result, expected) -def test_result_type_broadcast_series_func(int_frame_const_col): +def test_result_type_broadcast_series_func(int_frame_const_col, engine, request): # result_type should be consistent no matter which # path we take in the code + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba Series constructor only support ndarrays not list data" + ) + request.node.add_marker(mark) df = int_frame_const_col columns = ["other", "col", "names"] result = df.apply( - lambda x: Series([1, 2, 3], index=columns), axis=1, result_type="broadcast" + lambda x: Series([1, 2, 3], index=columns), + axis=1, + result_type="broadcast", + engine=engine, ) expected = df.copy() tm.assert_frame_equal(result, expected) -def test_result_type_series_result(int_frame_const_col): +def test_result_type_series_result(int_frame_const_col, engine, request): # result_type should be consistent no matter which # path we take in the code + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba Series constructor only support ndarrays not list data" + ) + request.node.add_marker(mark) df = int_frame_const_col # series result - result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1, engine=engine) expected = df.copy() tm.assert_frame_equal(result, expected) -def test_result_type_series_result_other_index(int_frame_const_col): +def test_result_type_series_result_other_index(int_frame_const_col, engine, request): # result_type should be consistent no matter which # path we take in the code + + if engine == "numba": + mark = pytest.mark.xfail( + reason="no support in numba Series constructor for list of columns" + ) + request.node.add_marker(mark) df = int_frame_const_col # series result with other index columns = ["other", "col", "names"] - result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1) + result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1, engine=engine) expected = df.copy() expected.columns = columns tm.assert_frame_equal(result, expected) @@ -1232,7 +1303,7 @@ result = df.agg(["min"]) expected = DataFrame( - [[1, 1.0, "bar", Timestamp("20130101")]], + [[1, 1.0, "bar", Timestamp("20130101").as_unit("ns")]], index=["min"], columns=df.columns, ) @@ -1358,25 +1429,34 @@ @pytest.mark.parametrize("num_cols", [2, 3, 5]) -def test_frequency_is_original(num_cols): +def test_frequency_is_original(num_cols, engine, request): # GH 22150 + if engine == "numba": + mark = pytest.mark.xfail(reason="numba engine only supports numeric indices") + request.node.add_marker(mark) index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"]) original = index.copy() df = DataFrame(1, index=index, columns=range(num_cols)) - df.apply(lambda x: x) + df.apply(lambda x: x, engine=engine) assert index.freq == original.freq -def test_apply_datetime_tz_issue(): +def test_apply_datetime_tz_issue(engine, request): # GH 29052 + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba engine doesn't support non-numeric indexes" + ) + request.node.add_marker(mark) + timestamps = [ Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), ] df = DataFrame(data=[0, 1, 2], index=timestamps) - result = df.apply(lambda x: x.name, axis=1) + result = df.apply(lambda x: x.name, axis=1, engine=engine) expected = Series(index=timestamps, data=timestamps) tm.assert_series_equal(result, expected) @@ -1384,13 +1464,16 @@ @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) -def test_mixed_column_raises(df, method): +def test_mixed_column_raises(df, method, using_infer_string): # GH 16832 if method == "sum": - msg = r'can only concatenate str \(not "int"\) to str' + msg = r'can only concatenate str \(not "int"\) to str|does not support' else: msg = "not supported between instances of 'str' and 'float'" - with pytest.raises(TypeError, match=msg): + if not using_infer_string: + with pytest.raises(TypeError, match=msg): + getattr(df, method)() + else: getattr(df, method)() @@ -1404,7 +1487,7 @@ tm.assert_series_equal(result, expected) -def test_apply_mutating(using_array_manager, using_copy_on_write): +def test_apply_mutating(using_array_manager, using_copy_on_write, warn_copy_on_write): # GH#35462 case where applied func pins a new BlockManager to a row df = DataFrame({"a": range(100), "b": range(100, 200)}) df_orig = df.copy() @@ -1418,7 +1501,8 @@ expected = df.copy() expected["a"] += 1 - result = df.apply(func, axis=1) + with tm.assert_cow_warning(warn_copy_on_write): + result = df.apply(func, axis=1) tm.assert_frame_equal(result, expected) if using_copy_on_write or using_array_manager: @@ -1439,10 +1523,15 @@ tm.assert_series_equal(result, expected) -def test_apply_no_suffix_index(): +def test_apply_no_suffix_index(engine, request): # GH36189 + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba engine doesn't support list-likes/dict-like callables" + ) + request.node.add_marker(mark) pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"]) - result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) + result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()], engine=engine) expected = DataFrame( {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""] ) @@ -1450,10 +1539,12 @@ tm.assert_frame_equal(result, expected) -def test_apply_raw_returns_string(): +def test_apply_raw_returns_string(engine): # https://github.com/pandas-dev/pandas/issues/35940 + if engine == "numba": + pytest.skip("No object dtype support in numba") df = DataFrame({"A": ["aa", "bbb"]}) - result = df.apply(lambda x: x[0], axis=1, raw=True) + result = df.apply(lambda x: x[0], engine=engine, axis=1, raw=True) expected = Series(["aa", "bbb"]) tm.assert_series_equal(result, expected) @@ -1489,10 +1580,17 @@ tm.assert_frame_equal(result, expected) -def test_apply_getitem_axis_1(): +def test_apply_getitem_axis_1(engine, request): # GH 13427 + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba engine not supporting duplicate index values" + ) + request.node.add_marker(mark) df = DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]}) - result = df[["a", "a"]].apply(lambda x: x.iloc[0] + x.iloc[1], axis=1) + result = df[["a", "a"]].apply( + lambda x: x.iloc[0] + x.iloc[1], axis=1, engine=engine + ) expected = Series([0, 2, 4]) tm.assert_series_equal(result, expected) @@ -1532,10 +1630,10 @@ tm.assert_series_equal(result, expected) -def test_apply_on_empty_dataframe(): +def test_apply_on_empty_dataframe(engine): # GH 39111 df = DataFrame({"a": [1, 2], "b": [3, 0]}) - result = df.head(0).apply(lambda x: max(x["a"], x["b"]), axis=1) + result = df.head(0).apply(lambda x: max(x["a"], x["b"]), axis=1, engine=engine) expected = Series([], dtype=np.float64) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/apply/test_frame_transform.py pandas-2.2.2+dfsg/pandas/tests/apply/test_frame_transform.py --- pandas-2.1.4+dfsg/pandas/tests/apply/test_frame_transform.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/apply/test_frame_transform.py 2024-04-10 17:42:52.000000000 +0000 @@ -156,7 +156,7 @@ def test_transform_bad_dtype(op, frame_or_series, request): # GH 35964 if op == "ngroup": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") ) @@ -185,7 +185,7 @@ # GH 35964 if op == "ngroup": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/apply/test_invalid_arg.py pandas-2.2.2+dfsg/pandas/tests/apply/test_invalid_arg.py --- pandas-2.1.4+dfsg/pandas/tests/apply/test_invalid_arg.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/apply/test_invalid_arg.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,15 +18,17 @@ DataFrame, Series, date_range, - notna, ) import pandas._testing as tm @pytest.mark.parametrize("result_type", ["foo", 1]) -def test_result_type_error(result_type, int_frame_const_col): +def test_result_type_error(result_type): # allowed result_type - df = int_frame_const_col + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) msg = ( "invalid value for result_type, must be one of " @@ -202,11 +204,6 @@ row["D"] = 7 return row - def transform2(row): - if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": - row["D"] = 7 - return row - msg = "'float' object has no attribute 'startswith'" with pytest.raises(AttributeError, match=msg): data.apply(transform, axis=1) @@ -218,9 +215,14 @@ DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] ), ) -def test_agg_cython_table_raises_frame(df, func, expected, axis): +def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 - msg = "can't multiply sequence by non-int of type 'str'" + if using_infer_string: + import pyarrow as pa + + expected = (expected, pa.lib.ArrowNotImplementedError) + + msg = "can't multiply sequence by non-int of type 'str'|has no kernel" warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"): @@ -243,11 +245,18 @@ ) ), ) -def test_agg_cython_table_raises_series(series, func, expected): +def test_agg_cython_table_raises_series(series, func, expected, using_infer_string): # GH21224 msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" if func == "median" or func is np.nanmedian or func is np.median: msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" + + if using_infer_string: + import pyarrow as pa + + expected = (expected, pa.lib.ArrowNotImplementedError) + + msg = msg + "|does not support|has no kernel" warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): @@ -280,8 +289,11 @@ lambda x: Series([1, 2]), ], ) -def test_apply_broadcast_error(int_frame_const_col, func): - df = int_frame_const_col +def test_apply_broadcast_error(func): + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) # > 1 ndim msg = "too many dims to broadcast|cannot broadcast result" @@ -330,11 +342,8 @@ # we are trying to transform with an aggregator msg = "Function did not transform" - warn = RuntimeWarning if func[0] == "sqrt" else None - warn_msg = "invalid value encountered in sqrt" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False): - string_series.transform(func) + string_series.transform(func) @pytest.mark.parametrize( diff -Nru pandas-2.1.4+dfsg/pandas/tests/apply/test_numba.py pandas-2.2.2+dfsg/pandas/tests/apply/test_numba.py --- pandas-2.1.4+dfsg/pandas/tests/apply/test_numba.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/apply/test_numba.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,118 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + DataFrame, + Index, +) +import pandas._testing as tm + +pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu] + + +@pytest.fixture(params=[0, 1]) +def apply_axis(request): + return request.param + + +def test_numba_vs_python_noop(float_frame, apply_axis): + func = lambda x: x + result = float_frame.apply(func, engine="numba", axis=apply_axis) + expected = float_frame.apply(func, engine="python", axis=apply_axis) + tm.assert_frame_equal(result, expected) + + +def test_numba_vs_python_string_index(): + # GH#56189 + pytest.importorskip("pyarrow") + df = DataFrame( + 1, + index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + ) + func = lambda x: x + result = df.apply(func, engine="numba", axis=0) + expected = df.apply(func, engine="python", axis=0) + tm.assert_frame_equal( + result, expected, check_column_type=False, check_index_type=False + ) + + +def test_numba_vs_python_indexing(): + frame = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, + index=Index(["A", "B", "C"]), + ) + row_func = lambda x: x["c"] + result = frame.apply(row_func, engine="numba", axis=1) + expected = frame.apply(row_func, engine="python", axis=1) + tm.assert_series_equal(result, expected) + + col_func = lambda x: x["A"] + result = frame.apply(col_func, engine="numba", axis=0) + expected = frame.apply(col_func, engine="python", axis=0) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "reduction", + [lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()], +) +def test_numba_vs_python_reductions(reduction, apply_axis): + df = DataFrame(np.ones((4, 4), dtype=np.float64)) + result = df.apply(reduction, engine="numba", axis=apply_axis) + expected = df.apply(reduction, engine="python", axis=apply_axis) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("colnames", [[1, 2, 3], [1.0, 2.0, 3.0]]) +def test_numba_numeric_colnames(colnames): + # Check that numeric column names lower properly and can be indxed on + df = DataFrame( + np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), columns=colnames + ) + first_col = colnames[0] + f = lambda x: x[first_col] # Get the first column + result = df.apply(f, engine="numba", axis=1) + expected = df.apply(f, engine="python", axis=1) + tm.assert_series_equal(result, expected) + + +def test_numba_parallel_unsupported(float_frame): + f = lambda x: x + with pytest.raises( + NotImplementedError, + match="Parallel apply is not supported when raw=False and engine='numba'", + ): + float_frame.apply(f, engine="numba", engine_kwargs={"parallel": True}) + + +def test_numba_nonunique_unsupported(apply_axis): + f = lambda x: x + df = DataFrame({"a": [1, 2]}, index=Index(["a", "a"])) + with pytest.raises( + NotImplementedError, + match="The index/columns must be unique when raw=False and engine='numba'", + ): + df.apply(f, engine="numba", axis=apply_axis) + + +def test_numba_unsupported_dtypes(apply_axis): + f = lambda x: x + df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]}) + df["c"] = df["c"].astype("double[pyarrow]") + + with pytest.raises( + ValueError, + match="Column b must have a numeric dtype. Found 'object|string' instead", + ): + df.apply(f, engine="numba", axis=apply_axis) + + with pytest.raises( + ValueError, + match="Column c is backed by an extension array, " + "which is not supported by the numba engine.", + ): + df["c"].to_frame().apply(f, engine="numba", axis=apply_axis) diff -Nru pandas-2.1.4+dfsg/pandas/tests/apply/test_series_apply.py pandas-2.2.2+dfsg/pandas/tests/apply/test_series_apply.py --- pandas-2.1.4+dfsg/pandas/tests/apply/test_series_apply.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/apply/test_series_apply.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,7 @@ MultiIndex, Series, concat, + date_range, timedelta_range, ) import pandas._testing as tm @@ -134,7 +135,7 @@ def test_series_apply_map_box_timestamps(by_row): # GH#2689, GH#2627 - ser = Series(pd.date_range("1/1/2000", periods=10)) + ser = Series(date_range("1/1/2000", periods=10)) def func(x): return (x.hour, x.day, x.month) @@ -150,51 +151,55 @@ tm.assert_series_equal(result, expected) -def test_apply_box(): +def test_apply_box_dt64(): # ufunc will not be boxed. Same test cases as the test_map_box vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] - s = Series(vals) - assert s.dtype == "datetime64[ns]" + ser = Series(vals, dtype="M8[ns]") + assert ser.dtype == "datetime64[ns]" # boxed value must be Timestamp instance - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat") + res = ser.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat") exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) + +def test_apply_box_dt64tz(): vals = [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), ] - s = Series(vals) - assert s.dtype == "datetime64[ns, US/Eastern]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat") + ser = Series(vals, dtype="M8[ns, US/Eastern]") + assert ser.dtype == "datetime64[ns, US/Eastern]" + res = ser.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat") exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) + +def test_apply_box_td64(): # timedelta vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] - s = Series(vals) - assert s.dtype == "timedelta64[ns]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.days}", by_row="compat") + ser = Series(vals) + assert ser.dtype == "timedelta64[ns]" + res = ser.apply(lambda x: f"{type(x).__name__}_{x.days}", by_row="compat") exp = Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) + +def test_apply_box_period(): # period vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] - s = Series(vals) - assert s.dtype == "Period[M]" - res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}", by_row="compat") + ser = Series(vals) + assert ser.dtype == "Period[M]" + res = ser.apply(lambda x: f"{type(x).__name__}_{x.freqstr}", by_row="compat") exp = Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) def test_apply_datetimetz(by_row): - values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( - "Asia/Tokyo" - ) + values = date_range("2011-01-01", "2011-01-02", freq="h").tz_localize("Asia/Tokyo") s = Series(values, name="XX") result = s.apply(lambda x: x + pd.offsets.Day(), by_row=by_row) - exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + exp_values = date_range("2011-01-02", "2011-01-03", freq="h").tz_localize( "Asia/Tokyo" ) exp = Series(exp_values, name="XX") @@ -213,10 +218,10 @@ exp = Series(["Asia/Tokyo"] * 25, name="XX") tm.assert_series_equal(result, exp) else: - result == "Asia/Tokyo" + assert result == "Asia/Tokyo" -def test_apply_categorical(by_row): +def test_apply_categorical(by_row, using_infer_string): values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) ser = Series(values, name="XX", index=list("abcdefg")) @@ -239,7 +244,7 @@ result = ser.apply(lambda x: "A") exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object + assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]" @pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]]) @@ -261,7 +266,7 @@ def test_apply_empty_integer_series_with_datetime_index(by_row): # GH 21245 - s = Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int) + s = Series([], index=date_range(start="2018-01-01", periods=0), dtype=int) result = s.apply(lambda x: x, by_row=by_row) tm.assert_series_equal(result, s) @@ -321,7 +326,7 @@ def test_transform_partial_failure(op, request): # GH 35964 if op in ("ffill", "bfill", "pad", "backfill", "shift"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason=f"{op} is successful on any dtype") ) @@ -504,8 +509,12 @@ DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"), ), ( - tm.makeTimeSeries(nper=30), - DataFrame(np.repeat([[1, 2]], 30, axis=0), dtype="int64"), + Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ), + DataFrame(np.repeat([[1, 2]], 10, axis=0), dtype="int64"), ), ], ) @@ -522,12 +531,15 @@ @pytest.mark.parametrize( - "by_row, expected", [("compat", Series(np.ones(30), dtype="int64")), (False, 1)] + "by_row, expected", [("compat", Series(np.ones(10), dtype="int64")), (False, 1)] ) def test_apply_scalar_on_date_time_index_aware_series(by_row, expected): # GH 25959 # Calling apply on a localized time series should not cause an error - series = tm.makeTimeSeries(nper=30).tz_localize("UTC") + series = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10, tz="UTC"), + ) result = Series(series.index).apply(lambda x: 1, by_row=by_row) tm.assert_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/apply/test_str.py pandas-2.2.2+dfsg/pandas/tests/apply/test_str.py --- pandas-2.1.4+dfsg/pandas/tests/apply/test_str.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/apply/test_str.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,7 +31,7 @@ @pytest.mark.parametrize("how", ["agg", "apply"]) def test_apply_with_string_funcs(request, float_frame, func, args, kwds, how): if len(args) > 1 and how == "agg": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=TypeError, reason="agg/apply signature mismatch - agg passes 2nd " @@ -256,12 +256,16 @@ def test_transform_groupby_kernel_series(request, string_series, op): # GH 35964 if op == "ngroup": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") ) args = [0.0] if op == "fillna" else [] ones = np.ones(string_series.shape[0]) - expected = string_series.groupby(ones).transform(op, *args) + + warn = FutureWarning if op == "fillna" else None + msg = "SeriesGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + expected = string_series.groupby(ones).transform(op, *args) result = string_series.transform(op, 0, *args) tm.assert_series_equal(result, expected) @@ -269,7 +273,7 @@ @pytest.mark.parametrize("op", frame_transform_kernels) def test_transform_groupby_kernel_frame(request, axis, float_frame, op): if op == "ngroup": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame") ) @@ -285,7 +289,12 @@ with tm.assert_produces_warning(FutureWarning, match=msg): gb = float_frame.groupby(ones, axis=axis) - expected = gb.transform(op, *args) + + warn = FutureWarning if op == "fillna" else None + op_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=op_msg): + expected = gb.transform(op, *args) + result = float_frame.transform(op, axis, *args) tm.assert_frame_equal(result, expected) @@ -300,7 +309,10 @@ ones = np.ones(float_frame.shape[1]) with tm.assert_produces_warning(FutureWarning, match=msg): gb2 = float_frame.groupby(ones, axis=axis) - expected2 = gb2.transform(op, *args) + warn = FutureWarning if op == "fillna" else None + op_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=op_msg): + expected2 = gb2.transform(op, *args) result2 = float_frame.transform(op, axis, *args) tm.assert_frame_equal(result2, expected2) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arithmetic/conftest.py pandas-2.2.2+dfsg/pandas/tests/arithmetic/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/arithmetic/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arithmetic/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,28 +2,9 @@ import pytest import pandas as pd -from pandas import ( - Index, - RangeIndex, -) -import pandas._testing as tm -from pandas.core.computation import expressions as expr - +from pandas import Index -@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) -def switch_numexpr_min_elements(request): - _MIN_ELEMENTS = expr._MIN_ELEMENTS - expr._MIN_ELEMENTS = request.param - yield request.param - expr._MIN_ELEMENTS = _MIN_ELEMENTS - -# ------------------------------------------------------------------ - - -# doctest with +SKIP for one fixture fails during setup with -# 'DoctestItem' object has no attribute 'callspec' -# due to switch_numexpr_min_elements fixture @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) def one(request): """ @@ -36,13 +17,13 @@ Examples -------- - dti = pd.date_range('2016-01-01', periods=2, freq='H') + dti = pd.date_range('2016-01-01', periods=2, freq='h') dti DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00'], - dtype='datetime64[ns]', freq='H') + dtype='datetime64[ns]', freq='h') dti + one DatetimeIndex(['2016-01-01 01:00:00', '2016-01-01 02:00:00'], - dtype='datetime64[ns]', freq='H') + dtype='datetime64[ns]', freq='h') """ return request.param @@ -58,9 +39,6 @@ zeros.extend([0, 0.0, -0.0]) -# doctest with +SKIP for zero fixture fails during setup with -# 'DoctestItem' object has no attribute 'callspec' -# due to switch_numexpr_min_elements fixture @pytest.fixture(params=zeros) def zero(request): """ @@ -82,27 +60,6 @@ # ------------------------------------------------------------------ -# Vector Fixtures - - -@pytest.fixture( - params=[ - # TODO: add more dtypes here - Index(np.arange(5, dtype="float64")), - Index(np.arange(5, dtype="int64")), - Index(np.arange(5, dtype="uint64")), - RangeIndex(5), - ], - ids=lambda x: type(x).__name__, -) -def numeric_idx(request): - """ - Several types of numeric-dtypes Index objects - """ - return request.param - - -# ------------------------------------------------------------------ # Scalar Fixtures @@ -168,22 +125,6 @@ @pytest.fixture( params=[ - pd.Timedelta(minutes=30).to_pytimedelta(), - np.timedelta64(30, "s"), - pd.Timedelta(seconds=30), - ] - + _common_mismatch -) -def not_hourly(request): - """ - Several timedelta-like and DateOffset instances that are _not_ - compatible with Hourly frequencies. - """ - return request.param - - -@pytest.fixture( - params=[ np.timedelta64(4, "h"), pd.Timedelta(hours=23).to_pytimedelta(), pd.Timedelta("23:00:00"), @@ -196,33 +137,3 @@ compatible with Daily frequencies. """ return request.param - - -@pytest.fixture( - params=[ - np.timedelta64(365, "D"), - pd.Timedelta(days=365).to_pytimedelta(), - pd.Timedelta(days=365), - ] - + _common_mismatch -) -def mismatched_freq(request): - """ - Several timedelta-like and DateOffset instances that are _not_ - compatible with Monthly or Annual frequencies. - """ - return request.param - - -# ------------------------------------------------------------------ - - -@pytest.fixture( - params=[Index, pd.Series, tm.to_array, np.array, list], ids=lambda x: x.__name__ -) -def box_1d_array(request): - """ - Fixture to test behavior for Index, Series, tm.to_array, numpy Array and list - classes - """ - return request.param diff -Nru pandas-2.1.4+dfsg/pandas/tests/arithmetic/test_datetime64.py pandas-2.2.2+dfsg/pandas/tests/arithmetic/test_datetime64.py --- pandas-2.1.4+dfsg/pandas/tests/arithmetic/test_datetime64.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arithmetic/test_datetime64.py 2024-04-10 17:42:52.000000000 +0000 @@ -394,7 +394,7 @@ class TestDatetimeIndexComparisons: # TODO: moved from tests.indexes.test_base; parametrize and de-duplicate def test_comparators(self, comparison_op): - index = tm.makeDateIndex(100) + index = date_range("2020-01-01", periods=10) element = index[len(index) // 2] element = Timestamp(element).to_datetime64() @@ -414,7 +414,7 @@ dti = date_range("2016-01-01", periods=2, tz=tz) if tz is not None: if isinstance(other, np.datetime64): - pytest.skip("no tzaware version available") + pytest.skip(f"{type(other).__name__} is not tz aware") other = localize_pydatetime(other, dti.tzinfo) result = dti == other @@ -905,7 +905,7 @@ dti = date_range("1994-04-01", periods=9, tz=tz, freq="QS") other = np.timedelta64("NaT") - expected = DatetimeIndex(["NaT"] * 9, tz=tz) + expected = DatetimeIndex(["NaT"] * 9, tz=tz).as_unit("ns") obj = tm.box_expected(dti, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -987,13 +987,13 @@ tm.assert_equal(ser - ts, expected) tm.assert_equal(ts - ser, -expected) - def test_dt64arr_sub_NaT(self, box_with_array): + def test_dt64arr_sub_NaT(self, box_with_array, unit): # GH#18808 - dti = DatetimeIndex([NaT, Timestamp("19900315")]) + dti = DatetimeIndex([NaT, Timestamp("19900315")]).as_unit(unit) ser = tm.box_expected(dti, box_with_array) result = ser - NaT - expected = Series([NaT, NaT], dtype="timedelta64[ns]") + expected = Series([NaT, NaT], dtype=f"timedelta64[{unit}]") expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) @@ -1001,7 +1001,7 @@ ser_tz = tm.box_expected(dti_tz, box_with_array) result = ser_tz - NaT - expected = Series([NaT, NaT], dtype="timedelta64[ns]") + expected = Series([NaT, NaT], dtype=f"timedelta64[{unit}]") expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) @@ -1076,24 +1076,22 @@ # Note: freq here includes both Tick and non-Tick offsets; this is # relevant because historically integer-addition was allowed if we had # a freq. - @pytest.mark.parametrize("freq", ["H", "D", "W", "M", "MS", "Q", "B", None]) + @pytest.mark.parametrize("freq", ["h", "D", "W", "2ME", "MS", "QE", "B", None]) @pytest.mark.parametrize("dtype", [None, "uint8"]) def test_dt64arr_addsub_intlike( - self, request, dtype, box_with_array, freq, tz_naive_fixture + self, request, dtype, index_or_series_or_array, freq, tz_naive_fixture ): # GH#19959, GH#19123, GH#19012 + # GH#55860 use index_or_series_or_array instead of box_with_array + # bc DataFrame alignment makes it inapplicable tz = tz_naive_fixture - if box_with_array is pd.DataFrame: - request.node.add_marker( - pytest.mark.xfail(raises=ValueError, reason="Axis alignment fails") - ) if freq is None: dti = DatetimeIndex(["NaT", "2017-04-05 06:07:08"], tz=tz) else: dti = date_range("2016-01-01", periods=2, freq=freq, tz=tz) - obj = box_with_array(dti) + obj = index_or_series_or_array(dti) other = np.array([4, -1]) if dtype is not None: other = other.astype(dtype) @@ -1144,7 +1142,7 @@ ) assert_invalid_addsub_type(dtarr, other, msg) - @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"]) @pytest.mark.parametrize("dti_freq", [None, "D"]) def test_dt64arr_add_sub_parr( self, dti_freq, pi_freq, box_with_array, box_with_array2 @@ -1223,13 +1221,16 @@ # Tick DateOffsets # TODO: parametrize over timezone? - def test_dt64arr_series_add_tick_DateOffset(self, box_with_array): + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_dt64arr_series_add_tick_DateOffset(self, box_with_array, unit): # GH#4532 # operate with pd.offsets - ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) + ser = Series( + [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")] + ).dt.as_unit(unit) expected = Series( [Timestamp("20130101 9:01:05"), Timestamp("20130101 9:02:05")] - ) + ).dt.as_unit(unit) ser = tm.box_expected(ser, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1282,12 +1283,12 @@ offset = dates + pd.offsets.Hour(5) assert dates[0] + pd.offsets.Hour(5) == offset[0] - dates = date_range("2010-11-01 00:00", periods=3, tz=tz, freq="H") + dates = date_range("2010-11-01 00:00", periods=3, tz=tz, freq="h") expected = DatetimeIndex( ["2010-11-01 05:00", "2010-11-01 06:00", "2010-11-01 07:00"], - freq="H", + freq="h", tz=tz, - ) + ).as_unit("ns") dates = tm.box_expected(dates, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1310,7 +1311,8 @@ # ------------------------------------------------------------- # RelativeDelta DateOffsets - def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array): + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array, unit): # GH#10699 vec = DatetimeIndex( [ @@ -1323,7 +1325,7 @@ Timestamp("2000-05-15"), Timestamp("2001-06-15"), ] - ) + ).as_unit(unit) vec = tm.box_expected(vec, box_with_array) vec_items = vec.iloc[0] if box_with_array is pd.DataFrame else vec @@ -1337,24 +1339,29 @@ ("seconds", 2), ("microseconds", 5), ] - for i, (unit, value) in enumerate(relative_kwargs): - off = DateOffset(**{unit: value}) + for i, (offset_unit, value) in enumerate(relative_kwargs): + off = DateOffset(**{offset_unit: value}) + + exp_unit = unit + if offset_unit == "microseconds" and unit != "ns": + exp_unit = "us" - expected = DatetimeIndex([x + off for x in vec_items]) + # TODO(GH#55564): as_unit will be unnecessary + expected = DatetimeIndex([x + off for x in vec_items]).as_unit(exp_unit) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, vec + off) - expected = DatetimeIndex([x - off for x in vec_items]) + expected = DatetimeIndex([x - off for x in vec_items]).as_unit(exp_unit) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, vec - off) off = DateOffset(**dict(relative_kwargs[: i + 1])) - expected = DatetimeIndex([x + off for x in vec_items]) + expected = DatetimeIndex([x + off for x in vec_items]).as_unit(exp_unit) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, vec + off) - expected = DatetimeIndex([x - off for x in vec_items]) + expected = DatetimeIndex([x - off for x in vec_items]).as_unit(exp_unit) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, vec - off) msg = "(bad|unsupported) operand type for unary" @@ -1415,8 +1422,10 @@ ) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize("n", [0, 5]) + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_dt64arr_add_sub_DateOffsets( - self, box_with_array, n, normalize, cls_and_kwargs + self, box_with_array, n, normalize, cls_and_kwargs, unit, tz ): # GH#10699 # assert vectorized operation matches pointwise operations @@ -1438,102 +1447,45 @@ # passing n = 0 is invalid for these offset classes return - vec = DatetimeIndex( - [ - Timestamp("2000-01-05 00:15:00"), - Timestamp("2000-01-31 00:23:00"), - Timestamp("2000-01-01"), - Timestamp("2000-03-31"), - Timestamp("2000-02-29"), - Timestamp("2000-12-31"), - Timestamp("2000-05-15"), - Timestamp("2001-06-15"), - ] + vec = ( + DatetimeIndex( + [ + Timestamp("2000-01-05 00:15:00"), + Timestamp("2000-01-31 00:23:00"), + Timestamp("2000-01-01"), + Timestamp("2000-03-31"), + Timestamp("2000-02-29"), + Timestamp("2000-12-31"), + Timestamp("2000-05-15"), + Timestamp("2001-06-15"), + ] + ) + .as_unit(unit) + .tz_localize(tz) ) vec = tm.box_expected(vec, box_with_array) vec_items = vec.iloc[0] if box_with_array is pd.DataFrame else vec offset_cls = getattr(pd.offsets, cls_name) - - # pandas.errors.PerformanceWarning: Non-vectorized DateOffset being - # applied to Series or DatetimeIndex - # we aren't testing that here, so ignore. - offset = offset_cls(n, normalize=normalize, **kwargs) - expected = DatetimeIndex([x + offset for x in vec_items]) + # TODO(GH#55564): as_unit will be unnecessary + expected = DatetimeIndex([x + offset for x in vec_items]).as_unit(unit) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, vec + offset) + tm.assert_equal(expected, offset + vec) - expected = DatetimeIndex([x - offset for x in vec_items]) + expected = DatetimeIndex([x - offset for x in vec_items]).as_unit(unit) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, vec - offset) - expected = DatetimeIndex([offset + x for x in vec_items]) + expected = DatetimeIndex([offset + x for x in vec_items]).as_unit(unit) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, offset + vec) msg = "(bad|unsupported) operand type for unary" with pytest.raises(TypeError, match=msg): offset - vec - def test_dt64arr_add_sub_DateOffset(self, box_with_array): - # GH#10699 - s = date_range("2000-01-01", "2000-01-31", name="a") - s = tm.box_expected(s, box_with_array) - result = s + DateOffset(years=1) - result2 = DateOffset(years=1) + s - exp = date_range("2001-01-01", "2001-01-31", name="a")._with_freq(None) - exp = tm.box_expected(exp, box_with_array) - tm.assert_equal(result, exp) - tm.assert_equal(result2, exp) - - result = s - DateOffset(years=1) - exp = date_range("1999-01-01", "1999-01-31", name="a")._with_freq(None) - exp = tm.box_expected(exp, box_with_array) - tm.assert_equal(result, exp) - - s = DatetimeIndex( - [ - Timestamp("2000-01-15 00:15:00", tz="US/Central"), - Timestamp("2000-02-15", tz="US/Central"), - ], - name="a", - ) - s = tm.box_expected(s, box_with_array) - result = s + pd.offsets.Day() - result2 = pd.offsets.Day() + s - exp = DatetimeIndex( - [ - Timestamp("2000-01-16 00:15:00", tz="US/Central"), - Timestamp("2000-02-16", tz="US/Central"), - ], - name="a", - ) - exp = tm.box_expected(exp, box_with_array) - tm.assert_equal(result, exp) - tm.assert_equal(result2, exp) - - s = DatetimeIndex( - [ - Timestamp("2000-01-15 00:15:00", tz="US/Central"), - Timestamp("2000-02-15", tz="US/Central"), - ], - name="a", - ) - s = tm.box_expected(s, box_with_array) - result = s + pd.offsets.MonthEnd() - result2 = pd.offsets.MonthEnd() + s - exp = DatetimeIndex( - [ - Timestamp("2000-01-31 00:15:00", tz="US/Central"), - Timestamp("2000-02-29", tz="US/Central"), - ], - name="a", - ) - exp = tm.box_expected(exp, box_with_array) - tm.assert_equal(result, exp) - tm.assert_equal(result2, exp) - @pytest.mark.parametrize( "other", [ @@ -1594,7 +1546,7 @@ Timestamp("2016-04-01"), Timestamp("2017-04-01"), ], - "AS-APR", + "YS-APR", ), ( "__sub__", @@ -1616,7 +1568,7 @@ Timestamp("2015-10-01"), Timestamp("2016-10-01"), ], - "AS-OCT", + "YS-OCT", ), ], ) @@ -1625,28 +1577,60 @@ ): # GH 26258 tz = tz_aware_fixture - date = date_range(start="01 Jan 2014", end="01 Jan 2017", freq="AS", tz=tz) + date = date_range(start="01 Jan 2014", end="01 Jan 2017", freq="YS", tz=tz) date = tm.box_expected(date, box_with_array, False) mth = getattr(date, op) result = mth(offset) - expected = DatetimeIndex(exp, tz=tz) + expected = DatetimeIndex(exp, tz=tz).as_unit("ns") expected = tm.box_expected(expected, box_with_array, False) tm.assert_equal(result, expected) + def test_dt64arr_series_add_DateOffset_with_milli(self): + # GH 57529 + dti = DatetimeIndex( + [ + "2000-01-01 00:00:00.012345678", + "2000-01-31 00:00:00.012345678", + "2000-02-29 00:00:00.012345678", + ], + dtype="datetime64[ns]", + ) + result = dti + DateOffset(milliseconds=4) + expected = DatetimeIndex( + [ + "2000-01-01 00:00:00.016345678", + "2000-01-31 00:00:00.016345678", + "2000-02-29 00:00:00.016345678", + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + + result = dti + DateOffset(days=1, milliseconds=4) + expected = DatetimeIndex( + [ + "2000-01-02 00:00:00.016345678", + "2000-02-01 00:00:00.016345678", + "2000-03-01 00:00:00.016345678", + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + class TestDatetime64OverflowHandling: # TODO: box + de-duplicate def test_dt64_overflow_masking(self, box_with_array): # GH#25317 - left = Series([Timestamp("1969-12-31")]) + left = Series([Timestamp("1969-12-31")], dtype="M8[ns]") right = Series([NaT]) left = tm.box_expected(left, box_with_array) right = tm.box_expected(right, box_with_array) - expected = TimedeltaIndex([NaT]) + expected = TimedeltaIndex([NaT], dtype="m8[ns]") expected = tm.box_expected(expected, box_with_array) result = left - right @@ -1656,7 +1640,7 @@ # GH#12534, fixed by GH#19024 dt = Timestamp("1700-01-31") td = Timedelta("20000 Days") - dti = date_range("1949-09-30", freq="100Y", periods=4) + dti = date_range("1949-09-30", freq="100YE", periods=4) ser = Series(dti) msg = "Overflow in int64 addition" with pytest.raises(OverflowError, match=msg): @@ -1685,8 +1669,8 @@ tm.assert_series_equal(res, -expected) def test_datetimeindex_sub_timestamp_overflow(self): - dtimax = pd.to_datetime(["2021-12-28 17:19", Timestamp.max]) - dtimin = pd.to_datetime(["2021-12-28 17:19", Timestamp.min]) + dtimax = pd.to_datetime(["2021-12-28 17:19", Timestamp.max]).as_unit("ns") + dtimin = pd.to_datetime(["2021-12-28 17:19", Timestamp.min]).as_unit("ns") tsneg = Timestamp("1950-01-01").as_unit("ns") ts_neg_variants = [ @@ -1724,11 +1708,11 @@ def test_datetimeindex_sub_datetimeindex_overflow(self): # GH#22492, GH#22508 - dtimax = pd.to_datetime(["2021-12-28 17:19", Timestamp.max]) - dtimin = pd.to_datetime(["2021-12-28 17:19", Timestamp.min]) + dtimax = pd.to_datetime(["2021-12-28 17:19", Timestamp.max]).as_unit("ns") + dtimin = pd.to_datetime(["2021-12-28 17:19", Timestamp.min]).as_unit("ns") - ts_neg = pd.to_datetime(["1950-01-01", "1950-01-01"]) - ts_pos = pd.to_datetime(["1980-01-01", "1980-01-01"]) + ts_neg = pd.to_datetime(["1950-01-01", "1950-01-01"]).as_unit("ns") + ts_pos = pd.to_datetime(["1980-01-01", "1980-01-01"]).as_unit("ns") # General tests expected = Timestamp.max._value - ts_pos[1]._value @@ -1804,12 +1788,16 @@ td1 + dt1 dt1 + td1 - def test_dt64ser_sub_datetime_dtype(self): + def test_dt64ser_sub_datetime_dtype(self, unit): ts = Timestamp(datetime(1993, 1, 7, 13, 30, 00)) dt = datetime(1993, 6, 22, 13, 30) - ser = Series([ts]) - result = pd.to_timedelta(np.abs(ser - dt)) - assert result.dtype == "timedelta64[ns]" + ser = Series([ts], dtype=f"M8[{unit}]") + result = ser - dt + + # the expected unit is the max of `unit` and the unit imputed to `dt`, + # which is "us" + exp_unit = tm.get_finest_unit(unit, "us") + assert result.dtype == f"timedelta64[{exp_unit}]" # ------------------------------------------------------------- # TODO: This next block of tests came from tests.series.test_operators, @@ -1861,15 +1849,15 @@ # Smoke test op(arg2) - def test_sub_single_tz(self): + def test_sub_single_tz(self, unit): # GH#12290 - s1 = Series([Timestamp("2016-02-10", tz="America/Sao_Paulo")]) - s2 = Series([Timestamp("2016-02-08", tz="America/Sao_Paulo")]) + s1 = Series([Timestamp("2016-02-10", tz="America/Sao_Paulo")]).dt.as_unit(unit) + s2 = Series([Timestamp("2016-02-08", tz="America/Sao_Paulo")]).dt.as_unit(unit) result = s1 - s2 - expected = Series([Timedelta("2days")]) + expected = Series([Timedelta("2days")]).dt.as_unit(unit) tm.assert_series_equal(result, expected) result = s2 - s1 - expected = Series([Timedelta("-2days")]) + expected = Series([Timedelta("-2days")]).dt.as_unit(unit) tm.assert_series_equal(result, expected) def test_dt64tz_series_sub_dtitz(self): @@ -1884,13 +1872,17 @@ res = ser - dti tm.assert_series_equal(res, expected) - def test_sub_datetime_compat(self): + def test_sub_datetime_compat(self, unit): # see GH#14088 - s = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), NaT]) + ser = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), NaT]).dt.as_unit(unit) dt = datetime(2016, 8, 22, 12, tzinfo=pytz.utc) - exp = Series([Timedelta("1 days"), NaT]) - tm.assert_series_equal(s - dt, exp) - tm.assert_series_equal(s - Timestamp(dt), exp) + # The datetime object has "us" so we upcast lower units + exp_unit = tm.get_finest_unit(unit, "us") + exp = Series([Timedelta("1 days"), NaT]).dt.as_unit(exp_unit) + result = ser - dt + tm.assert_series_equal(result, exp) + result2 = ser - Timestamp(dt) + tm.assert_series_equal(result2, exp) def test_dt64_series_add_mixed_tick_DateOffset(self): # GH#4532 @@ -1911,11 +1903,11 @@ ) tm.assert_series_equal(result, expected) - def test_datetime64_ops_nat(self): + def test_datetime64_ops_nat(self, unit): # GH#11349 - datetime_series = Series([NaT, Timestamp("19900315")]) - nat_series_dtype_timestamp = Series([NaT, NaT], dtype="datetime64[ns]") - single_nat_dtype_datetime = Series([NaT], dtype="datetime64[ns]") + datetime_series = Series([NaT, Timestamp("19900315")]).dt.as_unit(unit) + nat_series_dtype_timestamp = Series([NaT, NaT], dtype=f"datetime64[{unit}]") + single_nat_dtype_datetime = Series([NaT], dtype=f"datetime64[{unit}]") # subtraction tm.assert_series_equal(-NaT + datetime_series, nat_series_dtype_timestamp) @@ -1953,7 +1945,7 @@ dt2 = dt1.copy() dt2.iloc[2] = np.nan - td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="H")) + td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="h")) td2 = td1.copy() td2.iloc[1] = np.nan assert td2._values.freq is None @@ -2086,16 +2078,16 @@ with pytest.raises(TypeError, match=msg): tdi.values - dti - def test_dti_isub_tdi(self, tz_naive_fixture): + def test_dti_isub_tdi(self, tz_naive_fixture, unit): # GH#17558 tz = tz_naive_fixture - dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) - tdi = pd.timedelta_range("0 days", periods=10) - expected = date_range("2017-01-01", periods=10, tz=tz, freq="-1D") + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10).as_unit(unit) + tdi = pd.timedelta_range("0 days", periods=10, unit=unit) + expected = date_range("2017-01-01", periods=10, tz=tz, freq="-1D", unit=unit) expected = expected._with_freq(None) # isub with TimedeltaIndex - result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10).as_unit(unit) result -= tdi tm.assert_index_equal(result, expected) @@ -2113,7 +2105,7 @@ tdi -= dti # isub with timedelta64 array - result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10).as_unit(unit) result -= tdi.values tm.assert_index_equal(result, expected) @@ -2147,13 +2139,13 @@ expected = dti - tdi tm.assert_index_equal(result, expected) - def test_sub_dti_dti(self): + def test_sub_dti_dti(self, unit): # previously performed setop (deprecated in 0.16.0), now changed to # return subtraction -> TimeDeltaIndex (GH ...) - dti = date_range("20130101", periods=3) - dti_tz = date_range("20130101", periods=3).tz_localize("US/Eastern") - expected = TimedeltaIndex([0, 0, 0]) + dti = date_range("20130101", periods=3, unit=unit) + dti_tz = date_range("20130101", periods=3, unit=unit).tz_localize("US/Eastern") + expected = TimedeltaIndex([0, 0, 0]).as_unit(unit) result = dti - dti tm.assert_index_equal(result, expected) @@ -2172,16 +2164,16 @@ tm.assert_index_equal(dti, expected) # different length raises ValueError - dti1 = date_range("20130101", periods=3) - dti2 = date_range("20130101", periods=4) + dti1 = date_range("20130101", periods=3, unit=unit) + dti2 = date_range("20130101", periods=4, unit=unit) msg = "cannot add indices of unequal length" with pytest.raises(ValueError, match=msg): dti1 - dti2 # NaN propagation - dti1 = DatetimeIndex(["2012-01-01", np.nan, "2012-01-03"]) - dti2 = DatetimeIndex(["2012-01-02", "2012-01-03", np.nan]) - expected = TimedeltaIndex(["1 days", np.nan, np.nan]) + dti1 = DatetimeIndex(["2012-01-01", np.nan, "2012-01-03"]).as_unit(unit) + dti2 = DatetimeIndex(["2012-01-02", "2012-01-03", np.nan]).as_unit(unit) + expected = TimedeltaIndex(["1 days", np.nan, np.nan]).as_unit(unit) result = dti2 - dti1 tm.assert_index_equal(result, expected) @@ -2284,17 +2276,17 @@ nat_series_dtype_timestamp, ) - def test_ufunc_coercions(self): - idx = date_range("2011-01-01", periods=3, freq="2D", name="x") + def test_ufunc_coercions(self, unit): + idx = date_range("2011-01-01", periods=3, freq="2D", name="x", unit=unit) delta = np.timedelta64(1, "D") - exp = date_range("2011-01-02", periods=3, freq="2D", name="x") + exp = date_range("2011-01-02", periods=3, freq="2D", name="x", unit=unit) for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) tm.assert_index_equal(result, exp) assert result.freq == "2D" - exp = date_range("2010-12-31", periods=3, freq="2D", name="x") + exp = date_range("2010-12-31", periods=3, freq="2D", name="x", unit=unit) for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) @@ -2307,13 +2299,17 @@ delta = np.array( [np.timedelta64(1, "D"), np.timedelta64(2, "D"), np.timedelta64(3, "D")] ) - exp = DatetimeIndex(["2011-01-02", "2011-01-05", "2011-01-08"], name="x") + exp = DatetimeIndex( + ["2011-01-02", "2011-01-05", "2011-01-08"], name="x" + ).as_unit(unit) for result in [idx + delta, np.add(idx, delta)]: tm.assert_index_equal(result, exp) assert result.freq == exp.freq - exp = DatetimeIndex(["2010-12-31", "2011-01-01", "2011-01-02"], name="x") + exp = DatetimeIndex( + ["2010-12-31", "2011-01-01", "2011-01-02"], name="x" + ).as_unit(unit) for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) tm.assert_index_equal(result, exp) @@ -2324,7 +2320,7 @@ tz = tz_naive_fixture index = DatetimeIndex( ["2016-06-28 05:30", "2016-06-28 05:31"], tz=tz, name=names[0] - ) + ).as_unit("ns") ser = Series([Timedelta(seconds=5)] * 2, index=index, name=names[1]) expected = Series(index + Timedelta(seconds=5), index=index, name=names[2]) @@ -2392,7 +2388,8 @@ @pytest.mark.parametrize("years", [-1, 0, 1]) @pytest.mark.parametrize("months", [-2, 0, 2]) -def test_shift_months(years, months): +@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) +def test_shift_months(years, months, unit): dti = DatetimeIndex( [ Timestamp("2000-01-05 00:15:00"), @@ -2401,11 +2398,13 @@ Timestamp("2000-02-29"), Timestamp("2000-12-31"), ] - ) - actual = DatetimeIndex(shift_months(dti.asi8, years * 12 + months)) + ).as_unit(unit) + shifted = shift_months(dti.asi8, years * 12 + months, reso=dti._data._creso) + shifted_dt64 = shifted.view(f"M8[{dti.unit}]") + actual = DatetimeIndex(shifted_dt64) raw = [x + pd.offsets.DateOffset(years=years, months=months) for x in dti] - expected = DatetimeIndex(raw) + expected = DatetimeIndex(raw).as_unit(dti.unit) tm.assert_index_equal(actual, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arithmetic/test_numeric.py pandas-2.2.2+dfsg/pandas/tests/arithmetic/test_numeric.py --- pandas-2.1.4+dfsg/pandas/tests/arithmetic/test_numeric.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arithmetic/test_numeric.py 2024-04-10 17:42:52.000000000 +0000 @@ -19,6 +19,7 @@ Timedelta, TimedeltaIndex, array, + date_range, ) import pandas._testing as tm from pandas.core import ops @@ -29,6 +30,13 @@ ) +@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) +def switch_numexpr_min_elements(request, monkeypatch): + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", request.param) + yield request.param + + @pytest.fixture(params=[Index, Series, tm.to_array]) def box_pandas_1d_array(request): """ @@ -37,6 +45,34 @@ return request.param +@pytest.fixture( + params=[ + # TODO: add more dtypes here + Index(np.arange(5, dtype="float64")), + Index(np.arange(5, dtype="int64")), + Index(np.arange(5, dtype="uint64")), + RangeIndex(5), + ], + ids=lambda x: type(x).__name__, +) +def numeric_idx(request): + """ + Several types of numeric-dtypes Index objects + """ + return request.param + + +@pytest.fixture( + params=[Index, Series, tm.to_array, np.array, list], ids=lambda x: x.__name__ +) +def box_1d_array(request): + """ + Fixture to test behavior for Index, Series, tm.to_array, numpy Array and list + classes + """ + return request.param + + def adjust_negative_zero(zero, expected): """ Helper to adjust the expected result if we are dividing by -0.0 @@ -263,6 +299,12 @@ expected = expected.astype(dtype) elif type(three_days) is timedelta: expected = expected.astype("m8[us]") + elif isinstance( + three_days, + (pd.offsets.Day, pd.offsets.Hour, pd.offsets.Minute, pd.offsets.Second), + ): + # closest reso is Second + expected = expected.astype("m8[s]") index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) @@ -377,7 +419,7 @@ def test_div_negative_zero(self, zero, numeric_idx, op): # Check that -1 / -0.0 returns np.inf, not -np.inf if numeric_idx.dtype == np.uint64: - pytest.skip(f"Not relevant for {numeric_idx.dtype}") + pytest.skip(f"Div by negative 0 not relevant for {numeric_idx.dtype}") idx = numeric_idx - 3 expected = Index([-np.inf, -np.inf, -np.inf, np.nan, np.inf], dtype=np.float64) @@ -698,7 +740,7 @@ idx = numeric_idx msg = "cannot perform __rmul__ with this index type" with pytest.raises(TypeError, match=msg): - idx * pd.date_range("20130101", periods=5) + idx * date_range("20130101", periods=5) def test_mul_size_mismatch_raises(self, numeric_idx): idx = numeric_idx @@ -785,7 +827,11 @@ # TODO: This came from series.test.test_operators, needs cleanup def test_operators_frame(self): # rpow does not work with DataFrame - ts = tm.makeTimeSeries() + ts = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) ts.name = "ts" df = pd.DataFrame({"A": ts}) @@ -881,7 +927,7 @@ # TODO: This came from series.test.test_operators, needs cleanup def test_series_frame_radd_bug(self, fixed_now_ts): # GH#353 - vals = Series(tm.makeStringIndex()) + vals = Series([str(i) for i in range(5)]) result = "foo_" + vals expected = vals.map(lambda x: "foo_" + x) tm.assert_series_equal(result, expected) @@ -891,8 +937,11 @@ expected = pd.DataFrame({"vals": vals.map(lambda x: "foo_" + x)}) tm.assert_frame_equal(result, expected) - ts = tm.makeTimeSeries() - ts.name = "ts" + ts = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) # really raise this time fix_now = fixed_now_ts.to_pydatetime() @@ -920,8 +969,8 @@ # GH#4629 # arithmetic datetime64 ops with an index ser = Series( - pd.date_range("20130101", periods=5), - index=pd.date_range("20130101", periods=5), + date_range("20130101", periods=5), + index=date_range("20130101", periods=5), ) expected = ser - ser.index.to_series() result = ser - ser.index @@ -934,7 +983,7 @@ df = pd.DataFrame( np.random.default_rng(2).standard_normal((5, 2)), - index=pd.date_range("20130101", periods=5), + index=date_range("20130101", periods=5), ) df["date"] = pd.Timestamp("20130102") df["expected"] = df["date"] - df.index.to_series() @@ -996,7 +1045,11 @@ ) def test_series_operators_arithmetic(self, all_arithmetic_functions, func): op = all_arithmetic_functions - series = tm.makeTimeSeries().rename("ts") + series = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) other = func(series) compare_op(series, other, op) @@ -1005,7 +1058,11 @@ ) def test_series_operators_compare(self, comparison_op, func): op = comparison_op - series = tm.makeTimeSeries().rename("ts") + series = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) other = func(series) compare_op(series, other, op) @@ -1015,7 +1072,11 @@ ids=["multiply", "slice", "constant"], ) def test_divmod(self, func): - series = tm.makeTimeSeries().rename("ts") + series = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) other = func(series) results = divmod(series, other) if isinstance(other, abc.Iterable) and len(series) != len(other): @@ -1046,7 +1107,11 @@ # -1/0 == -np.inf # 1/-0.0 == -np.inf # -1/-0.0 == np.inf - tser = tm.makeTimeSeries().rename("ts") + tser = Series( + np.arange(1, 11, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) other = tser * 0 result = divmod(tser, other) @@ -1386,6 +1451,18 @@ tm.assert_index_equal(index - index, 0 * index) assert not (index - index).empty + def test_pow_nan_with_zero(self, box_with_array): + left = Index([np.nan, np.nan, np.nan]) + right = Index([0, 0, 0]) + expected = Index([1.0, 1.0, 1.0]) + + left = tm.box_expected(left, box_with_array) + right = tm.box_expected(right, box_with_array) + expected = tm.box_expected(expected, box_with_array) + + result = left**right + tm.assert_equal(result, expected) + def test_fill_value_inf_masking(): # GH #27464 make sure we mask 0/1 with Inf and not NaN diff -Nru pandas-2.1.4+dfsg/pandas/tests/arithmetic/test_object.py pandas-2.2.2+dfsg/pandas/tests/arithmetic/test_object.py --- pandas-2.1.4+dfsg/pandas/tests/arithmetic/test_object.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arithmetic/test_object.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -174,12 +176,16 @@ # invalid ops box = box_with_array - obj_ser = tm.makeObjectSeries() - obj_ser.name = "objects" + obj_ser = Series(list("abc"), dtype=object, name="objects") obj_ser = tm.box_expected(obj_ser, box) msg = "|".join( - ["can only concatenate str", "unsupported operand type", "must be str"] + [ + "can only concatenate str", + "unsupported operand type", + "must be str", + "has no kernel", + ] ) with pytest.raises(Exception, match=msg): op(obj_ser, 1) @@ -297,8 +303,9 @@ index += "_x" assert "a_x" in index + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="add doesn't work") def test_add(self): - index = tm.makeStringIndex(100) + index = pd.Index([str(i) for i in range(10)]) expected = pd.Index(index.values * 2) tm.assert_index_equal(index + index, expected) tm.assert_index_equal(index + index.tolist(), expected) @@ -311,17 +318,24 @@ expected = pd.Index(["1a", "1b", "1c"]) tm.assert_index_equal("1" + index, expected) - def test_sub_fail(self): - index = tm.makeStringIndex(100) + def test_sub_fail(self, using_infer_string): + index = pd.Index([str(i) for i in range(10)]) - msg = "unsupported operand type|Cannot broadcast" - with pytest.raises(TypeError, match=msg): + if using_infer_string: + import pyarrow as pa + + err = pa.lib.ArrowNotImplementedError + msg = "has no kernel" + else: + err = TypeError + msg = "unsupported operand type|Cannot broadcast" + with pytest.raises(err, match=msg): index - "a" - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): index - index - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): index - index.tolist() - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): index.tolist() - index def test_sub_object(self): diff -Nru pandas-2.1.4+dfsg/pandas/tests/arithmetic/test_period.py pandas-2.2.2+dfsg/pandas/tests/arithmetic/test_period.py --- pandas-2.1.4+dfsg/pandas/tests/arithmetic/test_period.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arithmetic/test_period.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,6 +31,45 @@ get_upcast_box, ) +_common_mismatch = [ + pd.offsets.YearBegin(2), + pd.offsets.MonthBegin(1), + pd.offsets.Minute(), +] + + +@pytest.fixture( + params=[ + Timedelta(minutes=30).to_pytimedelta(), + np.timedelta64(30, "s"), + Timedelta(seconds=30), + ] + + _common_mismatch +) +def not_hourly(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Hourly frequencies. + """ + return request.param + + +@pytest.fixture( + params=[ + np.timedelta64(365, "D"), + Timedelta(days=365).to_pytimedelta(), + Timedelta(days=365), + ] + + _common_mismatch +) +def mismatched_freq(request): + """ + Several timedelta-like and DateOffset instances that are _not_ + compatible with Monthly or Annual frequencies. + """ + return request.param + + # ------------------------------------------------------------------ # Comparisons @@ -286,14 +325,14 @@ msg = rf"Invalid comparison between dtype=period\[{freq}\] and Period" with pytest.raises(TypeError, match=msg): - base <= Period("2011", freq="A") + base <= Period("2011", freq="Y") with pytest.raises(TypeError, match=msg): - Period("2011", freq="A") >= base + Period("2011", freq="Y") >= base # TODO: Could parametrize over boxes for idx? - idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="A") - rev_msg = r"Invalid comparison between dtype=period\[A-DEC\] and PeriodArray" + idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="Y") + rev_msg = r"Invalid comparison between dtype=period\[Y-DEC\] and PeriodArray" idx_msg = rev_msg if box_with_array in [tm.to_array, pd.array] else msg with pytest.raises(TypeError, match=idx_msg): base <= idx @@ -405,18 +444,18 @@ # GH#13200 base = Series( [ - Period("2011", freq="A"), + Period("2011", freq="Y"), Period("2011-02", freq="M"), - Period("2013", freq="A"), + Period("2013", freq="Y"), Period("2011-04", freq="M"), ] ) ser = Series( [ - Period("2012", freq="A"), + Period("2012", freq="Y"), Period("2011-01", freq="M"), - Period("2013", freq="A"), + Period("2013", freq="Y"), Period("2011-05", freq="M"), ] ) @@ -637,11 +676,11 @@ def test_parr_sub_pi_mismatched_freq(self, box_with_array, box_with_array2): rng = period_range("1/1/2000", freq="D", periods=5) - other = period_range("1/6/2000", freq="H", periods=5) + other = period_range("1/6/2000", freq="h", periods=5) rng = tm.box_expected(rng, box_with_array) other = tm.box_expected(other, box_with_array2) - msg = r"Input has different freq=[HD] from PeriodArray\(freq=[DH]\)" + msg = r"Input has different freq=[hD] from PeriodArray\(freq=[Dh]\)" with pytest.raises(IncompatibleFrequency, match=msg): rng - other @@ -696,9 +735,9 @@ Timestamp("2016-01-01").to_pydatetime(), Timestamp("2016-01-01").to_datetime64(), # datetime-like arrays - pd.date_range("2016-01-01", periods=3, freq="H"), + pd.date_range("2016-01-01", periods=3, freq="h"), pd.date_range("2016-01-01", periods=3, tz="Europe/Brussels"), - pd.date_range("2016-01-01", periods=3, freq="S")._data, + pd.date_range("2016-01-01", periods=3, freq="s")._data, pd.date_range("2016-01-01", periods=3, tz="Asia/Tokyo")._data, # Miscellaneous invalid types 3.14, @@ -779,8 +818,8 @@ with pytest.raises(TypeError, match=msg): tdi - rng - @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) - @pytest.mark.parametrize("tdi_freq", [None, "H"]) + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"]) + @pytest.mark.parametrize("tdi_freq", [None, "h"]) def test_parr_sub_td64array(self, box_with_array, tdi_freq, pi_freq): box = box_with_array xbox = box if box not in [pd.array, tm.to_array] else pd.Index @@ -792,15 +831,15 @@ # TODO: parametrize over box for pi? td64obj = tm.box_expected(tdi, box) - if pi_freq == "H": + if pi_freq == "h": result = pi - td64obj - expected = (pi.to_timestamp("S") - tdi).to_period(pi_freq) + expected = (pi.to_timestamp("s") - tdi).to_period(pi_freq) expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) # Subtract from scalar result = pi[0] - td64obj - expected = (pi[0].to_timestamp("S") - tdi).to_period(pi_freq) + expected = (pi[0].to_timestamp("s") - tdi).to_period(pi_freq) expected = tm.box_expected(expected, box) tm.assert_equal(result, expected) @@ -891,9 +930,9 @@ def test_pi_add_iadd_int(self, one): # Variants of `one` for #19012 - rng = period_range("2000-01-01 09:00", freq="H", periods=10) + rng = period_range("2000-01-01 09:00", freq="h", periods=10) result = rng + one - expected = period_range("2000-01-01 10:00", freq="H", periods=10) + expected = period_range("2000-01-01 10:00", freq="h", periods=10) tm.assert_index_equal(result, expected) rng += one tm.assert_index_equal(rng, expected) @@ -903,9 +942,9 @@ PeriodIndex.__sub__ and __isub__ with several representations of the integer 1, e.g. int, np.int64, np.uint8, ... """ - rng = period_range("2000-01-01 09:00", freq="H", periods=10) + rng = period_range("2000-01-01 09:00", freq="h", periods=10) result = rng - one - expected = period_range("2000-01-01 08:00", freq="H", periods=10) + expected = period_range("2000-01-01 08:00", freq="h", periods=10) tm.assert_index_equal(result, expected) rng -= one tm.assert_index_equal(rng, expected) @@ -934,9 +973,9 @@ def test_pi_sub_isub_offset(self): # offset # DateOffset - rng = period_range("2014", "2024", freq="A") + rng = period_range("2014", "2024", freq="Y") result = rng - pd.offsets.YearEnd(5) - expected = period_range("2009", "2019", freq="A") + expected = period_range("2009", "2019", freq="Y") tm.assert_index_equal(result, expected) rng -= pd.offsets.YearEnd(5) tm.assert_index_equal(rng, expected) @@ -977,10 +1016,10 @@ pi = tm.box_expected(pi, box_with_array) expected = tm.box_expected(expected, box_with_array) - result = pi + to_offset("3M") + result = pi + to_offset("3ME") tm.assert_equal(result, expected) - result = to_offset("3M") + pi + result = to_offset("3ME") + pi tm.assert_equal(result, expected) # --------------------------------------------------------------- @@ -1048,7 +1087,7 @@ with pytest.raises(TypeError, match=msg): other - rng - @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5T", "5h", "5d"]) + @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5min", "5h", "5d"]) def test_parr_add_timedeltalike_tick_gt1(self, three_days, freqstr, box_with_array): # GH#23031 adding a time-delta-like offset to a PeriodArray that has # tick-like frequency with n != 1 @@ -1131,8 +1170,8 @@ def test_pi_add_iadd_timedeltalike_hourly(self, two_hours): other = two_hours - rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") - expected = period_range("2014-01-01 12:00", "2014-01-05 12:00", freq="H") + rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h") + expected = period_range("2014-01-01 12:00", "2014-01-05 12:00", freq="h") result = rng + other tm.assert_index_equal(result, expected) @@ -1144,12 +1183,12 @@ self, not_hourly, box_with_array ): other = not_hourly - rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") + rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h") rng = tm.box_expected(rng, box_with_array) msg = "|".join( [ # non-timedelta-like DateOffset - "Input has different freq(=.+)? from Period.*?\\(freq=H\\)", + "Input has different freq(=.+)? from Period.*?\\(freq=h\\)", # timedelta/td64/Timedelta but not a multiple of 24H "Cannot add/subtract timedelta-like from PeriodArray that is " "not an integer multiple of the PeriodArray's freq.", @@ -1164,8 +1203,8 @@ def test_pi_sub_isub_timedeltalike_hourly(self, two_hours): other = two_hours - rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") - expected = period_range("2014-01-01 08:00", "2014-01-05 08:00", freq="H") + rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h") + expected = period_range("2014-01-01 08:00", "2014-01-05 08:00", freq="h") result = rng - other tm.assert_index_equal(result, expected) @@ -1176,17 +1215,17 @@ def test_add_iadd_timedeltalike_annual(self): # offset # DateOffset - rng = period_range("2014", "2024", freq="A") + rng = period_range("2014", "2024", freq="Y") result = rng + pd.offsets.YearEnd(5) - expected = period_range("2019", "2029", freq="A") + expected = period_range("2019", "2029", freq="Y") tm.assert_index_equal(result, expected) rng += pd.offsets.YearEnd(5) tm.assert_index_equal(rng, expected) def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, mismatched_freq): other = mismatched_freq - rng = period_range("2014", "2024", freq="A") - msg = "Input has different freq(=.+)? from Period.*?\\(freq=A-DEC\\)" + rng = period_range("2014", "2024", freq="Y") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=Y-DEC\\)" with pytest.raises(IncompatibleFrequency, match=msg): rng + other with pytest.raises(IncompatibleFrequency, match=msg): @@ -1243,7 +1282,7 @@ "other", [ np.array(["NaT"] * 9, dtype="m8[ns]"), - TimedeltaArray._from_sequence(["NaT"] * 9), + TimedeltaArray._from_sequence(["NaT"] * 9, dtype="m8[ns]"), ], ) def test_parr_add_sub_tdt64_nat_array(self, box_with_array, other): @@ -1310,6 +1349,42 @@ expected = PeriodIndex(["2000-12-30"] * 3, freq="D")._data.astype(object) tm.assert_equal(result, expected) + def test_period_add_timestamp_raises(self, box_with_array): + # GH#17983 + ts = Timestamp("2017") + per = Period("2017", freq="M") + + arr = pd.Index([per], dtype="Period[M]") + arr = tm.box_expected(arr, box_with_array) + + msg = "cannot add PeriodArray and Timestamp" + with pytest.raises(TypeError, match=msg): + arr + ts + with pytest.raises(TypeError, match=msg): + ts + arr + msg = "cannot add PeriodArray and DatetimeArray" + with pytest.raises(TypeError, match=msg): + arr + Series([ts]) + with pytest.raises(TypeError, match=msg): + Series([ts]) + arr + with pytest.raises(TypeError, match=msg): + arr + pd.Index([ts]) + with pytest.raises(TypeError, match=msg): + pd.Index([ts]) + arr + + if box_with_array is pd.DataFrame: + msg = "cannot add PeriodArray and DatetimeArray" + else: + msg = r"unsupported operand type\(s\) for \+: 'Period' and 'DatetimeArray" + with pytest.raises(TypeError, match=msg): + arr + pd.DataFrame([ts]) + if box_with_array is pd.DataFrame: + msg = "cannot add PeriodArray and DatetimeArray" + else: + msg = r"unsupported operand type\(s\) for \+: 'DatetimeArray' and 'Period'" + with pytest.raises(TypeError, match=msg): + pd.DataFrame([ts]) + arr + class TestPeriodSeriesArithmetic: def test_parr_add_timedeltalike_scalar(self, three_days, box_with_array): diff -Nru pandas-2.1.4+dfsg/pandas/tests/arithmetic/test_timedelta64.py pandas-2.2.2+dfsg/pandas/tests/arithmetic/test_timedelta64.py --- pandas-2.1.4+dfsg/pandas/tests/arithmetic/test_timedelta64.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arithmetic/test_timedelta64.py 2024-04-10 17:42:52.000000000 +0000 @@ -70,7 +70,7 @@ box = box_with_array xbox = box_with_array if box_with_array not in [Index, pd.array] else np.ndarray - tdi = timedelta_range("2H", periods=4) + tdi = timedelta_range("2h", periods=4) other = np.array(tdi.to_numpy()[0]) tdi = tm.box_expected(tdi, box) @@ -276,32 +276,32 @@ def test_ufunc_coercions(self): # normal ops are also tested in tseries/test_timedeltas.py - idx = TimedeltaIndex(["2H", "4H", "6H", "8H", "10H"], freq="2H", name="x") + idx = TimedeltaIndex(["2h", "4h", "6h", "8h", "10h"], freq="2h", name="x") for result in [idx * 2, np.multiply(idx, 2)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(["4H", "8H", "12H", "16H", "20H"], freq="4H", name="x") + exp = TimedeltaIndex(["4h", "8h", "12h", "16h", "20h"], freq="4h", name="x") tm.assert_index_equal(result, exp) - assert result.freq == "4H" + assert result.freq == "4h" for result in [idx / 2, np.divide(idx, 2)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(["1H", "2H", "3H", "4H", "5H"], freq="H", name="x") + exp = TimedeltaIndex(["1h", "2h", "3h", "4h", "5h"], freq="h", name="x") tm.assert_index_equal(result, exp) - assert result.freq == "H" + assert result.freq == "h" for result in [-idx, np.negative(idx)]: assert isinstance(result, TimedeltaIndex) exp = TimedeltaIndex( - ["-2H", "-4H", "-6H", "-8H", "-10H"], freq="-2H", name="x" + ["-2h", "-4h", "-6h", "-8h", "-10h"], freq="-2h", name="x" ) tm.assert_index_equal(result, exp) - assert result.freq == "-2H" + assert result.freq == "-2h" - idx = TimedeltaIndex(["-2H", "-1H", "0H", "1H", "2H"], freq="H", name="x") + idx = TimedeltaIndex(["-2h", "-1h", "0h", "1h", "2h"], freq="h", name="x") for result in [abs(idx), np.absolute(idx)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(["2H", "1H", "0H", "1H", "2H"], freq=None, name="x") + exp = TimedeltaIndex(["2h", "1h", "0h", "1h", "2h"], freq=None, name="x") tm.assert_index_equal(result, exp) assert result.freq is None @@ -336,20 +336,22 @@ result = tdi - td expected = TimedeltaIndex(["0 days", NaT, "1 days"], name="foo") - tm.assert_index_equal(result, expected, check_names=False) + tm.assert_index_equal(result, expected) result = td - tdi expected = TimedeltaIndex(["0 days", NaT, "-1 days"], name="foo") - tm.assert_index_equal(result, expected, check_names=False) + tm.assert_index_equal(result, expected) result = dti - td expected = DatetimeIndex( - ["20121231", "20130101", "20130102"], freq="D", name="bar" + ["20121231", "20130101", "20130102"], dtype="M8[ns]", freq="D", name="bar" ) - tm.assert_index_equal(result, expected, check_names=False) + tm.assert_index_equal(result, expected) result = dt - tdi - expected = DatetimeIndex(["20121231", NaT, "20121230"], name="foo") + expected = DatetimeIndex( + ["20121231", NaT, "20121230"], dtype="M8[ns]", name="foo" + ) tm.assert_index_equal(result, expected) def test_subtraction_ops_with_tz(self, box_with_array): @@ -432,7 +434,9 @@ _check(result, expected) result = dti_tz - td - expected = DatetimeIndex(["20121231", "20130101", "20130102"], tz="US/Eastern") + expected = DatetimeIndex( + ["20121231", "20130101", "20130102"], tz="US/Eastern" + ).as_unit("ns") expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) @@ -450,7 +454,7 @@ tm.assert_index_equal(result, expected) result = dti - tdi # name will be reset - expected = DatetimeIndex(["20121231", NaT, "20130101"]) + expected = DatetimeIndex(["20121231", NaT, "20130101"], dtype="M8[ns]") tm.assert_index_equal(result, expected) def test_addition_ops(self): @@ -461,11 +465,15 @@ dt = Timestamp("20130101") result = tdi + dt - expected = DatetimeIndex(["20130102", NaT, "20130103"], name="foo") + expected = DatetimeIndex( + ["20130102", NaT, "20130103"], dtype="M8[ns]", name="foo" + ) tm.assert_index_equal(result, expected) result = dt + tdi - expected = DatetimeIndex(["20130102", NaT, "20130103"], name="foo") + expected = DatetimeIndex( + ["20130102", NaT, "20130103"], dtype="M8[ns]", name="foo" + ) tm.assert_index_equal(result, expected) result = td + tdi @@ -489,14 +497,15 @@ tdi + Index([1, 2, 3], dtype=np.int64) # this is a union! + # FIXME: don't leave commented-out # pytest.raises(TypeError, lambda : Index([1,2,3]) + tdi) result = tdi + dti # name will be reset - expected = DatetimeIndex(["20130102", NaT, "20130105"]) + expected = DatetimeIndex(["20130102", NaT, "20130105"], dtype="M8[ns]") tm.assert_index_equal(result, expected) result = dti + tdi # name will be reset - expected = DatetimeIndex(["20130102", NaT, "20130105"]) + expected = DatetimeIndex(["20130102", NaT, "20130105"], dtype="M8[ns]") tm.assert_index_equal(result, expected) result = dt + td @@ -738,20 +747,10 @@ s1 = pd.to_timedelta(Series(["00:00:01"])) s2 = pd.to_timedelta(Series(["00:00:02"])) - msg = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" - with pytest.raises(TypeError, match=msg): - # Passing datetime64-dtype data to TimedeltaIndex is no longer - # supported GH#29794 - pd.to_timedelta(Series([NaT])) # TODO: belongs elsewhere? - sn = pd.to_timedelta(Series([NaT], dtype="m8[ns]")) df1 = DataFrame(["00:00:01"]).apply(pd.to_timedelta) df2 = DataFrame(["00:00:02"]).apply(pd.to_timedelta) - with pytest.raises(TypeError, match=msg): - # Passing datetime64-dtype data to TimedeltaIndex is no longer - # supported GH#29794 - DataFrame([NaT]).apply(pd.to_timedelta) # TODO: belongs elsewhere? dfn = DataFrame([NaT._value]).apply(pd.to_timedelta) @@ -869,7 +868,7 @@ # timestamp on lhs result = resultb + df["A"] values = [Timestamp("20111230"), Timestamp("20120101"), Timestamp("20120103")] - expected = Series(values, name="A") + expected = Series(values, dtype="M8[ns]", name="A") tm.assert_series_equal(result, expected) # datetimes on rhs @@ -1031,7 +1030,7 @@ other = np.datetime64("NaT") tdi = timedelta_range("1 day", periods=3) - expected = DatetimeIndex(["NaT", "NaT", "NaT"]) + expected = DatetimeIndex(["NaT", "NaT", "NaT"], dtype="M8[ns]") tdser = tm.box_expected(tdi, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1073,8 +1072,8 @@ # ------------------------------------------------------------------ # Invalid __add__/__sub__ operations - @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) - @pytest.mark.parametrize("tdi_freq", [None, "H"]) + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"]) + @pytest.mark.parametrize("tdi_freq", [None, "h"]) def test_td64arr_sub_periodlike( self, box_with_array, box_with_array2, tdi_freq, pi_freq ): @@ -1133,7 +1132,7 @@ def test_td64arr_add_sub_int(self, box_with_array, one): # Variants of `one` for #19012, deprecated GH#22535 - rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) + rng = timedelta_range("1 days 09:00:00", freq="h", periods=10) tdarr = tm.box_expected(rng, box_with_array) msg = "Addition/subtraction of integers" @@ -1152,7 +1151,7 @@ box = box_with_array xbox = np.ndarray if box is pd.array else box - rng = timedelta_range("1 days 09:00:00", freq="H", periods=3) + rng = timedelta_range("1 days 09:00:00", freq="h", periods=3) tdarr = tm.box_expected(rng, box) other = tm.box_expected([4, 3, 2], xbox) @@ -2011,7 +2010,7 @@ tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(any_real_numpy_dtype) - expected = Series(["2.95D", "1D 23H 12m", "NaT"], dtype="timedelta64[ns]") + expected = Series(["2.95D", "1D 23h 12m", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) xbox = get_upcast_box(tdser, vector) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/boolean/test_arithmetic.py pandas-2.2.2+dfsg/pandas/tests/arrays/boolean/test_arithmetic.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/boolean/test_arithmetic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/boolean/test_arithmetic.py 2024-04-10 17:42:52.000000000 +0000 @@ -90,9 +90,16 @@ # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators): +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): # invalid ops + if using_infer_string: + import pyarrow as pa + + err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + err = TypeError + op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) @@ -110,9 +117,10 @@ [ r"unsupported operand type\(s\) for", "Concatenation operation is not implemented for NumPy arrays", + "has no kernel", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes @@ -123,7 +131,9 @@ r"unsupported operand type\(s\) for", "can only concatenate str", "not all arguments converted during string formatting", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): ops(pd.Series("foo", index=s.index)) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/boolean/test_construction.py pandas-2.2.2+dfsg/pandas/tests/arrays/boolean/test_construction.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/boolean/test_construction.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/boolean/test_construction.py 2024-04-10 17:42:52.000000000 +0000 @@ -223,7 +223,7 @@ # also with no missing values -> object dtype arr = pd.array([True, False, True], dtype="boolean") result = np.array(arr) - expected = np.array([True, False, True], dtype="object") + expected = np.array([True, False, True], dtype="bool") tm.assert_numpy_array_equal(result, expected) # force bool dtype @@ -242,7 +242,8 @@ def test_to_boolean_array_from_strings(): result = BooleanArray._from_sequence_of_strings( - np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object) + np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object), + dtype="boolean", ) expected = BooleanArray( np.array([True, False, True, True, False, False, False]), @@ -254,7 +255,7 @@ def test_to_boolean_array_from_strings_invalid_string(): with pytest.raises(ValueError, match="cannot be cast"): - BooleanArray._from_sequence_of_strings(["donkey"]) + BooleanArray._from_sequence_of_strings(["donkey"], dtype="boolean") @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) @@ -263,7 +264,7 @@ # default (with or without missing values) -> object dtype arr = con([True, False, True], dtype="boolean") result = arr.to_numpy() - expected = np.array([True, False, True], dtype="object") + expected = np.array([True, False, True], dtype="bool") tm.assert_numpy_array_equal(result, expected) arr = con([True, False, None], dtype="boolean") @@ -307,8 +308,6 @@ # converting to int or float without specifying na_value raises with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): arr.to_numpy(dtype="int64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - arr.to_numpy(dtype="float64") def test_to_numpy_copy(): diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/conftest.py pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/conftest.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -import pytest - -from pandas import Categorical - - -@pytest.fixture(params=[True, False]) -def allow_fill(request): - """Boolean 'allow_fill' parameter for Categorical.take""" - return request.param - - -@pytest.fixture -def factor(): - """Fixture returning a Categorical object""" - return Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_api.py pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_api.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_api.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_api.py 2024-04-10 17:42:52.000000000 +0000 @@ -385,7 +385,8 @@ class TestCategoricalAPIWithFactor: - def test_describe(self, factor): + def test_describe(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) # string type desc = factor.describe() assert factor.ordered diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_astype.py pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_astype.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_astype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_astype.py 2024-04-10 17:42:52.000000000 +0000 @@ -32,7 +32,7 @@ [ array(["2019", "2020"], dtype="datetime64[ns, UTC]"), array([0, 0], dtype="timedelta64[ns]"), - array([Period("2019"), Period("2020")], dtype="period[A-DEC]"), + array([Period("2019"), Period("2020")], dtype="period[Y-DEC]"), array([Interval(0, 1), Interval(1, 2)], dtype="interval"), array([1, np.nan], dtype="Int64"), ], @@ -89,7 +89,7 @@ expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = r"Cannot cast object dtype to float64" + msg = r"Cannot cast object|string dtype to float64" with pytest.raises(ValueError, match=msg): cat.astype(float) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -447,6 +449,7 @@ with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings") def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) @@ -742,7 +745,9 @@ def test_categorical_extension_array_nullable(self, nulls_fixture): # GH: - arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2) + arr = pd.arrays.StringArray._from_sequence( + [nulls_fixture] * 2, dtype=pd.StringDtype() + ) result = Categorical(arr) assert arr.dtype == result.categories.dtype expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype)) @@ -750,12 +755,12 @@ def test_from_sequence_copy(self): cat = Categorical(np.arange(5).repeat(2)) - result = Categorical._from_sequence(cat, dtype=None, copy=False) + result = Categorical._from_sequence(cat, dtype=cat.dtype, copy=False) # more generally, we'd be OK with a view assert result._codes is cat._codes - result = Categorical._from_sequence(cat, dtype=None, copy=True) + result = Categorical._from_sequence(cat, dtype=cat.dtype, copy=True) assert not tm.shares_memory(result, cat) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_indexing.py pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_indexing.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -21,7 +21,8 @@ class TestCategoricalIndexingWithFactor: - def test_getitem(self, factor): + def test_getitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) assert factor[0] == "a" assert factor[-1] == "c" @@ -31,7 +32,8 @@ subf = factor[np.asarray(factor) == "c"] tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8)) - def test_setitem(self, factor): + def test_setitem(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) # int/positional c = factor.copy() c[0] = "b" @@ -141,7 +143,8 @@ def test_periodindex(self): idx1 = PeriodIndex( - ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], + freq="M", ) cat1 = Categorical(idx1) @@ -152,7 +155,8 @@ tm.assert_index_equal(cat1.categories, exp_idx) idx2 = PeriodIndex( - ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], + freq="M", ) cat2 = Categorical(idx2, ordered=True) str(cat2) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_operators.py pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_operators.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_operators.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_operators.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,7 +17,8 @@ factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) tm.assert_categorical_equal(factor, factor) - def test_comparisons(self, factor): + def test_comparisons(self): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) result = factor[factor == "a"] expected = factor[np.asarray(factor) == "a"] tm.assert_categorical_equal(result, expected) @@ -92,7 +93,7 @@ cat > cat_unordered # comparison (in both directions) with Series will raise - s = Series(["b", "b", "b"]) + s = Series(["b", "b", "b"], dtype=object) msg = ( "Cannot compare a Categorical for op __gt__ with type " r"" @@ -108,7 +109,7 @@ # comparison with numpy.array will raise in both direction, but only on # newer numpy versions - a = np.array(["b", "b", "b"]) + a = np.array(["b", "b", "b"], dtype=object) with pytest.raises(TypeError, match=msg): cat > a with pytest.raises(TypeError, match=msg): @@ -248,7 +249,7 @@ cat_base = Series( Categorical(base, categories=cat.cat.categories, ordered=True) ) - s = Series(base) + s = Series(base, dtype=object if base == list("bbb") else None) a = np.array(base) # comparisons need to take categories ordering into account diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_replace.py pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_replace.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_replace.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_replace.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,6 +31,9 @@ ([1, 2, "3"], "5", ["5", "5", 3], True), ], ) +@pytest.mark.filterwarnings( + "ignore:.*with CategoricalDtype is deprecated:FutureWarning" +) def test_replace_categorical_series(to_replace, value, expected, flip_categories): # GH 31720 @@ -60,7 +63,13 @@ # GH#26988 cat = Categorical(["a", "b"]) expected = Categorical(result) - result = pd.Series(cat, copy=False).replace(to_replace, value)._values + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if expected_error_msg is not None else None + with tm.assert_produces_warning(warn, match=msg): + result = pd.Series(cat, copy=False).replace(to_replace, value)._values tm.assert_categorical_equal(result, expected) if to_replace == "b": # the "c" test is supposed to be unchanged @@ -69,14 +78,20 @@ tm.assert_categorical_equal(cat, expected) ser = pd.Series(cat, copy=False) - ser.replace(to_replace, value, inplace=True) + with tm.assert_produces_warning(warn, match=msg): + ser.replace(to_replace, value, inplace=True) tm.assert_categorical_equal(cat, expected) def test_replace_categorical_ea_dtype(): # GH49404 cat = Categorical(pd.array(["a", "b"], dtype="string")) - result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values expected = Categorical(pd.array(["c", pd.NA], dtype="string")) tm.assert_categorical_equal(result, expected) @@ -85,7 +100,12 @@ # GH51016 dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) ser = pd.Series([0, 1, 2], dtype=dtype) - result = ser.replace(0, 2) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.replace(0, 2) expected_dtype = pd.CategoricalDtype([1, 2], ordered=True) expected = pd.Series([2, 1, 2], dtype=expected_dtype) tm.assert_series_equal(expected, result, check_category_order=True) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_repr.py pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_repr.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_repr.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_repr.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,9 +1,13 @@ import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype from pandas import ( Categorical, CategoricalDtype, CategoricalIndex, + Index, Series, date_range, option_context, @@ -13,11 +17,18 @@ class TestCategoricalReprWithFactor: - def test_print(self, factor): - expected = [ - "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", - "Categories (3, object): ['a' < 'b' < 'c']", - ] + def test_print(self, using_infer_string): + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) + if using_infer_string: + expected = [ + "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", + "Categories (3, string): [a < b < c]", + ] + else: + expected = [ + "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", + "Categories (3, object): ['a' < 'b' < 'c']", + ] expected = "\n".join(expected) actual = repr(factor) assert actual == expected @@ -26,7 +37,7 @@ class TestCategoricalRepr: def test_big_print(self): codes = np.array([0, 1, 2, 0, 1, 2] * 100) - dtype = CategoricalDtype(categories=["a", "b", "c"]) + dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object)) factor = Categorical.from_codes(codes, dtype=dtype) expected = [ "['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']", @@ -40,13 +51,13 @@ assert actual == expected def test_empty_print(self): - factor = Categorical([], ["a", "b", "c"]) + factor = Categorical([], Index(["a", "b", "c"], dtype=object)) expected = "[], Categories (3, object): ['a', 'b', 'c']" actual = repr(factor) assert actual == expected assert expected == actual - factor = Categorical([], ["a", "b", "c"], ordered=True) + factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True) expected = "[], Categories (3, object): ['a' < 'b' < 'c']" actual = repr(factor) assert expected == actual @@ -66,6 +77,10 @@ with option_context("display.width", None): assert exp == repr(a) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), + reason="Change once infer_string is set to True by default", + ) def test_unicode_print(self): c = Categorical(["aaaaa", "bb", "cccc"] * 20) expected = """\ @@ -148,7 +163,7 @@ assert repr(c) == exp def test_categorical_repr_datetime(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx) exp = ( @@ -176,7 +191,7 @@ assert repr(c) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") c = Categorical(idx) exp = ( "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, " @@ -210,7 +225,7 @@ assert repr(c) == exp def test_categorical_repr_datetime_ordered(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < @@ -225,7 +240,7 @@ assert repr(c) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < @@ -254,17 +269,17 @@ assert repr(s) == s_exp def test_categorical_repr_period(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, +Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]""" # noqa: E501 assert repr(c) == exp c = Categorical(idx.append(idx), categories=idx) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, +Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]""" # noqa: E501 assert repr(c) == exp @@ -283,17 +298,17 @@ assert repr(c) == exp def test_categorical_repr_period_ordered(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < +Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < 2011-01-01 13:00]""" # noqa: E501 assert repr(c) == exp c = Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < +Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < 2011-01-01 13:00]""" # noqa: E501 assert repr(c) == exp @@ -396,7 +411,7 @@ assert repr(i) == exp def test_categorical_index_repr_datetime(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', @@ -405,7 +420,7 @@ assert repr(i) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', @@ -415,7 +430,7 @@ assert repr(i) == exp def test_categorical_index_repr_datetime_ordered(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) + idx = date_range("2011-01-01 09:00", freq="h", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', @@ -424,7 +439,7 @@ assert repr(i) == exp - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', @@ -445,22 +460,22 @@ def test_categorical_index_repr_period(self): # test all length - idx = period_range("2011-01-01 09:00", freq="H", periods=1) + idx = period_range("2011-01-01 09:00", freq="h", periods=1) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa: E501 assert repr(i) == exp - idx = period_range("2011-01-01 09:00", freq="H", periods=2) + idx = period_range("2011-01-01 09:00", freq="h", periods=2) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa: E501 assert repr(i) == exp - idx = period_range("2011-01-01 09:00", freq="H", periods=3) + idx = period_range("2011-01-01 09:00", freq="h", periods=3) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa: E501 assert repr(i) == exp - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], @@ -483,7 +498,7 @@ assert repr(i) == exp def test_categorical_index_repr_period_ordered(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) + idx = period_range("2011-01-01 09:00", freq="h", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_subclass.py pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_subclass.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_subclass.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_subclass.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,21 +2,25 @@ import pandas._testing as tm +class SubclassedCategorical(Categorical): + pass + + class TestCategoricalSubclassing: def test_constructor(self): - sc = tm.SubclassedCategorical(["a", "b", "c"]) - assert isinstance(sc, tm.SubclassedCategorical) + sc = SubclassedCategorical(["a", "b", "c"]) + assert isinstance(sc, SubclassedCategorical) tm.assert_categorical_equal(sc, Categorical(["a", "b", "c"])) def test_from_codes(self): - sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ["a", "b", "c"]) - assert isinstance(sc, tm.SubclassedCategorical) + sc = SubclassedCategorical.from_codes([1, 0, 2], ["a", "b", "c"]) + assert isinstance(sc, SubclassedCategorical) exp = Categorical.from_codes([1, 0, 2], ["a", "b", "c"]) tm.assert_categorical_equal(sc, exp) def test_map(self): - sc = tm.SubclassedCategorical(["a", "b", "c"]) + sc = SubclassedCategorical(["a", "b", "c"]) res = sc.map(lambda x: x.upper(), na_action=None) - assert isinstance(res, tm.SubclassedCategorical) + assert isinstance(res, SubclassedCategorical) exp = Categorical(["A", "B", "C"]) tm.assert_categorical_equal(res, exp) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_take.py pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_take.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/categorical/test_take.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/categorical/test_take.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,12 @@ import pandas._testing as tm +@pytest.fixture(params=[True, False]) +def allow_fill(request): + """Boolean 'allow_fill' parameter for Categorical.take""" + return request.param + + class TestTake: # https://github.com/pandas-dev/pandas/issues/20664 diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/datetimes/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/arrays/datetimes/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/datetimes/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/datetimes/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,25 +8,27 @@ import pandas as pd import pandas._testing as tm from pandas.core.arrays import DatetimeArray -from pandas.core.arrays.datetimes import _sequence_to_dt64ns class TestDatetimeArrayConstructor: def test_from_sequence_invalid_type(self): mi = pd.MultiIndex.from_product([np.arange(5), np.arange(5)]) with pytest.raises(TypeError, match="Cannot create a DatetimeArray"): - DatetimeArray._from_sequence(mi) + DatetimeArray._from_sequence(mi, dtype="M8[ns]") def test_only_1dim_accepted(self): arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]") - with pytest.raises(ValueError, match="Only 1-dimensional"): - # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 - DatetimeArray(arr.reshape(2, 2, 1)) - - with pytest.raises(ValueError, match="Only 1-dimensional"): - # 0-dim - DatetimeArray(arr[[0]].squeeze()) + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + DatetimeArray(arr.reshape(2, 2, 1)) + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 0-dim + DatetimeArray(arr[[0]].squeeze()) def test_freq_validation(self): # GH#24623 check that invalid instances cannot be created with the @@ -34,17 +36,18 @@ arr = np.arange(5, dtype=np.int64) * 3600 * 10**9 msg = ( - "Inferred frequency H from passed values does not " + "Inferred frequency h from passed values does not " "conform to passed frequency W-SUN" ) - with pytest.raises(ValueError, match=msg): - DatetimeArray(arr, freq="W") + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr, freq="W") @pytest.mark.parametrize( "meth", [ DatetimeArray._from_sequence, - _sequence_to_dt64ns, pd.to_datetime, pd.DatetimeIndex, ], @@ -68,44 +71,50 @@ def test_from_pandas_array(self): arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9 - result = DatetimeArray._from_sequence(arr)._with_freq("infer") + result = DatetimeArray._from_sequence(arr, dtype="M8[ns]")._with_freq("infer") - expected = pd.date_range("1970-01-01", periods=5, freq="H")._data + expected = pd.date_range("1970-01-01", periods=5, freq="h")._data tm.assert_datetime_array_equal(result, expected) def test_mismatched_timezone_raises(self): - arr = DatetimeArray( - np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"), - dtype=DatetimeTZDtype(tz="US/Central"), - ) + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + arr = DatetimeArray( + np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"), + dtype=DatetimeTZDtype(tz="US/Central"), + ) dtype = DatetimeTZDtype(tz="US/Eastern") msg = r"dtype=datetime64\[ns.*\] does not match data dtype datetime64\[ns.*\]" - with pytest.raises(TypeError, match=msg): - DatetimeArray(arr, dtype=dtype) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(TypeError, match=msg): + DatetimeArray(arr, dtype=dtype) # also with mismatched tzawareness - with pytest.raises(TypeError, match=msg): - DatetimeArray(arr, dtype=np.dtype("M8[ns]")) - with pytest.raises(TypeError, match=msg): - DatetimeArray(arr.tz_localize(None), dtype=arr.dtype) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(TypeError, match=msg): + DatetimeArray(arr, dtype=np.dtype("M8[ns]")) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(TypeError, match=msg): + DatetimeArray(arr.tz_localize(None), dtype=arr.dtype) def test_non_array_raises(self): - with pytest.raises(ValueError, match="list"): - DatetimeArray([1, 2, 3]) + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="list"): + DatetimeArray([1, 2, 3]) def test_bool_dtype_raises(self): arr = np.array([1, 2, 3], dtype="bool") + depr_msg = "DatetimeArray.__init__ is deprecated" msg = "Unexpected value for 'dtype': 'bool'. Must be" - with pytest.raises(ValueError, match=msg): - DatetimeArray(arr) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr) msg = r"dtype bool cannot be converted to datetime64\[ns\]" with pytest.raises(TypeError, match=msg): - DatetimeArray._from_sequence(arr) - - with pytest.raises(TypeError, match=msg): - _sequence_to_dt64ns(arr) + DatetimeArray._from_sequence(arr, dtype="M8[ns]") with pytest.raises(TypeError, match=msg): pd.DatetimeIndex(arr) @@ -114,25 +123,52 @@ pd.to_datetime(arr) def test_incorrect_dtype_raises(self): - with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): - DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category") + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category") + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="m8[s]") + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="M8[D]") + + def test_mismatched_values_dtype_units(self): + arr = np.array([1, 2, 3], dtype="M8[s]") + dtype = np.dtype("M8[ns]") + msg = "Values resolution does not match dtype." + depr_msg = "DatetimeArray.__init__ is deprecated" + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr, dtype=dtype) + + dtype2 = DatetimeTZDtype(tz="UTC", unit="ns") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + DatetimeArray(arr, dtype=dtype2) def test_freq_infer_raises(self): - with pytest.raises(ValueError, match="Frequency inference"): - DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer") + depr_msg = "DatetimeArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Frequency inference"): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer") def test_copy(self): data = np.array([1, 2, 3], dtype="M8[ns]") - arr = DatetimeArray(data, copy=False) + arr = DatetimeArray._from_sequence(data, copy=False) assert arr._ndarray is data - arr = DatetimeArray(data, copy=True) + arr = DatetimeArray._from_sequence(data, copy=True) assert arr._ndarray is not data @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_numpy_datetime_unit(self, unit): data = np.array([1, 2, 3], dtype=f"M8[{unit}]") - arr = DatetimeArray(data) + arr = DatetimeArray._from_sequence(data) assert arr.unit == unit assert arr[0].unit == unit @@ -143,14 +179,12 @@ ["2000"], dtype=DatetimeTZDtype(tz="US/Central") ) with pytest.raises(TypeError, match="data is already tz-aware"): - DatetimeArray._from_sequence_not_strict( - arr, dtype=DatetimeTZDtype(tz="UTC") - ) + DatetimeArray._from_sequence(arr, dtype=DatetimeTZDtype(tz="UTC")) def test_tz_dtype_matches(self): dtype = DatetimeTZDtype(tz="US/Central") arr = DatetimeArray._from_sequence(["2000"], dtype=dtype) - result = DatetimeArray._from_sequence_not_strict(arr, dtype=dtype) + result = DatetimeArray._from_sequence(arr, dtype=dtype) tm.assert_equal(arr, result) @pytest.mark.parametrize("order", ["F", "C"]) @@ -160,15 +194,10 @@ if order == "F": arr = arr.T - res = _sequence_to_dt64ns(arr) - expected = _sequence_to_dt64ns(arr.ravel()) - - tm.assert_numpy_array_equal(res[0].ravel(), expected[0]) - assert res[1] == expected[1] - assert res[2] == expected[2] - - res = DatetimeArray._from_sequence(arr) - expected = DatetimeArray._from_sequence(arr.ravel()).reshape(arr.shape) + res = DatetimeArray._from_sequence(arr, dtype=dti.dtype) + expected = DatetimeArray._from_sequence(arr.ravel(), dtype=dti.dtype).reshape( + arr.shape + ) tm.assert_datetime_array_equal(res, expected) @@ -194,7 +223,7 @@ ("s", "ns", "US/Central", "Asia/Kolkata", COARSE_TO_FINE_SAFE), ], ) -def test_from_arrowtest_from_arrow_with_different_units_and_timezones_with_( +def test_from_arrow_with_different_units_and_timezones_with( pa_unit, pd_unit, pa_tz, pd_tz, data ): pa = pytest.importorskip("pyarrow") @@ -204,9 +233,8 @@ dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz) result = dtype.__from_arrow__(arr) - expected = DatetimeArray( - np.array(data, dtype=f"datetime64[{pa_unit}]").astype(f"datetime64[{pd_unit}]"), - dtype=dtype, + expected = DatetimeArray._from_sequence(data, dtype=f"M8[{pa_unit}, UTC]").astype( + dtype, copy=False ) tm.assert_extension_array_equal(result, expected) @@ -232,7 +260,7 @@ dtype = DatetimeTZDtype(unit=unit, tz=tz) result = dtype.__from_arrow__(arr) - expected = DatetimeArray(np.array(data, dtype=f"datetime64[{unit}]")) + expected = DatetimeArray._from_sequence(np.array(data, dtype=f"datetime64[{unit}]")) expected = expected.tz_localize(tz=tz) tm.assert_extension_array_equal(result, expected) @@ -248,7 +276,7 @@ dtype = DatetimeTZDtype(unit="ns", tz="UTC") result = dtype.__from_arrow__(arr) - expected = DatetimeArray(np.array(data, dtype="datetime64[ns]")) + expected = DatetimeArray._from_sequence(np.array(data, dtype="datetime64[ns]")) expected = expected.tz_localize("UTC") tm.assert_extension_array_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/datetimes/test_cumulative.py pandas-2.2.2+dfsg/pandas/tests/arrays/datetimes/test_cumulative.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/datetimes/test_cumulative.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/datetimes/test_cumulative.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,40 +7,38 @@ class TestAccumulator: def test_accumulators_freq(self): # GH#50297 - arr = DatetimeArray._from_sequence_not_strict( + arr = DatetimeArray._from_sequence( [ "2000-01-01", "2000-01-02", "2000-01-03", ], - freq="D", - ) + dtype="M8[ns]", + )._with_freq("infer") result = arr._accumulate("cummin") - expected = DatetimeArray._from_sequence_not_strict( - ["2000-01-01"] * 3, freq=None - ) + expected = DatetimeArray._from_sequence(["2000-01-01"] * 3, dtype="M8[ns]") tm.assert_datetime_array_equal(result, expected) result = arr._accumulate("cummax") - expected = DatetimeArray._from_sequence_not_strict( + expected = DatetimeArray._from_sequence( [ "2000-01-01", "2000-01-02", "2000-01-03", ], - freq=None, + dtype="M8[ns]", ) tm.assert_datetime_array_equal(result, expected) @pytest.mark.parametrize("func", ["cumsum", "cumprod"]) def test_accumulators_disallowed(self, func): # GH#50297 - arr = DatetimeArray._from_sequence_not_strict( + arr = DatetimeArray._from_sequence( [ "2000-01-01", "2000-01-02", ], - freq="D", - ) + dtype="M8[ns]", + )._with_freq("infer") with pytest.raises(TypeError, match=f"Accumulation {func}"): arr._accumulate(func) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/datetimes/test_reductions.py pandas-2.2.2+dfsg/pandas/tests/arrays/datetimes/test_reductions.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/datetimes/test_reductions.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/datetimes/test_reductions.py 2024-04-10 17:42:52.000000000 +0000 @@ -124,7 +124,7 @@ # axis = 1 result = arr.median(axis=1) - expected = type(arr)._from_sequence([arr1d.median()]) + expected = type(arr)._from_sequence([arr1d.median()], dtype=arr.dtype) tm.assert_equal(result, expected) result = arr.median(axis=1, skipna=False) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/floating/test_arithmetic.py pandas-2.2.2+dfsg/pandas/tests/arrays/floating/test_arithmetic.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/floating/test_arithmetic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/floating/test_arithmetic.py 2024-04-10 17:42:52.000000000 +0000 @@ -122,11 +122,18 @@ # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators): +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + errs = TypeError + # invalid scalars msg = "|".join( [ @@ -140,15 +147,17 @@ "ufunc '.*' not supported for the input types, and the inputs could not", "ufunc '.*' did not contain a loop with signature matching types", "Concatenation operation is not implemented for NumPy arrays", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops("foo") - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Series("foo", index=s.index)) msg = "|".join( @@ -167,9 +176,11 @@ ), r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)", "cannot subtract DatetimeArray from ndarray", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/floating/test_to_numpy.py pandas-2.2.2+dfsg/pandas/tests/arrays/floating/test_to_numpy.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/floating/test_to_numpy.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/floating/test_to_numpy.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,12 +13,12 @@ # default (with or without missing values) -> object dtype arr = con([0.1, 0.2, 0.3], dtype="Float64") result = arr.to_numpy() - expected = np.array([0.1, 0.2, 0.3], dtype="object") + expected = np.array([0.1, 0.2, 0.3], dtype="float64") tm.assert_numpy_array_equal(result, expected) arr = con([0.1, 0.2, None], dtype="Float64") result = arr.to_numpy() - expected = np.array([0.1, 0.2, pd.NA], dtype="object") + expected = np.array([0.1, 0.2, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) @@ -33,10 +33,10 @@ tm.assert_numpy_array_equal(result, expected) arr = con([0.1, 0.2, None], dtype="Float64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - result = arr.to_numpy(dtype="float64") + result = arr.to_numpy(dtype="float64") + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) - # need to explicitly specify na_value result = arr.to_numpy(dtype="float64", na_value=np.nan) expected = np.array([0.1, 0.2, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) @@ -100,7 +100,7 @@ tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"]) +@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) def test_to_numpy_na_raises(box, dtype): con = pd.Series if box else pd.array diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/integer/test_arithmetic.py pandas-2.2.2+dfsg/pandas/tests/arrays/integer/test_arithmetic.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/integer/test_arithmetic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/integer/test_arithmetic.py 2024-04-10 17:42:52.000000000 +0000 @@ -172,11 +172,18 @@ # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators): +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + errs = TypeError + # invalid scalars msg = "|".join( [ @@ -188,20 +195,27 @@ "ufunc '.*' not supported for the input types, and the inputs could not", "ufunc '.*' did not contain a loop with signature matching types", "Addition/subtraction of integers and integer-arrays with Timestamp", + "has no kernel", + "not implemented", + "The 'out' kwarg is necessary. Use numpy.strings.multiply without it.", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops("foo") - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes str_ser = pd.Series("foo", index=s.index) # with pytest.raises(TypeError, match=msg): - if all_arithmetic_operators in [ - "__mul__", - "__rmul__", - ]: # (data[~data.isna()] >= 0).all(): + if ( + all_arithmetic_operators + in [ + "__mul__", + "__rmul__", + ] + and not using_infer_string + ): # (data[~data.isna()] >= 0).all(): res = ops(str_ser) expected = pd.Series(["foo" * x for x in data], index=s.index) expected = expected.fillna(np.nan) @@ -210,7 +224,7 @@ # more-correct than np.nan here. tm.assert_series_equal(res, expected) else: - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(str_ser) msg = "|".join( @@ -223,9 +237,11 @@ r"can only concatenate str \(not \"int\"\) to str", "not all arguments converted during string", "cannot subtract DatetimeArray from ndarray", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/integer/test_construction.py pandas-2.2.2+dfsg/pandas/tests/arrays/integer/test_construction.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/integer/test_construction.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/integer/test_construction.py 2024-04-10 17:42:52.000000000 +0000 @@ -175,32 +175,34 @@ def test_to_integer_array_float(): - result = IntegerArray._from_sequence([1.0, 2.0]) + result = IntegerArray._from_sequence([1.0, 2.0], dtype="Int64") expected = pd.array([1, 2], dtype="Int64") tm.assert_extension_array_equal(result, expected) with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): - IntegerArray._from_sequence([1.5, 2.0]) + IntegerArray._from_sequence([1.5, 2.0], dtype="Int64") # for float dtypes, the itemsize is not preserved - result = IntegerArray._from_sequence(np.array([1.0, 2.0], dtype="float32")) + result = IntegerArray._from_sequence( + np.array([1.0, 2.0], dtype="float32"), dtype="Int64" + ) assert result.dtype == Int64Dtype() def test_to_integer_array_str(): - result = IntegerArray._from_sequence(["1", "2", None]) + result = IntegerArray._from_sequence(["1", "2", None], dtype="Int64") expected = pd.array([1, 2, np.nan], dtype="Int64") tm.assert_extension_array_equal(result, expected) with pytest.raises( ValueError, match=r"invalid literal for int\(\) with base 10: .*" ): - IntegerArray._from_sequence(["1", "2", ""]) + IntegerArray._from_sequence(["1", "2", ""], dtype="Int64") with pytest.raises( ValueError, match=r"invalid literal for int\(\) with base 10: .*" ): - IntegerArray._from_sequence(["1.5", "2.0"]) + IntegerArray._from_sequence(["1.5", "2.0"], dtype="Int64") @pytest.mark.parametrize( diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/integer/test_dtypes.py pandas-2.2.2+dfsg/pandas/tests/arrays/integer/test_dtypes.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/integer/test_dtypes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/integer/test_dtypes.py 2024-04-10 17:42:52.000000000 +0000 @@ -23,7 +23,6 @@ @pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) def test_preserve_dtypes(op): - # TODO(#22346): preserve Int64 dtype # for ops that enable (mean would actually work here # but generally it is a float return value) df = pd.DataFrame( @@ -142,7 +141,7 @@ # coerce to object s = pd.Series(mixed) result = s.astype("object") - expected = pd.Series(np.asarray(mixed)) + expected = pd.Series(np.asarray(mixed, dtype=object)) tm.assert_series_equal(result, expected) @@ -272,7 +271,7 @@ tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) +@pytest.mark.parametrize("dtype", ["int64", "bool"]) def test_to_numpy_na_raises(dtype): a = pd.array([0, 1, None], dtype="Int64") with pytest.raises(ValueError, match=dtype): diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/integer/test_reduction.py pandas-2.2.2+dfsg/pandas/tests/arrays/integer/test_reduction.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/integer/test_reduction.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/integer/test_reduction.py 2024-04-10 17:42:52.000000000 +0000 @@ -102,7 +102,9 @@ ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ], ) -def test_mixed_reductions(op, expected): +def test_mixed_reductions(op, expected, using_infer_string): + if op in ["any", "all"] and using_infer_string: + expected = expected.astype("bool") df = DataFrame( { "A": ["a", "b", "b"], diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/interval/test_formats.py pandas-2.2.2+dfsg/pandas/tests/arrays/interval/test_formats.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/interval/test_formats.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/interval/test_formats.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,13 @@ +from pandas.core.arrays import IntervalArray + + +def test_repr(): + # GH#25022 + arr = IntervalArray.from_tuples([(0, 1), (1, 2)]) + result = repr(arr) + expected = ( + "\n" + "[(0, 1], (1, 2]]\n" + "Length: 2, dtype: interval[int64, right]" + ) + assert result == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/interval/test_interval.py pandas-2.2.2+dfsg/pandas/tests/arrays/interval/test_interval.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/interval/test_interval.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/interval/test_interval.py 2024-04-10 17:42:52.000000000 +0000 @@ -166,18 +166,6 @@ tm.assert_interval_array_equal(arr, orig) -def test_repr(): - # GH 25022 - arr = IntervalArray.from_tuples([(0, 1), (1, 2)]) - result = repr(arr) - expected = ( - "\n" - "[(0, 1], (1, 2]]\n" - "Length: 2, dtype: interval[int64, right]" - ) - assert result == expected - - class TestReductions: def test_min_max_invalid_axis(self, left_right_dtypes): left, right = left_right_dtypes @@ -241,172 +229,3 @@ res = arr_na.max(skipna=True) assert res == MAX assert type(res) == type(MAX) - - -# ---------------------------------------------------------------------------- -# Arrow interaction - - -def test_arrow_extension_type(): - pa = pytest.importorskip("pyarrow") - - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - - p1 = ArrowIntervalType(pa.int64(), "left") - p2 = ArrowIntervalType(pa.int64(), "left") - p3 = ArrowIntervalType(pa.int64(), "right") - - assert p1.closed == "left" - assert p1 == p2 - assert p1 != p3 - assert hash(p1) == hash(p2) - assert hash(p1) != hash(p3) - - -def test_arrow_array(): - pa = pytest.importorskip("pyarrow") - - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - - intervals = pd.interval_range(1, 5, freq=1).array - - result = pa.array(intervals) - assert isinstance(result.type, ArrowIntervalType) - assert result.type.closed == intervals.closed - assert result.type.subtype == pa.int64() - assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) - assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64")) - - expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)]) - assert result.storage.equals(expected) - - # convert to its storage type - result = pa.array(intervals, type=expected.type) - assert result.equals(expected) - - # unsupported conversions - with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): - pa.array(intervals, type="float64") - - with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): - pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) - - -def test_arrow_array_missing(): - pa = pytest.importorskip("pyarrow") - - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - - arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) - arr[1] = None - - result = pa.array(arr) - assert isinstance(result.type, ArrowIntervalType) - assert result.type.closed == arr.closed - assert result.type.subtype == pa.float64() - - # fields have missing values (not NaN) - left = pa.array([0.0, None, 2.0], type="float64") - right = pa.array([1.0, None, 3.0], type="float64") - assert result.storage.field("left").equals(left) - assert result.storage.field("right").equals(right) - - # structarray itself also has missing values on the array level - vals = [ - {"left": 0.0, "right": 1.0}, - {"left": None, "right": None}, - {"left": 2.0, "right": 3.0}, - ] - expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) - assert result.storage.equals(expected) - - -@pytest.mark.parametrize( - "breaks", - [[0.0, 1.0, 2.0, 3.0], date_range("2017", periods=4, freq="D")], - ids=["float", "datetime64[ns]"], -) -def test_arrow_table_roundtrip(breaks): - pa = pytest.importorskip("pyarrow") - - from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - - arr = IntervalArray.from_breaks(breaks) - arr[1] = None - df = pd.DataFrame({"a": arr}) - - table = pa.table(df) - assert isinstance(table.field("a").type, ArrowIntervalType) - result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.IntervalDtype) - tm.assert_frame_equal(result, df) - - table2 = pa.concat_tables([table, table]) - result = table2.to_pandas() - expected = pd.concat([df, df], ignore_index=True) - tm.assert_frame_equal(result, expected) - - # GH-41040 - table = pa.table( - [pa.chunked_array([], type=table.column(0).type)], schema=table.schema - ) - result = table.to_pandas() - tm.assert_frame_equal(result, expected[0:0]) - - -@pytest.mark.parametrize( - "breaks", - [[0.0, 1.0, 2.0, 3.0], date_range("2017", periods=4, freq="D")], - ids=["float", "datetime64[ns]"], -) -def test_arrow_table_roundtrip_without_metadata(breaks): - pa = pytest.importorskip("pyarrow") - - arr = IntervalArray.from_breaks(breaks) - arr[1] = None - df = pd.DataFrame({"a": arr}) - - table = pa.table(df) - # remove the metadata - table = table.replace_schema_metadata() - assert table.schema.metadata is None - - result = table.to_pandas() - assert isinstance(result["a"].dtype, pd.IntervalDtype) - tm.assert_frame_equal(result, df) - - -def test_from_arrow_from_raw_struct_array(): - # in case pyarrow lost the Interval extension type (eg on parquet roundtrip - # with datetime64[ns] subtype, see GH-45881), still allow conversion - # from arrow to IntervalArray - pa = pytest.importorskip("pyarrow") - - arr = pa.array([{"left": 0, "right": 1}, {"left": 1, "right": 2}]) - dtype = pd.IntervalDtype(np.dtype("int64"), closed="neither") - - result = dtype.__from_arrow__(arr) - expected = IntervalArray.from_breaks( - np.array([0, 1, 2], dtype="int64"), closed="neither" - ) - tm.assert_extension_array_equal(result, expected) - - result = dtype.__from_arrow__(pa.chunked_array([arr])) - tm.assert_extension_array_equal(result, expected) - - -@pytest.mark.parametrize("timezone", ["UTC", "US/Pacific", "GMT"]) -def test_interval_index_subtype(timezone, inclusive_endpoints_fixture): - # GH 46999 - dates = date_range("2022", periods=3, tz=timezone) - dtype = f"interval[datetime64[ns, {timezone}], {inclusive_endpoints_fixture}]" - result = IntervalIndex.from_arrays( - ["2022-01-01", "2022-01-02"], - ["2022-01-02", "2022-01-03"], - closed=inclusive_endpoints_fixture, - dtype=dtype, - ) - expected = IntervalIndex.from_arrays( - dates[:-1], dates[1:], closed=inclusive_endpoints_fixture - ) - tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/interval/test_interval_pyarrow.py pandas-2.2.2+dfsg/pandas/tests/arrays/interval/test_interval_pyarrow.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/interval/test_interval_pyarrow.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/interval/test_interval_pyarrow.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,160 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +def test_arrow_extension_type(): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + p1 = ArrowIntervalType(pa.int64(), "left") + p2 = ArrowIntervalType(pa.int64(), "left") + p3 = ArrowIntervalType(pa.int64(), "right") + + assert p1.closed == "left" + assert p1 == p2 + assert p1 != p3 + assert hash(p1) == hash(p2) + assert hash(p1) != hash(p3) + + +def test_arrow_array(): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + intervals = pd.interval_range(1, 5, freq=1).array + + result = pa.array(intervals) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == intervals.closed + assert result.type.subtype == pa.int64() + assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) + assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64")) + + expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)]) + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(intervals, type=expected.type) + assert result.equals(expected) + + # unsupported conversions + with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): + pa.array(intervals, type="float64") + + with pytest.raises(TypeError, match="Not supported to convert IntervalArray"): + pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) + + +def test_arrow_array_missing(): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) + arr[1] = None + + result = pa.array(arr) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == arr.closed + assert result.type.subtype == pa.float64() + + # fields have missing values (not NaN) + left = pa.array([0.0, None, 2.0], type="float64") + right = pa.array([1.0, None, 3.0], type="float64") + assert result.storage.field("left").equals(left) + assert result.storage.field("right").equals(right) + + # structarray itself also has missing values on the array level + vals = [ + {"left": 0.0, "right": 1.0}, + {"left": None, "right": None}, + {"left": 2.0, "right": 3.0}, + ] + expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) + assert result.storage.equals(expected) + + +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +@pytest.mark.parametrize( + "breaks", + [[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")], + ids=["float", "datetime64[ns]"], +) +def test_arrow_table_roundtrip(breaks): + pa = pytest.importorskip("pyarrow") + + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowIntervalType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) + + # GH#41040 + table = pa.table( + [pa.chunked_array([], type=table.column(0).type)], schema=table.schema + ) + result = table.to_pandas() + tm.assert_frame_equal(result, expected[0:0]) + + +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +@pytest.mark.parametrize( + "breaks", + [[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")], + ids=["float", "datetime64[ns]"], +) +def test_arrow_table_roundtrip_without_metadata(breaks): + pa = pytest.importorskip("pyarrow") + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + # remove the metadata + table = table.replace_schema_metadata() + assert table.schema.metadata is None + + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + +def test_from_arrow_from_raw_struct_array(): + # in case pyarrow lost the Interval extension type (eg on parquet roundtrip + # with datetime64[ns] subtype, see GH-45881), still allow conversion + # from arrow to IntervalArray + pa = pytest.importorskip("pyarrow") + + arr = pa.array([{"left": 0, "right": 1}, {"left": 1, "right": 2}]) + dtype = pd.IntervalDtype(np.dtype("int64"), closed="neither") + + result = dtype.__from_arrow__(arr) + expected = IntervalArray.from_breaks( + np.array([0, 1, 2], dtype="int64"), closed="neither" + ) + tm.assert_extension_array_equal(result, expected) + + result = dtype.__from_arrow__(pa.chunked_array([arr])) + tm.assert_extension_array_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/interval/test_ops.py pandas-2.2.2+dfsg/pandas/tests/arrays/interval/test_ops.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/interval/test_ops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/interval/test_ops.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,93 +0,0 @@ -"""Tests for Interval-Interval operations, such as overlaps, contains, etc.""" -import numpy as np -import pytest - -from pandas import ( - Interval, - IntervalIndex, - Timedelta, - Timestamp, -) -import pandas._testing as tm -from pandas.core.arrays import IntervalArray - - -@pytest.fixture(params=[IntervalArray, IntervalIndex]) -def constructor(request): - """ - Fixture for testing both interval container classes. - """ - return request.param - - -@pytest.fixture( - params=[ - (Timedelta("0 days"), Timedelta("1 day")), - (Timestamp("2018-01-01"), Timedelta("1 day")), - (0, 1), - ], - ids=lambda x: type(x[0]).__name__, -) -def start_shift(request): - """ - Fixture for generating intervals of different types from a start value - and a shift value that can be added to start to generate an endpoint. - """ - return request.param - - -class TestOverlaps: - def test_overlaps_interval(self, constructor, start_shift, closed, other_closed): - start, shift = start_shift - interval = Interval(start, start + 3 * shift, other_closed) - - # intervals: identical, nested, spanning, partial, adjacent, disjoint - tuples = [ - (start, start + 3 * shift), - (start + shift, start + 2 * shift), - (start - shift, start + 4 * shift), - (start + 2 * shift, start + 4 * shift), - (start + 3 * shift, start + 4 * shift), - (start + 4 * shift, start + 5 * shift), - ] - interval_container = constructor.from_tuples(tuples, closed) - - adjacent = interval.closed_right and interval_container.closed_left - expected = np.array([True, True, True, True, adjacent, False]) - result = interval_container.overlaps(interval) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("other_constructor", [IntervalArray, IntervalIndex]) - def test_overlaps_interval_container(self, constructor, other_constructor): - # TODO: modify this test when implemented - interval_container = constructor.from_breaks(range(5)) - other_container = other_constructor.from_breaks(range(5)) - with pytest.raises(NotImplementedError, match="^$"): - interval_container.overlaps(other_container) - - def test_overlaps_na(self, constructor, start_shift): - """NA values are marked as False""" - start, shift = start_shift - interval = Interval(start, start + shift) - - tuples = [ - (start, start + shift), - np.nan, - (start + 2 * shift, start + 3 * shift), - ] - interval_container = constructor.from_tuples(tuples) - - expected = np.array([True, False, False]) - result = interval_container.overlaps(interval) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize( - "other", - [10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")], - ids=lambda x: type(x).__name__, - ) - def test_overlaps_invalid_type(self, constructor, other): - interval_container = constructor.from_breaks(range(5)) - msg = f"`other` must be Interval-like, got {type(other).__name__}" - with pytest.raises(TypeError, match=msg): - interval_container.overlaps(other) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/interval/test_overlaps.py pandas-2.2.2+dfsg/pandas/tests/arrays/interval/test_overlaps.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/interval/test_overlaps.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/interval/test_overlaps.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,93 @@ +"""Tests for Interval-Interval operations, such as overlaps, contains, etc.""" +import numpy as np +import pytest + +from pandas import ( + Interval, + IntervalIndex, + Timedelta, + Timestamp, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +@pytest.fixture(params=[IntervalArray, IntervalIndex]) +def constructor(request): + """ + Fixture for testing both interval container classes. + """ + return request.param + + +@pytest.fixture( + params=[ + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timedelta("1 day")), + (0, 1), + ], + ids=lambda x: type(x[0]).__name__, +) +def start_shift(request): + """ + Fixture for generating intervals of different types from a start value + and a shift value that can be added to start to generate an endpoint. + """ + return request.param + + +class TestOverlaps: + def test_overlaps_interval(self, constructor, start_shift, closed, other_closed): + start, shift = start_shift + interval = Interval(start, start + 3 * shift, other_closed) + + # intervals: identical, nested, spanning, partial, adjacent, disjoint + tuples = [ + (start, start + 3 * shift), + (start + shift, start + 2 * shift), + (start - shift, start + 4 * shift), + (start + 2 * shift, start + 4 * shift), + (start + 3 * shift, start + 4 * shift), + (start + 4 * shift, start + 5 * shift), + ] + interval_container = constructor.from_tuples(tuples, closed) + + adjacent = interval.closed_right and interval_container.closed_left + expected = np.array([True, True, True, True, adjacent, False]) + result = interval_container.overlaps(interval) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("other_constructor", [IntervalArray, IntervalIndex]) + def test_overlaps_interval_container(self, constructor, other_constructor): + # TODO: modify this test when implemented + interval_container = constructor.from_breaks(range(5)) + other_container = other_constructor.from_breaks(range(5)) + with pytest.raises(NotImplementedError, match="^$"): + interval_container.overlaps(other_container) + + def test_overlaps_na(self, constructor, start_shift): + """NA values are marked as False""" + start, shift = start_shift + interval = Interval(start, start + shift) + + tuples = [ + (start, start + shift), + np.nan, + (start + 2 * shift, start + 3 * shift), + ] + interval_container = constructor.from_tuples(tuples) + + expected = np.array([True, False, False]) + result = interval_container.overlaps(interval) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")], + ids=lambda x: type(x).__name__, + ) + def test_overlaps_invalid_type(self, constructor, other): + interval_container = constructor.from_breaks(range(5)) + msg = f"`other` must be Interval-like, got {type(other).__name__}" + with pytest.raises(TypeError, match=msg): + interval_container.overlaps(other) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/masked/test_arrow_compat.py pandas-2.2.2+dfsg/pandas/tests/arrays/masked/test_arrow_compat.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/masked/test_arrow_compat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/masked/test_arrow_compat.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,7 +4,11 @@ import pandas as pd import pandas._testing as tm -pa = pytest.importorskip("pyarrow", minversion="1.0.1") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +pa = pytest.importorskip("pyarrow") from pandas.core.arrays.arrow._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -35,6 +39,7 @@ df = pd.DataFrame({"a": data}) table = pa.table(df) assert table.field("a").type == str(data.dtype.numpy_dtype) + result = table.to_pandas() assert result["a"].dtype == data.dtype tm.assert_frame_equal(result, df) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/masked/test_function.py pandas-2.2.2+dfsg/pandas/tests/arrays/masked/test_function.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/masked/test_function.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/masked/test_function.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import BaseMaskedArray arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES] arrays += [ @@ -55,3 +56,19 @@ result = data.tolist() expected = list(data) tm.assert_equal(result, expected) + + +def test_to_numpy(): + # GH#56991 + + class MyStringArray(BaseMaskedArray): + dtype = pd.StringDtype() + _dtype_cls = pd.StringDtype + _internal_fill_value = pd.NA + + arr = MyStringArray( + values=np.array(["a", "b", "c"]), mask=np.array([False, True, False]) + ) + result = arr.to_numpy() + expected = np.array(["a", pd.NA, "c"]) + tm.assert_numpy_array_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/numpy_/test_numpy.py pandas-2.2.2+dfsg/pandas/tests/arrays/numpy_/test_numpy.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/numpy_/test_numpy.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/numpy_/test_numpy.py 2024-04-10 17:42:52.000000000 +0000 @@ -87,7 +87,7 @@ assert result == expected -def test_dtype_univalent(any_numpy_dtype): +def test_dtype_idempotent(any_numpy_dtype): dtype = NumpyEADtype(any_numpy_dtype) result = NumpyEADtype(dtype) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/period/test_arrow_compat.py pandas-2.2.2+dfsg/pandas/tests/arrays/period/test_arrow_compat.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/period/test_arrow_compat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/period/test_arrow_compat.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,6 @@ import pytest -from pandas.compat.pyarrow import pa_version_under10p0 +from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.dtypes import PeriodDtype @@ -11,7 +11,12 @@ period_array, ) -pa = pytest.importorskip("pyarrow", minversion="1.0.1") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +pa = pytest.importorskip("pyarrow") def test_arrow_extension_type(): @@ -28,12 +33,12 @@ assert hash(p1) != hash(p3) -@pytest.mark.xfail(not pa_version_under10p0, reason="Wrong behavior with pyarrow 10") +@pytest.mark.xfail(not pa_version_under10p1, reason="Wrong behavior with pyarrow 10") @pytest.mark.parametrize( "data, freq", [ (pd.date_range("2017", periods=3), "D"), - (pd.date_range("2017", periods=3, freq="A"), "A-DEC"), + (pd.date_range("2017", periods=3, freq="YE"), "Y-DEC"), ], ) def test_arrow_array(data, freq): @@ -104,13 +109,14 @@ table = pa.table( [pa.chunked_array([], type=table.column(0).type)], schema=table.schema ) + result = table.to_pandas() assert isinstance(result["a"].dtype, PeriodDtype) tm.assert_frame_equal(result, df) def test_arrow_table_roundtrip_without_metadata(): - arr = PeriodArray([1, 2, 3], dtype="period[H]") + arr = PeriodArray([1, 2, 3], dtype="period[h]") arr[1] = pd.NaT df = pd.DataFrame({"a": arr}) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/period/test_astype.py pandas-2.2.2+dfsg/pandas/tests/arrays/period/test_astype.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/period/test_astype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/period/test_astype.py 2024-04-10 17:42:52.000000000 +0000 @@ -52,16 +52,16 @@ tm.assert_period_array_equal(result, expected) -@pytest.mark.parametrize("other", ["datetime64[ns]", "timedelta64[ns]"]) -def test_astype_datetime(other): +@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) +def test_astype_datetime(dtype): arr = period_array(["2000", "2001", None], freq="D") # slice off the [ns] so that the regex matches. - if other == "timedelta64[ns]": - with pytest.raises(TypeError, match=other[:-4]): - arr.astype(other) + if dtype == "timedelta64[ns]": + with pytest.raises(TypeError, match=dtype[:-4]): + arr.astype(dtype) else: # GH#45038 allow period->dt64 because we allow dt64->period - result = arr.astype(other) - expected = pd.DatetimeIndex(["2000", "2001", pd.NaT])._data + result = arr.astype(dtype) + expected = pd.DatetimeIndex(["2000", "2001", pd.NaT], dtype=dtype)._data tm.assert_datetime_array_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/period/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/arrays/period/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/period/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/period/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -71,11 +71,11 @@ "data, freq, msg", [ ( - [pd.Period("2017", "D"), pd.Period("2017", "A")], + [pd.Period("2017", "D"), pd.Period("2017", "Y")], None, "Input has different freq", ), - ([pd.Period("2017", "D")], "A", "Input has different freq"), + ([pd.Period("2017", "D")], "Y", "Input has different freq"), ], ) def test_period_array_raises(data, freq, msg): @@ -144,3 +144,13 @@ expected = PeriodArray(data, dtype="period[M]") tm.assert_equal(res, expected) + + +def test_period_array_from_datetime64(): + arr = np.array( + ["2020-01-01T00:00:00", "2020-02-02T00:00:00"], dtype="datetime64[ns]" + ) + result = PeriodArray._from_datetime64(arr, freq=MonthEnd(2)) + + expected = period_array(["2020-01-01", "2020-02-01"], freq=MonthEnd(2)) + tm.assert_period_array_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/sparse/test_indexing.py pandas-2.2.2+dfsg/pandas/tests/arrays/sparse/test_indexing.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/sparse/test_indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/sparse/test_indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -166,9 +166,16 @@ tm.assert_sp_array_equal(arr.take([0, 1, 2]), exp) def test_take_all_empty(self): - a = pd.array([0, 0], dtype=SparseDtype("int64")) - result = a.take([0, 1], allow_fill=True, fill_value=np.nan) - tm.assert_sp_array_equal(a, result) + sparse = pd.array([0, 0], dtype=SparseDtype("int64")) + result = sparse.take([0, 1], allow_fill=True, fill_value=np.nan) + tm.assert_sp_array_equal(sparse, result) + + def test_take_different_fill_value(self): + # Take with a different fill value shouldn't overwrite the original + sparse = pd.array([0.0], dtype=SparseDtype("float64", fill_value=0.0)) + result = sparse.take([0, -1], allow_fill=True, fill_value=np.nan) + expected = pd.array([0, np.nan], dtype=sparse.dtype) + tm.assert_sp_array_equal(expected, result) def test_take_fill_value(self): data = np.array([1, np.nan, 0, 3, 0]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/string_/test_string.py pandas-2.2.2+dfsg/pandas/tests/arrays/string_/test_string.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/string_/test_string.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/string_/test_string.py 2024-04-10 17:42:52.000000000 +0000 @@ -64,14 +64,14 @@ assert repr(df.A.array) == expected -def test_none_to_nan(cls): - a = cls._from_sequence(["a", None, "b"]) +def test_none_to_nan(cls, dtype): + a = cls._from_sequence(["a", None, "b"], dtype=dtype) assert a[1] is not None assert a[1] is na_val(a.dtype) -def test_setitem_validates(cls): - arr = cls._from_sequence(["a", "b"]) +def test_setitem_validates(cls, dtype): + arr = cls._from_sequence(["a", "b"], dtype=dtype) if cls is pd.arrays.StringArray: msg = "Cannot set non-string value '10' into a StringArray." @@ -120,6 +120,14 @@ result = casted.astype("datetime64[ns]") tm.assert_series_equal(result, ser) + # GH#38509 same thing for timedelta64 + ser2 = ser - ser.iloc[-1] + casted2 = ser2.astype(dtype) + assert is_dtype_equal(casted2.dtype, dtype) + + result2 = casted2.astype(ser2.dtype) + tm.assert_series_equal(result2, ser2) + def test_add(dtype): a = pd.Series(["a", "b", "c", None, None], dtype=dtype) @@ -145,7 +153,7 @@ if dtype.storage in arrow_string_storage: reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) - request.node.add_marker(mark) + request.applymarker(mark) a = pd.array(["a", "b", "c"], dtype=dtype) b = np.array([["a", "b", "c"]], dtype=object) @@ -170,12 +178,7 @@ tm.assert_extension_array_equal(result, expected) -def test_mul(dtype, request, arrow_string_storage): - if dtype.storage in arrow_string_storage: - reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" - mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) - request.node.add_marker(mark) - +def test_mul(dtype): a = pd.array(["a", "b", None], dtype=dtype) result = a * 2 expected = pd.array(["aa", "bb", None], dtype=dtype) @@ -188,7 +191,7 @@ @pytest.mark.xfail(reason="GH-28527") def test_add_strings(dtype): arr = pd.array(["a", "b", "c", "d"], dtype=dtype) - df = pd.DataFrame([["t", "y", "v", "w"]]) + df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object) assert arr.__add__(df) is NotImplemented result = arr + df @@ -260,7 +263,7 @@ other = 42 if op_name not in ["__eq__", "__ne__"]: - with pytest.raises(TypeError, match="not supported between"): + with pytest.raises(TypeError, match="Invalid comparison|not supported between"): getattr(a, op_name)(other) return @@ -358,12 +361,12 @@ @pytest.mark.parametrize("copy", [True, False]) -def test_from_sequence_no_mutate(copy, cls, request): +def test_from_sequence_no_mutate(copy, cls, dtype): nan_arr = np.array(["a", np.nan], dtype=object) expected_input = nan_arr.copy() na_arr = np.array(["a", pd.NA], dtype=object) - result = cls._from_sequence(nan_arr, copy=copy) + result = cls._from_sequence(nan_arr, dtype=dtype, copy=copy) if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics): import pyarrow as pa @@ -383,8 +386,16 @@ tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - msg = r"int\(\) argument must be a string, a bytes-like object or a( real)? number" - with pytest.raises(TypeError, match=msg): + if dtype.storage == "pyarrow_numpy": + err = ValueError + msg = "cannot convert float NaN to integer" + else: + err = TypeError + msg = ( + r"int\(\) argument must be a string, a bytes-like " + r"object or a( real)? number" + ) + with pytest.raises(err, match=msg): arr.astype("int64") @@ -425,7 +436,7 @@ @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("skipna", [True, False]) -def test_min_max(method, skipna, dtype, request): +def test_min_max(method, skipna, dtype): arr = pd.Series(["a", "b", "c", None], dtype=dtype) result = getattr(arr, method)(skipna=skipna) if skipna: @@ -444,7 +455,7 @@ else: reason = "'ArrowStringArray' object has no attribute 'max'" mark = pytest.mark.xfail(raises=TypeError, reason=reason) - request.node.add_marker(mark) + request.applymarker(mark) arr = box(["a", "b", "c", None], dtype=dtype) result = getattr(np, method)(arr) @@ -452,7 +463,7 @@ assert result == expected -def test_fillna_args(dtype, request, arrow_string_storage): +def test_fillna_args(dtype, arrow_string_storage): # GH 37987 arr = pd.array(["a", pd.NA], dtype=dtype) @@ -476,24 +487,37 @@ def test_arrow_array(dtype): # protocol added in 0.15.0 pa = pytest.importorskip("pyarrow") + import pyarrow.compute as pc data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) - expected = pa.array(list(data), type=pa.string(), from_pandas=True) + expected = pa.array(list(data), type=pa.large_string(), from_pandas=True) if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) - + if dtype.storage == "python": + expected = pc.cast(expected, pa.string()) assert arr.equals(expected) -def test_arrow_roundtrip(dtype, string_storage2): +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") + if using_infer_string and string_storage2 != "pyarrow_numpy": + request.applymarker( + pytest.mark.xfail( + reason="infer_string takes precedence over string storage" + ) + ) + data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - assert table.field("a").type == "string" + if dtype.storage == "python": + assert table.field("a").type == "string" + else: + assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage2): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) @@ -503,14 +527,27 @@ assert result.loc[2, "a"] is na_val(result["a"].dtype) -def test_arrow_load_from_zero_chunks(dtype, string_storage2): +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_arrow_load_from_zero_chunks( + dtype, string_storage2, request, using_infer_string +): # GH-41040 pa = pytest.importorskip("pyarrow") + if using_infer_string and string_storage2 != "pyarrow_numpy": + request.applymarker( + pytest.mark.xfail( + reason="infer_string takes precedence over string storage" + ) + ) + data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) - assert table.field("a").type == "string" + if dtype.storage == "python": + assert table.field("a").type == "string" + else: + assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) with pd.option_context("string_storage", string_storage2): diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/string_/test_string_arrow.py pandas-2.2.2+dfsg/pandas/tests/arrays/string_/test_string_arrow.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/string_/test_string_arrow.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/string_/test_string_arrow.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,7 +4,7 @@ import numpy as np import pytest -from pandas.compat import pa_version_under7p0 +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -17,29 +17,25 @@ ArrowStringArrayNumpySemantics, ) -skip_if_no_pyarrow = pytest.mark.skipif( - pa_version_under7p0, - reason="pyarrow>=7.0.0 is required for PyArrow backed StringArray", -) - -@skip_if_no_pyarrow def test_eq_all_na(): + pytest.importorskip("pyarrow") a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow")) result = a == a expected = pd.array([pd.NA, pd.NA], dtype="boolean[pyarrow]") tm.assert_extension_array_equal(result, expected) -def test_config(string_storage): +def test_config(string_storage, request, using_infer_string): + if using_infer_string and string_storage != "pyarrow_numpy": + request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - expected = ( - StringDtype(string_storage).construct_array_type()._from_sequence(["a", "b"]) - ) + dtype = StringDtype(string_storage) + expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) @@ -49,11 +45,10 @@ pd.options.mode.string_storage = "foo" -@skip_if_no_pyarrow @pytest.mark.parametrize("chunked", [True, False]) @pytest.mark.parametrize("array", ["numpy", "pyarrow"]) def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage): - import pyarrow as pa + pa = pytest.importorskip("pyarrow") array = pa if array in arrow_string_storage else np @@ -66,7 +61,7 @@ msg = "Unsupported type '' for ArrowExtensionArray" else: msg = re.escape( - "ArrowStringArray requires a PyArrow (chunked) array of string type" + "ArrowStringArray requires a PyArrow (chunked) array of large_string type" ) with pytest.raises(ValueError, match=msg): ArrowStringArray(arr) @@ -81,17 +76,20 @@ arr = pa.chunked_array(arr) msg = re.escape( - "ArrowStringArray requires a PyArrow (chunked) array of string type" + "ArrowStringArray requires a PyArrow (chunked) array of large_string type" ) with pytest.raises(ValueError, match=msg): ArrowStringArray(arr) +@pytest.mark.xfail( + reason="dict conversion does not seem to be implemented for large string in arrow" +) @pytest.mark.parametrize("chunked", [True, False]) def test_constructor_valid_string_type_value_dictionary(chunked): pa = pytest.importorskip("pyarrow") - arr = pa.array(["1", "2", "3"], pa.dictionary(pa.int32(), pa.utf8())) + arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode() if chunked: arr = pa.chunked_array(arr) @@ -101,14 +99,14 @@ def test_constructor_from_list(): # GH#27673 - pytest.importorskip("pyarrow", minversion="1.0.0") + pytest.importorskip("pyarrow") result = pd.Series(["E"], dtype=StringDtype(storage="pyarrow")) assert isinstance(result.dtype, StringDtype) assert result.dtype.storage == "pyarrow" -@skip_if_no_pyarrow -def test_from_sequence_wrong_dtype_raises(): +def test_from_sequence_wrong_dtype_raises(using_infer_string): + pytest.importorskip("pyarrow") with pd.option_context("string_storage", "python"): ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") @@ -120,15 +118,19 @@ ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") - with pytest.raises(AssertionError, match=None): - with pd.option_context("string_storage", "python"): - ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) with pd.option_context("string_storage", "pyarrow"): ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) - with pytest.raises(AssertionError, match=None): - ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + ArrowStringArray._from_sequence( + ["a", None, "c"], dtype=StringDtype("python") + ) ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) @@ -143,25 +145,24 @@ with pytest.raises(AssertionError, match=None): StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") - with pd.option_context("string_storage", "python"): - StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) - - with pytest.raises(AssertionError, match=None): - with pd.option_context("string_storage", "pyarrow"): + if not using_infer_string: + with pd.option_context("string_storage", "python"): StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "pyarrow"): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) with pytest.raises(AssertionError, match=None): StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) -@pytest.mark.skipif( - not pa_version_under7p0, - reason="pyarrow is installed", -) +@td.skip_if_installed("pyarrow") def test_pyarrow_not_installed_raises(): - msg = re.escape("pyarrow>=7.0.0 is required for PyArrow backed") + msg = re.escape("pyarrow>=10.0.1 is required for PyArrow backed") with pytest.raises(ImportError, match=msg): StringDtype(storage="pyarrow") @@ -176,7 +177,6 @@ ArrowStringArray._from_sequence(["a", None, "b"]) -@skip_if_no_pyarrow @pytest.mark.parametrize("multiple_chunks", [False, True]) @pytest.mark.parametrize( "key, value, expected", @@ -199,7 +199,7 @@ ], ) def test_setitem(multiple_chunks, key, value, expected): - import pyarrow as pa + pa = pytest.importorskip("pyarrow") result = pa.array(list("abcde")) expected = pa.array(expected) @@ -215,9 +215,8 @@ tm.assert_equal(result, expected) -@skip_if_no_pyarrow def test_setitem_invalid_indexer_raises(): - import pyarrow as pa + pa = pytest.importorskip("pyarrow") arr = ArrowStringArray(pa.array(list("abcde"))) @@ -240,10 +239,10 @@ arr[[0, 1]] = ["foo", "bar", "baz"] -@skip_if_no_pyarrow @pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) def test_pickle_roundtrip(dtype): # GH 42600 + pytest.importorskip("pyarrow") expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) @@ -258,9 +257,9 @@ tm.assert_series_equal(result_sliced, expected_sliced) -@skip_if_no_pyarrow def test_string_dtype_error_message(): # GH#55051 + pytest.importorskip("pyarrow") msg = "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'." with pytest.raises(ValueError, match=msg): StringDtype("bla") diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/test_array.py pandas-2.2.2+dfsg/pandas/tests/arrays/test_array.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/test_array.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/test_array.py 2024-04-10 17:42:52.000000000 +0000 @@ -47,8 +47,8 @@ "data, dtype, expected", [ # Basic NumPy defaults. - ([], None, FloatingArray._from_sequence([])), - ([1, 2], None, IntegerArray._from_sequence([1, 2])), + ([], None, FloatingArray._from_sequence([], dtype="Float64")), + ([1, 2], None, IntegerArray._from_sequence([1, 2], dtype="Int64")), ([1, 2], object, NumpyExtensionArray(np.array([1, 2], dtype=object))), ( [1, 2], @@ -60,11 +60,15 @@ None, NumpyExtensionArray(np.array([], dtype=object)), ), - (np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2])), + ( + np.array([1, 2], dtype="int64"), + None, + IntegerArray._from_sequence([1, 2], dtype="Int64"), + ), ( np.array([1.0, 2.0], dtype="float64"), None, - FloatingArray._from_sequence([1.0, 2.0]), + FloatingArray._from_sequence([1.0, 2.0], dtype="Float64"), ), # String alias passes through to NumPy ([1, 2], "float32", NumpyExtensionArray(np.array([1, 2], dtype="float32"))), @@ -98,32 +102,38 @@ ( [1, 2], np.dtype("datetime64[ns]"), - DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), + DatetimeArray._from_sequence( + np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]" + ), ), ( [1, 2], np.dtype("datetime64[s]"), - DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[s]")), + DatetimeArray._from_sequence( + np.array([1, 2], dtype="M8[s]"), dtype="M8[s]" + ), ), ( np.array([1, 2], dtype="datetime64[ns]"), None, - DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), + DatetimeArray._from_sequence( + np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]" + ), ), ( pd.DatetimeIndex(["2000", "2001"]), np.dtype("datetime64[ns]"), - DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), ), ( pd.DatetimeIndex(["2000", "2001"]), None, - DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), ), ( ["2000", "2001"], np.dtype("datetime64[ns]"), - DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), ), # Datetime (tz-aware) ( @@ -135,24 +145,26 @@ ), # Timedelta ( - ["1H", "2H"], + ["1h", "2h"], np.dtype("timedelta64[ns]"), - TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"), ), ( - pd.TimedeltaIndex(["1H", "2H"]), + pd.TimedeltaIndex(["1h", "2h"]), np.dtype("timedelta64[ns]"), - TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"), ), ( np.array([1, 2], dtype="m8[s]"), np.dtype("timedelta64[s]"), - TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[s]")), + TimedeltaArray._from_sequence( + np.array([1, 2], dtype="m8[s]"), dtype="m8[s]" + ), ), ( - pd.TimedeltaIndex(["1H", "2H"]), + pd.TimedeltaIndex(["1h", "2h"]), None, - TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"), ), ( # preserve non-nano, i.e. don't cast to NumpyExtensionArray @@ -200,16 +212,28 @@ ( ["a", None], "string", - pd.StringDtype().construct_array_type()._from_sequence(["a", None]), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype()), ), ( ["a", None], pd.StringDtype(), - pd.StringDtype().construct_array_type()._from_sequence(["a", None]), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype()), ), # Boolean - ([True, None], "boolean", BooleanArray._from_sequence([True, None])), - ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])), + ( + [True, None], + "boolean", + BooleanArray._from_sequence([True, None], dtype="boolean"), + ), + ( + [True, None], + pd.BooleanDtype(), + BooleanArray._from_sequence([True, None], dtype="boolean"), + ), # Index (pd.Index([1, 2]), None, NumpyExtensionArray(np.array([1, 2], dtype=np.int64))), # Series[EA] returns the EA @@ -264,15 +288,15 @@ # datetime ( [pd.Timestamp("2000"), pd.Timestamp("2001")], - DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), ), ( [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], - DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), ), ( np.array([1, 2], dtype="M8[ns]"), - DatetimeArray(np.array([1, 2], dtype="M8[ns]")), + DatetimeArray._from_sequence(np.array([1, 2], dtype="M8[ns]")), ), ( np.array([1, 2], dtype="M8[us]"), @@ -284,7 +308,7 @@ ( [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns") ), ), ( @@ -293,52 +317,59 @@ datetime.datetime(2001, 1, 1, tzinfo=cet), ], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet) + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns") ), ), # timedelta ( - [pd.Timedelta("1H"), pd.Timedelta("2H")], - TimedeltaArray._from_sequence(["1H", "2H"]), + [pd.Timedelta("1h"), pd.Timedelta("2h")], + TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"), ), ( np.array([1, 2], dtype="m8[ns]"), - TimedeltaArray(np.array([1, 2], dtype="m8[ns]")), + TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")), ), ( np.array([1, 2], dtype="m8[us]"), - TimedeltaArray(np.array([1, 2], dtype="m8[us]")), + TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")), ), # integer - ([1, 2], IntegerArray._from_sequence([1, 2])), - ([1, None], IntegerArray._from_sequence([1, None])), - ([1, pd.NA], IntegerArray._from_sequence([1, pd.NA])), - ([1, np.nan], IntegerArray._from_sequence([1, np.nan])), + ([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")), + ([1, None], IntegerArray._from_sequence([1, None], dtype="Int64")), + ([1, pd.NA], IntegerArray._from_sequence([1, pd.NA], dtype="Int64")), + ([1, np.nan], IntegerArray._from_sequence([1, np.nan], dtype="Int64")), # float - ([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2])), - ([0.1, None], FloatingArray._from_sequence([0.1, pd.NA])), - ([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA])), - ([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA])), + ([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2], dtype="Float64")), + ([0.1, None], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")), + ([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")), + ([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")), # integer-like float - ([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0])), - ([1.0, None], FloatingArray._from_sequence([1.0, pd.NA])), - ([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA])), - ([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA])), + ([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")), + ([1.0, None], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")), + ([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")), + ([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")), # mixed-integer-float - ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])), - ([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])), + ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")), + ( + [1, np.nan, 2.0], + FloatingArray._from_sequence([1.0, None, 2.0], dtype="Float64"), + ), # string ( ["a", "b"], - pd.StringDtype().construct_array_type()._from_sequence(["a", "b"]), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), ), ( ["a", None], - pd.StringDtype().construct_array_type()._from_sequence(["a", None]), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype()), ), # Boolean - ([True, False], BooleanArray._from_sequence([True, False])), - ([True, None], BooleanArray._from_sequence([True, None])), + ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")), + ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")), ], ) def test_array_inference(data, expected): @@ -350,7 +381,7 @@ "data", [ # mix of frequencies - [pd.Period("2000", "D"), pd.Period("2001", "A")], + [pd.Period("2000", "D"), pd.Period("2001", "Y")], # mix of closed [pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")], # Mix of timezones @@ -416,7 +447,7 @@ class DecimalArray2(DecimalArray): @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): + def _from_sequence(cls, scalars, *, dtype=None, copy=False): if isinstance(scalars, (pd.Series, pd.Index)): raise TypeError("scalars should not be of type pd.Series or pd.Index") @@ -427,20 +458,21 @@ box = index_or_series data = box([decimal.Decimal("1"), decimal.Decimal("2")]) + dtype = DecimalDtype2() # make sure it works with pytest.raises( TypeError, match="scalars should not be of type pd.Series or pd.Index" ): - DecimalArray2._from_sequence(data) + DecimalArray2._from_sequence(data, dtype=dtype) result = pd.array(data, dtype="decimal2") - expected = DecimalArray2._from_sequence(data.values) + expected = DecimalArray2._from_sequence(data.values, dtype=dtype) tm.assert_equal(result, expected) def test_array_to_numpy_na(): # GH#40638 - arr = pd.array([pd.NA, 1], dtype="string") + arr = pd.array([pd.NA, 1], dtype="string[python]") result = arr.to_numpy(na_value=True, dtype=bool) expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/test_datetimelike.py pandas-2.2.2+dfsg/pandas/tests/arrays/test_datetimelike.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/test_datetimelike.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/test_datetimelike.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,6 +11,8 @@ OutOfBoundsDatetime, Timestamp, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr +from pandas.compat.numpy import np_version_gt2 import pandas as pd from pandas import ( @@ -26,12 +28,10 @@ PeriodArray, TimedeltaArray, ) -from pandas.core.arrays.datetimes import _sequence_to_dt64ns -from pandas.core.arrays.timedeltas import sequence_to_td64ns # TODO: more freq variants -@pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"]) +@pytest.fixture(params=["D", "B", "W", "ME", "QE", "YE"]) def freqstr(request): """Fixture returning parametrized frequency in string format.""" return request.param @@ -52,6 +52,7 @@ warnings.filterwarnings( "ignore", message="Period with BDay freq", category=FutureWarning ) + freqstr = freq_to_period_freqstr(1, freqstr) pi = pd.period_range(start=Timestamp("2000-01-01"), periods=100, freq=freqstr) return pi @@ -89,7 +90,10 @@ def arr1d(self): """Fixture returning DatetimeArray with daily frequency.""" data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq="D") + if self.array_cls is PeriodArray: + arr = self.array_cls(data, freq="D") + else: + arr = self.index_cls(data, freq="D")._data return arr def test_compare_len1_raises(self, arr1d): @@ -161,7 +165,7 @@ if self.array_cls is PeriodArray: arr = PeriodArray(data, dtype="period[D]") else: - arr = self.array_cls(data) + arr = self.index_cls(data)._data idx = self.index_cls._simple_new(arr) takers = [1, 4, 94] @@ -183,9 +187,7 @@ arr1d.take([0, 1], allow_fill=True, fill_value=fill_value) def test_take_fill(self, arr1d): - np.arange(10, dtype="i8") * 24 * 3600 * 10**9 - - arr = arr1d # self.array_cls(data, freq="D") + arr = arr1d result = arr.take([-1, 1], allow_fill=True, fill_value=None) assert result[0] is NaT @@ -213,11 +215,11 @@ arr = arr1d idx = self.index_cls(arr) idx = idx.insert(0, NaT) - arr = self.array_cls(idx) + arr = arr1d result = arr._concat_same_type([arr[:-1], arr[1:], arr]) arr2 = arr.astype(object) - expected = self.index_cls(np.concatenate([arr2[:-1], arr2[1:], arr2]), None) + expected = self.index_cls(np.concatenate([arr2[:-1], arr2[1:], arr2])) tm.assert_index_equal(self.index_cls(result), expected) @@ -253,7 +255,7 @@ if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls(data) + arr = self.array_cls._from_sequence(data) arr[4] = NaT fill_value = arr[3] if method == "pad" else arr[5] @@ -269,7 +271,7 @@ if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls(data) + arr = self.array_cls._from_sequence(data) # scalar result = arr.searchsorted(arr[1]) @@ -324,19 +326,12 @@ ): arr.searchsorted("foo") - if string_storage == "python": - arr_type = "StringArray" - elif string_storage == "pyarrow": - arr_type = "ArrowStringArray" - else: - arr_type = "ArrowStringArrayNumpySemantics" - with pd.option_context("string_storage", string_storage): with pytest.raises( TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - f"or array of those. Got '{arr_type}' instead." + "or array of those. Got string array instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) @@ -348,7 +343,7 @@ if self.array_cls is PeriodArray: arr = self.array_cls(i8vals, dtype="period[ns]") else: - arr = self.array_cls(i8vals, freq="ns") + arr = self.index_cls(i8vals, freq="ns")._data arr[0] # should not raise OutOfBoundsDatetime index = pd.Index(arr) @@ -359,13 +354,15 @@ def test_getitem_2d(self, arr1d): # 2d slicing on a 1D array - expected = type(arr1d)(arr1d._ndarray[:, np.newaxis], dtype=arr1d.dtype) + expected = type(arr1d)._simple_new( + arr1d._ndarray[:, np.newaxis], dtype=arr1d.dtype + ) result = arr1d[:, np.newaxis] tm.assert_equal(result, expected) # Lookup on a 2D array arr2d = expected - expected = type(arr2d)(arr2d._ndarray[:3, 0], dtype=arr2d.dtype) + expected = type(arr2d)._simple_new(arr2d._ndarray[:3, 0], dtype=arr2d.dtype) result = arr2d[:3, 0] tm.assert_equal(result, expected) @@ -418,7 +415,7 @@ if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls(data, freq="D") + arr = self.index_cls(data, freq="D")._data arr[0] = arr[1] expected = np.arange(10, dtype="i8") * 24 * 3600 * 10**9 @@ -533,7 +530,7 @@ if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls(data, freq="D") + arr = self.index_cls(data, freq="D")._data expected = arr + pd.Timedelta(days=1) arr += pd.Timedelta(days=1) @@ -598,10 +595,13 @@ def test_from_integer_array(self): arr = np.array([1, 2, 3], dtype=np.int64) - expected = self.array_cls(arr, dtype=self.example_dtype) - data = pd.array(arr, dtype="Int64") - result = self.array_cls(data, dtype=self.example_dtype) + if self.array_cls is PeriodArray: + expected = self.array_cls(arr, dtype=self.example_dtype) + result = self.array_cls(data, dtype=self.example_dtype) + else: + expected = self.array_cls._from_sequence(arr, dtype=self.example_dtype) + result = self.array_cls._from_sequence(data, dtype=self.example_dtype) tm.assert_extension_array_equal(result, expected) @@ -627,25 +627,26 @@ # GH#24064 dti = self.index_cls(arr1d) - result = dti.round(freq="2T") + result = dti.round(freq="2min") expected = dti - pd.Timedelta(minutes=1) expected = expected._with_freq(None) tm.assert_index_equal(result, expected) dta = dti._data - result = dta.round(freq="2T") + result = dta.round(freq="2min") expected = expected._data._with_freq(None) tm.assert_datetime_array_equal(result, expected) def test_array_interface(self, datetime_index): - arr = DatetimeArray(datetime_index) + arr = datetime_index._data + copy_false = None if np_version_gt2 else False # default asarray gives the same underlying data (for tz naive) result = np.asarray(arr) expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, copy=False) + result = np.array(arr, copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) @@ -654,7 +655,7 @@ expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype="datetime64[ns]", copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="datetime64[ns]") @@ -697,6 +698,7 @@ # GH#23524 arr = arr1d dti = self.index_cls(arr1d) + copy_false = None if np_version_gt2 else False expected = dti.asi8.view("M8[ns]") result = np.array(arr, dtype="M8[ns]") @@ -705,17 +707,18 @@ result = np.array(arr, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) - # check that we are not making copies when setting copy=False - result = np.array(arr, dtype="M8[ns]", copy=False) + # check that we are not making copies when setting copy=copy_false + result = np.array(arr, dtype="M8[ns]", copy=copy_false) assert result.base is expected.base assert result.base is not None - result = np.array(arr, dtype="datetime64[ns]", copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=copy_false) assert result.base is expected.base assert result.base is not None def test_array_i8_dtype(self, arr1d): arr = arr1d dti = self.index_cls(arr1d) + copy_false = None if np_version_gt2 else False expected = dti.asi8 result = np.array(arr, dtype="i8") @@ -724,18 +727,18 @@ result = np.array(arr, dtype=np.int64) tm.assert_numpy_array_equal(result, expected) - # check that we are still making copies when setting copy=False - result = np.array(arr, dtype="i8", copy=False) + # check that we are still making copies when setting copy=copy_false + result = np.array(arr, dtype="i8", copy=copy_false) assert result.base is not expected.base assert result.base is None def test_from_array_keeps_base(self): # Ensure that DatetimeArray._ndarray.base isn't lost. arr = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") - dta = DatetimeArray(arr) + dta = DatetimeArray._from_sequence(arr) assert dta._ndarray is arr - dta = DatetimeArray(arr[:0]) + dta = DatetimeArray._from_sequence(arr[:0]) assert dta._ndarray.base is arr def test_from_dti(self, arr1d): @@ -760,8 +763,9 @@ @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_to_period(self, datetime_index, freqstr): dti = datetime_index - arr = DatetimeArray(dti) + arr = dti._data + freqstr = freq_to_period_freqstr(1, freqstr) expected = dti.to_period(freq=freqstr) result = arr.to_period(freq=freqstr) assert isinstance(result, PeriodArray) @@ -859,12 +863,12 @@ with pytest.raises(ValueError, match="to_concat must have the same"): arr._concat_same_type([arr, other]) - def test_concat_same_type_different_freq(self): + def test_concat_same_type_different_freq(self, unit): # we *can* concatenate DTI with different freqs. - a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central")) - b = DatetimeArray(pd.date_range("2000", periods=2, freq="H", tz="US/Central")) + a = pd.date_range("2000", periods=2, freq="D", tz="US/Central", unit=unit)._data + b = pd.date_range("2000", periods=2, freq="h", tz="US/Central", unit=unit)._data result = DatetimeArray._concat_same_type([a, b]) - expected = DatetimeArray( + expected = ( pd.to_datetime( [ "2000-01-01 00:00:00", @@ -872,7 +876,10 @@ "2000-01-01 00:00:00", "2000-01-01 01:00:00", ] - ).tz_localize("US/Central") + ) + .tz_localize("US/Central") + .as_unit(unit) + ._data ) tm.assert_datetime_array_equal(result, expected) @@ -886,7 +893,7 @@ def test_strftime_nat(self): # GH 29578 - arr = DatetimeArray(DatetimeIndex(["2019-01-01", NaT])) + arr = DatetimeIndex(["2019-01-01", NaT])._data result = arr.strftime("%Y-%m-%d") expected = np.array(["2019-01-01", np.nan], dtype=object) @@ -901,7 +908,7 @@ def test_from_tdi(self): tdi = TimedeltaIndex(["1 Day", "3 Hours"]) - arr = TimedeltaArray(tdi) + arr = tdi._data assert list(arr) == list(tdi) # Check that Index.__new__ knows what to do with TimedeltaArray @@ -911,7 +918,7 @@ def test_astype_object(self): tdi = TimedeltaIndex(["1 Day", "3 Hours"]) - arr = TimedeltaArray(tdi) + arr = tdi._data asobj = arr.astype("O") assert isinstance(asobj, np.ndarray) assert asobj.dtype == "O" @@ -919,7 +926,7 @@ def test_to_pytimedelta(self, timedelta_index): tdi = timedelta_index - arr = TimedeltaArray(tdi) + arr = tdi._data expected = tdi.to_pytimedelta() result = arr.to_pytimedelta() @@ -928,7 +935,7 @@ def test_total_seconds(self, timedelta_index): tdi = timedelta_index - arr = TimedeltaArray(tdi) + arr = tdi._data expected = tdi.total_seconds() result = arr.total_seconds() @@ -938,7 +945,7 @@ @pytest.mark.parametrize("propname", TimedeltaArray._field_ops) def test_int_properties(self, timedelta_index, propname): tdi = timedelta_index - arr = TimedeltaArray(tdi) + arr = tdi._data result = getattr(arr, propname) expected = np.array(getattr(tdi, propname), dtype=result.dtype) @@ -946,14 +953,15 @@ tm.assert_numpy_array_equal(result, expected) def test_array_interface(self, timedelta_index): - arr = TimedeltaArray(timedelta_index) + arr = timedelta_index._data + copy_false = None if np_version_gt2 else False # default asarray gives the same underlying data result = np.asarray(arr) expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, copy=False) + result = np.array(arr, copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) @@ -962,7 +970,7 @@ expected = arr._ndarray assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype="timedelta64[ns]", copy=False) + result = np.array(arr, dtype="timedelta64[ns]", copy=copy_false) assert result is expected tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype="timedelta64[ns]") @@ -989,7 +997,7 @@ def test_take_fill_valid(self, timedelta_index, fixed_now_ts): tdi = timedelta_index - arr = TimedeltaArray(tdi) + arr = tdi._data td1 = pd.Timedelta(days=1) result = arr.take([-1, 1], allow_fill=True, fill_value=td1) @@ -1064,7 +1072,7 @@ pi = self.index_cls(arr1d) arr = arr1d - expected = DatetimeArray(pi.to_timestamp(how=how)) + expected = DatetimeIndex(pi.to_timestamp(how=how))._data result = arr.to_timestamp(how=how) assert isinstance(result, DatetimeArray) @@ -1176,7 +1184,7 @@ ids=lambda x: type(x).__name__, ) def test_casting_nat_setitem_array(arr, casting_nats): - expected = type(arr)._from_sequence([NaT, arr[1], arr[2]]) + expected = type(arr)._from_sequence([NaT, arr[1], arr[2]], dtype=arr.dtype) for nat in casting_nats: arr = arr.copy() @@ -1247,7 +1255,7 @@ "values", [ pd.to_datetime(["2020-01-01", "2020-02-01"]), - TimedeltaIndex([1, 2], unit="D"), + pd.to_timedelta([1, 2], unit="D"), PeriodIndex(["2020-01-01", "2020-02-01"], freq="D"), ], ) @@ -1278,7 +1286,7 @@ "values", [ pd.to_datetime(["2020-01-01", "2020-02-01"]), - TimedeltaIndex([1, 2], unit="D"), + pd.to_timedelta([1, 2], unit="D"), PeriodIndex(["2020-01-01", "2020-02-01"], freq="D"), ], ) @@ -1310,19 +1318,16 @@ cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype] - result = cls(arr) - expected = cls(data) + depr_msg = f"{cls.__name__}.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = cls(arr) + expected = cls(data) tm.assert_extension_array_equal(result, expected) - result = cls._from_sequence(arr) - expected = cls._from_sequence(data) + result = cls._from_sequence(arr, dtype=dtype) + expected = cls._from_sequence(data, dtype=dtype) tm.assert_extension_array_equal(result, expected) - func = {"M8[ns]": _sequence_to_dt64ns, "m8[ns]": sequence_to_td64ns}[dtype] - result = func(arr)[0] - expected = func(data)[0] - tm.assert_equal(result, expected) - func = {"M8[ns]": pd.to_datetime, "m8[ns]": pd.to_timedelta}[dtype] result = func(arr).array expected = func(data).array diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/test_datetimes.py pandas-2.2.2+dfsg/pandas/tests/arrays/test_datetimes.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/test_datetimes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/test_datetimes.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,10 +15,7 @@ import numpy as np import pytest -from pandas._libs.tslibs import ( - npy_unit_to_abbrev, - tz_compare, -) +from pandas._libs.tslibs import tz_compare from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -235,8 +232,7 @@ dta, dti = dta_dti td = pd.Timedelta(scalar) - exp_reso = max(dta._creso, td._creso) - exp_unit = npy_unit_to_abbrev(exp_reso) + exp_unit = tm.get_finest_unit(dta.unit, td.unit) expected = (dti + td)._data.as_unit(exp_unit) result = dta + scalar @@ -285,7 +281,7 @@ op = comparison_op dti = pd.date_range("2016-01-1", freq="MS", periods=9, tz=None) - arr = DatetimeArray(dti) + arr = dti._data assert arr.freq == dti.freq assert arr.tz == dti.tz @@ -313,6 +309,22 @@ class TestDatetimeArray: + def test_astype_ns_to_ms_near_bounds(self): + # GH#55979 + ts = pd.Timestamp("1677-09-21 00:12:43.145225") + target = ts.as_unit("ms") + + dta = DatetimeArray._from_sequence([ts], dtype="M8[ns]") + assert (dta.view("i8") == ts.as_unit("ns").value).all() + + result = dta.astype("M8[ms]") + assert result[0] == target + + expected = DatetimeArray._from_sequence([ts], dtype="M8[ms]") + assert (expected.view("i8") == target._value).all() + + tm.assert_datetime_array_equal(result, expected) + def test_astype_non_nano_tznaive(self): dti = pd.date_range("2016-01-01", periods=3) @@ -378,7 +390,9 @@ @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype_int(self, dtype): - arr = DatetimeArray._from_sequence([pd.Timestamp("2000"), pd.Timestamp("2001")]) + arr = DatetimeArray._from_sequence( + [pd.Timestamp("2000"), pd.Timestamp("2001")], dtype="M8[ns]" + ) if np.dtype(dtype) != np.int64: with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): @@ -412,7 +426,7 @@ data = np.array([1, 2, 3], dtype="M8[ns]") dtype = data.dtype if tz is None else DatetimeTZDtype(tz=tz) - arr = DatetimeArray(data, dtype=dtype) + arr = DatetimeArray._from_sequence(data, dtype=dtype) expected = arr.copy() ts = pd.Timestamp("2020-09-08 16:50").tz_localize(tz) @@ -432,7 +446,9 @@ # pre-2.0 we required exact tz match, in 2.0 we require only # tzawareness-match data = np.array([1, 2, 3], dtype="M8[ns]") - arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central")) + arr = DatetimeArray._from_sequence( + data, copy=False, dtype=DatetimeTZDtype(tz="US/Central") + ) with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): arr[0] = pd.Timestamp("2000") @@ -441,7 +457,7 @@ assert arr[0] == ts.tz_convert("US/Central") def test_setitem_clears_freq(self): - a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central")) + a = pd.date_range("2000", periods=2, freq="D", tz="US/Central")._data a[0] = pd.Timestamp("2000", tz="US/Central") assert a.freq is None @@ -463,17 +479,17 @@ def test_repeat_preserves_tz(self): dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central") - arr = DatetimeArray(dti) + arr = dti._data repeated = arr.repeat([1, 1]) # preserves tz and values, but not freq - expected = DatetimeArray(arr.asi8, freq=None, dtype=arr.dtype) + expected = DatetimeArray._from_sequence(arr.asi8, dtype=arr.dtype) tm.assert_equal(repeated, expected) def test_value_counts_preserves_tz(self): dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central") - arr = DatetimeArray(dti).repeat([4, 3]) + arr = dti._data.repeat([4, 3]) result = arr.value_counts() @@ -488,7 +504,7 @@ @pytest.mark.parametrize("method", ["pad", "backfill"]) def test_fillna_preserves_tz(self, method): dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central") - arr = DatetimeArray(dti, copy=True) + arr = DatetimeArray._from_sequence(dti, copy=True) arr[2] = pd.NaT fill_val = dti[1] if method == "pad" else dti[3] @@ -547,7 +563,7 @@ def test_array_interface_tz(self): tz = "US/Central" - data = DatetimeArray(pd.date_range("2017", periods=2, tz=tz)) + data = pd.date_range("2017", periods=2, tz=tz)._data result = np.asarray(data) expected = np.array( @@ -570,7 +586,7 @@ tm.assert_numpy_array_equal(result, expected) def test_array_interface(self): - data = DatetimeArray(pd.date_range("2017", periods=2)) + data = pd.date_range("2017", periods=2)._data expected = np.array( ["2017-01-01T00:00:00", "2017-01-02T00:00:00"], dtype="datetime64[ns]" ) @@ -588,7 +604,7 @@ @pytest.mark.parametrize("index", [True, False]) def test_searchsorted_different_tz(self, index): data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9 - arr = DatetimeArray(data, freq="D").tz_localize("Asia/Tokyo") + arr = pd.DatetimeIndex(data, freq="D")._data.tz_localize("Asia/Tokyo") if index: arr = pd.Index(arr) @@ -603,7 +619,7 @@ @pytest.mark.parametrize("index", [True, False]) def test_searchsorted_tzawareness_compat(self, index): data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9 - arr = DatetimeArray(data, freq="D") + arr = pd.DatetimeIndex(data, freq="D")._data if index: arr = pd.Index(arr) @@ -637,7 +653,7 @@ @pytest.mark.parametrize("index", [True, False]) def test_searchsorted_invalid_types(self, other, index): data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9 - arr = DatetimeArray(data, freq="D") + arr = pd.DatetimeIndex(data, freq="D")._data if index: arr = pd.Index(arr) @@ -654,7 +670,7 @@ dti = pd.date_range("2016-01-01", periods=3) dta = dti._data - expected = DatetimeArray(np.roll(dta._ndarray, 1)) + expected = DatetimeArray._from_sequence(np.roll(dta._ndarray, 1)) fv = dta[-1] for fill_value in [fv, fv.to_pydatetime(), fv.to_datetime64()]: @@ -727,7 +743,7 @@ ) utc_vals *= 1_000_000_000 - dta = DatetimeArray(utc_vals).tz_localize("UTC").tz_convert(tz) + dta = DatetimeArray._from_sequence(utc_vals).tz_localize("UTC").tz_convert(tz) left = dta[2] right = list(dta)[2] @@ -746,9 +762,73 @@ assert str(left) == str(right2) assert left.utcoffset() == right2.utcoffset() + @pytest.mark.parametrize( + "freq, freq_depr", + [ + ("2ME", "2M"), + ("2SME", "2SM"), + ("2SME", "2sm"), + ("2QE", "2Q"), + ("2QE-SEP", "2Q-SEP"), + ("1YE", "1Y"), + ("2YE-MAR", "2Y-MAR"), + ("1YE", "1A"), + ("2YE-MAR", "2A-MAR"), + ("2ME", "2m"), + ("2QE-SEP", "2q-sep"), + ("2YE-MAR", "2a-mar"), + ("2YE", "2y"), + ], + ) + def test_date_range_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): + # GH#9586, GH#54275 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + f"in a future version, please use '{freq[1:]}' instead." + + expected = pd.date_range("1/1/2000", periods=4, freq=freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("freq_depr", ["2H", "2CBH", "2MIN", "2S", "2mS", "2Us"]) + def test_date_range_uppercase_frequency_deprecated(self, freq_depr): + # GH#9586, GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_depr.lower()[1:]}' instead." + + expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.lower()) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq_depr", + [ + "2ye-mar", + "2ys", + "2qe", + "2qs-feb", + "2bqs", + "2sms", + "2bms", + "2cbme", + "2me", + "2w", + ], + ) + def test_date_range_lowercase_frequency_deprecated(self, freq_depr): + # GH#9586, GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.upper()[1:]}' instead." + + expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.upper()) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = pd.date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + def test_factorize_sort_without_freq(): - dta = DatetimeArray._from_sequence([0, 2, 1]) + dta = DatetimeArray._from_sequence([0, 2, 1], dtype="M8[ns]") msg = r"call pd.factorize\(obj, sort=True\) instead" with pytest.raises(NotImplementedError, match=msg): diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/test_period.py pandas-2.2.2+dfsg/pandas/tests/arrays/test_period.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/test_period.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/test_period.py 2024-04-10 17:42:52.000000000 +0000 @@ -82,9 +82,9 @@ def test_setitem_raises_incompatible_freq(): arr = PeriodArray(np.arange(3), dtype="period[D]") with pytest.raises(IncompatibleFrequency, match="freq"): - arr[0] = pd.Period("2000", freq="A") + arr[0] = pd.Period("2000", freq="Y") - other = PeriodArray._from_sequence(["2000", "2001"], dtype="period[A]") + other = PeriodArray._from_sequence(["2000", "2001"], dtype="period[Y]") with pytest.raises(IncompatibleFrequency, match="freq"): arr[[0, 1]] = other @@ -133,8 +133,8 @@ @pytest.mark.parametrize( "other", [ - pd.Period("2000", freq="H"), - PeriodArray._from_sequence(["2000", "2001", "2000"], dtype="period[H]"), + pd.Period("2000", freq="h"), + PeriodArray._from_sequence(["2000", "2001", "2000"], dtype="period[h]"), ], ) def test_where_different_freq_raises(other): diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/test_timedeltas.py pandas-2.2.2+dfsg/pandas/tests/arrays/test_timedeltas.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/test_timedeltas.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/test_timedeltas.py 2024-04-10 17:42:52.000000000 +0000 @@ -196,7 +196,9 @@ class TestTimedeltaArray: @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype_int(self, dtype): - arr = TimedeltaArray._from_sequence([Timedelta("1H"), Timedelta("2H")]) + arr = TimedeltaArray._from_sequence( + [Timedelta("1h"), Timedelta("2h")], dtype="m8[ns]" + ) if np.dtype(dtype) != np.int64: with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): @@ -208,8 +210,8 @@ tm.assert_numpy_array_equal(result, expected) def test_setitem_clears_freq(self): - a = TimedeltaArray(pd.timedelta_range("1H", periods=2, freq="H")) - a[0] = Timedelta("1H") + a = pd.timedelta_range("1h", periods=2, freq="h")._data + a[0] = Timedelta("1h") assert a.freq is None @pytest.mark.parametrize( @@ -222,8 +224,8 @@ ) def test_setitem_objects(self, obj): # make sure we accept timedelta64 and timedelta in addition to Timedelta - tdi = pd.timedelta_range("2 Days", periods=4, freq="H") - arr = TimedeltaArray(tdi, freq=tdi.freq) + tdi = pd.timedelta_range("2 Days", periods=4, freq="h") + arr = tdi._data arr[0] = obj assert arr[0] == Timedelta(seconds=1) @@ -245,7 +247,7 @@ @pytest.mark.parametrize("index", [True, False]) def test_searchsorted_invalid_types(self, other, index): data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9 - arr = TimedeltaArray(data, freq="D") + arr = pd.TimedeltaIndex(data, freq="D")._data if index: arr = pd.Index(arr) @@ -262,10 +264,10 @@ class TestUnaryOps: def test_abs(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray(vals) + arr = TimedeltaArray._from_sequence(vals) evals = np.array([3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - expected = TimedeltaArray(evals) + expected = TimedeltaArray._from_sequence(evals) result = abs(arr) tm.assert_timedelta_array_equal(result, expected) @@ -275,7 +277,7 @@ def test_pos(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray(vals) + arr = TimedeltaArray._from_sequence(vals) result = +arr tm.assert_timedelta_array_equal(result, arr) @@ -287,10 +289,10 @@ def test_neg(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray(vals) + arr = TimedeltaArray._from_sequence(vals) evals = np.array([3600 * 10**9, "NaT", -7200 * 10**9], dtype="m8[ns]") - expected = TimedeltaArray(evals) + expected = TimedeltaArray._from_sequence(evals) result = -arr tm.assert_timedelta_array_equal(result, expected) @@ -299,10 +301,10 @@ tm.assert_timedelta_array_equal(result2, expected) def test_neg_freq(self): - tdi = pd.timedelta_range("2 Days", periods=4, freq="H") - arr = TimedeltaArray(tdi, freq=tdi.freq) + tdi = pd.timedelta_range("2 Days", periods=4, freq="h") + arr = tdi._data - expected = TimedeltaArray(-tdi._data, freq=-tdi.freq) + expected = -tdi._data result = -arr tm.assert_timedelta_array_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/timedeltas/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/arrays/timedeltas/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/timedeltas/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/timedeltas/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas._testing as tm from pandas.core.arrays import TimedeltaArray @@ -9,13 +10,16 @@ # GH#25282 arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]") - with pytest.raises(ValueError, match="Only 1-dimensional"): - # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 - TimedeltaArray(arr.reshape(2, 2, 1)) - - with pytest.raises(ValueError, match="Only 1-dimensional"): - # 0-dim - TimedeltaArray(arr[[0]].squeeze()) + depr_msg = "TimedeltaArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + TimedeltaArray(arr.reshape(2, 2, 1)) + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="Only 1-dimensional"): + # 0-dim + TimedeltaArray(arr[[0]].squeeze()) def test_freq_validation(self): # ensure that the public constructor cannot create an invalid instance @@ -25,39 +29,75 @@ "Inferred frequency None from passed values does not " "conform to passed frequency D" ) - with pytest.raises(ValueError, match=msg): - TimedeltaArray(arr.view("timedelta64[ns]"), freq="D") + depr_msg = "TimedeltaArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + TimedeltaArray(arr.view("timedelta64[ns]"), freq="D") def test_non_array_raises(self): - with pytest.raises(ValueError, match="list"): - TimedeltaArray([1, 2, 3]) + depr_msg = "TimedeltaArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match="list"): + TimedeltaArray([1, 2, 3]) def test_other_type_raises(self): - with pytest.raises(ValueError, match="dtype bool cannot be converted"): - TimedeltaArray(np.array([1, 2, 3], dtype="bool")) + msg = r"dtype bool cannot be converted to timedelta64\[ns\]" + with pytest.raises(TypeError, match=msg): + TimedeltaArray._from_sequence(np.array([1, 2, 3], dtype="bool")) def test_incorrect_dtype_raises(self): - # TODO: why TypeError for 'category' but ValueError for i8? - with pytest.raises( - ValueError, match=r"category cannot be converted to timedelta64\[ns\]" - ): - TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category") - - with pytest.raises( - ValueError, match=r"dtype int64 cannot be converted to timedelta64\[ns\]" - ): - TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64")) + msg = "dtype 'category' is invalid, should be np.timedelta64 dtype" + with pytest.raises(ValueError, match=msg): + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype="category" + ) + + msg = "dtype 'int64' is invalid, should be np.timedelta64 dtype" + with pytest.raises(ValueError, match=msg): + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64") + ) + + msg = r"dtype 'datetime64\[ns\]' is invalid, should be np.timedelta64 dtype" + with pytest.raises(ValueError, match=msg): + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("M8[ns]") + ) + + msg = ( + r"dtype 'datetime64\[us, UTC\]' is invalid, should be np.timedelta64 dtype" + ) + with pytest.raises(ValueError, match=msg): + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype="M8[us, UTC]" + ) + + msg = "Supported timedelta64 resolutions are 's', 'ms', 'us', 'ns'" + with pytest.raises(ValueError, match=msg): + TimedeltaArray._from_sequence( + np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("m8[Y]") + ) + + def test_mismatched_values_dtype_units(self): + arr = np.array([1, 2, 3], dtype="m8[s]") + dtype = np.dtype("m8[ns]") + msg = r"Values resolution does not match dtype" + depr_msg = "TimedeltaArray.__init__ is deprecated" + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(ValueError, match=msg): + TimedeltaArray(arr, dtype=dtype) def test_copy(self): data = np.array([1, 2, 3], dtype="m8[ns]") - arr = TimedeltaArray(data, copy=False) + arr = TimedeltaArray._from_sequence(data, copy=False) assert arr._ndarray is data - arr = TimedeltaArray(data, copy=True) + arr = TimedeltaArray._from_sequence(data, copy=True) assert arr._ndarray is not data assert arr._ndarray.base is not data def test_from_sequence_dtype(self): - msg = "dtype .*object.* cannot be converted to timedelta64" + msg = "dtype 'object' is invalid, should be np.timedelta64 dtype" with pytest.raises(ValueError, match=msg): TimedeltaArray._from_sequence([], dtype=object) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/timedeltas/test_cumulative.py pandas-2.2.2+dfsg/pandas/tests/arrays/timedeltas/test_cumulative.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/timedeltas/test_cumulative.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/timedeltas/test_cumulative.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,13 +7,14 @@ class TestAccumulator: def test_accumulators_disallowed(self): # GH#50297 - arr = TimedeltaArray._from_sequence_not_strict(["1D", "2D"]) + arr = TimedeltaArray._from_sequence(["1D", "2D"], dtype="m8[ns]") with pytest.raises(TypeError, match="cumprod not supported"): arr._accumulate("cumprod") - def test_cumsum(self): + def test_cumsum(self, unit): # GH#50297 - arr = TimedeltaArray._from_sequence_not_strict(["1D", "2D"]) + dtype = f"m8[{unit}]" + arr = TimedeltaArray._from_sequence(["1D", "2D"], dtype=dtype) result = arr._accumulate("cumsum") - expected = TimedeltaArray._from_sequence_not_strict(["1D", "3D"]) + expected = TimedeltaArray._from_sequence(["1D", "3D"], dtype=dtype) tm.assert_timedelta_array_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/arrays/timedeltas/test_reductions.py pandas-2.2.2+dfsg/pandas/tests/arrays/timedeltas/test_reductions.py --- pandas-2.1.4+dfsg/pandas/tests/arrays/timedeltas/test_reductions.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/arrays/timedeltas/test_reductions.py 2024-04-10 17:42:52.000000000 +0000 @@ -34,15 +34,18 @@ assert isinstance(result, Timedelta) assert result == Timedelta(0) - def test_min_max(self): - arr = TimedeltaArray._from_sequence(["3H", "3H", "NaT", "2H", "5H", "4H"]) + def test_min_max(self, unit): + dtype = f"m8[{unit}]" + arr = TimedeltaArray._from_sequence( + ["3h", "3h", "NaT", "2h", "5h", "4h"], dtype=dtype + ) result = arr.min() - expected = Timedelta("2H") + expected = Timedelta("2h") assert result == expected result = arr.max() - expected = Timedelta("5H") + expected = Timedelta("5h") assert result == expected result = arr.min(skipna=False) @@ -52,7 +55,7 @@ assert result is pd.NaT def test_sum(self): - tdi = pd.TimedeltaIndex(["3H", "3H", "NaT", "2H", "5H", "4H"]) + tdi = pd.TimedeltaIndex(["3h", "3h", "NaT", "2h", "5h", "4h"]) arr = tdi.array result = arr.sum(skipna=True) @@ -86,7 +89,7 @@ def test_npsum(self): # GH#25282, GH#25335 np.sum should return a Timedelta, not timedelta64 - tdi = pd.TimedeltaIndex(["3H", "3H", "2H", "5H", "4H"]) + tdi = pd.TimedeltaIndex(["3h", "3h", "2h", "5h", "4h"]) arr = tdi.array result = np.sum(tdi) @@ -102,7 +105,7 @@ arr = np.arange(8).astype(np.int64).view("m8[s]").astype("m8[ns]").reshape(4, 2) arr[-1, -1] = "Nat" - tda = TimedeltaArray(arr) + tda = TimedeltaArray._from_sequence(arr) result = tda.sum(skipna=False) assert result is pd.NaT @@ -133,7 +136,7 @@ ], ) def test_std(self, add): - tdi = pd.TimedeltaIndex(["0H", "4H", "NaT", "4H", "0H", "2H"]) + add + tdi = pd.TimedeltaIndex(["0h", "4h", "NaT", "4h", "0h", "2h"]) + add arr = tdi.array result = arr.std(skipna=True) @@ -162,7 +165,7 @@ assert np.isnat(result) def test_median(self): - tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"]) + tdi = pd.TimedeltaIndex(["0h", "3h", "NaT", "5h06m", "0h", "2h"]) arr = tdi.array result = arr.median(skipna=True) @@ -181,7 +184,7 @@ assert result is pd.NaT def test_mean(self): - tdi = pd.TimedeltaIndex(["0H", "3H", "NaT", "5H06m", "0H", "2H"]) + tdi = pd.TimedeltaIndex(["0h", "3h", "NaT", "5h06m", "0h", "2h"]) arr = tdi._data # manually verified result diff -Nru pandas-2.1.4+dfsg/pandas/tests/base/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/base/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/base/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/base/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -138,7 +138,9 @@ "object-string", ], ) - def test_constructor_datetime_outofbound(self, a, constructor): + def test_constructor_datetime_outofbound( + self, a, constructor, request, using_infer_string + ): # GH-26853 (+ bug GH-26206 out of bound non-ns unit) # No dtype specified (dtype inference) @@ -150,7 +152,10 @@ assert result.dtype == "M8[s]" else: result = constructor(a) - assert result.dtype == "object" + if using_infer_string and "object-string" in request.node.callspec.id: + assert result.dtype == "string" + else: + assert result.dtype == "object" tm.assert_numpy_array_equal(result.to_numpy(), a) # Explicit dtype specified diff -Nru pandas-2.1.4+dfsg/pandas/tests/base/test_conversion.py pandas-2.2.2+dfsg/pandas/tests/base/test_conversion.py --- pandas-2.1.4+dfsg/pandas/tests/base/test_conversion.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/base/test_conversion.py 2024-04-10 17:42:52.000000000 +0000 @@ -20,6 +20,7 @@ SparseArray, TimedeltaArray, ) +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics class TestToIterable: @@ -141,42 +142,48 @@ result = method(i)[0] assert isinstance(result, Timestamp) - def test_iter_box(self): + def test_iter_box_dt64(self, unit): vals = [Timestamp("2011-01-01"), Timestamp("2011-01-02")] - s = Series(vals) - assert s.dtype == "datetime64[ns]" - for res, exp in zip(s, vals): + ser = Series(vals).dt.as_unit(unit) + assert ser.dtype == f"datetime64[{unit}]" + for res, exp in zip(ser, vals): assert isinstance(res, Timestamp) assert res.tz is None assert res == exp + assert res.unit == unit + def test_iter_box_dt64tz(self, unit): vals = [ Timestamp("2011-01-01", tz="US/Eastern"), Timestamp("2011-01-02", tz="US/Eastern"), ] - s = Series(vals) + ser = Series(vals).dt.as_unit(unit) - assert s.dtype == "datetime64[ns, US/Eastern]" - for res, exp in zip(s, vals): + assert ser.dtype == f"datetime64[{unit}, US/Eastern]" + for res, exp in zip(ser, vals): assert isinstance(res, Timestamp) assert res.tz == exp.tz assert res == exp + assert res.unit == unit + def test_iter_box_timedelta64(self, unit): # timedelta vals = [Timedelta("1 days"), Timedelta("2 days")] - s = Series(vals) - assert s.dtype == "timedelta64[ns]" - for res, exp in zip(s, vals): + ser = Series(vals).dt.as_unit(unit) + assert ser.dtype == f"timedelta64[{unit}]" + for res, exp in zip(ser, vals): assert isinstance(res, Timedelta) assert res == exp + assert res.unit == unit + def test_iter_box_period(self): # period vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] s = Series(vals) assert s.dtype == "Period[M]" for res, exp in zip(s, vals): assert isinstance(res, pd.Period) - assert res.freq == "M" + assert res.freq == "ME" assert res == exp @@ -192,9 +199,9 @@ "datetime64[ns, US/Central]", ), ( - pd.PeriodIndex([2018, 2019], freq="A"), + pd.PeriodIndex([2018, 2019], freq="Y"), PeriodArray, - pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), + pd.core.dtypes.dtypes.PeriodDtype("Y-DEC"), ), (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"), ( @@ -209,7 +216,9 @@ ), ], ) -def test_values_consistent(arr, expected_type, dtype): +def test_values_consistent(arr, expected_type, dtype, using_infer_string): + if using_infer_string and dtype == "object": + expected_type = ArrowStringArrayNumpySemantics l_values = Series(arr)._values r_values = pd.Index(arr)._values assert type(l_values) is expected_type @@ -245,10 +254,13 @@ (pd.array([0, np.nan], dtype="Int64"), "_data"), (IntervalArray.from_breaks([0, 1]), "_left"), (SparseArray([0, 1]), "_sparse_values"), - (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_ndarray"), + ( + DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), + "_ndarray", + ), # tz-aware Datetime ( - DatetimeArray( + DatetimeArray._from_sequence( np.array( ["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]" ), @@ -286,7 +298,7 @@ pd.core.arrays.period_array(["2000", "2001"], freq="D"), np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), ), - (pd.array([0, np.nan], dtype="Int64"), np.array([0, pd.NA], dtype=object)), + (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])), ( IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), @@ -294,17 +306,16 @@ (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), # tz-naive datetime ( - DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")), + DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")), np.array(["2000", "2001"], dtype="M8[ns]"), ), # tz-aware stays tz`-aware ( - DatetimeArray( - np.array( - ["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]" - ), - dtype=DatetimeTZDtype(tz="US/Central"), - ), + DatetimeArray._from_sequence( + np.array(["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]") + ) + .tz_localize("UTC") + .tz_convert("US/Central"), np.array( [ Timestamp("2000-01-01", tz="US/Central"), @@ -314,7 +325,9 @@ ), # Timedelta ( - TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="H"), + TimedeltaArray._from_sequence( + np.array([0, 3600000000000], dtype="i8").view("m8[ns]") + ), np.array([0, 3600000000000], dtype="m8[ns]"), ), # GH#26406 tz is preserved in Categorical[dt64tz] @@ -335,10 +348,6 @@ with tm.assert_produces_warning(None): thing = box(arr) - if arr.dtype.name == "int64" and box is pd.array: - mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object") - request.node.add_marker(mark) - result = thing.to_numpy() tm.assert_numpy_array_equal(result, expected) @@ -350,17 +359,23 @@ @pytest.mark.parametrize( "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] ) -def test_to_numpy_copy(arr, as_series): +def test_to_numpy_copy(arr, as_series, using_infer_string): obj = pd.Index(arr, copy=False) if as_series: obj = Series(obj.values, copy=False) # no copy by default result = obj.to_numpy() - assert np.shares_memory(arr, result) is True + if using_infer_string and arr.dtype == object: + assert np.shares_memory(arr, result) is False + else: + assert np.shares_memory(arr, result) is True result = obj.to_numpy(copy=False) - assert np.shares_memory(arr, result) is True + if using_infer_string and arr.dtype == object: + assert np.shares_memory(arr, result) is False + else: + assert np.shares_memory(arr, result) is True # copy=True result = obj.to_numpy(copy=True) @@ -368,7 +383,7 @@ @pytest.mark.parametrize("as_series", [True, False]) -def test_to_numpy_dtype(as_series): +def test_to_numpy_dtype(as_series, unit): tz = "US/Eastern" obj = pd.DatetimeIndex(["2000", "2001"], tz=tz) if as_series: diff -Nru pandas-2.1.4+dfsg/pandas/tests/base/test_misc.py pandas-2.2.2+dfsg/pandas/tests/base/test_misc.py --- pandas-2.1.4+dfsg/pandas/tests/base/test_misc.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/base/test_misc.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import PYPY from pandas.core.dtypes.common import ( @@ -80,7 +82,10 @@ assert Series([1]).item() == 1 -@pytest.mark.skipif(PYPY, reason="not relevant for PyPy") +@pytest.mark.skipif( + PYPY or using_pyarrow_string_dtype(), + reason="not relevant for PyPy doesn't work properly for arrow strings", +) def test_memory_usage(index_or_series_memory_obj): obj = index_or_series_memory_obj # Clear index caches so that len(obj) == 0 report 0 memory usage @@ -127,7 +132,7 @@ @pytest.mark.parametrize("dtype", tm.NARROW_NP_DTYPES) def test_memory_usage_components_narrow_series(dtype): - series = tm.make_rand_series(name="a", dtype=dtype) + series = Series(range(5), dtype=dtype, index=[f"i-{i}" for i in range(5)], name="a") total_usage = series.memory_usage(index=True) non_index_usage = series.memory_usage(index=False) index_usage = series.index.memory_usage() @@ -141,7 +146,7 @@ if isinstance(obj, pd.MultiIndex): # See gh-14833 - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="np.searchsorted doesn't work on pd.MultiIndex: GH 14833" ) @@ -150,7 +155,7 @@ # TODO: Should Series cases also raise? Looks like they use numpy # comparison semantics https://github.com/numpy/numpy/issues/15981 mark = pytest.mark.xfail(reason="complex objects are not comparable") - request.node.add_marker(mark) + request.applymarker(mark) max_obj = max(obj, default=0) index = np.searchsorted(obj, max_obj) @@ -175,7 +180,9 @@ assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if is_dtype_equal(index.dtype, "string[pyarrow]"): + if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal( + index.dtype, "string[pyarrow_numpy]" + ): msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] diff -Nru pandas-2.1.4+dfsg/pandas/tests/base/test_unique.py pandas-2.2.2+dfsg/pandas/tests/base/test_unique.py --- pandas-2.1.4+dfsg/pandas/tests/base/test_unique.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/base/test_unique.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -98,6 +100,7 @@ @pytest.mark.single_cpu +@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji diff -Nru pandas-2.1.4+dfsg/pandas/tests/base/test_value_counts.py pandas-2.2.2+dfsg/pandas/tests/base/test_value_counts.py --- pandas-2.1.4+dfsg/pandas/tests/base/test_value_counts.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/base/test_value_counts.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,6 +14,7 @@ Series, Timedelta, TimedeltaIndex, + array, ) import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -113,7 +114,7 @@ tm.assert_series_equal(result, expected) -def test_value_counts_inferred(index_or_series): +def test_value_counts_inferred(index_or_series, using_infer_string): klass = index_or_series s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) @@ -125,7 +126,9 @@ tm.assert_index_equal(s.unique(), exp) else: exp = np.unique(np.array(s_values, dtype=np.object_)) - tm.assert_numpy_array_equal(s.unique(), exp) + if using_infer_string: + exp = array(exp) + tm.assert_equal(s.unique(), exp) assert s.nunique() == 4 # don't sort, have to sort after the fact as not sorting is @@ -147,7 +150,7 @@ tm.assert_series_equal(hist, expected) -def test_value_counts_bins(index_or_series): +def test_value_counts_bins(index_or_series, using_infer_string): klass = index_or_series s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) @@ -201,7 +204,9 @@ tm.assert_index_equal(s.unique(), exp) else: exp = np.array(["a", "b", np.nan, "d"], dtype=object) - tm.assert_numpy_array_equal(s.unique(), exp) + if using_infer_string: + exp = array(exp) + tm.assert_equal(s.unique(), exp) assert s.nunique() == 3 s = klass({}) if klass is dict else klass({}, dtype=object) @@ -216,7 +221,7 @@ assert s.nunique() == 0 -def test_value_counts_datetime64(index_or_series): +def test_value_counts_datetime64(index_or_series, unit): klass = index_or_series # GH 3002, datetime64[ns] @@ -233,7 +238,7 @@ "2008-09-09", "2008-09-09", ] - ), + ).as_unit(unit), "food": ["PIE", "GUM", "EGG", "EGG", "PIE", "GUM"], } ) @@ -242,44 +247,52 @@ s.name = None idx = pd.to_datetime( ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] - ) + ).as_unit(unit) expected_s = Series([3, 2, 1], index=idx, name="count") tm.assert_series_equal(s.value_counts(), expected_s) - expected = pd.array( + expected = array( np.array( ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"], - dtype="datetime64[ns]", + dtype=f"datetime64[{unit}]", ) ) + result = s.unique() if isinstance(s, Index): - tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) + tm.assert_index_equal(result, DatetimeIndex(expected)) else: - tm.assert_extension_array_equal(s.unique(), expected) + tm.assert_extension_array_equal(result, expected) assert s.nunique() == 3 # with NaT s = df["dt"].copy() s = klass(list(s.values) + [pd.NaT] * 4) + if klass is Series: + s = s.dt.as_unit(unit) + else: + s = s.as_unit(unit) result = s.value_counts() - assert result.index.dtype == "datetime64[ns]" + assert result.index.dtype == f"datetime64[{unit}]" tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s = pd.concat( - [Series([4], index=DatetimeIndex([pd.NaT]), name="count"), expected_s] + [ + Series([4], index=DatetimeIndex([pd.NaT]).as_unit(unit), name="count"), + expected_s, + ] ) tm.assert_series_equal(result, expected_s) - assert s.dtype == "datetime64[ns]" + assert s.dtype == f"datetime64[{unit}]" unique = s.unique() - assert unique.dtype == "datetime64[ns]" + assert unique.dtype == f"datetime64[{unit}]" # numpy_array_equal cannot compare pd.NaT if isinstance(s, Index): - exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) + exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]).as_unit(unit) tm.assert_index_equal(unique, exp_idx) else: tm.assert_extension_array_equal(unique[:3], expected) @@ -288,21 +301,29 @@ assert s.nunique() == 3 assert s.nunique(dropna=False) == 4 + +def test_value_counts_timedelta64(index_or_series, unit): # timedelta64[ns] - td = df.dt - df.dt + timedelta(1) - td = klass(td, name="dt") + klass = index_or_series + + day = Timedelta(timedelta(1)).as_unit(unit) + tdi = TimedeltaIndex([day], name="dt").as_unit(unit) + + tdvals = np.zeros(6, dtype=f"m8[{unit}]") + day + td = klass(tdvals, name="dt") result = td.value_counts() - expected_s = Series([6], index=Index([Timedelta("1day")], name="dt"), name="count") + expected_s = Series([6], index=tdi, name="count") tm.assert_series_equal(result, expected_s) - expected = TimedeltaIndex(["1 days"], name="dt") + expected = tdi + result = td.unique() if isinstance(td, Index): - tm.assert_index_equal(td.unique(), expected) + tm.assert_index_equal(result, expected) else: - tm.assert_extension_array_equal(td.unique(), expected._values) + tm.assert_extension_array_equal(result, expected._values) - td2 = timedelta(1) + (df.dt - df.dt) + td2 = day + np.zeros(6, dtype=f"m8[{unit}]") td2 = klass(td2, name="dt") result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s) @@ -320,3 +341,16 @@ else: expected = Series([1, 1, 1], index=[True, pd.NA, np.nan], name="count") tm.assert_series_equal(res, expected) + + +def test_value_counts_object_inference_deprecated(): + # GH#56161 + dti = pd.date_range("2016-01-01", periods=3, tz="UTC") + + idx = dti.astype(object) + msg = "The behavior of value_counts with object-dtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = idx.value_counts() + + exp = dti.value_counts() + tm.assert_series_equal(res, exp) diff -Nru pandas-2.1.4+dfsg/pandas/tests/computation/test_eval.py pandas-2.2.2+dfsg/pandas/tests/computation/test_eval.py --- pandas-2.1.4+dfsg/pandas/tests/computation/test_eval.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/computation/test_eval.py 2024-04-10 17:42:52.000000000 +0000 @@ -25,8 +25,11 @@ import pandas as pd from pandas import ( DataFrame, + Index, Series, date_range, + period_range, + timedelta_range, ) import pandas._testing as tm from pandas.core.computation import ( @@ -63,7 +66,7 @@ reason=f"numexpr enabled->{USE_NUMEXPR}, " f"installed->{NUMEXPR_INSTALLED}", ), - td.skip_if_no_ne, + td.skip_if_no("numexpr"), ], ) for engine in ENGINES @@ -115,6 +118,18 @@ midhs = lhs +@pytest.fixture +def idx_func_dict(): + return { + "i": lambda n: Index(np.arange(n), dtype=np.int64), + "f": lambda n: Index(np.arange(n), dtype=np.float64), + "s": lambda n: Index([f"{i}_{chr(i)}" for i in range(97, 97 + n)]), + "dt": lambda n: date_range("2020-01-01", periods=n), + "td": lambda n: timedelta_range("1 day", periods=n), + "p": lambda n: period_range("2020-01-01", periods=n, freq="D"), + } + + class TestEval: @pytest.mark.parametrize( "cmp1", @@ -194,7 +209,7 @@ reason="Looks like expected is negative, unclear whether " "expected is incorrect or result is incorrect" ) - request.node.add_marker(mark) + request.applymarker(mark) skip_these = ["in", "not in"] ex = f"~(lhs {op} rhs)" @@ -724,9 +739,6 @@ assert pd.eval(f"{event.str.match('hello').a and event.str.match('hello').a}") -f = lambda *args, **kwargs: np.random.default_rng(2).standard_normal() - - # ------------------------------------- # gh-12388: Typecasting rules consistency with python @@ -738,7 +750,7 @@ @pytest.mark.parametrize("dt", [np.float32, np.float64]) @pytest.mark.parametrize("left_right", [("df", "3"), ("3", "df")]) def test_binop_typecasting(self, engine, parser, op, dt, left_right): - df = tm.makeCustomDataframe(5, 3, data_gen_f=f, dtype=dt) + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)), dtype=dt) left, right = left_right s = f"{left} {op} {right}" res = pd.eval(s, engine=engine, parser=parser) @@ -765,7 +777,7 @@ def test_align_nested_unary_op(self, engine, parser): s = "df * ~2" - df = tm.makeCustomDataframe(5, 3, data_gen_f=f) + df = DataFrame(np.random.default_rng(2).standard_normal((5, 3))) res = pd.eval(s, engine=engine, parser=parser) tm.assert_frame_equal(res, df * ~2) @@ -774,13 +786,17 @@ @pytest.mark.parametrize("rr_idx_type", index_types) @pytest.mark.parametrize("c_idx_type", index_types) def test_basic_frame_alignment( - self, engine, parser, lr_idx_type, rr_idx_type, c_idx_type + self, engine, parser, lr_idx_type, rr_idx_type, c_idx_type, idx_func_dict ): - df = tm.makeCustomDataframe( - 10, 10, data_gen_f=f, r_idx_type=lr_idx_type, c_idx_type=c_idx_type - ) - df2 = tm.makeCustomDataframe( - 20, 10, data_gen_f=f, r_idx_type=rr_idx_type, c_idx_type=c_idx_type + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 10)), + index=idx_func_dict[lr_idx_type](10), + columns=idx_func_dict[c_idx_type](10), + ) + df2 = DataFrame( + np.random.default_rng(2).standard_normal((20, 10)), + index=idx_func_dict[rr_idx_type](20), + columns=idx_func_dict[c_idx_type](10), ) # only warns if not monotonic and not sortable if should_warn(df.index, df2.index): @@ -792,9 +808,13 @@ @pytest.mark.parametrize("r_idx_type", lhs_index_types) @pytest.mark.parametrize("c_idx_type", lhs_index_types) - def test_frame_comparison(self, engine, parser, r_idx_type, c_idx_type): - df = tm.makeCustomDataframe( - 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + def test_frame_comparison( + self, engine, parser, r_idx_type, c_idx_type, idx_func_dict + ): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 10)), + index=idx_func_dict[r_idx_type](10), + columns=idx_func_dict[c_idx_type](10), ) res = pd.eval("df < 2", engine=engine, parser=parser) tm.assert_frame_equal(res, df < 2) @@ -812,10 +832,24 @@ @pytest.mark.parametrize("c1", index_types) @pytest.mark.parametrize("r2", index_types) @pytest.mark.parametrize("c2", index_types) - def test_medium_complex_frame_alignment(self, engine, parser, r1, c1, r2, c2): - df = tm.makeCustomDataframe(3, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) - df2 = tm.makeCustomDataframe(4, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) - df3 = tm.makeCustomDataframe(5, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + def test_medium_complex_frame_alignment( + self, engine, parser, r1, c1, r2, c2, idx_func_dict + ): + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 2)), + index=idx_func_dict[r1](3), + columns=idx_func_dict[c1](2), + ) + df2 = DataFrame( + np.random.default_rng(2).standard_normal((4, 2)), + index=idx_func_dict[r2](4), + columns=idx_func_dict[c2](2), + ) + df3 = DataFrame( + np.random.default_rng(2).standard_normal((5, 2)), + index=idx_func_dict[r2](5), + columns=idx_func_dict[c2](2), + ) if should_warn(df.index, df2.index, df3.index): with tm.assert_produces_warning(RuntimeWarning): res = pd.eval("df + df2 + df3", engine=engine, parser=parser) @@ -828,10 +862,12 @@ @pytest.mark.parametrize("c_idx_type", index_types) @pytest.mark.parametrize("r_idx_type", lhs_index_types) def test_basic_frame_series_alignment( - self, engine, parser, index_name, r_idx_type, c_idx_type + self, engine, parser, index_name, r_idx_type, c_idx_type, idx_func_dict ): - df = tm.makeCustomDataframe( - 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 10)), + index=idx_func_dict[r_idx_type](10), + columns=idx_func_dict[c_idx_type](10), ) index = getattr(df, index_name) s = Series(np.random.default_rng(2).standard_normal(5), index[:5]) @@ -855,7 +891,7 @@ ) @pytest.mark.filterwarnings("ignore::RuntimeWarning") def test_basic_series_frame_alignment( - self, request, engine, parser, index_name, r_idx_type, c_idx_type + self, request, engine, parser, index_name, r_idx_type, c_idx_type, idx_func_dict ): if ( engine == "numexpr" @@ -869,9 +905,11 @@ f"parser={parser}, index_name={index_name}, " f"r_idx_type={r_idx_type}, c_idx_type={c_idx_type}" ) - request.node.add_marker(pytest.mark.xfail(reason=reason, strict=False)) - df = tm.makeCustomDataframe( - 10, 7, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + request.applymarker(pytest.mark.xfail(reason=reason, strict=False)) + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 7)), + index=idx_func_dict[r_idx_type](10), + columns=idx_func_dict[c_idx_type](7), ) index = getattr(df, index_name) s = Series(np.random.default_rng(2).standard_normal(5), index[:5]) @@ -893,10 +931,12 @@ @pytest.mark.parametrize("index_name", ["index", "columns"]) @pytest.mark.parametrize("op", ["+", "*"]) def test_series_frame_commutativity( - self, engine, parser, index_name, op, r_idx_type, c_idx_type + self, engine, parser, index_name, op, r_idx_type, c_idx_type, idx_func_dict ): - df = tm.makeCustomDataframe( - 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 10)), + index=idx_func_dict[r_idx_type](10), + columns=idx_func_dict[c_idx_type](10), ) index = getattr(df, index_name) s = Series(np.random.default_rng(2).standard_normal(5), index[:5]) @@ -921,17 +961,23 @@ @pytest.mark.parametrize("c1", index_types) @pytest.mark.parametrize("r2", index_types) @pytest.mark.parametrize("c2", index_types) - def test_complex_series_frame_alignment(self, engine, parser, r1, c1, r2, c2): + def test_complex_series_frame_alignment( + self, engine, parser, r1, c1, r2, c2, idx_func_dict + ): n = 3 m1 = 5 m2 = 2 * m1 - - index_name = np.random.default_rng(2).choice(["index", "columns"]) - obj_name = np.random.default_rng(2).choice(["df", "df2"]) - - df = tm.makeCustomDataframe(m1, n, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) - df2 = tm.makeCustomDataframe(m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) - index = getattr(locals().get(obj_name), index_name) + df = DataFrame( + np.random.default_rng(2).standard_normal((m1, n)), + index=idx_func_dict[r1](m1), + columns=idx_func_dict[c1](n), + ) + df2 = DataFrame( + np.random.default_rng(2).standard_normal((m2, n)), + index=idx_func_dict[r2](m2), + columns=idx_func_dict[c2](n), + ) + index = df2.columns ser = Series(np.random.default_rng(2).standard_normal(n), index[:n]) if r2 == "dt" or c2 == "dt": @@ -1243,7 +1289,7 @@ expected["c"] = expected["a"] + expected["b"] tm.assert_frame_equal(df, expected) - def test_multi_line_expression(self): + def test_multi_line_expression(self, warn_copy_on_write): # GH 11149 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() @@ -1417,8 +1463,10 @@ self.eval(expression, target=target, inplace=True) def test_basic_period_index_boolean_expression(self): - df = tm.makeCustomDataframe(2, 2, data_gen_f=f, c_idx_type="p", r_idx_type="i") - + df = DataFrame( + np.random.default_rng(2).standard_normal((2, 2)), + columns=period_range("2020-01-01", freq="D", periods=2), + ) e = df < 2 r = self.eval("df < 2", local_dict={"df": df}) x = df < 2 @@ -1427,13 +1475,19 @@ tm.assert_frame_equal(x, e) def test_basic_period_index_subscript_expression(self): - df = tm.makeCustomDataframe(2, 2, data_gen_f=f, c_idx_type="p", r_idx_type="i") + df = DataFrame( + np.random.default_rng(2).standard_normal((2, 2)), + columns=period_range("2020-01-01", freq="D", periods=2), + ) r = self.eval("df[df < 2 + 3]", local_dict={"df": df}) e = df[df < 2 + 3] tm.assert_frame_equal(r, e) def test_nested_period_index_subscript_expression(self): - df = tm.makeCustomDataframe(2, 2, data_gen_f=f, c_idx_type="p", r_idx_type="i") + df = DataFrame( + np.random.default_rng(2).standard_normal((2, 2)), + columns=period_range("2020-01-01", freq="D", periods=2), + ) r = self.eval("df[df[df < 2] < 2] + df * 2", local_dict={"df": df}) e = df[df[df < 2] < 2] + df * 2 tm.assert_frame_equal(r, e) @@ -1695,14 +1749,14 @@ pd.eval(e, engine=engine, parser=parser, global_dict={}) -@td.skip_if_no_ne +@td.skip_if_no("numexpr") def test_invalid_engine(): msg = "Invalid engine 'asdf' passed" with pytest.raises(KeyError, match=msg): pd.eval("x + y", local_dict={"x": 1, "y": 2}, engine="asdf") -@td.skip_if_no_ne +@td.skip_if_no("numexpr") @pytest.mark.parametrize( ("use_numexpr", "expected"), ( @@ -1719,7 +1773,7 @@ assert result == expected -@td.skip_if_no_ne +@td.skip_if_no("numexpr") def test_numexpr_option_incompatible_op(): # GH 32556 with pd.option_context("compute.use_numexpr", False): @@ -1731,7 +1785,7 @@ tm.assert_frame_equal(result, expected) -@td.skip_if_no_ne +@td.skip_if_no("numexpr") def test_invalid_parser(): msg = "Invalid parser 'asdf' passed" with pytest.raises(KeyError, match=msg): @@ -1840,7 +1894,7 @@ ], ) def test_equals_various(other): - df = DataFrame({"A": ["a", "b", "c"]}) + df = DataFrame({"A": ["a", "b", "c"]}, dtype=object) result = df.eval(f"A == {other}") expected = Series([False, False, False], name="A") if USE_NUMEXPR: @@ -1892,7 +1946,7 @@ def test_eval_no_support_column_name(request, column): # GH 44603 if column in ["True", "False", "inf", "Inf"]: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=KeyError, reason=f"GH 47859 DataFrame eval not supported with {column}", @@ -1909,14 +1963,15 @@ tm.assert_frame_equal(result, expected) -def test_set_inplace(using_copy_on_write): +def test_set_inplace(using_copy_on_write, warn_copy_on_write): # https://github.com/pandas-dev/pandas/issues/47449 # Ensure we don't only update the DataFrame inplace, but also the actual # column values, such that references to this column also get updated df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) result_view = df[:] ser = df["A"] - df.eval("A = B + C", inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.eval("A = B + C", inplace=True) expected = DataFrame({"A": [11, 13, 15], "B": [4, 5, 6], "C": [7, 8, 9]}) tm.assert_frame_equal(df, expected) if not using_copy_on_write: diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/index/test_datetimeindex.py pandas-2.2.2+dfsg/pandas/tests/copy_view/index/test_datetimeindex.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/index/test_datetimeindex.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/index/test_datetimeindex.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + @pytest.mark.parametrize( "cons", diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/index/test_index.py pandas-2.2.2+dfsg/pandas/tests/copy_view/index/test_index.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/index/test_index.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/index/test_index.py 2024-04-10 17:42:52.000000000 +0000 @@ -19,11 +19,12 @@ return idx, view -def test_set_index_update_column(using_copy_on_write): +def test_set_index_update_column(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1}) df = df.set_index("a", drop=False) expected = df.index.copy(deep=True) - df.iloc[0, 0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 if using_copy_on_write: tm.assert_index_equal(df.index, expected) else: @@ -39,49 +40,53 @@ tm.assert_index_equal(df.index, expected) -def test_set_index_series(using_copy_on_write): +def test_set_index_series(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1.5}) ser = Series([10, 11]) df = df.set_index(ser) expected = df.index.copy(deep=True) - ser.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 if using_copy_on_write: tm.assert_index_equal(df.index, expected) else: tm.assert_index_equal(df.index, Index([100, 11])) -def test_assign_index_as_series(using_copy_on_write): +def test_assign_index_as_series(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1.5}) ser = Series([10, 11]) df.index = ser expected = df.index.copy(deep=True) - ser.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 if using_copy_on_write: tm.assert_index_equal(df.index, expected) else: tm.assert_index_equal(df.index, Index([100, 11])) -def test_assign_index_as_index(using_copy_on_write): +def test_assign_index_as_index(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1.5}) ser = Series([10, 11]) rhs_index = Index(ser) df.index = rhs_index rhs_index = None # overwrite to clear reference expected = df.index.copy(deep=True) - ser.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 if using_copy_on_write: tm.assert_index_equal(df.index, expected) else: tm.assert_index_equal(df.index, Index([100, 11])) -def test_index_from_series(using_copy_on_write): +def test_index_from_series(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2]) idx = Index(ser) expected = idx.copy(deep=True) - ser.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 if using_copy_on_write: tm.assert_index_equal(idx, expected) else: @@ -96,12 +101,13 @@ assert np.shares_memory(get_array(ser), arr) -def test_index_from_index(using_copy_on_write): +def test_index_from_index(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2]) idx = Index(ser) idx = Index(idx) expected = idx.copy(deep=True) - ser.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 100 if using_copy_on_write: tm.assert_index_equal(idx, expected) else: diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/index/test_periodindex.py pandas-2.2.2+dfsg/pandas/tests/copy_view/index/test_periodindex.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/index/test_periodindex.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/index/test_periodindex.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + @pytest.mark.parametrize( "cons", diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/index/test_timedeltaindex.py pandas-2.2.2+dfsg/pandas/tests/copy_view/index/test_timedeltaindex.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/index/test_timedeltaindex.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/index/test_timedeltaindex.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Setting a value on a view:FutureWarning" +) + @pytest.mark.parametrize( "cons", diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/test_array.py pandas-2.2.2+dfsg/pandas/tests/copy_view/test_array.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/test_array.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/test_array.py 2024-04-10 17:42:52.000000000 +0000 @@ -116,7 +116,8 @@ @pytest.mark.parametrize("order", ["F", "C"]) def test_ravel_read_only(using_copy_on_write, order): ser = Series([1, 2, 3]) - arr = ser.ravel(order=order) + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + arr = ser.ravel(order=order) if using_copy_on_write: assert arr.flags.writeable is False assert np.shares_memory(get_array(ser), arr) @@ -132,21 +133,25 @@ assert arr.flags.writeable is True arr = np.asarray(ser) - assert not np.shares_memory(arr, get_array(ser)) - assert arr.flags.writeable is True + assert np.shares_memory(arr, get_array(ser)) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True def test_dataframe_array_ea_dtypes(using_copy_on_write): df = DataFrame({"a": [1, 2, 3]}, dtype="Int64") arr = np.asarray(df, dtype="int64") - # TODO: This should be able to share memory, but we are roundtripping - # through object - assert not np.shares_memory(arr, get_array(df, "a")) - assert arr.flags.writeable is True + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True arr = np.asarray(df) + assert np.shares_memory(arr, get_array(df, "a")) if using_copy_on_write: - # TODO(CoW): This should be True assert arr.flags.writeable is False else: assert arr.flags.writeable is True diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/test_astype.py pandas-2.2.2+dfsg/pandas/tests/copy_view/test_astype.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/test_astype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/test_astype.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,8 @@ +import pickle + import numpy as np import pytest -from pandas.compat import pa_version_under7p0 from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -43,8 +44,8 @@ @pytest.mark.parametrize("dtype", ["int64", "Int64"]) @pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"]) def test_astype_avoids_copy(using_copy_on_write, dtype, new_dtype): - if new_dtype == "int64[pyarrow]" and pa_version_under7p0: - pytest.skip("pyarrow not installed") + if new_dtype == "int64[pyarrow]": + pytest.importorskip("pyarrow") df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) df_orig = df.copy() df2 = df.astype(new_dtype) @@ -68,8 +69,8 @@ @pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"]) def test_astype_different_target_dtype(using_copy_on_write, dtype): - if dtype == "int32[pyarrow]" and pa_version_under7p0: - pytest.skip("pyarrow not installed") + if dtype == "int32[pyarrow]": + pytest.importorskip("pyarrow") df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() df2 = df.astype(dtype) @@ -131,6 +132,15 @@ tm.assert_frame_equal(df2, df_orig) +def test_astype_string_copy_on_pickle_roundrip(): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy.astype(str) + tm.assert_series_equal(base, base_copy) + + def test_astype_dict_dtypes(using_copy_on_write): df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")} @@ -187,8 +197,8 @@ assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) -@pytest.mark.skipif(pa_version_under7p0, reason="pyarrow not installed") def test_astype_arrow_timestamp(using_copy_on_write): + pytest.importorskip("pyarrow") df = DataFrame( { "a": [ diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/test_chained_assignment_deprecation.py pandas-2.2.2+dfsg/pandas/tests/copy_view/test_chained_assignment_deprecation.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/test_chained_assignment_deprecation.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/test_chained_assignment_deprecation.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,174 @@ +import numpy as np +import pytest + +from pandas.compat import PY311 +from pandas.errors import ( + ChainedAssignmentError, + SettingWithCopyWarning, +) + +from pandas import ( + DataFrame, + option_context, +) +import pandas._testing as tm + + +def test_methods_iloc_warn(using_copy_on_write): + if not using_copy_on_write: + df = DataFrame({"a": [1, 2, 3], "b": 1}) + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].replace(1, 5, inplace=True) + + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].fillna(1, inplace=True) + + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].interpolate(inplace=True) + + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].ffill(inplace=True) + + with tm.assert_cow_warning(match="A value"): + df.iloc[:, 0].bfill(inplace=True) + + +@pytest.mark.parametrize( + "func, args", + [ + ("replace", (4, 5)), + ("fillna", (1,)), + ("interpolate", ()), + ("bfill", ()), + ("ffill", ()), + ], +) +def test_methods_iloc_getitem_item_cache( + func, args, using_copy_on_write, warn_copy_on_write +): + # ensure we don't incorrectly raise chained assignment warning because + # of the item cache / iloc not setting the item cache + df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) + + df = df_orig.copy() + ser = df.iloc[:, 0] + getattr(ser, func)(*args, inplace=True) + + # parent that holds item_cache is dead, so don't increase ref count + df = df_orig.copy() + ser = df.copy()["a"] + getattr(ser, func)(*args, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df.iloc[:, 0] # iloc creates a new object + getattr(ser, func)(*args, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df["a"] + getattr(ser, func)(*args, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + # TODO(CoW-warn) because of the usage of *args, this doesn't warn on Py3.11+ + if using_copy_on_write: + with tm.raises_chained_assignment_error(not PY311): + getattr(df["a"], func)(*args, inplace=True) + else: + with tm.assert_cow_warning(not PY311, match="A value"): + getattr(df["a"], func)(*args, inplace=True) + + df = df_orig.copy() + ser = df["a"] # populate the item_cache and keep ref + if using_copy_on_write: + with tm.raises_chained_assignment_error(not PY311): + getattr(df["a"], func)(*args, inplace=True) + else: + # ideally also warns on the default mode, but the ser' _cacher + # messes up the refcount + even in warning mode this doesn't trigger + # the warning of Py3.1+ (see above) + with tm.assert_cow_warning(warn_copy_on_write and not PY311, match="A value"): + getattr(df["a"], func)(*args, inplace=True) + + +def test_methods_iloc_getitem_item_cache_fillna( + using_copy_on_write, warn_copy_on_write +): + # ensure we don't incorrectly raise chained assignment warning because + # of the item cache / iloc not setting the item cache + df_orig = DataFrame({"a": [1, 2, 3], "b": 1}) + + df = df_orig.copy() + ser = df.iloc[:, 0] + ser.fillna(1, inplace=True) + + # parent that holds item_cache is dead, so don't increase ref count + df = df_orig.copy() + ser = df.copy()["a"] + ser.fillna(1, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df.iloc[:, 0] # iloc creates a new object + ser.fillna(1, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + ser = df["a"] + ser.fillna(1, inplace=True) + + df = df_orig.copy() + df["a"] # populate the item_cache + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].fillna(1, inplace=True) + else: + with tm.assert_cow_warning(match="A value"): + df["a"].fillna(1, inplace=True) + + df = df_orig.copy() + ser = df["a"] # populate the item_cache and keep ref + if using_copy_on_write: + with tm.raises_chained_assignment_error(): + df["a"].fillna(1, inplace=True) + else: + # TODO(CoW-warn) ideally also warns on the default mode, but the ser' _cacher + # messes up the refcount + with tm.assert_cow_warning(warn_copy_on_write, match="A value"): + df["a"].fillna(1, inplace=True) + + +# TODO(CoW-warn) expand the cases +@pytest.mark.parametrize( + "indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])] +) +def test_series_setitem(indexer, using_copy_on_write, warn_copy_on_write): + # ensure we only get a single warning for those typical cases of chained + # assignment + df = DataFrame({"a": [1, 2, 3], "b": 1}) + + # using custom check instead of tm.assert_produces_warning because that doesn't + # fail if multiple warnings are raised + with pytest.warns() as record: + df["a"][indexer] = 0 + assert len(record) == 1 + if using_copy_on_write: + assert record[0].category == ChainedAssignmentError + else: + assert record[0].category == FutureWarning + assert "ChainedAssignmentError" in record[0].message.args[0] + + +@pytest.mark.filterwarnings("ignore::pandas.errors.SettingWithCopyWarning") +@pytest.mark.parametrize( + "indexer", ["a", ["a", "b"], slice(0, 2), np.array([True, False, True])] +) +def test_frame_setitem(indexer, using_copy_on_write): + df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1}) + + extra_warnings = () if using_copy_on_write else (SettingWithCopyWarning,) + + with option_context("chained_assignment", "warn"): + with tm.raises_chained_assignment_error(extra_warnings=extra_warnings): + df[0:3][indexer] = 10 diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/test_clip.py pandas-2.2.2+dfsg/pandas/tests/copy_view/test_clip.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/test_clip.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/test_clip.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,16 +1,23 @@ import numpy as np -from pandas import DataFrame +from pandas import ( + DataFrame, + option_context, +) import pandas._testing as tm from pandas.tests.copy_view.util import get_array -def test_clip_inplace_reference(using_copy_on_write): +def test_clip_inplace_reference(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1.5, 2, 3]}) df_copy = df.copy() arr_a = get_array(df, "a") view = df[:] - df.clip(lower=2, inplace=True) + if warn_copy_on_write: + with tm.assert_cow_warning(): + df.clip(lower=2, inplace=True) + else: + df.clip(lower=2, inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), arr_a) @@ -81,3 +88,14 @@ with tm.raises_chained_assignment_error(): df[["a"]].clip(1, 2, inplace=True) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].clip(1, 2, inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[["a"]].clip(1, 2, inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[df["a"] > 1].clip(1, 2, inplace=True) diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/copy_view/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -21,7 +21,7 @@ @pytest.mark.parametrize("dtype", [None, "int64"]) -def test_series_from_series(dtype, using_copy_on_write): +def test_series_from_series(dtype, using_copy_on_write, warn_copy_on_write): # Case: constructing a Series from another Series object follows CoW rules: # a new object is returned and thus mutations are not propagated ser = Series([1, 2, 3], name="name") @@ -43,7 +43,8 @@ assert not np.shares_memory(get_array(ser), get_array(result)) else: # mutating shallow copy does mutate original - result.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 0 assert ser.iloc[0] == 0 # and still shares memory assert np.shares_memory(get_array(ser), get_array(result)) @@ -57,11 +58,12 @@ assert result.iloc[0] == 1 else: # mutating original does mutate shallow copy - ser.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 0 assert result.iloc[0] == 0 -def test_series_from_series_with_reindex(using_copy_on_write): +def test_series_from_series_with_reindex(using_copy_on_write, warn_copy_on_write): # Case: constructing a Series from another Series with specifying an index # that potentially requires a reindex of the values ser = Series([1, 2, 3], name="name") @@ -76,7 +78,8 @@ ]: result = Series(ser, index=index) assert np.shares_memory(ser.values, result.values) - result.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 0 if using_copy_on_write: assert ser.iloc[0] == 1 else: @@ -99,7 +102,9 @@ def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr): if idx is None or dtype is not None: fastpath = False - ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath) + msg = "The 'fastpath' keyword in pd.Series is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath) ser_orig = ser.copy() data = getattr(arr, "_data", arr) if using_copy_on_write: @@ -151,13 +156,16 @@ assert ser._mgr._has_no_reference(0) +@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") @pytest.mark.parametrize("fastpath", [False, True]) @pytest.mark.parametrize("dtype", [None, "int64"]) @pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)]) def test_series_from_block_manager(using_copy_on_write, idx, dtype, fastpath): ser = Series([1, 2, 3], dtype="int64") ser_orig = ser.copy() - ser2 = Series(ser._mgr, dtype=dtype, fastpath=fastpath, index=idx) + msg = "The 'fastpath' keyword in pd.Series is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + ser2 = Series(ser._mgr, dtype=dtype, fastpath=fastpath, index=idx) assert np.shares_memory(get_array(ser), get_array(ser2)) if using_copy_on_write: assert not ser2._mgr._has_no_reference(0) @@ -172,22 +180,35 @@ def test_series_from_block_manager_different_dtype(using_copy_on_write): ser = Series([1, 2, 3], dtype="int64") - ser2 = Series(ser._mgr, dtype="int32") + msg = "Passing a SingleBlockManager to Series" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + ser2 = Series(ser._mgr, dtype="int32") assert not np.shares_memory(get_array(ser), get_array(ser2)) if using_copy_on_write: assert ser2._mgr._has_no_reference(0) -@pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr]) +@pytest.mark.parametrize("use_mgr", [True, False]) @pytest.mark.parametrize("columns", [None, ["a"]]) -def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, func): +def test_dataframe_constructor_mgr_or_df( + using_copy_on_write, warn_copy_on_write, columns, use_mgr +): df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() - new_df = DataFrame(func(df)) + if use_mgr: + data = df._mgr + warn = DeprecationWarning + else: + data = df + warn = None + msg = "Passing a BlockManager to DataFrame" + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): + new_df = DataFrame(data) assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) - new_df.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write and not use_mgr): + new_df.iloc[0] = 100 if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) @@ -201,7 +222,7 @@ @pytest.mark.parametrize("index", [None, [0, 1, 2]]) @pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]]) def test_dataframe_from_dict_of_series( - request, using_copy_on_write, columns, index, dtype + request, using_copy_on_write, warn_copy_on_write, columns, index, dtype ): # Case: constructing a DataFrame from Series objects with copy=False # has to do a lazy following CoW rules @@ -221,7 +242,8 @@ assert np.shares_memory(get_array(result, "a"), get_array(s1)) # mutating the new dataframe doesn't mutate original - result.iloc[0, 0] = 10 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0, 0] = 10 if using_copy_on_write: assert not np.shares_memory(get_array(result, "a"), get_array(s1)) tm.assert_series_equal(s1, s1_orig) @@ -234,7 +256,8 @@ result = DataFrame( {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False ) - s1.iloc[0] = 10 + with tm.assert_cow_warning(warn_copy_on_write): + s1.iloc[0] = 10 if using_copy_on_write: assert not np.shares_memory(get_array(result, "a"), get_array(s1)) tm.assert_frame_equal(result, expected) @@ -264,7 +287,9 @@ @pytest.mark.parametrize( "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)] ) -def test_dataframe_from_series_or_index(using_copy_on_write, data, dtype, cons): +def test_dataframe_from_series_or_index( + using_copy_on_write, warn_copy_on_write, data, dtype, cons +): obj = cons(data, dtype=dtype) obj_orig = obj.copy() df = DataFrame(obj, dtype=dtype) @@ -272,7 +297,8 @@ if using_copy_on_write: assert not df._mgr._has_no_reference(0) - df.iloc[0, 0] = data[-1] + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = data[-1] if using_copy_on_write: tm.assert_equal(obj, obj_orig) @@ -288,7 +314,8 @@ def test_dataframe_from_series_infer_datetime(using_copy_on_write): ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object) - df = DataFrame(ser) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + df = DataFrame(ser) assert not np.shares_memory(get_array(ser), get_array(df, 0)) if using_copy_on_write: assert df._mgr._has_no_reference(0) @@ -327,7 +354,7 @@ assert np.shares_memory(get_array(df, 0), arr) -def test_dataframe_from_records_with_dataframe(using_copy_on_write): +def test_dataframe_from_records_with_dataframe(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() with tm.assert_produces_warning(FutureWarning): @@ -335,7 +362,8 @@ if using_copy_on_write: assert not df._mgr._has_no_reference(0) assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) - df2.iloc[0, 0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + df2.iloc[0, 0] = 100 if using_copy_on_write: tm.assert_frame_equal(df, df_orig) else: diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/test_core_functionalities.py pandas-2.2.2+dfsg/pandas/tests/copy_view/test_core_functionalities.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/test_core_functionalities.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/test_core_functionalities.py 2024-04-10 17:42:52.000000000 +0000 @@ -28,27 +28,33 @@ assert np.shares_memory(arr, get_array(df, "a")) -def test_setitem_with_view_copies(using_copy_on_write): +def test_setitem_with_view_copies(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) view = df[:] expected = df.copy() df["b"] = 100 arr = get_array(df, "a") - df.iloc[0, 0] = 100 # Check that we correctly track reference + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 # Check that we correctly track reference if using_copy_on_write: assert not np.shares_memory(arr, get_array(df, "a")) tm.assert_frame_equal(view, expected) -def test_setitem_with_view_invalidated_does_not_copy(using_copy_on_write, request): +def test_setitem_with_view_invalidated_does_not_copy( + using_copy_on_write, warn_copy_on_write, request +): df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1}) view = df[:] df["b"] = 100 arr = get_array(df, "a") view = None # noqa: F841 - df.iloc[0, 0] = 100 + # TODO(CoW-warn) false positive? -> block gets split because of `df["b"] = 100` + # which introduces additional refs, even when those of `view` go out of scopes + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 if using_copy_on_write: # Setitem split the block. Since the old block shared data with view # all the new blocks are referencing view and each other. When view @@ -57,7 +63,7 @@ mark = pytest.mark.xfail( reason="blk.delete does not track references correctly" ) - request.node.add_marker(mark) + request.applymarker(mark) assert np.shares_memory(arr, get_array(df, "a")) diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/test_indexing.py pandas-2.2.2+dfsg/pandas/tests/copy_view/test_indexing.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/test_indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/test_indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -101,7 +101,7 @@ tm.assert_frame_equal(subset, expected) -def test_subset_row_slice(backend, using_copy_on_write): +def test_subset_row_slice(backend, using_copy_on_write, warn_copy_on_write): # Case: taking a subset of the rows of a DataFrame using a slice # + afterwards modifying the subset _, DataFrame, _ = backend @@ -121,7 +121,8 @@ # INFO this no longer raise warning since pandas 1.4 # with pd.option_context("chained_assignment", "warn"): # with tm.assert_produces_warning(SettingWithCopyWarning): - subset.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0, 0] = 0 subset._mgr._verify_integrity() @@ -139,7 +140,9 @@ @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_column_slice(backend, using_copy_on_write, using_array_manager, dtype): +def test_subset_column_slice( + backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype +): # Case: taking a subset of the columns of a DataFrame using a slice # + afterwards modifying the subset dtype_backend, DataFrame, _ = backend @@ -159,7 +162,9 @@ subset.iloc[0, 0] = 0 assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b")) - + elif warn_copy_on_write: + with tm.assert_cow_warning(single_block): + subset.iloc[0, 0] = 0 else: # we only get a warning in case of a single block warn = SettingWithCopyWarning if single_block else None @@ -198,6 +203,7 @@ column_indexer, using_array_manager, using_copy_on_write, + warn_copy_on_write, ): # Case: taking a subset of the rows+columns of a DataFrame using .loc # + afterwards modifying the subset @@ -213,16 +219,9 @@ subset = df.loc[row_indexer, column_indexer] - # modifying the subset never modifies the parent - subset.iloc[0, 0] = 0 - - expected = DataFrame( - {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) - ) - tm.assert_frame_equal(subset, expected) # a few corner cases _do_ actually modify the parent (with both row and column # slice, and in case of ArrayManager or BlockManager with single block) - if ( + mutate_parent = ( isinstance(row_indexer, slice) and isinstance(column_indexer, slice) and ( @@ -233,7 +232,17 @@ and not using_copy_on_write ) ) - ): + ) + + # modifying the subset never modifies the parent + with tm.assert_cow_warning(warn_copy_on_write and mutate_parent): + subset.iloc[0, 0] = 0 + + expected = DataFrame( + {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + if mutate_parent: df_orig.iloc[1, 1] = 0 tm.assert_frame_equal(df, df_orig) @@ -258,6 +267,7 @@ column_indexer, using_array_manager, using_copy_on_write, + warn_copy_on_write, ): # Case: taking a subset of the rows+columns of a DataFrame using .iloc # + afterwards modifying the subset @@ -273,16 +283,9 @@ subset = df.iloc[row_indexer, column_indexer] - # modifying the subset never modifies the parent - subset.iloc[0, 0] = 0 - - expected = DataFrame( - {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) - ) - tm.assert_frame_equal(subset, expected) # a few corner cases _do_ actually modify the parent (with both row and column # slice, and in case of ArrayManager or BlockManager with single block) - if ( + mutate_parent = ( isinstance(row_indexer, slice) and isinstance(column_indexer, slice) and ( @@ -293,7 +296,17 @@ and not using_copy_on_write ) ) - ): + ) + + # modifying the subset never modifies the parent + with tm.assert_cow_warning(warn_copy_on_write and mutate_parent): + subset.iloc[0, 0] = 0 + + expected = DataFrame( + {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + if mutate_parent: df_orig.iloc[1, 1] = 0 tm.assert_frame_equal(df, df_orig) @@ -303,7 +316,9 @@ [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], ids=["slice", "mask", "array"], ) -def test_subset_set_with_row_indexer(backend, indexer_si, indexer, using_copy_on_write): +def test_subset_set_with_row_indexer( + backend, indexer_si, indexer, using_copy_on_write, warn_copy_on_write +): # Case: setting values with a row indexer on a viewing subset # subset[indexer] = value and subset.iloc[indexer] = value _, DataFrame, _ = backend @@ -320,6 +335,9 @@ if using_copy_on_write: indexer_si(subset)[indexer] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + indexer_si(subset)[indexer] = 0 else: # INFO iloc no longer raises warning since pandas 1.4 warn = SettingWithCopyWarning if indexer_si is tm.setitem else None @@ -340,7 +358,7 @@ tm.assert_frame_equal(df, df_orig) -def test_subset_set_with_mask(backend, using_copy_on_write): +def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write): # Case: setting values with a mask on a viewing subset: subset[mask] = value _, DataFrame, _ = backend df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) @@ -351,6 +369,9 @@ if using_copy_on_write: subset[mask] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + subset[mask] = 0 else: with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning(SettingWithCopyWarning): @@ -370,7 +391,7 @@ tm.assert_frame_equal(df, df_orig) -def test_subset_set_column(backend, using_copy_on_write): +def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write): # Case: setting a single column on a viewing subset -> subset[col] = value dtype_backend, DataFrame, _ = backend df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) @@ -382,7 +403,7 @@ else: arr = pd.array([10, 11], dtype="Int64") - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: subset["a"] = arr else: with pd.option_context("chained_assignment", "warn"): @@ -401,7 +422,7 @@ "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) def test_subset_set_column_with_loc( - backend, using_copy_on_write, using_array_manager, dtype + backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype ): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value @@ -414,6 +435,9 @@ if using_copy_on_write: subset.loc[:, "a"] = np.array([10, 11], dtype="int64") + elif warn_copy_on_write: + with tm.assert_cow_warning(): + subset.loc[:, "a"] = np.array([10, 11], dtype="int64") else: with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning( @@ -437,7 +461,9 @@ tm.assert_frame_equal(df, df_orig) -def test_subset_set_column_with_loc2(backend, using_copy_on_write, using_array_manager): +def test_subset_set_column_with_loc2( + backend, using_copy_on_write, warn_copy_on_write, using_array_manager +): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value # separate test for case of DataFrame of a single column -> takes a separate @@ -449,6 +475,9 @@ if using_copy_on_write: subset.loc[:, "a"] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + subset.loc[:, "a"] = 0 else: with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning( @@ -472,7 +501,7 @@ @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_set_columns(backend, using_copy_on_write, dtype): +def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dtype): # Case: setting multiple columns on a viewing subset # -> subset[[col1, col2]] = value dtype_backend, DataFrame, _ = backend @@ -482,7 +511,7 @@ df_orig = df.copy() subset = df[1:3] - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: subset[["a", "c"]] = 0 else: with pd.option_context("chained_assignment", "warn"): @@ -509,7 +538,9 @@ [slice("a", "b"), np.array([True, True, False]), ["a", "b"]], ids=["slice", "mask", "array"], ) -def test_subset_set_with_column_indexer(backend, indexer, using_copy_on_write): +def test_subset_set_with_column_indexer( + backend, indexer, using_copy_on_write, warn_copy_on_write +): # Case: setting multiple columns with a column indexer on a viewing subset # -> subset.loc[:, [col1, col2]] = value _, DataFrame, _ = backend @@ -519,6 +550,9 @@ if using_copy_on_write: subset.loc[:, indexer] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + subset.loc[:, indexer] = 0 else: with pd.option_context("chained_assignment", "warn"): # As of 2.0, this setitem attempts (successfully) to set values @@ -561,7 +595,13 @@ "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) def test_subset_chained_getitem( - request, backend, method, dtype, using_copy_on_write, using_array_manager + request, + backend, + method, + dtype, + using_copy_on_write, + using_array_manager, + warn_copy_on_write, ): # Case: creating a subset using multiple, chained getitem calls using views # still needs to guarantee proper CoW behaviour @@ -588,7 +628,9 @@ # modify subset -> don't modify parent subset = method(df) - subset.iloc[0, 0] = 0 + + with tm.assert_cow_warning(warn_copy_on_write and subset_is_view): + subset.iloc[0, 0] = 0 if using_copy_on_write or (not subset_is_view): tm.assert_frame_equal(df, df_orig) else: @@ -596,7 +638,8 @@ # modify parent -> don't modify subset subset = method(df) - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write and subset_is_view): + df.iloc[0, 0] = 0 expected = DataFrame({"a": [1, 2], "b": [4, 5]}) if using_copy_on_write or not subset_is_view: tm.assert_frame_equal(subset, expected) @@ -607,10 +650,12 @@ @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_chained_getitem_column(backend, dtype, using_copy_on_write): +def test_subset_chained_getitem_column( + backend, dtype, using_copy_on_write, warn_copy_on_write +): # Case: creating a subset using multiple, chained getitem calls using views # still needs to guarantee proper CoW behaviour - _, DataFrame, Series = backend + dtype_backend, DataFrame, Series = backend df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} ) @@ -619,7 +664,8 @@ # modify subset -> don't modify parent subset = df[:]["a"][0:2] df._clear_item_cache() - subset.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 if using_copy_on_write: tm.assert_frame_equal(df, df_orig) else: @@ -628,7 +674,8 @@ # modify parent -> don't modify subset subset = df[:]["a"][0:2] df._clear_item_cache() - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 expected = Series([1, 2], name="a") if using_copy_on_write: tm.assert_series_equal(subset, expected) @@ -650,7 +697,9 @@ ], ids=["getitem", "iloc", "loc", "long-chain"], ) -def test_subset_chained_getitem_series(backend, method, using_copy_on_write): +def test_subset_chained_getitem_series( + backend, method, using_copy_on_write, warn_copy_on_write +): # Case: creating a subset using multiple, chained getitem calls using views # still needs to guarantee proper CoW behaviour _, _, Series = backend @@ -659,7 +708,8 @@ # modify subset -> don't modify parent subset = method(s) - subset.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 if using_copy_on_write: tm.assert_series_equal(s, s_orig) else: @@ -667,7 +717,8 @@ # modify parent -> don't modify subset subset = s.iloc[0:3].iloc[0:2] - s.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + s.iloc[0] = 0 expected = Series([1, 2], index=["a", "b"]) if using_copy_on_write: tm.assert_series_equal(subset, expected) @@ -675,14 +726,17 @@ assert subset.iloc[0] == 0 -def test_subset_chained_single_block_row(using_copy_on_write, using_array_manager): +def test_subset_chained_single_block_row( + using_copy_on_write, using_array_manager, warn_copy_on_write +): # not parametrizing this for dtype backend, since this explicitly tests single block df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) df_orig = df.copy() # modify subset -> don't modify parent subset = df[:].iloc[0].iloc[0:2] - subset.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 if using_copy_on_write or using_array_manager: tm.assert_frame_equal(df, df_orig) else: @@ -690,7 +744,8 @@ # modify parent -> don't modify subset subset = df[:].iloc[0].iloc[0:2] - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 expected = Series([1, 4], index=["a", "b"], name=0) if using_copy_on_write or using_array_manager: tm.assert_series_equal(subset, expected) @@ -709,10 +764,10 @@ ], ids=["getitem", "loc", "loc-rows", "iloc", "iloc-rows"], ) -def test_null_slice(backend, method, using_copy_on_write): +def test_null_slice(backend, method, using_copy_on_write, warn_copy_on_write): # Case: also all variants of indexing with a null slice (:) should return # new objects to ensure we correctly use CoW for the results - _, DataFrame, _ = backend + dtype_backend, DataFrame, _ = backend df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) df_orig = df.copy() @@ -722,7 +777,8 @@ assert df2 is not df # and those trigger CoW when mutated - df2.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df2.iloc[0, 0] = 0 if using_copy_on_write: tm.assert_frame_equal(df, df_orig) else: @@ -738,7 +794,7 @@ ], ids=["getitem", "loc", "iloc"], ) -def test_null_slice_series(backend, method, using_copy_on_write): +def test_null_slice_series(backend, method, using_copy_on_write, warn_copy_on_write): _, _, Series = backend s = Series([1, 2, 3], index=["a", "b", "c"]) s_orig = s.copy() @@ -749,7 +805,8 @@ assert s2 is not s # and those trigger CoW when mutated - s2.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + s2.iloc[0] = 0 if using_copy_on_write: tm.assert_series_equal(s, s_orig) else: @@ -763,7 +820,7 @@ # Series -- Indexing operations taking subset + modifying the subset/parent -def test_series_getitem_slice(backend, using_copy_on_write): +def test_series_getitem_slice(backend, using_copy_on_write, warn_copy_on_write): # Case: taking a slice of a Series + afterwards modifying the subset _, _, Series = backend s = Series([1, 2, 3], index=["a", "b", "c"]) @@ -772,7 +829,8 @@ subset = s[:] assert np.shares_memory(get_array(subset), get_array(s)) - subset.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(subset), get_array(s)) @@ -788,13 +846,38 @@ assert s.iloc[0] == 0 +def test_series_getitem_ellipsis(using_copy_on_write, warn_copy_on_write): + # Case: taking a view of a Series using Ellipsis + afterwards modifying the subset + s = Series([1, 2, 3]) + s_orig = s.copy() + + subset = s[...] + assert np.shares_memory(get_array(subset), get_array(s)) + + with tm.assert_cow_warning(warn_copy_on_write): + subset.iloc[0] = 0 + + if using_copy_on_write: + assert not np.shares_memory(get_array(subset), get_array(s)) + + expected = Series([0, 2, 3]) + tm.assert_series_equal(subset, expected) + + if using_copy_on_write: + # original parent series is not modified (CoW) + tm.assert_series_equal(s, s_orig) + else: + # original parent series is actually updated + assert s.iloc[0] == 0 + + @pytest.mark.parametrize( "indexer", [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], ids=["slice", "mask", "array"], ) def test_series_subset_set_with_indexer( - backend, indexer_si, indexer, using_copy_on_write + backend, indexer_si, indexer, using_copy_on_write, warn_copy_on_write ): # Case: setting values in a viewing Series with an indexer _, _, Series = backend @@ -810,9 +893,12 @@ and indexer.dtype.kind == "i" ): warn = FutureWarning - - with tm.assert_produces_warning(warn, match=msg): - indexer_si(subset)[indexer] = 0 + if warn_copy_on_write: + with tm.assert_cow_warning(raise_on_extra_warnings=warn is not None): + indexer_si(subset)[indexer] = 0 + else: + with tm.assert_produces_warning(warn, match=msg): + indexer_si(subset)[indexer] = 0 expected = Series([0, 0, 3], index=["a", "b", "c"]) tm.assert_series_equal(subset, expected) @@ -826,10 +912,10 @@ # del operator -def test_del_frame(backend, using_copy_on_write): +def test_del_frame(backend, using_copy_on_write, warn_copy_on_write): # Case: deleting a column with `del` on a viewing child dataframe should # not modify parent + update the references - _, DataFrame, _ = backend + dtype_backend, DataFrame, _ = backend df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df[:] @@ -843,11 +929,13 @@ tm.assert_frame_equal(df2, df_orig[["a", "c"]]) df2._mgr._verify_integrity() - df.loc[0, "b"] = 200 + with tm.assert_cow_warning(warn_copy_on_write and dtype_backend == "numpy"): + df.loc[0, "b"] = 200 assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) df_orig = df.copy() - df2.loc[0, "a"] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + df2.loc[0, "a"] = 100 if using_copy_on_write: # modifying child after deleting a column still doesn't update parent tm.assert_frame_equal(df, df_orig) @@ -879,7 +967,9 @@ # Accessing column as Series -def test_column_as_series(backend, using_copy_on_write, using_array_manager): +def test_column_as_series( + backend, using_copy_on_write, warn_copy_on_write, using_array_manager +): # Case: selecting a single column now also uses Copy-on-Write dtype_backend, DataFrame, Series = backend df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) @@ -892,10 +982,14 @@ if using_copy_on_write or using_array_manager: s[0] = 0 else: - warn = SettingWithCopyWarning if dtype_backend == "numpy" else None - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(warn): + if warn_copy_on_write: + with tm.assert_cow_warning(): s[0] = 0 + else: + warn = SettingWithCopyWarning if dtype_backend == "numpy" else None + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(warn): + s[0] = 0 expected = Series([0, 2, 3], name="a") tm.assert_series_equal(s, expected) @@ -910,7 +1004,7 @@ def test_column_as_series_set_with_upcast( - backend, using_copy_on_write, using_array_manager + backend, using_copy_on_write, using_array_manager, warn_copy_on_write ): # Case: selecting a single column now also uses Copy-on-Write -> when # setting a value causes an upcast, we don't need to update the parent @@ -921,10 +1015,12 @@ s = df["a"] if dtype_backend == "nullable": - with pytest.raises(TypeError, match="Invalid value"): - s[0] = "foo" + with tm.assert_cow_warning(warn_copy_on_write): + with pytest.raises(TypeError, match="Invalid value"): + s[0] = "foo" expected = Series([1, 2, 3], name="a") - elif using_copy_on_write or using_array_manager: + elif using_copy_on_write or warn_copy_on_write or using_array_manager: + # TODO(CoW-warn) assert the FutureWarning for CoW is also raised with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): s[0] = "foo" expected = Series(["foo", 2, 3], dtype=object, name="a") @@ -962,7 +1058,12 @@ ids=["getitem", "loc", "iloc"], ) def test_column_as_series_no_item_cache( - request, backend, method, using_copy_on_write, using_array_manager + request, + backend, + method, + using_copy_on_write, + warn_copy_on_write, + using_array_manager, ): # Case: selecting a single column (which now also uses Copy-on-Write to protect # the view) should always give a new object (i.e. not make use of a cache) @@ -974,13 +1075,16 @@ s2 = method(df) is_iloc = "iloc" in request.node.name - if using_copy_on_write or is_iloc: + if using_copy_on_write or warn_copy_on_write or is_iloc: assert s1 is not s2 else: assert s1 is s2 if using_copy_on_write or using_array_manager: s1.iloc[0] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + s1.iloc[0] = 0 else: warn = SettingWithCopyWarning if dtype_backend == "numpy" else None with pd.option_context("chained_assignment", "warn"): @@ -1032,7 +1136,7 @@ "col", [[0.1, 0.2, 0.3], [7, 8, 9]], ids=["mixed-block", "single-block"] ) def test_set_value_copy_only_necessary_column( - using_copy_on_write, indexer_func, indexer, val, col + using_copy_on_write, warn_copy_on_write, indexer_func, indexer, val, col ): # When setting inplace, only copy column that is modified instead of the whole # block (by splitting the block) @@ -1040,13 +1144,19 @@ df_orig = df.copy() view = df[:] - if val == "a" and indexer[0] != slice(None): + if val == "a" and not warn_copy_on_write: with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype is deprecated" ): indexer_func(df)[indexer] = val + if val == "a" and warn_copy_on_write: + with tm.assert_produces_warning( + FutureWarning, match="incompatible dtype|Setting a value on a view" + ): + indexer_func(df)[indexer] = val else: - indexer_func(df)[indexer] = val + with tm.assert_cow_warning(warn_copy_on_write and val == 100): + indexer_func(df)[indexer] = val if using_copy_on_write: assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) @@ -1060,19 +1170,25 @@ assert np.shares_memory(get_array(df, "a"), get_array(view, "a")) -def test_series_midx_slice(using_copy_on_write): +def test_series_midx_slice(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]])) + ser_orig = ser.copy() result = ser[1] assert np.shares_memory(get_array(ser), get_array(result)) - result.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 100 if using_copy_on_write: + tm.assert_series_equal(ser, ser_orig) + else: expected = Series( - [1, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]]) + [100, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]]) ) tm.assert_series_equal(ser, expected) -def test_getitem_midx_slice(using_copy_on_write, using_array_manager): +def test_getitem_midx_slice( + using_copy_on_write, warn_copy_on_write, using_array_manager +): df = DataFrame({("a", "x"): [1, 2], ("a", "y"): 1, ("b", "x"): 2}) df_orig = df.copy() new_df = df[("a",)] @@ -1085,16 +1201,26 @@ if using_copy_on_write: new_df.iloc[0, 0] = 100 tm.assert_frame_equal(df_orig, df) + else: + if warn_copy_on_write: + with tm.assert_cow_warning(): + new_df.iloc[0, 0] = 100 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(SettingWithCopyWarning): + new_df.iloc[0, 0] = 100 + assert df.iloc[0, 0] == 100 -def test_series_midx_tuples_slice(using_copy_on_write): +def test_series_midx_tuples_slice(using_copy_on_write, warn_copy_on_write): ser = Series( [1, 2, 3], index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]), ) result = ser[(1, 2)] assert np.shares_memory(get_array(ser), get_array(result)) - result.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 100 if using_copy_on_write: expected = Series( [1, 2, 3], @@ -1103,6 +1229,27 @@ tm.assert_series_equal(ser, expected) +def test_midx_read_only_bool_indexer(): + # GH#56635 + def mklbl(prefix, n): + return [f"{prefix}{i}" for i in range(n)] + + idx = pd.MultiIndex.from_product( + [mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)] + ) + cols = pd.MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"] + ) + df = DataFrame(1, index=idx, columns=cols).sort_index().sort_index(axis=1) + + mask = df[("a", "foo")] == 1 + expected_mask = mask.copy() + result = df.loc[pd.IndexSlice[mask, :, ["C1", "C3"]], :] + expected = df.loc[pd.IndexSlice[:, :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + tm.assert_series_equal(mask, expected_mask) + + def test_loc_enlarging_with_dataframe(using_copy_on_write): df = DataFrame({"a": [1, 2, 3]}) rhs = DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]}) diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/test_interp_fillna.py pandas-2.2.2+dfsg/pandas/tests/copy_view/test_interp_fillna.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/test_interp_fillna.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/test_interp_fillna.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,6 +10,7 @@ Series, Timestamp, interval_range, + option_context, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -90,12 +91,13 @@ @pytest.mark.parametrize( "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]] ) -def test_interpolate_inplace_with_refs(using_copy_on_write, vals): +def test_interpolate_inplace_with_refs(using_copy_on_write, vals, warn_copy_on_write): df = DataFrame({"a": [1, np.nan, 2]}) df_orig = df.copy() arr = get_array(df, "a") view = df[:] - df.interpolate(method="linear", inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.interpolate(method="linear", inplace=True) if using_copy_on_write: # Check that copy was triggered in interpolate and that we don't @@ -108,6 +110,31 @@ assert np.shares_memory(arr, get_array(df, "a")) +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +@pytest.mark.parametrize("dtype", ["float64", "Float64"]) +def test_interp_fill_functions_inplace( + using_copy_on_write, func, warn_copy_on_write, dtype +): + # Check that these takes the same code paths as interpolate + df = DataFrame({"a": [1, np.nan, 2]}, dtype=dtype) + df_orig = df.copy() + arr = get_array(df, "a") + view = df[:] + + with tm.assert_cow_warning(warn_copy_on_write and dtype == "float64"): + getattr(df, func)(inplace=True) + + if using_copy_on_write: + # Check that copy was triggered in interpolate and that we don't + # have any references left + assert not np.shares_memory(arr, get_array(df, "a")) + tm.assert_frame_equal(df_orig, view) + assert df._mgr._has_no_reference(0) + assert view._mgr._has_no_reference(0) + else: + assert np.shares_memory(arr, get_array(df, "a")) is (dtype == "float64") + + def test_interpolate_cleaned_fill_method(using_copy_on_write): # Check that "method is set to None" case works correctly df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) @@ -228,14 +255,15 @@ assert df._mgr._has_no_reference(1) -def test_fillna_inplace_reference(using_copy_on_write): +def test_fillna_inplace_reference(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1.5, np.nan], "b": 1}) df_orig = df.copy() arr_a = get_array(df, "a") arr_b = get_array(df, "b") view = df[:] - df.fillna(5.5, inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.fillna(5.5, inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), arr_a) assert np.shares_memory(get_array(df, "b"), arr_b) @@ -249,7 +277,7 @@ tm.assert_frame_equal(df, expected) -def test_fillna_interval_inplace_reference(using_copy_on_write): +def test_fillna_interval_inplace_reference(using_copy_on_write, warn_copy_on_write): # Set dtype explicitly to avoid implicit cast when setting nan ser = Series( interval_range(start=0, end=5), name="a", dtype="interval[float64, right]" @@ -258,7 +286,8 @@ ser_orig = ser.copy() view = ser[:] - ser.fillna(value=Interval(left=0, right=5), inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + ser.fillna(value=Interval(left=0, right=5), inplace=True) if using_copy_on_write: assert not np.shares_memory( @@ -324,12 +353,13 @@ def test_fillna_inplace_ea_noop_shares_memory( - using_copy_on_write, any_numeric_ea_and_arrow_dtype + using_copy_on_write, warn_copy_on_write, any_numeric_ea_and_arrow_dtype ): df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype) df_orig = df.copy() view = df[:] - df.fillna(100, inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.fillna(100, inplace=True) if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) @@ -342,7 +372,10 @@ assert not df._mgr._has_no_reference(1) assert not view._mgr._has_no_reference(1) - df.iloc[0, 1] = 100 + with tm.assert_cow_warning( + warn_copy_on_write and "pyarrow" not in any_numeric_ea_and_arrow_dtype + ): + df.iloc[0, 1] = 100 if isinstance(df["a"].dtype, ArrowDtype) or using_copy_on_write: tm.assert_frame_equal(df_orig, view) else: @@ -361,6 +394,17 @@ with tm.raises_chained_assignment_error(): df[["a"]].fillna(100, inplace=True) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[["a"]].fillna(100, inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[df.a > 5].fillna(100, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].fillna(100, inplace=True) @pytest.mark.parametrize("func", ["interpolate", "ffill", "bfill"]) @@ -375,3 +419,14 @@ with tm.raises_chained_assignment_error(): getattr(df[["a"]], func)(inplace=True) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + getattr(df["a"], func)(inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + getattr(df[["a"]], func)(inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + getattr(df[df["a"] > 1], func)(inplace=True) diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/test_methods.py pandas-2.2.2+dfsg/pandas/tests/copy_view/test_methods.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/test_methods.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/test_methods.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,6 +12,7 @@ Series, Timestamp, date_range, + option_context, period_range, ) import pandas._testing as tm @@ -39,7 +40,7 @@ assert df.iloc[0, 0] == 1 -def test_copy_shallow(using_copy_on_write): +def test_copy_shallow(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_copy = df.copy(deep=False) @@ -69,7 +70,8 @@ assert np.shares_memory(get_array(df_copy, "c"), get_array(df, "c")) else: # mutating shallow copy does mutate original - df_copy.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df_copy.iloc[0, 0] = 0 assert df.iloc[0, 0] == 0 # and still shares memory assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) @@ -278,6 +280,17 @@ tm.assert_series_equal(ser, ser_orig) +def test_groupby_column_index_in_references(): + df = DataFrame( + {"A": ["a", "b", "c", "d"], "B": [1, 2, 3, 4], "C": ["a", "a", "b", "b"]} + ) + df = df.set_index("A") + key = df["C"] + result = df.groupby(key, observed=True).sum() + expected = df.groupby("C", observed=True).sum() + tm.assert_frame_equal(result, expected) + + def test_rename_columns(using_copy_on_write): # Case: renaming columns returns a new dataframe # + afterwards modifying the result @@ -526,14 +539,15 @@ tm.assert_frame_equal(df2, df_orig) -def test_shift_columns(using_copy_on_write): +def test_shift_columns(using_copy_on_write, warn_copy_on_write): df = DataFrame( [[1, 2], [3, 4], [5, 6]], columns=date_range("2020-01-01", "2020-01-02") ) df2 = df.shift(periods=1, axis=1) assert np.shares_memory(get_array(df2, "2020-01-02"), get_array(df, "2020-01-01")) - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 if using_copy_on_write: assert not np.shares_memory( get_array(df2, "2020-01-02"), get_array(df, "2020-01-01") @@ -545,7 +559,7 @@ tm.assert_frame_equal(df2, expected) -def test_pop(using_copy_on_write): +def test_pop(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() view_original = df[:] @@ -557,7 +571,8 @@ if using_copy_on_write: result.iloc[0] = 0 assert not np.shares_memory(result.values, get_array(view_original, "a")) - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(df, "b"), get_array(view_original, "b")) tm.assert_frame_equal(view_original, df_orig) @@ -648,7 +663,7 @@ tm.assert_series_equal(ser, ser_orig) # Original is unchanged -def test_to_frame(using_copy_on_write): +def test_to_frame(using_copy_on_write, warn_copy_on_write): # Case: converting a Series to a DataFrame with to_frame ser = Series([1, 2, 3]) ser_orig = ser.copy() @@ -658,7 +673,8 @@ # currently this always returns a "view" assert np.shares_memory(ser.values, get_array(df, 0)) - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 0 if using_copy_on_write: # mutating df triggers a copy-on-write for that column @@ -672,7 +688,8 @@ # modify original series -> don't modify dataframe df = ser[:].to_frame() - ser.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 0 if using_copy_on_write: tm.assert_frame_equal(df, ser_orig.to_frame()) @@ -741,7 +758,7 @@ ], ids=["shallow-copy", "reset_index", "rename", "select_dtypes"], ) -def test_chained_methods(request, method, idx, using_copy_on_write): +def test_chained_methods(request, method, idx, using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() @@ -750,13 +767,15 @@ # modify df2 -> don't modify df df2 = method(df) - df2.iloc[0, idx] = 0 + with tm.assert_cow_warning(warn_copy_on_write and df2_is_view): + df2.iloc[0, idx] = 0 if not df2_is_view: tm.assert_frame_equal(df, df_orig) # modify df -> don't modify df2 df2 = method(df) - df.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write and df2_is_view): + df.iloc[0, 0] = 0 if not df2_is_view: tm.assert_frame_equal(df2.iloc[:, idx:], df_orig) @@ -905,7 +924,7 @@ lambda df: df.tail(3), ], ) -def test_head_tail(method, using_copy_on_write): +def test_head_tail(method, using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = method(df) @@ -918,14 +937,16 @@ assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) # modify df2 to trigger CoW for that block - df2.iloc[0, 0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + df2.iloc[0, 0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: # without CoW enabled, head and tail return views. Mutating df2 also mutates df. assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - df2.iloc[0, 0] = 1 + with tm.assert_cow_warning(warn_copy_on_write): + df2.iloc[0, 0] = 1 tm.assert_frame_equal(df, df_orig) @@ -1139,7 +1160,7 @@ "obj, kwargs", [(Series([1, 2, 3], name="a"), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})], ) -def test_sort_values_inplace(using_copy_on_write, obj, kwargs, using_array_manager): +def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_write): obj_orig = obj.copy() view = obj[:] obj.sort_values(inplace=True, **kwargs) @@ -1147,7 +1168,8 @@ assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) # mutating obj triggers a copy-on-write for the column / block - obj.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + obj.iloc[0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(obj, "a"), get_array(view, "a")) tm.assert_equal(view, obj_orig) @@ -1156,7 +1178,7 @@ @pytest.mark.parametrize("decimals", [-1, 0, 1]) -def test_round(using_copy_on_write, decimals): +def test_round(using_copy_on_write, warn_copy_on_write, decimals): df = DataFrame({"a": [1, 2], "b": "c"}) df_orig = df.copy() df2 = df.round(decimals=decimals) @@ -1171,6 +1193,7 @@ assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) df2.iloc[0, 1] = "d" df2.iloc[0, 0] = 4 @@ -1270,7 +1293,7 @@ tm.assert_series_equal(ser, ser_orig) -def test_set_flags(using_copy_on_write): +def test_set_flags(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2, 3]) ser_orig = ser.copy() ser2 = ser.set_flags(allows_duplicate_labels=False) @@ -1278,7 +1301,8 @@ assert np.shares_memory(ser, ser2) # mutating ser triggers a copy-on-write for the column / block - ser2.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + ser2.iloc[0] = 0 if using_copy_on_write: assert not np.shares_memory(ser2, ser) tm.assert_series_equal(ser, ser_orig) @@ -1311,7 +1335,7 @@ def test_tz_convert_localize(using_copy_on_write, func, tz): # GH 49473 ser = Series( - [1, 2], index=date_range(start="2014-08-01 09:00", freq="H", periods=2, tz=tz) + [1, 2], index=date_range(start="2014-08-01 09:00", freq="h", periods=2, tz=tz) ) ser_orig = ser.copy() ser2 = getattr(ser, func)("US/Central") @@ -1351,7 +1375,7 @@ tm.assert_frame_equal(df, df_orig) -def test_squeeze(using_copy_on_write): +def test_squeeze(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() series = df.squeeze() @@ -1360,7 +1384,8 @@ assert np.shares_memory(series.values, get_array(df, "a")) # mutating squeezed df triggers a copy-on-write for that column/block - series.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + series.iloc[0] = 0 if using_copy_on_write: assert not np.shares_memory(series.values, get_array(df, "a")) tm.assert_frame_equal(df, df_orig) @@ -1370,7 +1395,7 @@ assert df.loc[0, "a"] == 0 -def test_items(using_copy_on_write): +def test_items(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) df_orig = df.copy() @@ -1381,7 +1406,8 @@ assert np.shares_memory(get_array(ser, name), get_array(df, name)) # mutating df triggers a copy-on-write for that column / block - ser.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + ser.iloc[0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(ser, name), get_array(df, name)) @@ -1392,11 +1418,12 @@ @pytest.mark.parametrize("dtype", ["int64", "Int64"]) -def test_putmask(using_copy_on_write, dtype): +def test_putmask(using_copy_on_write, dtype, warn_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype) view = df[:] df_orig = df.copy() - df[df == df] = 5 + with tm.assert_cow_warning(warn_copy_on_write): + df[df == df] = 5 if using_copy_on_write: assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) @@ -1430,15 +1457,21 @@ @pytest.mark.parametrize( "val, exp, warn", [(5.5, True, FutureWarning), (5, False, None)] ) -def test_putmask_dont_copy_some_blocks(using_copy_on_write, val, exp, warn): +def test_putmask_dont_copy_some_blocks( + using_copy_on_write, val, exp, warn, warn_copy_on_write +): df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5}) view = df[:] df_orig = df.copy() indexer = DataFrame( [[True, False, False], [True, False, False]], columns=list("abc") ) - with tm.assert_produces_warning(warn, match="incompatible dtype"): - df[indexer] = val + if warn_copy_on_write: + with tm.assert_cow_warning(): + df[indexer] = val + else: + with tm.assert_produces_warning(warn, match="incompatible dtype"): + df[indexer] = val if using_copy_on_write: assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) @@ -1536,15 +1569,26 @@ with tm.raises_chained_assignment_error(): getattr(df[["a"]], func)(df["a"] > 2, 5, inplace=True) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + getattr(df["a"], func)(df["a"] > 2, 5, inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + getattr(df[["a"]], func)(df["a"] > 2, 5, inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + getattr(df[df["a"] > 1], func)(df["a"] > 2, 5, inplace=True) def test_asfreq_noop(using_copy_on_write): df = DataFrame( {"a": [0.0, None, 2.0, 3.0]}, - index=date_range("1/1/2000", periods=4, freq="T"), + index=date_range("1/1/2000", periods=4, freq="min"), ) df_orig = df.copy() - df2 = df.asfreq(freq="T") + df2 = df.asfreq(freq="min") if using_copy_on_write: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) @@ -1568,14 +1612,16 @@ tm.assert_frame_equal(df, df_orig) -def test_interpolate_creates_copy(using_copy_on_write): +def test_interpolate_creates_copy(using_copy_on_write, warn_copy_on_write): # GH#51126 df = DataFrame({"a": [1.5, np.nan, 3]}) view = df[:] expected = df.copy() - df.ffill(inplace=True) - df.iloc[0, 0] = 100.5 + with tm.assert_cow_warning(warn_copy_on_write): + df.ffill(inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100.5 if using_copy_on_write: tm.assert_frame_equal(view, expected) @@ -1651,7 +1697,7 @@ @pytest.mark.parametrize("key", ["a", ["a"]]) -def test_get(using_copy_on_write, key): +def test_get(using_copy_on_write, warn_copy_on_write, key): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() @@ -1665,8 +1711,11 @@ else: # for non-CoW it depends on whether we got a Series or DataFrame if it # is a view or copy or triggers a warning or not - warn = SettingWithCopyWarning if isinstance(key, list) else None - with pd.option_context("chained_assignment", "warn"): + if warn_copy_on_write: + warn = FutureWarning if isinstance(key, str) else None + else: + warn = SettingWithCopyWarning if isinstance(key, list) else None + with option_context("chained_assignment", "warn"): with tm.assert_produces_warning(warn): result.iloc[0] = 0 @@ -1680,7 +1729,9 @@ @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_xs(using_copy_on_write, using_array_manager, axis, key, dtype): +def test_xs( + using_copy_on_write, warn_copy_on_write, using_array_manager, axis, key, dtype +): single_block = (dtype == "int64") and not using_array_manager is_view = single_block or (using_array_manager and axis == 1) df = DataFrame( @@ -1695,10 +1746,13 @@ elif using_copy_on_write: assert result._mgr._has_no_reference(0) - if using_copy_on_write or is_view: + if using_copy_on_write or (is_view and not warn_copy_on_write): result.iloc[0] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(single_block or axis == 1): + result.iloc[0] = 0 else: - with pd.option_context("chained_assignment", "warn"): + with option_context("chained_assignment", "warn"): with tm.assert_produces_warning(SettingWithCopyWarning): result.iloc[0] = 0 @@ -1710,7 +1764,9 @@ @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("key, level", [("l1", 0), (2, 1)]) -def test_xs_multiindex(using_copy_on_write, using_array_manager, key, level, axis): +def test_xs_multiindex( + using_copy_on_write, warn_copy_on_write, using_array_manager, key, level, axis +): arr = np.arange(18).reshape(6, 3) index = MultiIndex.from_product([["l1", "l2"], [1, 2, 3]], names=["lev1", "lev2"]) df = DataFrame(arr, index=index, columns=list("abc")) @@ -1725,25 +1781,28 @@ get_array(df, df.columns[0]), get_array(result, result.columns[0]) ) - warn = ( - SettingWithCopyWarning - if not using_copy_on_write and not using_array_manager - else None - ) - with pd.option_context("chained_assignment", "warn"): + if warn_copy_on_write: + warn = FutureWarning if level == 0 else None + elif not using_copy_on_write and not using_array_manager: + warn = SettingWithCopyWarning + else: + warn = None + with option_context("chained_assignment", "warn"): with tm.assert_produces_warning(warn): result.iloc[0, 0] = 0 tm.assert_frame_equal(df, df_orig) -def test_update_frame(using_copy_on_write): +def test_update_frame(using_copy_on_write, warn_copy_on_write): df1 = DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) df2 = DataFrame({"b": [100.0]}, index=[1]) df1_orig = df1.copy() view = df1[:] - df1.update(df2) + # TODO(CoW) better warning message? + with tm.assert_cow_warning(warn_copy_on_write): + df1.update(df2) expected = DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 100.0, 6.0]}) tm.assert_frame_equal(df1, expected) @@ -1756,13 +1815,17 @@ tm.assert_frame_equal(view, expected) -def test_update_series(using_copy_on_write): +def test_update_series(using_copy_on_write, warn_copy_on_write): ser1 = Series([1.0, 2.0, 3.0]) ser2 = Series([100.0], index=[1]) ser1_orig = ser1.copy() view = ser1[:] - ser1.update(ser2) + if warn_copy_on_write: + with tm.assert_cow_warning(): + ser1.update(ser2) + else: + ser1.update(ser2) expected = Series([1.0, 100.0, 3.0]) tm.assert_series_equal(ser1, expected) @@ -1785,21 +1848,45 @@ with tm.raises_chained_assignment_error(): df[["a"]].update(ser2.to_frame()) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].update(ser2) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[["a"]].update(ser2.to_frame()) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[df["a"] > 1].update(ser2.to_frame()) -def test_inplace_arithmetic_series(): +def test_inplace_arithmetic_series(using_copy_on_write): ser = Series([1, 2, 3]) + ser_orig = ser.copy() data = get_array(ser) ser *= 2 - assert np.shares_memory(get_array(ser), data) - tm.assert_numpy_array_equal(data, get_array(ser)) + if using_copy_on_write: + # https://github.com/pandas-dev/pandas/pull/55745 + # changed to NOT update inplace because there is no benefit (actual + # operation already done non-inplace). This was only for the optics + # of updating the backing array inplace, but we no longer want to make + # that guarantee + assert not np.shares_memory(get_array(ser), data) + tm.assert_numpy_array_equal(data, get_array(ser_orig)) + else: + assert np.shares_memory(get_array(ser), data) + tm.assert_numpy_array_equal(data, get_array(ser)) -def test_inplace_arithmetic_series_with_reference(using_copy_on_write): +def test_inplace_arithmetic_series_with_reference( + using_copy_on_write, warn_copy_on_write +): ser = Series([1, 2, 3]) ser_orig = ser.copy() view = ser[:] - ser *= 2 + with tm.assert_cow_warning(warn_copy_on_write): + ser *= 2 if using_copy_on_write: assert not np.shares_memory(get_array(ser), get_array(view)) tm.assert_series_equal(ser_orig, view) @@ -1841,7 +1928,7 @@ assert not np.shares_memory(get_array(df, "a"), get_array(result, 0)) -def test_transform_frame(using_copy_on_write): +def test_transform_frame(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": 1}) df_orig = df.copy() @@ -1849,12 +1936,13 @@ ser.iloc[0] = 100 return ser - df.transform(func) + with tm.assert_cow_warning(warn_copy_on_write): + df.transform(func) if using_copy_on_write: tm.assert_frame_equal(df, df_orig) -def test_transform_series(using_copy_on_write): +def test_transform_series(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2, 3]) ser_orig = ser.copy() @@ -1862,7 +1950,8 @@ ser.iloc[0] = 100 return ser - ser.transform(func) + with tm.assert_cow_warning(warn_copy_on_write): + ser.transform(func) if using_copy_on_write: tm.assert_series_equal(ser, ser_orig) @@ -1875,16 +1964,18 @@ tm.assert_series_equal(result, expected) -def test_series_view(using_copy_on_write): +def test_series_view(using_copy_on_write, warn_copy_on_write): ser = Series([1, 2, 3]) ser_orig = ser.copy() - ser2 = ser.view() + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + ser2 = ser.view() assert np.shares_memory(get_array(ser), get_array(ser2)) if using_copy_on_write: assert not ser2._mgr._has_no_reference(0) - ser2.iloc[0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + ser2.iloc[0] = 100 if using_copy_on_write: tm.assert_series_equal(ser_orig, ser) @@ -1922,7 +2013,7 @@ tm.assert_frame_equal(df, df_orig) -def test_eval_inplace(using_copy_on_write): +def test_eval_inplace(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": 1}) df_orig = df.copy() df_view = df[:] @@ -1930,6 +2021,35 @@ df.eval("c = a+b", inplace=True) assert np.shares_memory(get_array(df, "a"), get_array(df_view, "a")) - df.iloc[0, 0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 if using_copy_on_write: tm.assert_frame_equal(df_view, df_orig) + + +def test_apply_modify_row(using_copy_on_write, warn_copy_on_write): + # Case: applying a function on each row as a Series object, where the + # function mutates the row object (which needs to trigger CoW if row is a view) + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + df_orig = df.copy() + + def transform(row): + row["B"] = 100 + return row + + with tm.assert_cow_warning(warn_copy_on_write): + df.apply(transform, axis=1) + + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + else: + assert df.loc[0, "B"] == 100 + + # row Series is a copy + df = DataFrame({"A": [1, 2], "B": ["b", "c"]}) + df_orig = df.copy() + + with tm.assert_produces_warning(None): + df.apply(transform, axis=1) + + tm.assert_frame_equal(df, df_orig) diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/test_replace.py pandas-2.2.2+dfsg/pandas/tests/copy_view/test_replace.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/test_replace.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/test_replace.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,6 +4,7 @@ from pandas import ( Categorical, DataFrame, + option_context, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -47,12 +48,13 @@ tm.assert_frame_equal(df, df_orig) -def test_replace_regex_inplace_refs(using_copy_on_write): +def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": ["aaa", "bbb"]}) df_orig = df.copy() view = df[:] arr = get_array(df, "a") - df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) if using_copy_on_write: assert not np.shares_memory(arr, get_array(df, "a")) assert df._mgr._has_no_reference(0) @@ -160,13 +162,19 @@ def test_replace_list_categorical(using_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") arr = get_array(df, "a") - df.replace(["c"], value="a", inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.replace(["c"], value="a", inplace=True) assert np.shares_memory(arr.codes, get_array(df, "a").codes) if using_copy_on_write: assert df._mgr._has_no_reference(0) df_orig = df.copy() - df2 = df.replace(["b"], value="a") + with tm.assert_produces_warning(FutureWarning, match=msg): + df2 = df.replace(["b"], value="a") assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) tm.assert_frame_equal(df, df_orig) @@ -176,7 +184,12 @@ df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") view = df[:] df_orig = df.copy() - df.replace(["c"], value="a", inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.replace(["c"], value="a", inplace=True) if using_copy_on_write: assert not np.shares_memory( get_array(view, "a").codes, get_array(df, "a").codes @@ -201,11 +214,12 @@ @pytest.mark.parametrize("to_replace", [1.5, [1.5]]) -def test_replace_inplace_reference(using_copy_on_write, to_replace): +def test_replace_inplace_reference(using_copy_on_write, to_replace, warn_copy_on_write): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") view = df[:] - df.replace(to_replace=to_replace, value=15.5, inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.replace(to_replace=to_replace, value=15.5, inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), arr_a) @@ -235,7 +249,13 @@ df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] - df.replace(to_replace=to_replace, value=val, inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df.replace(to_replace=to_replace, value=val, inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) @@ -250,7 +270,13 @@ def test_replace_categorical_inplace(using_copy_on_write, val): df = DataFrame({"a": Categorical([1, 2, 3])}) arr_a = get_array(df, "a") - df.replace(to_replace=1, value=val, inplace=True) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df.replace(to_replace=1, value=val, inplace=True) assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) if using_copy_on_write: @@ -264,7 +290,13 @@ def test_replace_categorical(using_copy_on_write, val): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() - df2 = df.replace(to_replace=1, value=val) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + warn = FutureWarning if val == 1.5 else None + with tm.assert_produces_warning(warn, match=msg): + df2 = df.replace(to_replace=1, value=val) if using_copy_on_write: assert df._mgr._has_no_reference(0) @@ -278,14 +310,18 @@ @pytest.mark.parametrize("method", ["where", "mask"]) -def test_masking_inplace(using_copy_on_write, method): +def test_masking_inplace(using_copy_on_write, method, warn_copy_on_write): df = DataFrame({"a": [1.5, 2, 3]}) df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] method = getattr(df, method) - method(df["a"] > 1.6, -1, inplace=True) + if warn_copy_on_write: + with tm.assert_cow_warning(): + method(df["a"] > 1.6, -1, inplace=True) + else: + method(df["a"] > 1.6, -1, inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), arr_a) @@ -349,12 +385,13 @@ assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) -def test_replace_list_none_inplace_refs(using_copy_on_write): +def test_replace_list_none_inplace_refs(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": ["a", "b", "c"]}) arr = get_array(df, "a") df_orig = df.copy() view = df[:] - df.replace(["a"], value=None, inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.replace(["a"], value=None, inplace=True) if using_copy_on_write: assert df._mgr._has_no_reference(0) assert not np.shares_memory(arr, get_array(df, "a")) @@ -395,6 +432,17 @@ with tm.raises_chained_assignment_error(): df[["a"]].replace(1, 100, inplace=True) tm.assert_frame_equal(df, df_orig) + else: + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[["a"]].replace(1, 100, inplace=True) + + with tm.assert_produces_warning(None): + with option_context("mode.chained_assignment", None): + df[df.a > 5].replace(1, 100, inplace=True) + + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].replace(1, 100, inplace=True) def test_replace_listlike(using_copy_on_write): @@ -415,7 +463,7 @@ tm.assert_frame_equal(df, df_orig) -def test_replace_listlike_inplace(using_copy_on_write): +def test_replace_listlike_inplace(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) arr = get_array(df, "a") df.replace([200, 2], [10, 11], inplace=True) @@ -423,7 +471,8 @@ view = df[:] df_orig = df.copy() - df.replace([200, 3], [10, 11], inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.replace([200, 3], [10, 11], inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), arr) tm.assert_frame_equal(view, df_orig) diff -Nru pandas-2.1.4+dfsg/pandas/tests/copy_view/test_setitem.py pandas-2.2.2+dfsg/pandas/tests/copy_view/test_setitem.py --- pandas-2.1.4+dfsg/pandas/tests/copy_view/test_setitem.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/copy_view/test_setitem.py 2024-04-10 17:42:52.000000000 +0000 @@ -140,3 +140,17 @@ assert not np.shares_memory(get_array(rhs), df._get_column_array(0)) if using_copy_on_write: assert df._mgr._has_no_reference(0) + + +def test_set_column_with_inplace_operator(using_copy_on_write, warn_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + # this should not raise any warning + with tm.assert_produces_warning(None): + df["a"] += 1 + + # when it is not in a chain, then it should produce a warning + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + ser = df["a"] + with tm.assert_cow_warning(warn_copy_on_write): + ser += 1 diff -Nru pandas-2.1.4+dfsg/pandas/tests/dtypes/cast/test_construct_ndarray.py pandas-2.2.2+dfsg/pandas/tests/dtypes/cast/test_construct_ndarray.py --- pandas-2.1.4+dfsg/pandas/tests/dtypes/cast/test_construct_ndarray.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/dtypes/cast/test_construct_ndarray.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd import pandas._testing as tm from pandas.core.construction import sanitize_array @@ -15,9 +16,14 @@ ([1, 2, None], np.dtype("str"), np.array(["1", "2", None])), ], ) -def test_construct_1d_ndarray_preserving_na(values, dtype, expected): +def test_construct_1d_ndarray_preserving_na( + values, dtype, expected, using_infer_string +): result = sanitize_array(values, index=None, dtype=dtype) - tm.assert_numpy_array_equal(result, expected) + if using_infer_string and expected.dtype == object and dtype is None: + tm.assert_extension_array_equal(result, pd.array(expected)) + else: + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]"]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/dtypes/cast/test_downcast.py pandas-2.2.2+dfsg/pandas/tests/dtypes/cast/test_downcast.py --- pandas-2.1.4+dfsg/pandas/tests/dtypes/cast/test_downcast.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/dtypes/cast/test_downcast.py 2024-04-10 17:42:52.000000000 +0000 @@ -56,8 +56,8 @@ ser = Series([True, True, False]) result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) - expected = ser - tm.assert_series_equal(result, expected) + expected = ser.values + tm.assert_numpy_array_equal(result, expected) def test_downcast_conversion_no_nan(any_real_numpy_dtype): diff -Nru pandas-2.1.4+dfsg/pandas/tests/dtypes/cast/test_find_common_type.py pandas-2.2.2+dfsg/pandas/tests/dtypes/cast/test_find_common_type.py --- pandas-2.1.4+dfsg/pandas/tests/dtypes/cast/test_find_common_type.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/dtypes/cast/test_find_common_type.py 2024-04-10 17:42:52.000000000 +0000 @@ -122,7 +122,7 @@ [ DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), PeriodDtype(freq="2D"), - PeriodDtype(freq="H"), + PeriodDtype(freq="h"), np.dtype("datetime64[ns]"), object, np.int64, diff -Nru pandas-2.1.4+dfsg/pandas/tests/dtypes/cast/test_infer_dtype.py pandas-2.2.2+dfsg/pandas/tests/dtypes/cast/test_infer_dtype.py --- pandas-2.1.4+dfsg/pandas/tests/dtypes/cast/test_infer_dtype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/dtypes/cast/test_infer_dtype.py 2024-04-10 17:42:52.000000000 +0000 @@ -159,8 +159,10 @@ (Timestamp("20160101", tz="UTC"), "datetime64[s, UTC]"), ], ) -def test_infer_dtype_from_scalar(value, expected): +def test_infer_dtype_from_scalar(value, expected, using_infer_string): dtype, _ = infer_dtype_from_scalar(value) + if using_infer_string and value == "foo": + expected = "string" assert is_dtype_equal(dtype, expected) with pytest.raises(TypeError, match="must be list-like"): @@ -189,8 +191,14 @@ ), ], ) -def test_infer_dtype_from_array(arr, expected): +def test_infer_dtype_from_array(arr, expected, using_infer_string): dtype, _ = infer_dtype_from_array(arr) + if ( + using_infer_string + and isinstance(arr, Series) + and arr.tolist() == ["a", "b", "c"] + ): + expected = "string" assert is_dtype_equal(dtype, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/dtypes/cast/test_promote.py pandas-2.2.2+dfsg/pandas/tests/dtypes/cast/test_promote.py --- pandas-2.1.4+dfsg/pandas/tests/dtypes/cast/test_promote.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/dtypes/cast/test_promote.py 2024-04-10 17:42:52.000000000 +0000 @@ -229,24 +229,24 @@ [ # float filled with float ("float32", 1, "float32"), - ("float32", np.finfo("float32").max * 1.1, "float64"), + ("float32", float(np.finfo("float32").max) * 1.1, "float64"), ("float64", 1, "float64"), - ("float64", np.finfo("float32").max * 1.1, "float64"), + ("float64", float(np.finfo("float32").max) * 1.1, "float64"), # complex filled with float ("complex64", 1, "complex64"), - ("complex64", np.finfo("float32").max * 1.1, "complex128"), + ("complex64", float(np.finfo("float32").max) * 1.1, "complex128"), ("complex128", 1, "complex128"), - ("complex128", np.finfo("float32").max * 1.1, "complex128"), + ("complex128", float(np.finfo("float32").max) * 1.1, "complex128"), # float filled with complex ("float32", 1 + 1j, "complex64"), - ("float32", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ("float32", float(np.finfo("float32").max) * (1.1 + 1j), "complex128"), ("float64", 1 + 1j, "complex128"), - ("float64", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ("float64", float(np.finfo("float32").max) * (1.1 + 1j), "complex128"), # complex filled with complex ("complex64", 1 + 1j, "complex64"), - ("complex64", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ("complex64", float(np.finfo("float32").max) * (1.1 + 1j), "complex128"), ("complex128", 1 + 1j, "complex128"), - ("complex128", np.finfo("float32").max * (1.1 + 1j), "complex128"), + ("complex128", float(np.finfo("float32").max) * (1.1 + 1j), "complex128"), ], ) def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype): diff -Nru pandas-2.1.4+dfsg/pandas/tests/dtypes/test_common.py pandas-2.2.2+dfsg/pandas/tests/dtypes/test_common.py --- pandas-2.1.4+dfsg/pandas/tests/dtypes/test_common.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/dtypes/test_common.py 2024-04-10 17:42:52.000000000 +0000 @@ -94,10 +94,10 @@ [ "period[D]", "period[3M]", - "period[U]", + "period[us]", "Period[D]", "Period[3M]", - "Period[U]", + "Period[us]", ], ) def test_period_dtype(self, dtype): @@ -198,7 +198,7 @@ @pytest.mark.parametrize( - "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] ) def test_is_sparse(check_scipy): msg = "is_sparse is deprecated" @@ -276,7 +276,7 @@ assert not com.is_period_dtype(pd.Period("2017-01-01")) assert com.is_period_dtype(PeriodDtype(freq="D")) - assert com.is_period_dtype(pd.PeriodIndex([], freq="A")) + assert com.is_period_dtype(pd.PeriodIndex([], freq="Y")) def test_is_interval_dtype(): @@ -303,14 +303,23 @@ assert com.is_categorical_dtype(pd.CategoricalIndex([1, 2, 3])) -def test_is_string_dtype(): - assert not com.is_string_dtype(int) - assert not com.is_string_dtype(pd.Series([1, 2])) - - assert com.is_string_dtype(str) - assert com.is_string_dtype(object) - assert com.is_string_dtype(np.array(["a", "b"])) - assert com.is_string_dtype(pd.StringDtype()) +@pytest.mark.parametrize( + "dtype, expected", + [ + (int, False), + (pd.Series([1, 2]), False), + (str, True), + (object, True), + (np.array(["a", "b"]), True), + (pd.StringDtype(), True), + (pd.Index([], dtype="O"), True), + ], +) +def test_is_string_dtype(dtype, expected): + # GH#54661 + + result = com.is_string_dtype(dtype) + assert result is expected @pytest.mark.parametrize( @@ -625,7 +634,7 @@ @pytest.mark.parametrize( - "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] ) def test_is_extension_array_dtype(check_scipy): assert not com.is_extension_array_dtype([1, 2, 3]) @@ -669,9 +678,9 @@ (np.dtype("float64"), np.dtype("float64")), (str, np.dtype(str)), (pd.Series([1, 2], dtype=np.dtype("int16")), np.dtype("int16")), - (pd.Series(["a", "b"]), np.dtype(object)), + (pd.Series(["a", "b"], dtype=object), np.dtype(object)), (pd.Index([1, 2]), np.dtype("int64")), - (pd.Index(["a", "b"]), np.dtype(object)), + (pd.Index(["a", "b"], dtype=object), np.dtype(object)), ("category", "category"), (pd.Categorical(["a", "b"]).dtype, CategoricalDtype(["a", "b"])), (pd.Categorical(["a", "b"]), CategoricalDtype(["a", "b"])), @@ -720,9 +729,9 @@ (np.dtype("float64"), np.float64), (str, np.dtype(str).type), (pd.Series([1, 2], dtype=np.dtype("int16")), np.int16), - (pd.Series(["a", "b"]), np.object_), + (pd.Series(["a", "b"], dtype=object), np.object_), (pd.Index([1, 2], dtype="int64"), np.int64), - (pd.Index(["a", "b"]), np.object_), + (pd.Index(["a", "b"], dtype=object), np.object_), ("category", CategoricalDtypeType), (pd.Categorical(["a", "b"]).dtype, CategoricalDtypeType), (pd.Categorical(["a", "b"]), CategoricalDtypeType), diff -Nru pandas-2.1.4+dfsg/pandas/tests/dtypes/test_dtypes.py pandas-2.2.2+dfsg/pandas/tests/dtypes/test_dtypes.py --- pandas-2.1.4+dfsg/pandas/tests/dtypes/test_dtypes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/dtypes/test_dtypes.py 2024-04-10 17:42:52.000000000 +0000 @@ -200,7 +200,8 @@ def test_dtype_specific_categorical_dtype(self): expected = "datetime64[ns]" - result = str(Categorical(DatetimeIndex([])).categories.dtype) + dti = DatetimeIndex([], dtype=expected) + result = str(Categorical(dti).categories.dtype) assert result == expected def test_not_string(self): @@ -432,24 +433,24 @@ assert dt.freq == pd.tseries.offsets.Day(3) for s in [ - "period[26H]", - "Period[26H]", - "26H", - "period[1D2H]", - "Period[1D2H]", - "1D2H", + "period[26h]", + "Period[26h]", + "26h", + "period[1D2h]", + "Period[1D2h]", + "1D2h", ]: dt = PeriodDtype(s) assert dt.freq == pd.tseries.offsets.Hour(26) def test_cannot_use_custom_businessday(self): # GH#52534 - msg = "CustomBusinessDay cannot be used with Period or PeriodDtype" + msg = "C is not supported as period frequency" + msg1 = " is not supported as period frequency" msg2 = r"PeriodDtype\[B\] is deprecated" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - PeriodDtype("C") - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): + PeriodDtype("C") + with pytest.raises(ValueError, match=msg1): with tm.assert_produces_warning(FutureWarning, match=msg2): PeriodDtype(pd.offsets.CustomBusinessDay()) @@ -467,8 +468,8 @@ assert PeriodDtype("period[3D]") == PeriodDtype("period[3D]") assert PeriodDtype("period[3D]") is not PeriodDtype("period[3D]") - assert PeriodDtype("period[1S1U]") == PeriodDtype("period[1000001U]") - assert PeriodDtype("period[1S1U]") is not PeriodDtype("period[1000001U]") + assert PeriodDtype("period[1s1us]") == PeriodDtype("period[1000001us]") + assert PeriodDtype("period[1s1us]") is not PeriodDtype("period[1000001us]") def test_compat(self, dtype): assert not is_datetime64_ns_dtype(dtype) @@ -505,15 +506,15 @@ assert PeriodDtype.is_dtype("period[D]") assert PeriodDtype.is_dtype("period[3D]") assert PeriodDtype.is_dtype(PeriodDtype("3D")) - assert PeriodDtype.is_dtype("period[U]") - assert PeriodDtype.is_dtype("period[S]") - assert PeriodDtype.is_dtype(PeriodDtype("U")) - assert PeriodDtype.is_dtype(PeriodDtype("S")) + assert PeriodDtype.is_dtype("period[us]") + assert PeriodDtype.is_dtype("period[s]") + assert PeriodDtype.is_dtype(PeriodDtype("us")) + assert PeriodDtype.is_dtype(PeriodDtype("s")) assert not PeriodDtype.is_dtype("D") assert not PeriodDtype.is_dtype("3D") assert not PeriodDtype.is_dtype("U") - assert not PeriodDtype.is_dtype("S") + assert not PeriodDtype.is_dtype("s") assert not PeriodDtype.is_dtype("foo") assert not PeriodDtype.is_dtype(np.object_) assert not PeriodDtype.is_dtype(np.int64) @@ -533,7 +534,7 @@ with tm.assert_produces_warning(DeprecationWarning, match=msg): assert is_period_dtype(dtype) - pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="H") + pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="h") assert is_period_dtype(pidx.dtype) assert is_period_dtype(pidx) @@ -728,7 +729,7 @@ assert not IntervalDtype.is_dtype("D") assert not IntervalDtype.is_dtype("3D") - assert not IntervalDtype.is_dtype("U") + assert not IntervalDtype.is_dtype("us") assert not IntervalDtype.is_dtype("S") assert not IntervalDtype.is_dtype("foo") assert not IntervalDtype.is_dtype("IntervalA") @@ -917,6 +918,24 @@ assert c1 is not c2 assert c1 != c2 + def test_equal_but_different_mixed_dtypes(self): + c1 = CategoricalDtype([1, 2, "3"]) + c2 = CategoricalDtype(["3", 1, 2]) + assert c1 is not c2 + assert c1 == c2 + + def test_equal_empty_ordered(self): + c1 = CategoricalDtype([], ordered=True) + c2 = CategoricalDtype([], ordered=True) + assert c1 is not c2 + assert c1 == c2 + + def test_equal_empty_unordered(self): + c1 = CategoricalDtype([]) + c2 = CategoricalDtype([]) + assert c1 is not c2 + assert c1 == c2 + @pytest.mark.parametrize("v1, v2", [([1, 2, 3], [1, 2, 3]), ([1, 2, 3], [3, 2, 1])]) def test_order_hashes_different(self, v1, v2): c1 = CategoricalDtype(v1, ordered=False) @@ -1036,13 +1055,14 @@ ) assert result == CategoricalDtype([1, 2], ordered=False) - def test_str_vs_repr(self, ordered): + def test_str_vs_repr(self, ordered, using_infer_string): c1 = CategoricalDtype(["a", "b"], ordered=ordered) assert str(c1) == "category" # Py2 will have unicode prefixes + dtype = "string" if using_infer_string else "object" pat = ( r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, " - r"categories_dtype=object\)" + rf"categories_dtype={dtype}\)" ) assert re.match(pat.format(ordered=ordered), repr(c1)) diff -Nru pandas-2.1.4+dfsg/pandas/tests/dtypes/test_generic.py pandas-2.2.2+dfsg/pandas/tests/dtypes/test_generic.py --- pandas-2.1.4+dfsg/pandas/tests/dtypes/test_generic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/dtypes/test_generic.py 2024-04-10 17:42:52.000000000 +0000 @@ -19,8 +19,9 @@ categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical) df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index) sparse_array = pd.arrays.SparseArray(np.random.default_rng(2).standard_normal(10)) - datetime_array = pd.core.arrays.DatetimeArray(datetime_index) - timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index) + + datetime_array = pd.core.arrays.DatetimeArray._from_sequence(datetime_index) + timedelta_array = pd.core.arrays.TimedeltaArray._from_sequence(timedelta_index) abc_pairs = [ ("ABCMultiIndex", multi_index), diff -Nru pandas-2.1.4+dfsg/pandas/tests/dtypes/test_inference.py pandas-2.2.2+dfsg/pandas/tests/dtypes/test_inference.py --- pandas-2.1.4+dfsg/pandas/tests/dtypes/test_inference.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/dtypes/test_inference.py 2024-04-10 17:42:52.000000000 +0000 @@ -33,8 +33,10 @@ missing as libmissing, ops as libops, ) +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes import inference +from pandas.core.dtypes.cast import find_result_type from pandas.core.dtypes.common import ( ensure_int32, is_bool, @@ -110,8 +112,8 @@ def __len__(self) -> int: return len(self._values) - def __array__(self, t=None): - return np.asarray(self._values, dtype=t) + def __array__(self, dtype=None, copy=None): + return np.asarray(self._values, dtype=dtype) @property def ndim(self): @@ -1622,11 +1624,23 @@ tm.assert_numpy_array_equal(out, expected) def test_is_period(self): - assert lib.is_period(Period("2011-01", freq="M")) - assert not lib.is_period(PeriodIndex(["2011-01"], freq="M")) - assert not lib.is_period(Timestamp("2011-01")) - assert not lib.is_period(1) - assert not lib.is_period(np.nan) + # GH#55264 + msg = "is_period is deprecated and will be removed in a future version" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert lib.is_period(Period("2011-01", freq="M")) + assert not lib.is_period(PeriodIndex(["2011-01"], freq="M")) + assert not lib.is_period(Timestamp("2011-01")) + assert not lib.is_period(1) + assert not lib.is_period(np.nan) + + def test_is_interval(self): + # GH#55264 + msg = "is_interval is deprecated and will be removed in a future version" + item = Interval(1, 2) + with tm.assert_produces_warning(FutureWarning, match=msg): + assert lib.is_interval(item) + assert not lib.is_interval(pd.IntervalIndex([item])) + assert not lib.is_interval(pd.IntervalIndex([item])._engine) def test_categorical(self): # GH 8974 @@ -1983,3 +1997,51 @@ values = np.arange(10, dtype=np.int64) result = ensure_int32(values) assert result.dtype == np.int32 + + +@pytest.mark.parametrize( + "right,result", + [ + (0, np.uint8), + (-1, np.int16), + (300, np.uint16), + # For floats, we just upcast directly to float64 instead of trying to + # find a smaller floating dtype + (300.0, np.uint16), # for integer floats, we convert them to ints + (300.1, np.float64), + (np.int16(300), np.int16 if np_version_gt2 else np.uint16), + ], +) +def test_find_result_type_uint_int(right, result): + left_dtype = np.dtype("uint8") + assert find_result_type(left_dtype, right) == result + + +@pytest.mark.parametrize( + "right,result", + [ + (0, np.int8), + (-1, np.int8), + (300, np.int16), + # For floats, we just upcast directly to float64 instead of trying to + # find a smaller floating dtype + (300.0, np.int16), # for integer floats, we convert them to ints + (300.1, np.float64), + (np.int16(300), np.int16), + ], +) +def test_find_result_type_int_int(right, result): + left_dtype = np.dtype("int8") + assert find_result_type(left_dtype, right) == result + + +@pytest.mark.parametrize( + "right,result", + [ + (300.0, np.float64), + (np.float32(300), np.float32), + ], +) +def test_find_result_type_floats(right, result): + left_dtype = np.dtype("float16") + assert find_result_type(left_dtype, right) == result diff -Nru pandas-2.1.4+dfsg/pandas/tests/dtypes/test_missing.py pandas-2.2.2+dfsg/pandas/tests/dtypes/test_missing.py --- pandas-2.1.4+dfsg/pandas/tests/dtypes/test_missing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/dtypes/test_missing.py 2024-04-10 17:42:52.000000000 +0000 @@ -40,6 +40,7 @@ Series, TimedeltaIndex, date_range, + period_range, ) import pandas._testing as tm @@ -77,11 +78,13 @@ @pytest.mark.parametrize( "ser", [ - tm.makeFloatSeries(), - tm.makeStringSeries(), - tm.makeObjectSeries(), - tm.makeTimeSeries(), - tm.makePeriodSeries(), + Series( + [str(i) for i in range(5)], + index=Index([str(i) for i in range(5)], dtype=object), + dtype=object, + ), + Series(range(5), date_range("2020-01-01", periods=5)), + Series(range(5), period_range("2020-01-01", periods=5)), ], ) def test_null_check_is_series(null_func, ser): @@ -124,15 +127,25 @@ @pytest.mark.parametrize("isna_f", [isna, isnull]) @pytest.mark.parametrize( - "df", + "data", [ - tm.makeTimeDataFrame(), - tm.makePeriodFrame(), - tm.makeMixedDataFrame(), + np.arange(4, dtype=float), + [0.0, 1.0, 0.0, 1.0], + Series(list("abcd"), dtype=object), + date_range("2020-01-01", periods=4), ], ) - def test_isna_isnull_frame(self, isna_f, df): + @pytest.mark.parametrize( + "index", + [ + date_range("2020-01-01", periods=4), + range(4), + period_range("2020-01-01", periods=4), + ], + ) + def test_isna_isnull_frame(self, isna_f, data, index): # frame + df = pd.DataFrame(data, index=index) result = isna_f(df) expected = df.apply(isna_f) tm.assert_frame_equal(result, expected) @@ -418,12 +431,10 @@ assert not array_equivalent( Index([0, np.nan]), Index([1, np.nan]), dtype_equal=dtype_equal ) - assert array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan]), dtype_equal=dtype_equal - ) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan]), dtype_equal=dtype_equal - ) + + +@pytest.mark.parametrize("dtype_equal", [True, False]) +def test_array_equivalent_tdi(dtype_equal): assert array_equivalent( TimedeltaIndex([0, np.nan]), TimedeltaIndex([0, np.nan]), @@ -435,6 +446,16 @@ dtype_equal=dtype_equal, ) + +@pytest.mark.parametrize("dtype_equal", [True, False]) +def test_array_equivalent_dti(dtype_equal): + assert array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan]), dtype_equal=dtype_equal + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan]), dtype_equal=dtype_equal + ) + dti1 = DatetimeIndex([0, np.nan], tz="US/Eastern") dti2 = DatetimeIndex([0, np.nan], tz="CET") dti3 = DatetimeIndex([1, np.nan], tz="US/Eastern") @@ -552,9 +573,7 @@ ) -@pytest.mark.parametrize( - "strict_nan", [pytest.param(True, marks=pytest.mark.xfail), False] -) +@pytest.mark.parametrize("strict_nan", [True, False]) def test_array_equivalent_nested(strict_nan): # reached in groupby aggregations, make sure we use np.any when checking # if the comparison is truthy @@ -577,9 +596,7 @@ @pytest.mark.filterwarnings("ignore:elementwise comparison failed:DeprecationWarning") -@pytest.mark.parametrize( - "strict_nan", [pytest.param(True, marks=pytest.mark.xfail), False] -) +@pytest.mark.parametrize("strict_nan", [True, False]) def test_array_equivalent_nested2(strict_nan): # more than one level of nesting left = np.array( @@ -604,9 +621,7 @@ assert not array_equivalent(left, right, strict_nan=strict_nan) -@pytest.mark.parametrize( - "strict_nan", [pytest.param(True, marks=pytest.mark.xfail), False] -) +@pytest.mark.parametrize("strict_nan", [True, False]) def test_array_equivalent_nested_list(strict_nan): left = np.array([[50, 70, 90], [20, 30]], dtype=object) right = np.array([[50, 70, 90], [20, 30]], dtype=object) diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/array_with_attr/array.py pandas-2.2.2+dfsg/pandas/tests/extension/array_with_attr/array.py --- pandas-2.1.4+dfsg/pandas/tests/extension/array_with_attr/array.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/array_with_attr/array.py 2024-04-10 17:42:52.000000000 +0000 @@ -48,8 +48,11 @@ self.attr = attr @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): - data = np.array(scalars, dtype="float64", copy=copy) + def _from_sequence(cls, scalars, *, dtype=None, copy=False): + if not copy: + data = np.asarray(scalars, dtype="float64") + else: + data = np.array(scalars, dtype="float64", copy=copy) return cls(data) def __getitem__(self, item): diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/__init__.py pandas-2.2.2+dfsg/pandas/tests/extension/base/__init__.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -56,12 +56,7 @@ BaseUnaryOpsTests, ) from pandas.tests.extension.base.printing import BasePrintingTests -from pandas.tests.extension.base.reduce import ( # noqa: F401 - BaseBooleanReduceTests, - BaseNoReduceTests, - BaseNumericReduceTests, - BaseReduceTests, -) +from pandas.tests.extension.base.reduce import BaseReduceTests from pandas.tests.extension.base.reshaping import BaseReshapingTests from pandas.tests.extension.base.setitem import BaseSetitemTests @@ -90,5 +85,47 @@ BaseReduceTests, BaseReshapingTests, BaseSetitemTests, + Dim2CompatTests, ): pass + + +def __getattr__(name: str): + import warnings + + if name == "BaseNoReduceTests": + warnings.warn( + "BaseNoReduceTests is deprecated and will be removed in a " + "future version. Use BaseReduceTests and override " + "`_supports_reduction` instead.", + FutureWarning, + ) + from pandas.tests.extension.base.reduce import BaseNoReduceTests + + return BaseNoReduceTests + + elif name == "BaseNumericReduceTests": + warnings.warn( + "BaseNumericReduceTests is deprecated and will be removed in a " + "future version. Use BaseReduceTests and override " + "`_supports_reduction` instead.", + FutureWarning, + ) + from pandas.tests.extension.base.reduce import BaseNumericReduceTests + + return BaseNumericReduceTests + + elif name == "BaseBooleanReduceTests": + warnings.warn( + "BaseBooleanReduceTests is deprecated and will be removed in a " + "future version. Use BaseReduceTests and override " + "`_supports_reduction` instead.", + FutureWarning, + ) + from pandas.tests.extension.base.reduce import BaseBooleanReduceTests + + return BaseBooleanReduceTests + + raise AttributeError( + f"module 'pandas.tests.extension.base' has no attribute '{name}'" + ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/accumulate.py pandas-2.2.2+dfsg/pandas/tests/extension/base/accumulate.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/accumulate.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/accumulate.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,16 +16,13 @@ return False def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): - alt = ser.astype("float64") - result = getattr(ser, op_name)(skipna=skipna) - - if result.dtype == pd.Float32Dtype() and op_name == "cumprod" and skipna: - # TODO: avoid special-casing here - pytest.skip( - f"Float32 precision lead to large differences with op {op_name} " - f"and skipna={skipna}" - ) + try: + alt = ser.astype("float64") + except TypeError: + # e.g. Period can't be cast to float64 + alt = ser.astype(object) + result = getattr(ser, op_name)(skipna=skipna) expected = getattr(alt, op_name)(skipna=skipna) tm.assert_series_equal(result, expected, check_dtype=False) @@ -37,5 +34,6 @@ if self._supports_accumulation(ser, op_name): self.check_accumulate(ser, op_name, skipna) else: - with pytest.raises(NotImplementedError): + with pytest.raises((NotImplementedError, TypeError)): + # TODO: require TypeError for things that will _never_ work? getattr(ser, op_name)(skipna=skipna) diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/constructors.py pandas-2.2.2+dfsg/pandas/tests/extension/base/constructors.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,7 +18,7 @@ def test_array_from_scalars(self, data): scalars = [data[0], data[1], data[2]] - result = data._from_sequence(scalars) + result = data._from_sequence(scalars, dtype=data.dtype) assert isinstance(result, type(data)) def test_series_constructor(self, data): diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/dim2.py pandas-2.2.2+dfsg/pandas/tests/extension/base/dim2.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/dim2.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/dim2.py 2024-04-10 17:42:52.000000000 +0000 @@ -20,6 +20,17 @@ # Note: these are ONLY for ExtensionArray subclasses that support 2D arrays. # i.e. not for pyarrow-backed EAs. + @pytest.fixture(autouse=True) + def skip_if_doesnt_support_2d(self, dtype, request): + if not dtype._supports_2d: + node = request.node + # In cases where we are mixed in to ExtensionTests, we only want to + # skip tests that are defined in Dim2CompatTests + test_func = node._obj + if test_func.__qualname__.startswith("Dim2CompatTests"): + # TODO: is there a less hacky way of checking this? + pytest.skip(f"{dtype} does not support 2D.") + def test_transpose(self, data): arr2d = data.repeat(2).reshape(-1, 2) shape = arr2d.shape diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/dtype.py pandas-2.2.2+dfsg/pandas/tests/extension/base/dtype.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/dtype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/dtype.py 2024-04-10 17:42:52.000000000 +0000 @@ -59,7 +59,12 @@ # check equivalency for using .dtypes df = pd.DataFrame( - {"A": pd.Series(data, dtype=dtype), "B": data, "C": "foo", "D": 1} + { + "A": pd.Series(data, dtype=dtype), + "B": data, + "C": pd.Series(["foo"] * len(data), dtype=object), + "D": 1, + } ) result = df.dtypes == str(dtype) assert np.dtype("int64") != "Int64" diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/groupby.py pandas-2.2.2+dfsg/pandas/tests/extension/base/groupby.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/groupby.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/groupby.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,15 +13,23 @@ import pandas._testing as tm +@pytest.mark.filterwarnings( + "ignore:The default of observed=False is deprecated:FutureWarning" +) class BaseGroupbyTests: """Groupby-specific tests.""" def test_grouping_grouper(self, data_for_grouping): df = pd.DataFrame( - {"A": ["B", "B", None, None, "A", "A", "B", "C"], "B": data_for_grouping} + { + "A": pd.Series( + ["B", "B", None, None, "A", "A", "B", "C"], dtype=object + ), + "B": data_for_grouping, + } ) - gr1 = df.groupby("A").grouper.groupings[0] - gr2 = df.groupby("B").grouper.groupings[0] + gr1 = df.groupby("A")._grouper.groupings[0] + gr2 = df.groupby("B")._grouper.groupings[0] tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values) tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping) @@ -105,10 +113,14 @@ def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - df.groupby("B", group_keys=False).apply(groupby_apply_op) - df.groupby("B", group_keys=False).A.apply(groupby_apply_op) - df.groupby("A", group_keys=False).apply(groupby_apply_op) - df.groupby("A", group_keys=False).B.apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/interface.py pandas-2.2.2+dfsg/pandas/tests/extension/base/interface.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/interface.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/interface.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import ExtensionDtype @@ -65,6 +66,9 @@ result = np.array(data, dtype=object) expected = np.array(list(data), dtype=object) + if expected.ndim > 1: + # nested data, explicitly construct as 1D + expected = construct_1d_object_array_from_listlike(list(data)) tm.assert_numpy_array_equal(result, expected) def test_is_extension_array_dtype(self, data): @@ -103,7 +107,7 @@ result = data.copy() if data.dtype._is_immutable: - pytest.skip("test_copy assumes mutability") + pytest.skip(f"test_copy assumes mutability and {data.dtype} is immutable") data[1] = data[0] assert result[1] != result[0] @@ -118,7 +122,7 @@ assert type(result) == type(data) if data.dtype._is_immutable: - pytest.skip("test_view assumes mutability") + pytest.skip(f"test_view assumes mutability and {data.dtype} is immutable") result[1] = result[0] assert data[1] == data[0] diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/io.py pandas-2.2.2+dfsg/pandas/tests/extension/base/io.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/io.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/io.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,11 +5,31 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import ExtensionArray class BaseParsingTests: @pytest.mark.parametrize("engine", ["c", "python"]) - def test_EA_types(self, engine, data): + def test_EA_types(self, engine, data, request): + if isinstance(data.dtype, pd.CategoricalDtype): + # in parsers.pyx _convert_with_dtype there is special-casing for + # Categorical that pre-empts _from_sequence_of_strings + pass + elif isinstance(data.dtype, pd.core.dtypes.dtypes.NumpyEADtype): + # These get unwrapped internally so are treated as numpy dtypes + # in the parsers.pyx code + pass + elif ( + type(data)._from_sequence_of_strings.__func__ + is ExtensionArray._from_sequence_of_strings.__func__ + ): + # i.e. the EA hasn't overridden _from_sequence_of_strings + mark = pytest.mark.xfail( + reason="_from_sequence_of_strings not implemented", + raises=NotImplementedError, + ) + request.node.add_marker(mark) + df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) csv_output = df.to_csv(index=False, na_rep=np.nan) result = pd.read_csv( diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/methods.py pandas-2.2.2+dfsg/pandas/tests/extension/base/methods.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/methods.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/methods.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,6 +7,7 @@ from pandas._typing import Dtype from pandas.core.dtypes.common import is_bool_dtype +from pandas.core.dtypes.dtypes import NumpyEADtype from pandas.core.dtypes.missing import na_value_for_dtype import pandas as pd @@ -247,10 +248,22 @@ ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("keep", ["first", "last", False]) + def test_duplicated(self, data, keep): + arr = data.take([0, 1, 0, 1]) + result = arr.duplicated(keep=keep) + if keep == "first": + expected = np.array([False, False, True, True]) + elif keep == "last": + expected = np.array([True, True, False, False]) + else: + expected = np.array([True, True, True, True]) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method): - duplicated = box(data._from_sequence([data[0], data[0]])) + duplicated = box(data._from_sequence([data[0], data[0]], dtype=data.dtype)) result = method(duplicated) @@ -319,7 +332,7 @@ data_missing.fillna(data_missing.take([1])) # Subclasses can override if we expect e.g Sparse[bool], boolean, pyarrow[bool] - _combine_le_expected_dtype: Dtype = np.dtype(bool) + _combine_le_expected_dtype: Dtype = NumpyEADtype("bool") def test_combine_le(self, data_repeated): # GH 20825 @@ -329,16 +342,20 @@ s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 <= x2) expected = pd.Series( - [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], - dtype=self._combine_le_expected_dtype, + pd.array( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + dtype=self._combine_le_expected_dtype, + ) ) tm.assert_series_equal(result, expected) val = s1.iloc[0] result = s1.combine(val, lambda x1, x2: x1 <= x2) expected = pd.Series( - [a <= val for a in list(orig_data1)], - dtype=self._combine_le_expected_dtype, + pd.array( + [a <= val for a in list(orig_data1)], + dtype=self._combine_le_expected_dtype, + ) ) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/missing.py pandas-2.2.2+dfsg/pandas/tests/extension/base/missing.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/missing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/missing.py 2024-04-10 17:42:52.000000000 +0000 @@ -44,7 +44,7 @@ tm.assert_series_equal(result, expected) def test_dropna_frame(self, data_missing): - df = pd.DataFrame({"A": data_missing}) + df = pd.DataFrame({"A": data_missing}, columns=pd.Index(["A"], dtype=object)) # defaults result = df.dropna() @@ -77,6 +77,28 @@ expected = pd.Series(data_missing.take([1, 1, 1, 0, 1])) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + arr = data_missing.take(input_ilocs) + result = pd.Series(arr).ffill(limit_area=limit_area) + expected = pd.Series(data_missing.take(expected_ilocs)) + tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings( "ignore:Series.fillna with 'method' is deprecated:FutureWarning" ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/ops.py pandas-2.2.2+dfsg/pandas/tests/extension/base/ops.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/ops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/ops.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,10 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + +from pandas.core.dtypes.common import is_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core import ops @@ -25,13 +29,23 @@ # The self.obj_bar_exc pattern isn't great in part because it can depend # on op_name or dtypes, but we use it here for backward-compatibility. if op_name in ["__divmod__", "__rdivmod__"]: - return self.divmod_exc - if isinstance(obj, pd.Series) and isinstance(other, pd.Series): - return self.series_array_exc + result = self.divmod_exc + elif isinstance(obj, pd.Series) and isinstance(other, pd.Series): + result = self.series_array_exc elif isinstance(obj, pd.Series): - return self.series_scalar_exc + result = self.series_scalar_exc else: - return self.frame_scalar_exc + result = self.frame_scalar_exc + + if using_pyarrow_string_dtype() and result is not None: + import pyarrow as pa + + result = ( # type: ignore[assignment] + result, + pa.lib.ArrowNotImplementedError, + NotImplementedError, + ) + return result def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # In _check_op we check that the result of a pointwise operation @@ -128,12 +142,18 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar + if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): + pytest.skip("Skip testing Python string formatting") + op_name = all_arithmetic_operators ser = pd.Series(data) self.check_opname(ser, op_name, ser.iloc[0]) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar + if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): + pytest.skip("Skip testing Python string formatting") + op_name = all_arithmetic_operators df = pd.DataFrame({"A": data}) self.check_opname(df, op_name, data[0]) @@ -239,9 +259,23 @@ class BaseUnaryOpsTests(BaseOpsUtil): def test_invert(self, data): ser = pd.Series(data, name="name") - result = ~ser - expected = pd.Series(~data, name="name") - tm.assert_series_equal(result, expected) + try: + # 10 is an arbitrary choice here, just avoid iterating over + # the whole array to trim test runtime + [~x for x in data[:10]] + except TypeError: + # scalars don't support invert -> we don't expect the vectorized + # operation to succeed + with pytest.raises(TypeError): + ~ser + with pytest.raises(TypeError): + ~data + else: + # Note we do not reuse the pointwise result to construct expected + # because python semantics for negating bools are weird see GH#54569 + result = ~ser + expected = pd.Series(~data, name="name") + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("ufunc", [np.positive, np.negative, np.abs]) def test_unary_ufunc_dunder_equivalence(self, data, ufunc): diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/reduce.py pandas-2.2.2+dfsg/pandas/tests/extension/base/reduce.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/reduce.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/reduce.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,22 +13,23 @@ make sense for numeric/boolean operations. """ - def _supports_reduction(self, obj, op_name: str) -> bool: + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: # Specify if we expect this reduction to succeed. return False - def check_reduce(self, s, op_name, skipna): + def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): # We perform the same operation on the np.float64 data and check # that the results match. Override if you need to cast to something # other than float64. - res_op = getattr(s, op_name) + res_op = getattr(ser, op_name) try: - alt = s.astype("float64") + alt = ser.astype("float64") except (TypeError, ValueError): - # e.g. Interval can't cast, so let's cast to object and do + # e.g. Interval can't cast (TypeError), StringArray can't cast + # (ValueError), so let's cast to object and do # the reduction pointwise - alt = s.astype(object) + alt = ser.astype(object) exp_op = getattr(alt, op_name) if op_name == "count": @@ -79,63 +80,66 @@ @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): op_name = all_boolean_reductions - s = pd.Series(data) + ser = pd.Series(data) - if not self._supports_reduction(s, op_name): + if not self._supports_reduction(ser, op_name): + # TODO: the message being checked here isn't actually checking anything msg = ( "[Cc]annot perform|Categorical is not ordered for operation|" "does not support reduction|" ) with pytest.raises(TypeError, match=msg): - getattr(s, op_name)(skipna=skipna) + getattr(ser, op_name)(skipna=skipna) else: - self.check_reduce(s, op_name, skipna) + self.check_reduce(ser, op_name, skipna) @pytest.mark.filterwarnings("ignore::RuntimeWarning") @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions - s = pd.Series(data) + ser = pd.Series(data) - if not self._supports_reduction(s, op_name): + if not self._supports_reduction(ser, op_name): + # TODO: the message being checked here isn't actually checking anything msg = ( "[Cc]annot perform|Categorical is not ordered for operation|" "does not support reduction|" ) with pytest.raises(TypeError, match=msg): - getattr(s, op_name)(skipna=skipna) + getattr(ser, op_name)(skipna=skipna) else: # min/max with empty produce numpy warnings - self.check_reduce(s, op_name, skipna) + self.check_reduce(ser, op_name, skipna) @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_frame(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions - s = pd.Series(data) - if not is_numeric_dtype(s.dtype): - pytest.skip("not numeric dtype") + ser = pd.Series(data) + if not is_numeric_dtype(ser.dtype): + pytest.skip(f"{ser.dtype} is not numeric dtype") if op_name in ["count", "kurt", "sem"]: pytest.skip(f"{op_name} not an array method") - if not self._supports_reduction(s, op_name): + if not self._supports_reduction(ser, op_name): pytest.skip(f"Reduction {op_name} not supported for this dtype") - self.check_reduce_frame(s, op_name, skipna) + self.check_reduce_frame(ser, op_name, skipna) -# TODO: deprecate BaseNoReduceTests, BaseNumericReduceTests, BaseBooleanReduceTests +# TODO(3.0): remove BaseNoReduceTests, BaseNumericReduceTests, +# BaseBooleanReduceTests class BaseNoReduceTests(BaseReduceTests): """we don't define any reductions""" class BaseNumericReduceTests(BaseReduceTests): # For backward compatibility only, this only runs the numeric reductions - def _supports_reduction(self, obj, op_name: str) -> bool: + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: if op_name in ["any", "all"]: pytest.skip("These are tested in BaseBooleanReduceTests") return True @@ -143,7 +147,7 @@ class BaseBooleanReduceTests(BaseReduceTests): # For backward compatibility only, this only runs the numeric reductions - def _supports_reduction(self, obj, op_name: str) -> bool: + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: if op_name not in ["any", "all"]: pytest.skip("These are tested in BaseNumericReduceTests") return True diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/reshaping.py pandas-2.2.2+dfsg/pandas/tests/extension/base/reshaping.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/reshaping.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/reshaping.py 2024-04-10 17:42:52.000000000 +0000 @@ -236,13 +236,16 @@ result = pd.merge(df1, df2, on="key") expected = pd.DataFrame( { - "key": key.take([0, 0, 0, 0, 1]), - "val_x": [1, 1, 3, 3, 2], - "val_y": [1, 3, 1, 3, 2], + "key": key.take([0, 0, 1, 2, 2]), + "val_x": [1, 1, 2, 3, 3], + "val_y": [1, 3, 2, 1, 3], } ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) @pytest.mark.parametrize( "columns", [ @@ -334,7 +337,7 @@ assert type(result) == type(data) if data.dtype._is_immutable: - pytest.skip("test_ravel assumes mutability") + pytest.skip(f"test_ravel assumes mutability and {data.dtype} is immutable") # Check that we have a view, not a copy result[0] = result[1] @@ -351,7 +354,9 @@ assert result.shape == data.shape[::-1] if data.dtype._is_immutable: - pytest.skip("test_transpose assumes mutability") + pytest.skip( + f"test_transpose assumes mutability and {data.dtype} is immutable" + ) # Check that we have a view, not a copy result[0] = result[1] diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/base/setitem.py pandas-2.2.2+dfsg/pandas/tests/extension/base/setitem.py --- pandas-2.1.4+dfsg/pandas/tests/extension/base/setitem.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/base/setitem.py 2024-04-10 17:42:52.000000000 +0000 @@ -43,7 +43,13 @@ # This fixture is auto-used, but we want to not-skip # test_is_immutable. return - pytest.skip("__setitem__ test not applicable with immutable dtype") + + # When BaseSetitemTests is mixed into ExtensionTests, we only + # want this fixture to operate on the tests defined in this + # class/file. + defined_in = node.function.__qualname__.split(".")[0] + if defined_in == "BaseSetitemTests": + pytest.skip("__setitem__ test not applicable with immutable dtype") def test_is_immutable(self, data): if data.dtype._is_immutable: @@ -73,7 +79,7 @@ original = ser.copy() value = [data[0]] if as_array: - value = data._from_sequence(value) + value = data._from_sequence(value, dtype=data.dtype) xpr = "cannot set using a {} indexer with a different length" with pytest.raises(ValueError, match=xpr.format("list-like")): @@ -351,11 +357,11 @@ def test_setitem_with_expansion_dataframe_column(self, data, full_indexer): # https://github.com/pandas-dev/pandas/issues/32395 - df = expected = pd.DataFrame({"data": pd.Series(data)}) + df = expected = pd.DataFrame({0: pd.Series(data)}) result = pd.DataFrame(index=df.index) key = full_indexer(df) - result.loc[key, "data"] = df["data"] + result.loc[key, 0] = df[0] tm.assert_frame_equal(result, expected) @@ -401,10 +407,10 @@ orig = df.copy() - df.iloc[:] = df + df.iloc[:] = df.copy() tm.assert_frame_equal(df, orig) - df.iloc[:-1] = df.iloc[:-1] + df.iloc[:-1] = df.iloc[:-1].copy() tm.assert_frame_equal(df, orig) df.iloc[:] = df.values diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/conftest.py pandas-2.2.2+dfsg/pandas/tests/extension/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/extension/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,6 +2,8 @@ import pytest +from pandas._config.config import _get_option + from pandas import ( Series, options, @@ -35,7 +37,7 @@ if not (dtype._is_numeric or dtype.kind == "m"): # Object-dtypes may want to allow this, but for the most part # only numeric and timedelta-like dtypes will need to implement this. - pytest.skip("Not a numeric dtype") + pytest.skip(f"{dtype} is not a numeric dtype") raise NotImplementedError @@ -118,7 +120,11 @@ @pytest.fixture def na_value(dtype): - """The scalar missing value for this type. Default dtype.na_value""" + """ + The scalar missing value for this type. Default dtype.na_value. + + TODO: can be removed in 3.x (see https://github.com/pandas-dev/pandas/pull/54930) + """ return dtype.na_value @@ -218,4 +224,7 @@ """ Fixture to check if Copy-on-Write is enabled. """ - return options.mode.copy_on_write and options.mode.data_manager == "block" + return ( + options.mode.copy_on_write is True + and _get_option("mode.data_manager", silent=True) == "block" + ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/date/array.py pandas-2.2.2+dfsg/pandas/tests/extension/date/array.py --- pandas-2.1.4+dfsg/pandas/tests/extension/date/array.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/date/array.py 2024-04-10 17:42:52.000000000 +0000 @@ -176,9 +176,13 @@ @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): if isinstance(scalars, dt.date): - pass + raise TypeError elif isinstance(scalars, DateArray): - pass + if dtype is not None: + return scalars.astype(dtype, copy=copy) + if copy: + return scalars.copy() + return scalars[:] elif isinstance(scalars, np.ndarray): scalars = scalars.astype("U10") # 10 chars for yyyy-mm-dd return DateArray(scalars) diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/decimal/array.py pandas-2.2.2+dfsg/pandas/tests/extension/decimal/array.py --- pandas-2.1.4+dfsg/pandas/tests/extension/decimal/array.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/decimal/array.py 2024-04-10 17:42:52.000000000 +0000 @@ -97,12 +97,14 @@ return self._dtype @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): + def _from_sequence(cls, scalars, *, dtype=None, copy=False): return cls(scalars) @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): - return cls._from_sequence([decimal.Decimal(x) for x in strings], dtype, copy) + return cls._from_sequence( + [decimal.Decimal(x) for x in strings], dtype=dtype, copy=copy + ) @classmethod def _from_factorized(cls, values, original): @@ -155,7 +157,7 @@ if isinstance(x, (decimal.Decimal, numbers.Number)): return x else: - return DecimalArray._from_sequence(x) + return type(self)._from_sequence(x, dtype=self.dtype) if ufunc.nout > 1: return tuple(reconstruct(x) for x in result) @@ -178,7 +180,7 @@ fill_value = self.dtype.na_value result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill) - return self._from_sequence(result) + return self._from_sequence(result, dtype=self.dtype) def copy(self): return type(self)(self._data.copy(), dtype=self.dtype) diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/decimal/test_decimal.py pandas-2.2.2+dfsg/pandas/tests/extension/decimal/test_decimal.py --- pandas-2.1.4+dfsg/pandas/tests/extension/decimal/test_decimal.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/decimal/test_decimal.py 2024-04-10 17:42:52.000000000 +0000 @@ -71,28 +71,28 @@ ) -> type[Exception] | None: return None - def _supports_reduction(self, obj, op_name: str) -> bool: + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return True - def check_reduce(self, s, op_name, skipna): + def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): if op_name == "count": - return super().check_reduce(s, op_name, skipna) + return super().check_reduce(ser, op_name, skipna) else: - result = getattr(s, op_name)(skipna=skipna) - expected = getattr(np.asarray(s), op_name)() + result = getattr(ser, op_name)(skipna=skipna) + expected = getattr(np.asarray(ser), op_name)() tm.assert_almost_equal(result, expected) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request): if all_numeric_reductions in ["kurt", "skew", "sem", "median"]: mark = pytest.mark.xfail(raises=NotImplementedError) - request.node.add_marker(mark) + request.applymarker(mark) super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): op_name = all_numeric_reductions if op_name in ["skew", "median"]: mark = pytest.mark.xfail(raises=NotImplementedError) - request.node.add_marker(mark) + request.applymarker(mark) return super().test_reduce_frame(data, all_numeric_reductions, skipna) @@ -156,6 +156,36 @@ ): super().test_fillna_limit_pad(data_missing) + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + msg = "ExtensionArray.fillna 'method' keyword is deprecated" + with tm.assert_produces_warning( + DeprecationWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + msg = "DecimalArray does not implement limit_area" + with pytest.raises(NotImplementedError, match=msg): + super().test_ffill_limit_area( + data_missing, limit_area, input_ilocs, expected_ilocs + ) + def test_fillna_limit_backfill(self, data_missing): msg = "Series.fillna with 'method' is deprecated" with tm.assert_produces_warning( @@ -254,12 +284,6 @@ assert data.dtype.name in repr(ser) assert "Decimal: " in repr(ser) - @pytest.mark.xfail( - reason="Looks like the test (incorrectly) implicitly assumes int/bool dtype" - ) - def test_invert(self, data): - super().test_invert(data) - @pytest.mark.xfail(reason="Inconsistent array-vs-scalar behavior") @pytest.mark.parametrize("ufunc", [np.positive, np.negative, np.abs]) def test_unary_ufunc_dunder_equivalence(self, data, ufunc): @@ -334,7 +358,7 @@ """Helper class for testing error handling in _from_sequence.""" @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): + def _from_sequence(cls, scalars, *, dtype=None, copy=False): raise KeyError("For the test") diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/json/array.py pandas-2.2.2+dfsg/pandas/tests/extension/json/array.py --- pandas-2.1.4+dfsg/pandas/tests/extension/json/array.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/json/array.py 2024-04-10 17:42:52.000000000 +0000 @@ -83,7 +83,7 @@ # self._values = self.values = self.data @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): + def _from_sequence(cls, scalars, *, dtype=None, copy=False): return cls(scalars) @classmethod @@ -112,7 +112,9 @@ else: item = pd.api.indexers.check_array_indexer(self, item) if is_bool_dtype(item.dtype): - return self._from_sequence([x for x, m in zip(self, item) if m]) + return type(self)._from_sequence( + [x for x, m in zip(self, item) if m], dtype=self.dtype + ) # integer return type(self)([self.data[i] for i in item]) @@ -144,7 +146,7 @@ def __ne__(self, other): return NotImplemented - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): if dtype is None: dtype = object if dtype == object: @@ -187,7 +189,7 @@ except IndexError as err: raise IndexError(msg) from err - return self._from_sequence(output) + return type(self)._from_sequence(output, dtype=self.dtype) def copy(self): return type(self)(self.data[:]) @@ -206,9 +208,12 @@ return self elif isinstance(dtype, StringDtype): value = self.astype(str) # numpy doesn't like nested dicts - return dtype.construct_array_type()._from_sequence(value, copy=False) - - return np.array([dict(x) for x in self], dtype=dtype, copy=copy) + arr_cls = dtype.construct_array_type() + return arr_cls._from_sequence(value, dtype=dtype, copy=False) + elif not copy: + return np.asarray([dict(x) for x in self], dtype=dtype) + else: + return np.array([dict(x) for x in self], dtype=dtype, copy=copy) def unique(self): # Parent method doesn't work since np.array will try to infer @@ -232,6 +237,10 @@ frozen = [tuple(x.items()) for x in self] return construct_1d_object_array_from_listlike(frozen) + def _pad_or_backfill(self, *, method, limit=None, copy=True): + # GH#56616 - test EA method without limit_area argument + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) + def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/json/test_json.py pandas-2.2.2+dfsg/pandas/tests/extension/json/test_json.py --- pandas-2.1.4+dfsg/pandas/tests/extension/json/test_json.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/json/test_json.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,6 +2,7 @@ import operator import sys +import numpy as np import pytest import pandas as pd @@ -13,6 +14,10 @@ make_data, ) +# We intentionally don't run base.BaseSetitemTests because pandas' +# internals has trouble setting sequences of values into scalar positions. +unhashable = pytest.mark.xfail(reason="Unhashable") + @pytest.fixture def dtype(): @@ -73,15 +78,7 @@ ) -class BaseJSON: - pass - - -class TestDtype(BaseJSON, base.BaseDtypeTests): - pass - - -class TestInterface(BaseJSON, base.BaseInterfaceTests): +class TestJSONArray(base.ExtensionTests): @pytest.mark.xfail( reason="comparison method not implemented for JSONArray (GH-37867)" ) @@ -89,8 +86,6 @@ # GH-37867 super().test_contains(data) - -class TestConstructors(BaseJSON, base.BaseConstructorsTests): @pytest.mark.xfail(reason="not implemented constructor from dtype") def test_from_dtype(self, data): # construct from our dtype & string dtype @@ -129,8 +124,6 @@ finally: sys.setrecursionlimit(rec_limit) - -class TestReshaping(BaseJSON, base.BaseReshapingTests): @pytest.mark.xfail(reason="Different definitions of NA") def test_stack(self): """ @@ -146,16 +139,6 @@ # this matches otherwise return super().test_unstack(data, index) - -class TestGetitem(BaseJSON, base.BaseGetitemTests): - pass - - -class TestIndex(BaseJSON, base.BaseIndexTests): - pass - - -class TestMissing(BaseJSON, base.BaseMissingTests): @pytest.mark.xfail(reason="Setting a dict as a scalar") def test_fillna_series(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" @@ -166,15 +149,29 @@ """We treat dictionaries as a mapping in fillna, not a scalar.""" super().test_fillna_frame() + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + msg = "JSONArray does not implement limit_area" + with pytest.raises(NotImplementedError, match=msg): + super().test_ffill_limit_area( + data_missing, limit_area, input_ilocs, expected_ilocs + ) -unhashable = pytest.mark.xfail(reason="Unhashable") - - -class TestReduce(base.BaseReduceTests): - pass - - -class TestMethods(BaseJSON, base.BaseMethodsTests): @unhashable def test_value_counts(self, all_data, dropna): super().test_value_counts(all_data, dropna) @@ -237,11 +234,9 @@ ): if using_copy_on_write: mark = pytest.mark.xfail(reason="Fails with CoW") - request.node.add_marker(mark) + request.applymarker(mark) super().test_equals_same_data_different_object(data) - -class TestCasting(BaseJSON, base.BaseCastingTests): @pytest.mark.xfail(reason="failing on np.array(self, dtype=str)") def test_astype_str(self): """This currently fails in NumPy on np.array(self, dtype=str) with @@ -250,12 +245,6 @@ """ super().test_astype_str() - -# We intentionally don't run base.BaseSetitemTests because pandas' -# internals has trouble setting sequences of values into scalar positions. - - -class TestGroupby(BaseJSON, base.BaseGroupbyTests): @unhashable def test_groupby_extension_transform(self): """ @@ -295,25 +284,147 @@ """ super().test_groupby_extension_no_sort() - -class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests): def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): if len(data[0]) != 1: mark = pytest.mark.xfail(reason="raises in coercing to Series") - request.node.add_marker(mark) + request.applymarker(mark) super().test_arith_frame_with_scalar(data, all_arithmetic_operators) - -class TestComparisonOps(BaseJSON, base.BaseComparisonOpsTests): def test_compare_array(self, data, comparison_op, request): if comparison_op.__name__ in ["eq", "ne"]: mark = pytest.mark.xfail(reason="Comparison methods not implemented") - request.node.add_marker(mark) + request.applymarker(mark) super().test_compare_array(data, comparison_op) + @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") + def test_setitem_loc_scalar_mixed(self, data): + super().test_setitem_loc_scalar_mixed(data) + + @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") + def test_setitem_loc_scalar_multiple_homogoneous(self, data): + super().test_setitem_loc_scalar_multiple_homogoneous(data) + + @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") + def test_setitem_iloc_scalar_mixed(self, data): + super().test_setitem_iloc_scalar_mixed(data) + + @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") + def test_setitem_iloc_scalar_multiple_homogoneous(self, data): + super().test_setitem_iloc_scalar_multiple_homogoneous(data) + + @pytest.mark.parametrize( + "mask", + [ + np.array([True, True, True, False, False]), + pd.array([True, True, True, False, False], dtype="boolean"), + pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"), + ], + ids=["numpy-array", "boolean-array", "boolean-array-na"], + ) + def test_setitem_mask(self, data, mask, box_in_series, request): + if box_in_series: + mark = pytest.mark.xfail( + reason="cannot set using a list-like indexer with a different length" + ) + request.applymarker(mark) + elif not isinstance(mask, np.ndarray): + mark = pytest.mark.xfail(reason="Issues unwanted DeprecationWarning") + request.applymarker(mark) + super().test_setitem_mask(data, mask, box_in_series) + + def test_setitem_mask_raises(self, data, box_in_series, request): + if not box_in_series: + mark = pytest.mark.xfail(reason="Fails to raise") + request.applymarker(mark) + + super().test_setitem_mask_raises(data, box_in_series) + + @pytest.mark.xfail( + reason="cannot set using a list-like indexer with a different length" + ) + def test_setitem_mask_boolean_array_with_na(self, data, box_in_series): + super().test_setitem_mask_boolean_array_with_na(data, box_in_series) + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_setitem_integer_array(self, data, idx, box_in_series, request): + if box_in_series: + mark = pytest.mark.xfail( + reason="cannot set using a list-like indexer with a different length" + ) + request.applymarker(mark) + super().test_setitem_integer_array(data, idx, box_in_series) + + @pytest.mark.xfail(reason="list indices must be integers or slices, not NAType") + @pytest.mark.parametrize( + "idx, box_in_series", + [ + ([0, 1, 2, pd.NA], False), + pytest.param( + [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948") + ), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + ], + ids=["list-False", "list-True", "integer-array-False", "integer-array-True"], + ) + def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series): + super().test_setitem_integer_with_missing_raises(data, idx, box_in_series) + + @pytest.mark.xfail(reason="Fails to raise") + def test_setitem_scalar_key_sequence_raise(self, data): + super().test_setitem_scalar_key_sequence_raise(data) + + def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request): + if "full_slice" in request.node.name: + mark = pytest.mark.xfail(reason="slice is not iterable") + request.applymarker(mark) + super().test_setitem_with_expansion_dataframe_column(data, full_indexer) + + @pytest.mark.xfail(reason="slice is not iterable") + def test_setitem_frame_2d_values(self, data): + super().test_setitem_frame_2d_values(data) + + @pytest.mark.xfail( + reason="cannot set using a list-like indexer with a different length" + ) + @pytest.mark.parametrize("setter", ["loc", None]) + def test_setitem_mask_broadcast(self, data, setter): + super().test_setitem_mask_broadcast(data, setter) + + @pytest.mark.xfail( + reason="cannot set using a slice indexer with a different length" + ) + def test_setitem_slice(self, data, box_in_series): + super().test_setitem_slice(data, box_in_series) -class TestPrinting(BaseJSON, base.BasePrintingTests): - pass + @pytest.mark.xfail(reason="slice object is not iterable") + def test_setitem_loc_iloc_slice(self, data): + super().test_setitem_loc_iloc_slice(data) + + @pytest.mark.xfail(reason="slice object is not iterable") + def test_setitem_slice_mismatch_length_raises(self, data): + super().test_setitem_slice_mismatch_length_raises(data) + + @pytest.mark.xfail(reason="slice object is not iterable") + def test_setitem_slice_array(self, data): + super().test_setitem_slice_array(data) + + @pytest.mark.xfail(reason="Fail to raise") + def test_setitem_invalid(self, data, invalid_scalar): + super().test_setitem_invalid(data, invalid_scalar) + + @pytest.mark.xfail(reason="only integer scalar arrays can be converted") + def test_setitem_2d_values(self, data): + super().test_setitem_2d_values(data) + + @pytest.mark.xfail(reason="data type 'json' not understood") + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) def custom_assert_series_equal(left, right, *args, **kwargs): diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/list/array.py pandas-2.2.2+dfsg/pandas/tests/extension/list/array.py --- pandas-2.1.4+dfsg/pandas/tests/extension/list/array.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/list/array.py 2024-04-10 17:42:52.000000000 +0000 @@ -54,7 +54,7 @@ self.data = values @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): + def _from_sequence(cls, scalars, *, dtype=None, copy=False): data = np.empty(len(scalars), dtype=object) data[:] = scalars return cls(data) @@ -115,7 +115,10 @@ elif is_string_dtype(dtype) and not is_object_dtype(dtype): # numpy has problems with astype(str) for nested elements return np.array([str(x) for x in self.data], dtype=dtype) - return np.array(self.data, dtype=dtype, copy=copy) + elif not copy: + return np.asarray(self.data, dtype=dtype) + else: + return np.array(self.data, dtype=dtype, copy=copy) @classmethod def _concat_same_type(cls, to_concat): diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/list/test_list.py pandas-2.2.2+dfsg/pandas/tests/extension/list/test_list.py --- pandas-2.1.4+dfsg/pandas/tests/extension/list/test_list.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/list/test_list.py 2024-04-10 17:42:52.000000000 +0000 @@ -27,7 +27,7 @@ def test_to_csv(data): # https://github.com/pandas-dev/pandas/issues/28840 # array with list-likes fail when doing astype(str) on the numpy array - # which was done in to_native_types + # which was done in get_values_for_csv df = pd.DataFrame({"a": data}) res = df.to_csv() assert str(data[0]) in res diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/test_arrow.py pandas-2.2.2+dfsg/pandas/tests/extension/test_arrow.py --- pandas-2.1.4+dfsg/pandas/tests/extension/test_arrow.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/test_arrow.py 2024-04-10 17:42:52.000000000 +0000 @@ -34,15 +34,14 @@ from pandas._libs.tslibs import timezones from pandas.compat import ( PY311, + PY312, is_ci_environment, is_platform_windows, - pa_version_under7p0, - pa_version_under8p0, - pa_version_under9p0, pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, ) +import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -63,7 +62,7 @@ ) from pandas.tests.extension import base -pa = pytest.importorskip("pyarrow", minversion="7.0.0") +pa = pytest.importorskip("pyarrow") from pandas.core.arrays.arrow.array import ArrowExtensionArray from pandas.core.arrays.arrow.extension_types import ArrowPeriodType @@ -78,7 +77,7 @@ "on CI to path to the tzdata for pyarrow." ), ) - request.node.add_marker(mark) + request.applymarker(mark) @pytest.fixture(params=tm.ALL_PYARROW_DTYPES, ids=str) @@ -267,19 +266,63 @@ # TODO: skip otherwise? -class TestBaseCasting(base.BaseCastingTests): +class TestArrowArray(base.ExtensionTests): + def test_compare_scalar(self, data, comparison_op): + ser = pd.Series(data) + self._compare_other(ser, data, comparison_op, data[0]) + + @pytest.mark.parametrize("na_action", [None, "ignore"]) + def test_map(self, data_missing, na_action): + if data_missing.dtype.kind in "mM": + result = data_missing.map(lambda x: x, na_action=na_action) + expected = data_missing.to_numpy(dtype=object) + tm.assert_numpy_array_equal(result, expected) + else: + result = data_missing.map(lambda x: x, na_action=na_action) + if data_missing.dtype == "float32[pyarrow]": + # map roundtrips through objects, which converts to float64 + expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) + else: + expected = data_missing.to_numpy() + tm.assert_numpy_array_equal(result, expected) + def test_astype_str(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_binary(pa_dtype): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"For {pa_dtype} .astype(str) decodes.", ) ) + elif ( + pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None + ) or pa.types.is_duration(pa_dtype): + request.applymarker( + pytest.mark.xfail( + reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", + ) + ) super().test_astype_str(data) + @pytest.mark.parametrize( + "nullable_string_dtype", + [ + "string[python]", + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_astype_string(self, data, nullable_string_dtype, request): + pa_dtype = data.dtype.pyarrow_dtype + if ( + pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None + ) or pa.types.is_duration(pa_dtype): + request.applymarker( + pytest.mark.xfail( + reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", + ) + ) + super().test_astype_string(data, nullable_string_dtype) -class TestConstructors(base.BaseConstructorsTests): def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype): @@ -288,7 +331,7 @@ else: reason = f"pyarrow.type_for_alias cannot infer {pa_dtype}" - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=reason, ) @@ -298,11 +341,13 @@ def test_from_sequence_pa_array(self, data): # https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784 # data._pa_array = pa.ChunkedArray - result = type(data)._from_sequence(data._pa_array) + result = type(data)._from_sequence(data._pa_array, dtype=data.dtype) tm.assert_extension_array_equal(result, data) assert isinstance(result._pa_array, pa.ChunkedArray) - result = type(data)._from_sequence(data._pa_array.combine_chunks()) + result = type(data)._from_sequence( + data._pa_array.combine_chunks(), dtype=data.dtype + ) tm.assert_extension_array_equal(result, data) assert isinstance(result._pa_array, pa.ChunkedArray) @@ -315,7 +360,7 @@ def test_from_sequence_of_strings_pa_array(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_time64(pa_dtype) and pa_dtype.equals("time64[ns]") and not PY311: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Nanosecond time parsing not supported.", ) @@ -323,7 +368,7 @@ elif pa_version_under11p0 and ( pa.types.is_duration(pa_dtype) or pa.types.is_decimal(pa_dtype) ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, reason=f"pyarrow doesn't support parsing {pa_dtype}", @@ -340,12 +385,6 @@ result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype) tm.assert_extension_array_equal(result, data) - -class TestGetitemTests(base.BaseGetitemTests): - pass - - -class TestBaseAccumulateTests(base.BaseAccumulateTests): def check_accumulate(self, ser, op_name, skipna): result = getattr(ser, op_name)(skipna=skipna) @@ -397,9 +436,7 @@ data, all_numeric_accumulations, skipna ) - if pa_version_under9p0 or ( - pa_version_under13p0 and all_numeric_accumulations != "cumsum" - ): + if pa_version_under13p0 and all_numeric_accumulations != "cumsum": # xfailing takes a long time to run because pytest # renders the exception messages even when not showing them opt = request.config.option @@ -410,12 +447,12 @@ mark = pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for pyarrow < 9" ) - request.node.add_marker(mark) + request.applymarker(mark) elif all_numeric_accumulations == "cumsum" and ( pa.types.is_boolean(pa_type) or pa.types.is_decimal(pa_type) ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for {pa_type}", raises=NotImplementedError, @@ -424,10 +461,8 @@ self.check_accumulate(ser, op_name, skipna) - -class TestReduce(base.BaseReduceTests): - def _supports_reduction(self, obj, op_name: str) -> bool: - dtype = tm.get_dtype(obj) + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + dtype = ser.dtype # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has # no attribute "pyarrow_dtype" pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr] @@ -470,20 +505,25 @@ return True - def check_reduce(self, ser, op_name, skipna): - pa_dtype = ser.dtype.pyarrow_dtype - if op_name == "count": - result = getattr(ser, op_name)() + def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): + # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no + # attribute "pyarrow_dtype" + pa_dtype = ser.dtype.pyarrow_dtype # type: ignore[union-attr] + if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype): + alt = ser.astype("Float64") else: - result = getattr(ser, op_name)(skipna=skipna) + # TODO: in the opposite case, aren't we testing... nothing? For + # e.g. date/time dtypes trying to calculate 'expected' by converting + # to object will raise for mean, std etc + alt = ser - if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype): - ser = ser.astype("Float64") # TODO: in the opposite case, aren't we testing... nothing? if op_name == "count": - expected = getattr(ser, op_name)() + result = getattr(ser, op_name)() + expected = getattr(alt, op_name)() else: - expected = getattr(ser, op_name)(skipna=skipna) + result = getattr(ser, op_name)(skipna=skipna) + expected = getattr(alt, op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize("skipna", [True, False]) @@ -501,19 +541,7 @@ if all_numeric_reductions in {"skew", "kurt"} and ( dtype._is_numeric or dtype.kind == "b" ): - request.node.add_marker(xfail_mark) - elif ( - all_numeric_reductions in {"var", "std", "median"} - and pa_version_under7p0 - and pa.types.is_decimal(pa_dtype) - ): - request.node.add_marker(xfail_mark) - elif ( - all_numeric_reductions == "sem" - and pa_version_under8p0 - and (dtype._is_numeric or pa.types.is_temporal(pa_dtype)) - ): - request.node.add_marker(xfail_mark) + request.applymarker(xfail_mark) elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { "sem", @@ -521,7 +549,7 @@ "var", "median", }: - request.node.add_marker(xfail_mark) + request.applymarker(xfail_mark) super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("skipna", [True, False]) @@ -539,7 +567,7 @@ if pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype): # We *might* want to make this behave like the non-pyarrow cases, # but have not yet decided. - request.node.add_marker(xfail_mark) + request.applymarker(xfail_mark) return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) @@ -567,7 +595,7 @@ if op_name == "skew": if data.dtype._is_numeric: mark = pytest.mark.xfail(reason="skew not implemented") - request.node.add_marker(mark) + request.applymarker(mark) return super().test_reduce_frame(data, all_numeric_reductions, skipna) @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"]) @@ -576,8 +604,6 @@ result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median() assert result == 1.5 - -class TestBaseGroupby(base.BaseGroupbyTests): def test_in_numeric_groupby(self, data_for_grouping): dtype = data_for_grouping.dtype if is_string_dtype(dtype): @@ -598,12 +624,10 @@ else: super().test_in_numeric_groupby(data_for_grouping) - -class TestBaseDtype(base.BaseDtypeTests): def test_construct_from_string_own_name(self, dtype, request): pa_dtype = dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=NotImplementedError, reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", @@ -627,7 +651,7 @@ assert not type(dtype).is_dtype(dtype.name) else: if pa.types.is_decimal(pa_dtype): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=NotImplementedError, reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", @@ -649,7 +673,7 @@ or pa.types.is_binary(pa_dtype) or pa.types.is_decimal(pa_dtype) ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( f"{pa_dtype} does not have associated numpy " @@ -666,20 +690,12 @@ else: super().test_is_not_string_type(dtype) - -class TestBaseIndex(base.BaseIndexTests): - pass - - -class TestBaseInterface(base.BaseInterfaceTests): @pytest.mark.xfail( reason="GH 45419: pyarrow.ChunkedArray does not support views.", run=False ) def test_view(self, data): super().test_view(data) - -class TestBaseMissing(base.BaseMissingTests): def test_fillna_no_op_returns_copy(self, data): data = data[~data.isna()] @@ -692,48 +708,38 @@ assert result is not data tm.assert_extension_array_equal(result, data) - -class TestBasePrinting(base.BasePrintingTests): - pass - - -class TestBaseReshaping(base.BaseReshapingTests): @pytest.mark.xfail( reason="GH 45419: pyarrow.ChunkedArray does not support views", run=False ) def test_transpose(self, data): super().test_transpose(data) - -class TestBaseSetitem(base.BaseSetitemTests): @pytest.mark.xfail( reason="GH 45419: pyarrow.ChunkedArray does not support views", run=False ) def test_setitem_preserves_views(self, data): super().test_setitem_preserves_views(data) - -class TestBaseParsing(base.BaseParsingTests): @pytest.mark.parametrize("dtype_backend", ["pyarrow", no_default]) @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data, dtype_backend, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=NotImplementedError, reason=f"Parameterized types {pa_dtype} not supported.", ) ) elif pa.types.is_timestamp(pa_dtype) and pa_dtype.unit in ("us", "ns"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=ValueError, reason="https://github.com/pandas-dev/pandas/issues/49767", ) ) elif pa.types.is_binary(pa_dtype): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") ) df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) @@ -751,26 +757,32 @@ expected = df tm.assert_frame_equal(result, expected) - -class TestBaseUnaryOps(base.BaseUnaryOpsTests): def test_invert(self, data, request): pa_dtype = data.dtype.pyarrow_dtype - if not (pa.types.is_boolean(pa_dtype) or pa.types.is_integer(pa_dtype)): - request.node.add_marker( + if not ( + pa.types.is_boolean(pa_dtype) + or pa.types.is_integer(pa_dtype) + or pa.types.is_string(pa_dtype) + ): + request.applymarker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, reason=f"pyarrow.compute.invert does support {pa_dtype}", ) ) - super().test_invert(data) - + if PY312 and pa.types.is_boolean(pa_dtype): + with tm.assert_produces_warning( + DeprecationWarning, match="Bitwise inversion", check_stacklevel=False + ): + super().test_invert(data) + else: + super().test_invert(data) -class TestBaseMethods(base.BaseMethodsTests): @pytest.mark.parametrize("periods", [1, -2]) def test_diff(self, data, periods, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_unsigned_integer(pa_dtype) and periods == 1: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=pa.ArrowInvalid, reason=( @@ -786,51 +798,8 @@ result = data.value_counts() assert result.dtype == ArrowDtype(pa.int64()) - def test_argmin_argmax( - self, data_for_sorting, data_missing_for_sorting, na_value, request - ): - pa_dtype = data_for_sorting.dtype.pyarrow_dtype - if pa.types.is_decimal(pa_dtype) and pa_version_under7p0: - request.node.add_marker( - pytest.mark.xfail( - reason=f"No pyarrow kernel for {pa_dtype}", - raises=pa.ArrowNotImplementedError, - ) - ) - super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value) - - @pytest.mark.parametrize( - "op_name, skipna, expected", - [ - ("idxmax", True, 0), - ("idxmin", True, 2), - ("argmax", True, 0), - ("argmin", True, 2), - ("idxmax", False, np.nan), - ("idxmin", False, np.nan), - ("argmax", False, -1), - ("argmin", False, -1), - ], - ) - def test_argreduce_series( - self, data_missing_for_sorting, op_name, skipna, expected, request - ): - pa_dtype = data_missing_for_sorting.dtype.pyarrow_dtype - if pa.types.is_decimal(pa_dtype) and pa_version_under7p0 and skipna: - request.node.add_marker( - pytest.mark.xfail( - reason=f"No pyarrow kernel for {pa_dtype}", - raises=pa.ArrowNotImplementedError, - ) - ) - super().test_argreduce_series( - data_missing_for_sorting, op_name, skipna, expected - ) - _combine_le_expected_dtype = "bool[pyarrow]" - -class TestBaseArithmeticOps(base.BaseArithmeticOpsTests): divmod_exc = NotImplementedError def get_op_from_name(self, op_name): @@ -853,6 +822,9 @@ # while ArrowExtensionArray maintains original type expected = pointwise_result + if op_name in ["eq", "ne", "lt", "le", "gt", "ge"]: + return pointwise_result.astype("boolean[pyarrow]") + was_frame = False if isinstance(expected, pd.DataFrame): was_frame = True @@ -947,7 +919,7 @@ return expected def _is_temporal_supported(self, opname, pa_dtype): - return not pa_version_under8p0 and ( + return ( ( opname in ("__add__", "__radd__") or ( @@ -1002,14 +974,10 @@ arrow_temporal_supported = self._is_temporal_supported(opname, pa_dtype) - if ( - opname == "__rpow__" - and ( - pa.types.is_floating(pa_dtype) - or pa.types.is_integer(pa_dtype) - or pa.types.is_decimal(pa_dtype) - ) - and not pa_version_under7p0 + if opname == "__rpow__" and ( + pa.types.is_floating(pa_dtype) + or pa.types.is_integer(pa_dtype) + or pa.types.is_decimal(pa_dtype) ): mark = pytest.mark.xfail( reason=( @@ -1032,47 +1000,38 @@ f"pd.NA and {pa_dtype} Python scalar" ), ) - elif ( - opname == "__rfloordiv__" - and (pa.types.is_integer(pa_dtype) or pa.types.is_decimal(pa_dtype)) - and not pa_version_under7p0 + elif opname == "__rfloordiv__" and ( + pa.types.is_integer(pa_dtype) or pa.types.is_decimal(pa_dtype) ): mark = pytest.mark.xfail( raises=pa.ArrowInvalid, reason="divide by 0", ) - elif ( - opname == "__rtruediv__" - and pa.types.is_decimal(pa_dtype) - and not pa_version_under7p0 - ): + elif opname == "__rtruediv__" and pa.types.is_decimal(pa_dtype): mark = pytest.mark.xfail( raises=pa.ArrowInvalid, reason="divide by 0", ) - elif ( - opname == "__pow__" - and pa.types.is_decimal(pa_dtype) - and pa_version_under7p0 - ): - mark = pytest.mark.xfail( - raises=pa.ArrowInvalid, - reason="Invalid decimal function: power_checked", - ) return mark def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): pa_dtype = data.dtype.pyarrow_dtype - if all_arithmetic_operators == "__rmod__" and ( - pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) - ): + if all_arithmetic_operators == "__rmod__" and pa.types.is_binary(pa_dtype): pytest.skip("Skip testing Python string formatting") + elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( + pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) + ): + request.applymarker( + pytest.mark.xfail( + raises=TypeError, reason="Can only string multiply by an integer." + ) + ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: - request.node.add_marker(mark) + request.applymarker(mark) super().test_arith_series_with_scalar(data, all_arithmetic_operators) @@ -1083,26 +1042,29 @@ pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) ): pytest.skip("Skip testing Python string formatting") + elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( + pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) + ): + request.applymarker( + pytest.mark.xfail( + raises=TypeError, reason="Can only string multiply by an integer." + ) + ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: - request.node.add_marker(mark) + request.applymarker(mark) super().test_arith_frame_with_scalar(data, all_arithmetic_operators) def test_arith_series_with_array(self, data, all_arithmetic_operators, request): pa_dtype = data.dtype.pyarrow_dtype - if ( - all_arithmetic_operators - in ( - "__sub__", - "__rsub__", - ) - and pa.types.is_unsigned_integer(pa_dtype) - and not pa_version_under7p0 - ): - request.node.add_marker( + if all_arithmetic_operators in ( + "__sub__", + "__rsub__", + ) and pa.types.is_unsigned_integer(pa_dtype): + request.applymarker( pytest.mark.xfail( raises=pa.ArrowInvalid, reason=( @@ -1111,10 +1073,18 @@ ), ) ) + elif all_arithmetic_operators in ("__rmul__", "__mul__") and ( + pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype) + ): + request.applymarker( + pytest.mark.xfail( + raises=TypeError, reason="Can only string multiply by an integer." + ) + ) mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype) if mark is not None: - request.node.add_marker(mark) + request.applymarker(mark) op_name = all_arithmetic_operators ser = pd.Series(data) @@ -1128,7 +1098,7 @@ pa_dtype = data.dtype.pyarrow_dtype if pa_dtype.equals("int8"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=pa.ArrowInvalid, reason=f"raises on overflow for {pa_dtype}", @@ -1136,28 +1106,6 @@ ) super().test_add_series_with_extension_array(data) - -class TestBaseComparisonOps(base.BaseComparisonOpsTests): - def test_compare_array(self, data, comparison_op, na_value): - ser = pd.Series(data) - # pd.Series([ser.iloc[0]] * len(ser)) may not return ArrowExtensionArray - # since ser.iloc[0] is a python scalar - other = pd.Series(pd.array([ser.iloc[0]] * len(ser), dtype=data.dtype)) - if comparison_op.__name__ in ["eq", "ne"]: - # comparison should match point-wise comparisons - result = comparison_op(ser, other) - # Series.combine does not calculate the NA mask correctly - # when comparing over an array - assert result[8] is na_value - assert result[97] is na_value - expected = ser.combine(other, comparison_op) - expected[8] = na_value - expected[97] = na_value - tm.assert_series_equal(result, expected) - - else: - return super().test_compare_array(data, comparison_op) - def test_invalid_other_comp(self, data, comparison_op): # GH 48833 with pytest.raises( @@ -1392,6 +1340,26 @@ pd.Series(range(3), dtype=invalid) +def test_arrow_string_multiplication(): + # GH 56537 + binary = pd.Series(["abc", "defg"], dtype=ArrowDtype(pa.string())) + repeat = pd.Series([2, -2], dtype="int64[pyarrow]") + result = binary * repeat + expected = pd.Series(["abcabc", ""], dtype=ArrowDtype(pa.string())) + tm.assert_series_equal(result, expected) + reflected_result = repeat * binary + tm.assert_series_equal(result, reflected_result) + + +def test_arrow_string_multiplication_scalar_repeat(): + binary = pd.Series(["abc", "defg"], dtype=ArrowDtype(pa.string())) + result = binary * 2 + expected = pd.Series(["abcabc", "defgdefg"], dtype=ArrowDtype(pa.string())) + tm.assert_series_equal(result, expected) + reflected_result = 2 * binary + tm.assert_series_equal(reflected_result, expected) + + @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] ) @@ -1409,10 +1377,7 @@ ): # For string, bytes, and bool, we don't *expect* to have quantile work # Note this matches the non-pyarrow behavior - if pa_version_under7p0: - msg = r"Function quantile has no kernel matching input types \(.*\)" - else: - msg = r"Function 'quantile' has no kernel matching input types \(.*\)" + msg = r"Function 'quantile' has no kernel matching input types \(.*\)" with pytest.raises(pa.ArrowNotImplementedError, match=msg): ser.quantile(q=quantile, interpolation=interpolation) return @@ -1420,13 +1385,13 @@ if ( pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype) - or (pa.types.is_decimal(pa_dtype) and not pa_version_under7p0) + or pa.types.is_decimal(pa_dtype) ): pass elif pa.types.is_temporal(data._pa_array.type): pass else: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, reason=f"quantile not supported by pyarrow for {pa_dtype}", @@ -1607,21 +1572,26 @@ tm.assert_series_equal(result, expected) +def test_astype_errors_ignore(): + # GH 55399 + expected = pd.DataFrame({"col": [17000000]}, dtype="int32[pyarrow]") + result = expected.astype("float[pyarrow]", errors="ignore") + tm.assert_frame_equal(result, expected) + + def test_to_numpy_with_defaults(data): # GH49973 result = data.to_numpy() pa_type = data._pa_array.type - if ( - pa.types.is_duration(pa_type) - or pa.types.is_timestamp(pa_type) - or pa.types.is_date(pa_type) - ): + if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type): + pytest.skip("Tested in test_to_numpy_temporal") + elif pa.types.is_date(pa_type): expected = np.array(list(data)) else: expected = np.array(data._pa_array) - if data._hasna: + if data._hasna and not is_numeric_dtype(data.dtype): expected = expected.astype(object) expected[pd.isna(data)] = pd.NA @@ -1633,8 +1603,8 @@ data = [1, None] arr = pd.array(data, dtype="int64[pyarrow]") result = arr.to_numpy() - expected = np.array([1, pd.NA], dtype=object) - assert isinstance(result[0], int) + expected = np.array([1, np.nan]) + assert isinstance(result[0], float) tm.assert_numpy_array_equal(result, expected) @@ -1655,6 +1625,19 @@ tm.assert_numpy_array_equal(result, expected) +def test_to_numpy_without_dtype(): + # GH 54808 + arr = pd.array([True, pd.NA], dtype="boolean[pyarrow]") + result = arr.to_numpy(na_value=False) + expected = np.array([True, False], dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) + + arr = pd.array([1.0, pd.NA], dtype="float32[pyarrow]") + result = arr.to_numpy(na_value=0.0) + expected = np.array([1.0, 0.0], dtype=np.float32) + tm.assert_numpy_array_equal(result, expected) + + def test_setitem_null_slice(data): # GH50248 orig = data.copy() @@ -1663,7 +1646,7 @@ result[:] = data[0] expected = ArrowExtensionArray._from_sequence( [data[0]] * len(data), - dtype=data._pa_array.type, + dtype=data.dtype, ) tm.assert_extension_array_equal(result, expected) @@ -1701,7 +1684,6 @@ data[:] = fill_value -@pytest.mark.skipif(pa_version_under8p0, reason="returns object with 7.0") def test_from_arrow_respecting_given_dtype(): date_array = pa.array( [pd.Timestamp("2019-12-31"), pd.Timestamp("2019-12-31")], type=pa.date32() @@ -1716,7 +1698,6 @@ tm.assert_series_equal(result, expected) -@pytest.mark.skipif(pa_version_under8p0, reason="doesn't raise with 7") def test_from_arrow_respecting_given_dtype_unsafe(): array = pa.array([1.5, 2.5], type=pa.float64()) with pytest.raises(pa.ArrowInvalid, match="Float value 1.5 was truncated"): @@ -1826,19 +1807,33 @@ @pytest.mark.parametrize( "side, pat, na, exp", [ - ["startswith", "ab", None, [True, None]], - ["startswith", "b", False, [False, False]], - ["endswith", "b", True, [False, True]], - ["endswith", "bc", None, [True, None]], + ["startswith", "ab", None, [True, None, False]], + ["startswith", "b", False, [False, False, False]], + ["endswith", "b", True, [False, True, False]], + ["endswith", "bc", None, [True, None, False]], + ["startswith", ("a", "e", "g"), None, [True, None, True]], + ["endswith", ("a", "c", "g"), None, [True, None, True]], + ["startswith", (), None, [False, None, False]], + ["endswith", (), None, [False, None, False]], ], ) def test_str_start_ends_with(side, pat, na, exp): - ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + ser = pd.Series(["abc", None, "efg"], dtype=ArrowDtype(pa.string())) result = getattr(ser.str, side)(pat, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("side", ("startswith", "endswith")) +def test_str_starts_ends_with_all_nulls_empty_tuple(side): + ser = pd.Series([None, None], dtype=ArrowDtype(pa.string())) + result = getattr(ser.str, side)(()) + + # bool datatype preserved for all nulls. + expected = pd.Series([None, None], dtype=ArrowDtype(pa.bool_())) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "arg_name, arg", [["pat", re.compile("b")], ["repl", str], ["case", False], ["flags", 1]], @@ -1866,17 +1861,20 @@ tm.assert_series_equal(result, expected) +def test_str_replace_negative_n(): + # GH 56404 + ser = pd.Series(["abc", "aaaaaa"], dtype=ArrowDtype(pa.string())) + actual = ser.str.replace("a", "", -3, True) + expected = pd.Series(["bc", ""], dtype=ArrowDtype(pa.string())) + tm.assert_series_equal(expected, actual) + + def test_str_repeat_unsupported(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) with pytest.raises(NotImplementedError, match="repeat is not"): ser.str.repeat([1, 2]) -@pytest.mark.xfail( - pa_version_under7p0, - reason="Unsupported for pyarrow < 7", - raises=NotImplementedError, -) def test_str_repeat(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) result = ser.str.repeat(2) @@ -1905,16 +1903,21 @@ @pytest.mark.parametrize( "pat, case, na, exp", [ - ["abc", False, None, [True, None]], - ["Abc", True, None, [False, None]], - ["bc", True, None, [False, None]], - ["ab", False, True, [True, True]], - ["a[a-z]{2}", False, None, [True, None]], - ["A[a-z]{1}", True, None, [False, None]], + ["abc", False, None, [True, True, False, None]], + ["Abc", True, None, [False, False, False, None]], + ["bc", True, None, [False, False, False, None]], + ["ab", False, None, [True, True, False, None]], + ["a[a-z]{2}", False, None, [True, True, False, None]], + ["A[a-z]{1}", True, None, [False, False, False, None]], + # GH Issue: #56652 + ["abc$", False, None, [True, False, False, None]], + ["abc\\$", False, None, [False, True, False, None]], + ["Abc$", True, None, [False, False, False, None]], + ["Abc\\$", True, None, [False, False, False, None]], ], ) def test_str_fullmatch(pat, case, na, exp): - ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string())) result = ser.str.match(pat, case=case, na=na) expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) @@ -1922,7 +1925,7 @@ @pytest.mark.parametrize( "sub, start, end, exp, exp_typ", - [["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [2, None], pa.int64()]], + [["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [1, None], pa.int64()]], ) def test_str_find(sub, start, end, exp, exp_typ): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) @@ -1931,6 +1934,14 @@ tm.assert_series_equal(result, expected) +def test_str_find_negative_start(): + # GH 56411 + ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) + result = ser.str.find(sub="b", start=-1000, end=3) + expected = pd.Series([1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) + + def test_str_find_notimplemented(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) with pytest.raises(NotImplementedError, match="find not implemented"): @@ -2285,14 +2296,40 @@ tm.assert_frame_equal(result, expected) -def test_str_unsupported_extract(): - ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) - with pytest.raises( - NotImplementedError, match="str.extract not supported with pd.ArrowDtype" - ): +def test_str_extract_non_symbolic(): + ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string())) + with pytest.raises(ValueError, match="pat=.* must contain a symbolic group name."): ser.str.extract(r"[ab](\d)") +@pytest.mark.parametrize("expand", [True, False]) +def test_str_extract(expand): + ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string())) + result = ser.str.extract(r"(?P[ab])(?P\d)", expand=expand) + expected = pd.DataFrame( + { + "letter": ArrowExtensionArray(pa.array(["a", "b", None])), + "digit": ArrowExtensionArray(pa.array(["1", "2", None])), + } + ) + tm.assert_frame_equal(result, expected) + + +def test_str_extract_expand(): + ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string())) + result = ser.str.extract(r"[ab](?P\d)", expand=True) + expected = pd.DataFrame( + { + "digit": ArrowExtensionArray(pa.array(["1", "2", None])), + } + ) + tm.assert_frame_equal(result, expected) + + result = ser.str.extract(r"[ab](?P\d)", expand=False) + expected = pd.Series(ArrowExtensionArray(pa.array(["1", "2", None])), name="digit") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_duration_from_strings_with_nat(unit): # GH51175 @@ -2324,15 +2361,7 @@ ["dayofyear", 2], ["hour", 3], ["minute", 4], - pytest.param( - "is_leap_year", - False, - marks=pytest.mark.xfail( - pa_version_under8p0, - raises=NotImplementedError, - reason="is_leap_year not implemented for pyarrow < 8.0", - ), - ), + ["is_leap_year", False], ["microsecond", 5], ["month", 1], ["nanosecond", 6], @@ -2538,10 +2567,10 @@ dtype=ArrowDtype(pa.timestamp("ns")), ) with pytest.raises(NotImplementedError, match="ambiguous is not supported."): - getattr(ser.dt, method)("1H", ambiguous="NaT") + getattr(ser.dt, method)("1h", ambiguous="NaT") with pytest.raises(NotImplementedError, match="nonexistent is not supported."): - getattr(ser.dt, method)("1H", nonexistent="NaT") + getattr(ser.dt, method)("1h", nonexistent="NaT") @pytest.mark.parametrize("method", ["ceil", "floor", "round"]) @@ -2557,10 +2586,7 @@ getattr(ser.dt, method)(None) -@pytest.mark.xfail( - pa_version_under7p0, reason="Methods not supported for pyarrow < 7.0" -) -@pytest.mark.parametrize("freq", ["D", "H", "T", "S", "L", "U", "N"]) +@pytest.mark.parametrize("freq", ["D", "h", "min", "s", "ms", "us", "ns"]) @pytest.mark.parametrize("method", ["ceil", "floor", "round"]) def test_dt_ceil_year_floor(freq, method): ser = pd.Series( @@ -2702,6 +2728,111 @@ tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["timestamp[ms][pyarrow]", "duration[ms][pyarrow]"]) +def test_as_unit(dtype): + # GH 52284 + ser = pd.Series([1000, None], dtype=dtype) + result = ser.dt.as_unit("ns") + expected = ser.astype(dtype.replace("ms", "ns")) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "prop, expected", + [ + ["days", 1], + ["seconds", 2], + ["microseconds", 3], + ["nanoseconds", 4], + ], +) +def test_dt_timedelta_properties(prop, expected): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = getattr(ser.dt, prop) + expected = pd.Series( + ArrowExtensionArray(pa.array([expected, None], type=pa.int32())) + ) + tm.assert_series_equal(result, expected) + + +def test_dt_timedelta_total_seconds(): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.total_seconds() + expected = pd.Series( + ArrowExtensionArray(pa.array([86402.000003, None], type=pa.float64())) + ) + tm.assert_series_equal(result, expected) + + +def test_dt_to_pytimedelta(): + # GH 52284 + data = [timedelta(1, 2, 3), timedelta(1, 2, 4)] + ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns"))) + + result = ser.dt.to_pytimedelta() + expected = np.array(data, dtype=object) + tm.assert_numpy_array_equal(result, expected) + assert all(type(res) is timedelta for res in result) + + expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta() + tm.assert_numpy_array_equal(result, expected) + + +def test_dt_components(): + # GH 52284 + ser = pd.Series( + [ + pd.Timedelta( + days=1, + seconds=2, + microseconds=3, + nanoseconds=4, + ), + None, + ], + dtype=ArrowDtype(pa.duration("ns")), + ) + result = ser.dt.components + expected = pd.DataFrame( + [[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]], + columns=[ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ], + dtype="int32[pyarrow]", + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("skipna", [True, False]) def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): # GH51624 @@ -2859,11 +2990,6 @@ assert repr(ser) == "0 2020-01-01\ndtype: date32[day][pyarrow]" -@pytest.mark.xfail( - pa_version_under8p0, - reason="Function 'add_checked' has no kernel matching input types", - raises=pa.ArrowNotImplementedError, -) def test_duration_overflow_from_ndarray_containing_nat(): # GH52843 data_ts = pd.to_datetime([1, None]) @@ -2884,7 +3010,7 @@ reason="in infer_dtype pd.NA is not ignored in these cases " "even with skipna=True in the list(data) check below" ) - request.node.add_marker(mark) + request.applymarker(mark) assert res == lib.infer_dtype(list(data), skipna=True) @@ -2930,13 +3056,6 @@ ) def test_arithmetic_temporal(pa_type, request): # GH 53171 - if pa_version_under8p0 and pa.types.is_duration(pa_type): - mark = pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason="Function 'subtract_checked' has no kernel matching input types", - ) - request.node.add_marker(mark) - arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type)) unit = pa_type.unit result = arr - pd.Timedelta(1, unit=unit).as_unit(unit) @@ -3013,26 +3132,31 @@ @pytest.mark.parametrize( - "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES + "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES, ids=repr ) -def test_to_numpy_temporal(pa_type): +@pytest.mark.parametrize("dtype", [None, object]) +def test_to_numpy_temporal(pa_type, dtype): # GH 53326 + # GH 55997: Return datetime64/timedelta64 types with NaT if possible arr = ArrowExtensionArray(pa.array([1, None], type=pa_type)) - result = arr.to_numpy() + result = arr.to_numpy(dtype=dtype) if pa.types.is_duration(pa_type): - expected = [ - pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit), - pd.NA, - ] - assert isinstance(result[0], pd.Timedelta) + value = pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit) else: - expected = [ - pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit), - pd.NA, - ] - assert isinstance(result[0], pd.Timestamp) - expected = np.array(expected, dtype=object) - assert result[0].unit == expected[0].unit + value = pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit) + + if dtype == object or (pa.types.is_timestamp(pa_type) and pa_type.tz is not None): + if dtype == object: + na = pd.NA + else: + na = pd.NaT + expected = np.array([value, na], dtype=object) + assert result[0].unit == value.unit + else: + na = pa_type.to_pandas_dtype().type("nat", pa_type.unit) + value = value.to_numpy() + expected = np.array([value, na]) + assert np.datetime_data(result[0])[0] == pa_type.unit tm.assert_numpy_array_equal(result, expected) @@ -3071,6 +3195,14 @@ assert result == expected +def test_pow_missing_operand(): + # GH 55512 + k = pd.Series([2, None], dtype="int64[pyarrow]") + result = k.pow(None, fill_value=3) + expected = pd.Series([8, None], dtype="int64[pyarrow]") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES) def test_duration_fillna_numpy(pa_type): # GH 54707 @@ -3102,6 +3234,22 @@ tm.assert_index_equal(res_uniques, exp_uniques) +def test_dictionary_astype_categorical(): + # GH#56672 + arrs = [ + pa.array(np.array(["a", "x", "c", "a"])).dictionary_encode(), + pa.array(np.array(["a", "d", "c"])).dictionary_encode(), + ] + ser = pd.Series(ArrowExtensionArray(pa.chunked_array(arrs))) + result = ser.astype("category") + categories = pd.Index(["a", "x", "c", "d"], dtype=ArrowDtype(pa.string())) + expected = pd.Series( + ["a", "x", "c", "a", "a", "d", "c"], + dtype=pd.CategoricalDtype(categories=categories), + ) + tm.assert_series_equal(result, expected) + + def test_arrow_floordiv(): # GH 55561 a = pd.Series([-7], dtype="int64[pyarrow]") @@ -3111,6 +3259,92 @@ tm.assert_series_equal(result, expected) +def test_arrow_floordiv_large_values(): + # GH 56645 + a = pd.Series([1425801600000000000], dtype="int64[pyarrow]") + expected = pd.Series([1425801600000], dtype="int64[pyarrow]") + result = a // 1_000_000 + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floordiv_large_integral_result(dtype): + # GH 56676 + a = pd.Series([18014398509481983], dtype=dtype) + result = a // 1 + tm.assert_series_equal(result, a) + + +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_larger_divisor(pa_type): + # GH 56676 + dtype = ArrowDtype(pa_type) + a = pd.Series([-23], dtype=dtype) + result = a // 24 + expected = pd.Series([-1], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_integral_invalid(pa_type): + # GH 56676 + min_value = np.iinfo(pa_type.to_pandas_dtype()).min + a = pd.Series([min_value], dtype=ArrowDtype(pa_type)) + with pytest.raises(pa.lib.ArrowInvalid, match="overflow|not in range"): + a // -1 + with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"): + a // 0 + + +@pytest.mark.parametrize("dtype", tm.FLOAT_PYARROW_DTYPES_STR_REPR) +def test_arrow_floordiv_floating_0_divisor(dtype): + # GH 56676 + a = pd.Series([2], dtype=dtype) + result = a // 0 + expected = pd.Series([float("inf")], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["float64", "datetime64[ns]", "timedelta64[ns]"]) +def test_astype_int_with_null_to_numpy_dtype(dtype): + # GH 57093 + ser = pd.Series([1, None], dtype="int64[pyarrow]") + result = ser.astype(dtype) + expected = pd.Series([1, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) +def test_arrow_integral_floordiv_large_values(pa_type): + # GH 56676 + max_value = np.iinfo(pa_type.to_pandas_dtype()).max + dtype = ArrowDtype(pa_type) + a = pd.Series([max_value], dtype=dtype) + b = pd.Series([1], dtype=dtype) + result = a // b + tm.assert_series_equal(result, a) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_true_division_large_divisor(dtype): + # GH 56706 + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype="float64[pyarrow]") + result = a / b + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floor_division_large_divisor(dtype): + # GH 56706 + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype=dtype) + result = a // b + tm.assert_series_equal(result, expected) + + def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] @@ -3119,3 +3353,36 @@ ArrowExtensionArray(pa.array(pd.to_datetime(string_dates), from_pandas=True)) ) tm.assert_series_equal(result, expected) + + +def test_string_to_time_parsing_cast(): + # GH 56463 + string_times = ["11:41:43.076160"] + result = pd.Series(string_times, dtype="time64[us][pyarrow]") + expected = pd.Series( + ArrowExtensionArray(pa.array([time(11, 41, 43, 76160)], from_pandas=True)) + ) + tm.assert_series_equal(result, expected) + + +def test_to_numpy_float(): + # GH#56267 + ser = pd.Series([32, 40, None], dtype="float[pyarrow]") + result = ser.astype("float64") + expected = pd.Series([32, 40, np.nan], dtype="float64") + tm.assert_series_equal(result, expected) + + +def test_to_numpy_timestamp_to_int(): + # GH 55997 + ser = pd.Series(["2020-01-01 04:30:00"], dtype="timestamp[ns][pyarrow]") + result = ser.to_numpy(dtype=np.int64) + expected = np.array([1577853000000000000]) + tm.assert_numpy_array_equal(result, expected) + + +def test_map_numeric_na_action(): + ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") + result = ser.map(lambda x: 42, na_action="ignore") + expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/test_categorical.py pandas-2.2.2+dfsg/pandas/tests/extension/test_categorical.py --- pandas-2.1.4+dfsg/pandas/tests/extension/test_categorical.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/test_categorical.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,6 +18,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd from pandas import Categorical import pandas._testing as tm @@ -72,11 +74,7 @@ return Categorical(["a", "a", None, None, "b", "b", "a", "c"]) -class TestDtype(base.BaseDtypeTests): - pass - - -class TestInterface(base.BaseInterfaceTests): +class TestCategorical(base.ExtensionTests): @pytest.mark.xfail(reason="Memory usage doesn't match") def test_memory_usage(self, data): # TODO: Is this deliberate? @@ -104,10 +102,10 @@ if na_value_obj is na_value: continue assert na_value_obj not in data - assert na_value_obj in data_missing # this line differs from super method + # this section suffers from super method + if not using_pyarrow_string_dtype(): + assert na_value_obj in data_missing - -class TestConstructors(base.BaseConstructorsTests): def test_empty(self, dtype): cls = dtype.construct_array_type() result = cls._empty((4,), dtype=dtype) @@ -117,12 +115,6 @@ # dtype on our result. assert result.dtype == CategoricalDtype([]) - -class TestReshaping(base.BaseReshapingTests): - pass - - -class TestGetitem(base.BaseGetitemTests): @pytest.mark.skip(reason="Backwards compatibility") def test_getitem_scalar(self, data): # CategoricalDtype.type isn't "correct" since it should @@ -130,28 +122,6 @@ # to break things by changing. super().test_getitem_scalar(data) - -class TestSetitem(base.BaseSetitemTests): - pass - - -class TestIndex(base.BaseIndexTests): - pass - - -class TestMissing(base.BaseMissingTests): - pass - - -class TestReduce(base.BaseReduceTests): - pass - - -class TestAccumulate(base.BaseAccumulateTests): - pass - - -class TestMethods(base.BaseMethodsTests): @pytest.mark.xfail(reason="Unobserved categories included") def test_value_counts(self, all_data, dropna): return super().test_value_counts(all_data, dropna) @@ -178,17 +148,11 @@ result = data.map(lambda x: x, na_action=na_action) tm.assert_extension_array_equal(result, data) - -class TestCasting(base.BaseCastingTests): - pass - - -class TestArithmeticOps(base.BaseArithmeticOpsTests): def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): # frame & scalar op_name = all_arithmetic_operators if op_name == "__rmod__": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="rmod never called when string is first argument" ) @@ -198,27 +162,31 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): op_name = all_arithmetic_operators if op_name == "__rmod__": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="rmod never called when string is first argument" ) ) super().test_arith_series_with_scalar(data, op_name) - -class TestComparisonOps(base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op, other): + def _compare_other(self, ser: pd.Series, data, op, other): op_name = f"__{op.__name__}__" if op_name not in ["__eq__", "__ne__"]: msg = "Unordered Categoricals can only compare equality or not" with pytest.raises(TypeError, match=msg): op(data, other) else: - return super()._compare_other(s, data, op, other) - + return super()._compare_other(ser, data, op, other) -class TestParsing(base.BaseParsingTests): - pass + @pytest.mark.xfail(reason="Categorical overrides __repr__") + @pytest.mark.parametrize("size", ["big", "small"]) + def test_array_repr(self, data, size): + super().test_array_repr(data, size) + + @pytest.mark.xfail(reason="TBD") + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + super().test_groupby_extension_agg(as_index, data_for_grouping) class Test2DCompat(base.NDArrayBacked2DTests): diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/test_common.py pandas-2.2.2+dfsg/pandas/tests/extension/test_common.py --- pandas-2.1.4+dfsg/pandas/tests/extension/test_common.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/test_common.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,7 +17,7 @@ def __init__(self, data) -> None: self.data = data - def __array__(self, dtype): + def __array__(self, dtype=None, copy=None): return self.data @property @@ -30,8 +30,10 @@ if copy: return type(self)(self.data) return self - - return np.array(self, dtype=dtype, copy=copy) + elif not copy: + return np.asarray(self, dtype=dtype) + else: + return np.array(self, dtype=dtype, copy=copy) class TestExtensionArrayDtype: diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/test_datetime.py pandas-2.2.2+dfsg/pandas/tests/extension/test_datetime.py --- pandas-2.1.4+dfsg/pandas/tests/extension/test_datetime.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/test_datetime.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,13 +31,15 @@ @pytest.fixture def data(dtype): - data = DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype) + data = DatetimeArray._from_sequence( + pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype + ) return data @pytest.fixture def data_missing(dtype): - return DatetimeArray( + return DatetimeArray._from_sequence( np.array(["NaT", "2000-01-01"], dtype="datetime64[ns]"), dtype=dtype ) @@ -47,14 +49,18 @@ a = pd.Timestamp("2000-01-01") b = pd.Timestamp("2000-01-02") c = pd.Timestamp("2000-01-03") - return DatetimeArray(np.array([b, c, a], dtype="datetime64[ns]"), dtype=dtype) + return DatetimeArray._from_sequence( + np.array([b, c, a], dtype="datetime64[ns]"), dtype=dtype + ) @pytest.fixture def data_missing_for_sorting(dtype): a = pd.Timestamp("2000-01-01") b = pd.Timestamp("2000-01-02") - return DatetimeArray(np.array([b, "NaT", a], dtype="datetime64[ns]"), dtype=dtype) + return DatetimeArray._from_sequence( + np.array([b, "NaT", a], dtype="datetime64[ns]"), dtype=dtype + ) @pytest.fixture @@ -68,7 +74,7 @@ b = pd.Timestamp("2000-01-02") c = pd.Timestamp("2000-01-03") na = "NaT" - return DatetimeArray( + return DatetimeArray._from_sequence( np.array([b, b, na, na, a, a, b, c], dtype="datetime64[ns]"), dtype=dtype ) @@ -82,78 +88,57 @@ # ---------------------------------------------------------------------------- -class BaseDatetimeTests: - pass +class TestDatetimeArray(base.ExtensionTests): + def _get_expected_exception(self, op_name, obj, other): + if op_name in ["__sub__", "__rsub__"]: + return None + return super()._get_expected_exception(op_name, obj, other) + def _supports_accumulation(self, ser, op_name: str) -> bool: + return op_name in ["cummin", "cummax"] -# ---------------------------------------------------------------------------- -# Tests -class TestDatetimeDtype(BaseDatetimeTests, base.BaseDtypeTests): - pass + def _supports_reduction(self, obj, op_name: str) -> bool: + return op_name in ["min", "max", "median", "mean", "std", "any", "all"] + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): + meth = all_boolean_reductions + msg = f"'{meth}' with datetime64 dtypes is deprecated and will raise in" + with tm.assert_produces_warning( + FutureWarning, match=msg, check_stacklevel=False + ): + super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) -class TestConstructors(BaseDatetimeTests, base.BaseConstructorsTests): def test_series_constructor(self, data): # Series construction drops any .freq attr data = data._with_freq(None) super().test_series_constructor(data) - -class TestGetitem(BaseDatetimeTests, base.BaseGetitemTests): - pass - - -class TestIndex(base.BaseIndexTests): - pass - - -class TestMethods(BaseDatetimeTests, base.BaseMethodsTests): @pytest.mark.parametrize("na_action", [None, "ignore"]) def test_map(self, data, na_action): result = data.map(lambda x: x, na_action=na_action) tm.assert_extension_array_equal(result, data) + def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): + if op_name in ["median", "mean", "std"]: + alt = ser.astype("int64") + + res_op = getattr(ser, op_name) + exp_op = getattr(alt, op_name) + result = res_op(skipna=skipna) + expected = exp_op(skipna=skipna) + if op_name in ["mean", "median"]: + # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" + # has no attribute "tz" + tz = ser.dtype.tz # type: ignore[union-attr] + expected = pd.Timestamp(expected, tz=tz) + else: + expected = pd.Timedelta(expected) + tm.assert_almost_equal(result, expected) -class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests): - pass - - -class TestArithmeticOps(BaseDatetimeTests, base.BaseArithmeticOpsTests): - implements = {"__sub__", "__rsub__"} - - def _get_expected_exception(self, op_name, obj, other): - if op_name in self.implements: - return None - return super()._get_expected_exception(op_name, obj, other) - - -class TestCasting(BaseDatetimeTests, base.BaseCastingTests): - pass - - -class TestComparisonOps(BaseDatetimeTests, base.BaseComparisonOpsTests): - pass - - -class TestMissing(BaseDatetimeTests, base.BaseMissingTests): - pass - - -class TestReshaping(BaseDatetimeTests, base.BaseReshapingTests): - pass - - -class TestSetitem(BaseDatetimeTests, base.BaseSetitemTests): - pass - - -class TestGroupby(BaseDatetimeTests, base.BaseGroupbyTests): - pass - - -class TestPrinting(BaseDatetimeTests, base.BasePrintingTests): - pass + else: + return super().check_reduce(ser, op_name, skipna) -class Test2DCompat(BaseDatetimeTests, base.NDArrayBacked2DTests): +class Test2DCompat(base.NDArrayBacked2DTests): pass diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/test_interval.py pandas-2.2.2+dfsg/pandas/tests/extension/test_interval.py --- pandas-2.1.4+dfsg/pandas/tests/extension/test_interval.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/test_interval.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,6 +13,10 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ +from __future__ import annotations + +from typing import TYPE_CHECKING + import numpy as np import pytest @@ -22,6 +26,9 @@ from pandas.core.arrays import IntervalArray from pandas.tests.extension import base +if TYPE_CHECKING: + import pandas as pd + def make_data(): N = 100 @@ -49,7 +56,7 @@ @pytest.fixture def data_for_twos(): - pytest.skip("Not a numeric dtype") + pytest.skip("Interval is not a numeric dtype") @pytest.fixture @@ -73,7 +80,7 @@ class TestIntervalArray(base.ExtensionTests): divmod_exc = TypeError - def _supports_reduction(self, obj, op_name: str) -> bool: + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return op_name in ["min", "max"] @pytest.mark.xfail( @@ -83,18 +90,6 @@ def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) - @pytest.mark.parametrize("engine", ["c", "python"]) - def test_EA_types(self, engine, data): - expected_msg = r".*must implement _from_sequence_of_strings.*" - with pytest.raises(NotImplementedError, match=expected_msg): - super().test_EA_types(engine, data) - - @pytest.mark.xfail( - reason="Looks like the test (incorrectly) implicitly assumes int/bool dtype" - ) - def test_invert(self, data): - super().test_invert(data) - # TODO: either belongs in tests.arrays.interval or move into base tests. def test_fillna_non_scalar_raises(data_missing): diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/test_masked.py pandas-2.2.2+dfsg/pandas/tests/extension/test_masked.py --- pandas-2.1.4+dfsg/pandas/tests/extension/test_masked.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/test_masked.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,6 +13,8 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ +import warnings + import numpy as np import pytest @@ -22,6 +24,12 @@ ) from pandas.compat.numpy import np_version_gt2 +from pandas.core.dtypes.common import ( + is_float_dtype, + is_signed_integer_dtype, + is_unsigned_integer_dtype, +) + import pandas as pd import pandas._testing as tm from pandas.core.arrays.boolean import BooleanDtype @@ -160,11 +168,26 @@ return pd.array([b, b, na, na, a, a, b, c], dtype=dtype) -class TestDtype(base.BaseDtypeTests): - pass +class TestMaskedArrays(base.ExtensionTests): + @pytest.mark.parametrize("na_action", [None, "ignore"]) + def test_map(self, data_missing, na_action): + result = data_missing.map(lambda x: x, na_action=na_action) + if data_missing.dtype == Float32Dtype(): + # map roundtrips through objects, which converts to float64 + expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) + else: + expected = data_missing.to_numpy() + tm.assert_numpy_array_equal(result, expected) + def test_map_na_action_ignore(self, data_missing_for_sorting): + zero = data_missing_for_sorting[2] + result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") + if data_missing_for_sorting.dtype.kind == "b": + expected = np.array([False, pd.NA, False], dtype=object) + else: + expected = np.array([zero, np.nan, zero]) + tm.assert_numpy_array_equal(result, expected) -class TestArithmeticOps(base.BaseArithmeticOpsTests): def _get_expected_exception(self, op_name, obj, other): try: dtype = tm.get_dtype(obj) @@ -180,15 +203,25 @@ # exception message would include "numpy boolean subtract"" return TypeError return None - return super()._get_expected_exception(op_name, obj, other) + return None def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): sdtype = tm.get_dtype(obj) expected = pointwise_result + if op_name in ("eq", "ne", "le", "ge", "lt", "gt"): + return expected.astype("boolean") + if sdtype.kind in "iu": if op_name in ("__rtruediv__", "__truediv__", "__div__"): - expected = expected.fillna(np.nan).astype("Float64") + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Downcasting object dtype arrays", + category=FutureWarning, + ) + filled = expected.fillna(np.nan) + expected = filled.astype("Float64") else: # combine method result in 'biggest' (int64) dtype expected = expected.astype(sdtype) @@ -220,11 +253,6 @@ expected = expected.astype(sdtype) return expected - series_scalar_exc = None - series_array_exc = None - frame_scalar_exc = None - divmod_exc = None - def test_divmod_series_array(self, data, data_for_twos, request): if data.dtype.kind == "b": mark = pytest.mark.xfail( @@ -232,52 +260,9 @@ "floordiv but not for divmod. This matches what we do for " "non-masked bool dtype." ) - request.node.add_marker(mark) + request.applymarker(mark) super().test_divmod_series_array(data, data_for_twos) - -class TestComparisonOps(base.BaseComparisonOpsTests): - series_scalar_exc = None - series_array_exc = None - frame_scalar_exc = None - - def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): - return pointwise_result.astype("boolean") - - -class TestInterface(base.BaseInterfaceTests): - pass - - -class TestConstructors(base.BaseConstructorsTests): - pass - - -class TestReshaping(base.BaseReshapingTests): - pass - - # for test_concat_mixed_dtypes test - # concat of an Integer and Int coerces to object dtype - # TODO(jreback) once integrated this would - - -class TestGetitem(base.BaseGetitemTests): - pass - - -class TestSetitem(base.BaseSetitemTests): - pass - - -class TestIndex(base.BaseIndexTests): - pass - - -class TestMissing(base.BaseMissingTests): - pass - - -class TestMethods(base.BaseMethodsTests): def test_combine_le(self, data_repeated): # TODO: patching self is a bad pattern here orig_data1, orig_data2 = data_repeated(2) @@ -288,18 +273,8 @@ self._combine_le_expected_dtype = object super().test_combine_le(data_repeated) - -class TestCasting(base.BaseCastingTests): - pass - - -class TestGroupby(base.BaseGroupbyTests): - pass - - -class TestReduce(base.BaseReduceTests): - def _supports_reduction(self, obj, op_name: str) -> bool: - if op_name in ["any", "all"] and tm.get_dtype(obj).kind != "b": + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if op_name in ["any", "all"] and ser.dtype.kind != "b": pytest.skip(reason="Tested in tests/reductions/test_reductions.py") return True @@ -316,18 +291,22 @@ if op_name in ["min", "max"]: cmp_dtype = "bool" + # TODO: prod with integer dtypes does *not* match the result we would + # get if we used object for cmp_dtype. In that cae the object result + # is a large integer while the non-object case overflows and returns 0 + alt = ser.dropna().astype(cmp_dtype) if op_name == "count": result = getattr(ser, op_name)() - expected = getattr(ser.dropna().astype(cmp_dtype), op_name)() + expected = getattr(alt, op_name)() else: result = getattr(ser, op_name)(skipna=skipna) - expected = getattr(ser.dropna().astype(cmp_dtype), op_name)(skipna=skipna) + expected = getattr(alt, op_name)(skipna=skipna) if not skipna and ser.isna().any() and op_name not in ["any", "all"]: expected = pd.NA tm.assert_almost_equal(result, expected) def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): - if tm.is_float_dtype(arr.dtype): + if is_float_dtype(arr.dtype): cmp_dtype = arr.dtype.name elif op_name in ["mean", "median", "var", "std", "skew"]: cmp_dtype = "Float64" @@ -335,7 +314,7 @@ cmp_dtype = arr.dtype.name elif arr.dtype in ["Int64", "UInt64"]: cmp_dtype = arr.dtype.name - elif tm.is_signed_integer_dtype(arr.dtype): + elif is_signed_integer_dtype(arr.dtype): # TODO: Why does Window Numpy 2.0 dtype depend on skipna? cmp_dtype = ( "Int32" @@ -343,7 +322,7 @@ or not IS64 else "Int64" ) - elif tm.is_unsigned_integer_dtype(arr.dtype): + elif is_unsigned_integer_dtype(arr.dtype): cmp_dtype = ( "UInt32" if (is_platform_windows() and (not np_version_gt2 or not skipna)) @@ -368,8 +347,6 @@ raise TypeError("not supposed to reach this") return cmp_dtype - -class TestAccumulation(base.BaseAccumulateTests): def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: return True @@ -397,6 +374,13 @@ else: expected_dtype = f"Int{length}" + if expected_dtype == "Float32" and op_name == "cumprod" and skipna: + # TODO: xfail? + pytest.skip( + f"Float32 precision lead to large differences with op {op_name} " + f"and skipna={skipna}" + ) + if op_name == "cumsum": result = getattr(ser, op_name)(skipna=skipna) expected = pd.Series( @@ -429,24 +413,5 @@ raise NotImplementedError(f"{op_name} not supported") -class TestUnaryOps(base.BaseUnaryOpsTests): - def test_invert(self, data, request): - if data.dtype.kind == "f": - mark = pytest.mark.xfail( - reason="Looks like the base class test implicitly assumes " - "boolean/integer dtypes" - ) - request.node.add_marker(mark) - super().test_invert(data) - - -class TestPrinting(base.BasePrintingTests): - pass - - -class TestParsing(base.BaseParsingTests): - pass - - class Test2DCompat(base.Dim2CompatTests): pass diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/test_numpy.py pandas-2.2.2+dfsg/pandas/tests/extension/test_numpy.py --- pandas-2.1.4+dfsg/pandas/tests/extension/test_numpy.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/test_numpy.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,23 +18,14 @@ import numpy as np import pytest -from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.dtypes import NumpyEADtype import pandas as pd import pandas._testing as tm from pandas.api.types import is_object_dtype from pandas.core.arrays.numpy_ import NumpyExtensionArray -from pandas.core.internals import blocks from pandas.tests.extension import base - -def _can_hold_element_patched(obj, element) -> bool: - if isinstance(element, NumpyExtensionArray): - element = element.to_numpy() - return can_hold_element(obj, element) - - orig_assert_attr_equal = tm.assert_attr_equal @@ -78,7 +69,6 @@ """ with monkeypatch.context() as m: m.setattr(NumpyExtensionArray, "_typ", "extension") - m.setattr(blocks, "can_hold_element", _can_hold_element_patched) m.setattr(tm.asserters, "assert_attr_equal", _assert_attr_equal) yield @@ -151,7 +141,7 @@ @pytest.fixture def data_for_twos(dtype): if dtype.kind == "O": - pytest.skip("Not a numeric dtype") + pytest.skip(f"{dtype} is not a numeric dtype") arr = np.ones(100) * 2 return NumpyExtensionArray._from_sequence(arr, dtype=dtype) @@ -169,21 +159,13 @@ """ if dtype == "object": mark = pytest.mark.xfail(reason="Fails for object dtype") - request.node.add_marker(mark) + request.applymarker(mark) skip_nested = pytest.mark.usefixtures("skip_numpy_object") -class BaseNumPyTests: - pass - - -class TestCasting(BaseNumPyTests, base.BaseCastingTests): - pass - - -class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests): +class TestNumpyExtensionArray(base.ExtensionTests): @pytest.mark.skip(reason="We don't register our dtype") # We don't want to register. This test should probably be split in two. def test_from_dtype(self, data): @@ -194,11 +176,9 @@ # ValueError: Length of passed values is 1, index implies 3. super().test_series_constructor_scalar_with_index(data, dtype) - -class TestDtype(BaseNumPyTests, base.BaseDtypeTests): - def test_check_dtype(self, data, request): + def test_check_dtype(self, data, request, using_infer_string): if data.dtype.numpy_dtype == "object": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"NumpyExtensionArray expectedly clashes with a " f"NumPy name: {data.dtype.numpy_dtype}" @@ -214,26 +194,11 @@ else: super().test_is_not_object_type(dtype) - -class TestGetitem(BaseNumPyTests, base.BaseGetitemTests): @skip_nested def test_getitem_scalar(self, data): # AssertionError super().test_getitem_scalar(data) - -class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): - pass - - -class TestInterface(BaseNumPyTests, base.BaseInterfaceTests): - @skip_nested - def test_array_interface(self, data): - # NumPy array shape inference - super().test_array_interface(data) - - -class TestMethods(BaseNumPyTests, base.BaseMethodsTests): @skip_nested def test_shift_fill_value(self, data): # np.array shape inference. Shift implementation fails. @@ -251,7 +216,9 @@ @skip_nested def test_searchsorted(self, data_for_sorting, as_series): - # Test setup fails. + # TODO: NumpyExtensionArray.searchsorted calls ndarray.searchsorted which + # isn't quite what we want in nested data cases. Instead we need to + # adapt something like libindex._bin_search. super().test_searchsorted(data_for_sorting, as_series) @pytest.mark.xfail(reason="NumpyExtensionArray.diff may fail on dtype") @@ -261,7 +228,7 @@ def test_insert(self, data, request): if data.dtype.numpy_dtype == object: mark = pytest.mark.xfail(reason="Dimension mismatch in np.concatenate") - request.node.add_marker(mark) + request.applymarker(mark) super().test_insert(data) @@ -270,47 +237,73 @@ # NumpyExtensionArray[object] can hold anything, so skip super().test_insert_invalid(data, invalid_scalar) - -class TestArithmetics(BaseNumPyTests, base.BaseArithmeticOpsTests): divmod_exc = None series_scalar_exc = None frame_scalar_exc = None series_array_exc = None - @skip_nested def test_divmod(self, data): + divmod_exc = None + if data.dtype.kind == "O": + divmod_exc = TypeError + self.divmod_exc = divmod_exc super().test_divmod(data) - @skip_nested - def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + def test_divmod_series_array(self, data): + ser = pd.Series(data) + exc = None + if data.dtype.kind == "O": + exc = TypeError + self.divmod_exc = exc + self._check_divmod_op(ser, divmod, data) + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): + opname = all_arithmetic_operators + series_scalar_exc = None + if data.dtype.numpy_dtype == object: + if opname in ["__mul__", "__rmul__"]: + mark = pytest.mark.xfail( + reason="the Series.combine step raises but not the Series method." + ) + request.node.add_marker(mark) + series_scalar_exc = TypeError + self.series_scalar_exc = series_scalar_exc super().test_arith_series_with_scalar(data, all_arithmetic_operators) - def test_arith_series_with_array(self, data, all_arithmetic_operators, request): + def test_arith_series_with_array(self, data, all_arithmetic_operators): opname = all_arithmetic_operators + series_array_exc = None if data.dtype.numpy_dtype == object and opname not in ["__add__", "__radd__"]: - mark = pytest.mark.xfail(reason="Fails for object dtype") - request.node.add_marker(mark) + series_array_exc = TypeError + self.series_array_exc = series_array_exc super().test_arith_series_with_array(data, all_arithmetic_operators) - @skip_nested - def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): + opname = all_arithmetic_operators + frame_scalar_exc = None + if data.dtype.numpy_dtype == object: + if opname in ["__mul__", "__rmul__"]: + mark = pytest.mark.xfail( + reason="the Series.combine step raises but not the Series method." + ) + request.node.add_marker(mark) + frame_scalar_exc = TypeError + self.frame_scalar_exc = frame_scalar_exc super().test_arith_frame_with_scalar(data, all_arithmetic_operators) - -class TestPrinting(BaseNumPyTests, base.BasePrintingTests): - pass - - -class TestReduce(BaseNumPyTests, base.BaseReduceTests): - def _supports_reduction(self, obj, op_name: str) -> bool: - if tm.get_dtype(obj).kind == "O": + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + if ser.dtype.kind == "O": return op_name in ["sum", "min", "max", "any", "all"] return True - def check_reduce(self, s, op_name, skipna): - res_op = getattr(s, op_name) + def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): + res_op = getattr(ser, op_name) # avoid coercing int -> float. Just cast to the actual numpy type. - exp_op = getattr(s.astype(s.dtype._dtype), op_name) + # error: Item "ExtensionDtype" of "dtype[Any] | ExtensionDtype" has + # no attribute "numpy_dtype" + cmp_dtype = ser.dtype.numpy_dtype # type: ignore[union-attr] + alt = ser.astype(cmp_dtype) + exp_op = getattr(alt, op_name) if op_name == "count": result = res_op() expected = exp_op() @@ -319,13 +312,11 @@ expected = exp_op(skipna=skipna) tm.assert_almost_equal(result, expected) - @pytest.mark.skip("tests not written yet") + @pytest.mark.skip("TODO: tests not written yet") @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_frame(self, data, all_numeric_reductions, skipna): pass - -class TestMissing(BaseNumPyTests, base.BaseMissingTests): @skip_nested def test_fillna_series(self, data_missing): # Non-scalar "scalar" values. @@ -336,12 +327,6 @@ # Non-scalar "scalar" values. super().test_fillna_frame(data_missing) - -class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): - pass - - -class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): @skip_nested def test_setitem_invalid(self, data, invalid_scalar): # object dtype can hold anything, so doesn't raise @@ -425,13 +410,17 @@ if data.dtype.numpy_dtype != object: if not isinstance(key, slice) or key != slice(None): expected = pd.DataFrame({"data": data.to_numpy()}) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_column_type=False) + @pytest.mark.xfail(reason="NumpyEADtype is unpacked") + def test_index_from_listlike_with_dtype(self, data): + super().test_index_from_listlike_with_dtype(data) -@skip_nested -class TestParsing(BaseNumPyTests, base.BaseParsingTests): - pass + @skip_nested + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) -class Test2DCompat(BaseNumPyTests, base.NDArrayBacked2DTests): +class Test2DCompat(base.NDArrayBacked2DTests): pass diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/test_period.py pandas-2.2.2+dfsg/pandas/tests/extension/test_period.py --- pandas-2.1.4+dfsg/pandas/tests/extension/test_period.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/test_period.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,10 +13,17 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ +from __future__ import annotations + +from typing import TYPE_CHECKING + import numpy as np import pytest -from pandas._libs import iNaT +from pandas._libs import ( + Period, + iNaT, +) from pandas.compat import is_platform_windows from pandas.compat.numpy import np_version_gte1p24 @@ -26,6 +33,9 @@ from pandas.core.arrays import PeriodArray from pandas.tests.extension import base +if TYPE_CHECKING: + import pandas as pd + @pytest.fixture(params=["D", "2D"]) def dtype(request): @@ -61,27 +71,36 @@ return PeriodArray([B, B, NA, NA, A, A, B, C], dtype=dtype) -class BasePeriodTests: - pass - - -class TestPeriodDtype(BasePeriodTests, base.BaseDtypeTests): - pass - - -class TestConstructors(BasePeriodTests, base.BaseConstructorsTests): - pass - +class TestPeriodArray(base.ExtensionTests): + def _get_expected_exception(self, op_name, obj, other): + if op_name in ("__sub__", "__rsub__"): + return None + return super()._get_expected_exception(op_name, obj, other) -class TestGetitem(BasePeriodTests, base.BaseGetitemTests): - pass + def _supports_accumulation(self, ser, op_name: str) -> bool: + return op_name in ["cummin", "cummax"] + def _supports_reduction(self, obj, op_name: str) -> bool: + return op_name in ["min", "max", "median"] -class TestIndex(base.BaseIndexTests): - pass + def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): + if op_name == "median": + res_op = getattr(ser, op_name) + + alt = ser.astype("int64") + + exp_op = getattr(alt, op_name) + result = res_op(skipna=skipna) + expected = exp_op(skipna=skipna) + # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no + # attribute "freq" + freq = ser.dtype.freq # type: ignore[union-attr] + expected = Period._from_ordinal(int(expected), freq=freq) + tm.assert_almost_equal(result, expected) + else: + return super().check_reduce(ser, op_name, skipna) -class TestMethods(BasePeriodTests, base.BaseMethodsTests): @pytest.mark.parametrize("periods", [1, -2]) def test_diff(self, data, periods): if is_platform_windows() and np_version_gte1p24: @@ -96,48 +115,5 @@ tm.assert_extension_array_equal(result, data) -class TestInterface(BasePeriodTests, base.BaseInterfaceTests): - pass - - -class TestArithmeticOps(BasePeriodTests, base.BaseArithmeticOpsTests): - def _get_expected_exception(self, op_name, obj, other): - if op_name in ("__sub__", "__rsub__"): - return None - return super()._get_expected_exception(op_name, obj, other) - - -class TestCasting(BasePeriodTests, base.BaseCastingTests): - pass - - -class TestComparisonOps(BasePeriodTests, base.BaseComparisonOpsTests): - pass - - -class TestMissing(BasePeriodTests, base.BaseMissingTests): - pass - - -class TestReshaping(BasePeriodTests, base.BaseReshapingTests): - pass - - -class TestSetitem(BasePeriodTests, base.BaseSetitemTests): - pass - - -class TestGroupby(BasePeriodTests, base.BaseGroupbyTests): - pass - - -class TestPrinting(BasePeriodTests, base.BasePrintingTests): - pass - - -class TestParsing(BasePeriodTests, base.BaseParsingTests): - pass - - -class Test2DCompat(BasePeriodTests, base.NDArrayBacked2DTests): +class Test2DCompat(base.NDArrayBacked2DTests): pass diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/test_sparse.py pandas-2.2.2+dfsg/pandas/tests/extension/test_sparse.py --- pandas-2.1.4+dfsg/pandas/tests/extension/test_sparse.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/test_sparse.py 2024-04-10 17:42:52.000000000 +0000 @@ -98,26 +98,64 @@ return SparseArray([0, 0, np.nan, -2, -1, 4, 2, 3, 0, 0], fill_value=request.param) -class BaseSparseTests: - def _check_unsupported(self, data): - if data.dtype == SparseDtype(int, 0): - pytest.skip("Can't store nan in int array.") - - -class TestDtype(BaseSparseTests, base.BaseDtypeTests): - def test_array_type_with_arg(self, data, dtype): - assert dtype.construct_array_type() is SparseArray - +class TestSparseArray(base.ExtensionTests): + def _supports_reduction(self, obj, op_name: str) -> bool: + return True + + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request): + if all_numeric_reductions in [ + "prod", + "median", + "var", + "std", + "sem", + "skew", + "kurt", + ]: + mark = pytest.mark.xfail( + reason="This should be viable but is not implemented" + ) + request.node.add_marker(mark) + elif ( + all_numeric_reductions in ["sum", "max", "min", "mean"] + and data.dtype.kind == "f" + and not skipna + ): + mark = pytest.mark.xfail(reason="getting a non-nan float") + request.node.add_marker(mark) -class TestInterface(BaseSparseTests, base.BaseInterfaceTests): - pass + super().test_reduce_series_numeric(data, all_numeric_reductions, skipna) + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_frame(self, data, all_numeric_reductions, skipna, request): + if all_numeric_reductions in [ + "prod", + "median", + "var", + "std", + "sem", + "skew", + "kurt", + ]: + mark = pytest.mark.xfail( + reason="This should be viable but is not implemented" + ) + request.node.add_marker(mark) + elif ( + all_numeric_reductions in ["sum", "max", "min", "mean"] + and data.dtype.kind == "f" + and not skipna + ): + mark = pytest.mark.xfail(reason="ExtensionArray NA mask are different") + request.node.add_marker(mark) -class TestConstructors(BaseSparseTests, base.BaseConstructorsTests): - pass + super().test_reduce_frame(data, all_numeric_reductions, skipna) + def _check_unsupported(self, data): + if data.dtype == SparseDtype(int, 0): + pytest.skip("Can't store nan in int array.") -class TestReshaping(BaseSparseTests, base.BaseReshapingTests): def test_concat_mixed_dtypes(self, data): # https://github.com/pandas-dev/pandas/issues/20762 # This should be the same, aside from concat([sparse, float]) @@ -133,6 +171,9 @@ ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) @pytest.mark.parametrize( "columns", [ @@ -170,8 +211,6 @@ self._check_unsupported(data) super().test_merge(data, na_value) - -class TestGetitem(BaseSparseTests, base.BaseGetitemTests): def test_get(self, data): ser = pd.Series(data, index=[2 * i for i in range(len(data))]) if np.isnan(ser.values.fill_value): @@ -184,16 +223,6 @@ self._check_unsupported(data) super().test_reindex(data, na_value) - -class TestSetitem(BaseSparseTests, base.BaseSetitemTests): - pass - - -class TestIndex(base.BaseIndexTests): - pass - - -class TestMissing(BaseSparseTests, base.BaseMissingTests): def test_isna(self, data_missing): sarr = SparseArray(data_missing) expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value)) @@ -214,14 +243,16 @@ def test_fillna_no_op_returns_copy(self, data, request): if np.isnan(data.fill_value): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="returns array with different fill value") ) super().test_fillna_no_op_returns_copy(data) @pytest.mark.xfail(reason="Unsupported") - def test_fillna_series(self): + def test_fillna_series(self, data_missing): # this one looks doable. + # TODO: this fails bc we do not pass through data_missing. If we did, + # the 0-fill case would xpass super().test_fillna_series() def test_fillna_frame(self, data_missing): @@ -244,8 +275,6 @@ tm.assert_frame_equal(result, expected) - -class TestMethods(BaseSparseTests, base.BaseMethodsTests): _combine_le_expected_dtype = "Sparse[bool]" def test_fillna_copy_frame(self, data_missing, using_copy_on_write): @@ -346,14 +375,12 @@ with pytest.raises(ValueError, match=msg): data.map(lambda x: np.nan, na_action=na_action) - -class TestCasting(BaseSparseTests, base.BaseCastingTests): @pytest.mark.xfail(raises=TypeError, reason="no sparse StringDtype") - def test_astype_string(self, data): + def test_astype_string(self, data, nullable_string_dtype): + # TODO: this fails bc we do not pass through nullable_string_dtype; + # If we did, the 0-cases would xpass super().test_astype_string(data) - -class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests): series_scalar_exc = None frame_scalar_exc = None divmod_exc = None @@ -387,20 +414,30 @@ "rmod", ]: mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch") - request.node.add_marker(mark) + request.applymarker(mark) super().test_arith_frame_with_scalar(data, all_arithmetic_operators) - -class TestComparisonOps(BaseSparseTests): - def _compare_other(self, data_for_compare: SparseArray, comparison_op, other): + def _compare_other( + self, ser: pd.Series, data_for_compare: SparseArray, comparison_op, other + ): op = comparison_op result = op(data_for_compare, other) - assert isinstance(result, SparseArray) + if isinstance(other, pd.Series): + assert isinstance(result, pd.Series) + assert isinstance(result.dtype, SparseDtype) + else: + assert isinstance(result, SparseArray) assert result.dtype.subtype == np.bool_ - if isinstance(other, SparseArray): - fill_value = op(data_for_compare.fill_value, other.fill_value) + if isinstance(other, pd.Series): + fill_value = op(data_for_compare.fill_value, other._values.fill_value) + expected = SparseArray( + op(data_for_compare.to_dense(), np.asarray(other)), + fill_value=fill_value, + dtype=np.bool_, + ) + else: fill_value = np.all( op(np.asarray(data_for_compare.fill_value), np.asarray(other)) @@ -411,40 +448,51 @@ fill_value=fill_value, dtype=np.bool_, ) - tm.assert_sp_array_equal(result, expected) + if isinstance(other, pd.Series): + # error: Incompatible types in assignment + expected = pd.Series(expected) # type: ignore[assignment] + tm.assert_equal(result, expected) def test_scalar(self, data_for_compare: SparseArray, comparison_op): - self._compare_other(data_for_compare, comparison_op, 0) - self._compare_other(data_for_compare, comparison_op, 1) - self._compare_other(data_for_compare, comparison_op, -1) - self._compare_other(data_for_compare, comparison_op, np.nan) + ser = pd.Series(data_for_compare) + self._compare_other(ser, data_for_compare, comparison_op, 0) + self._compare_other(ser, data_for_compare, comparison_op, 1) + self._compare_other(ser, data_for_compare, comparison_op, -1) + self._compare_other(ser, data_for_compare, comparison_op, np.nan) + + def test_array(self, data_for_compare: SparseArray, comparison_op, request): + if data_for_compare.dtype.fill_value == 0 and comparison_op.__name__ in [ + "eq", + "ge", + "le", + ]: + mark = pytest.mark.xfail(reason="Wrong fill_value") + request.applymarker(mark) - @pytest.mark.xfail(reason="Wrong indices") - def test_array(self, data_for_compare: SparseArray, comparison_op): arr = np.linspace(-4, 5, 10) - self._compare_other(data_for_compare, comparison_op, arr) + ser = pd.Series(data_for_compare) + self._compare_other(ser, data_for_compare, comparison_op, arr) + + def test_sparse_array(self, data_for_compare: SparseArray, comparison_op, request): + if data_for_compare.dtype.fill_value == 0 and comparison_op.__name__ != "gt": + mark = pytest.mark.xfail(reason="Wrong fill_value") + request.applymarker(mark) - @pytest.mark.xfail(reason="Wrong indices") - def test_sparse_array(self, data_for_compare: SparseArray, comparison_op): + ser = pd.Series(data_for_compare) arr = data_for_compare + 1 - self._compare_other(data_for_compare, comparison_op, arr) + self._compare_other(ser, data_for_compare, comparison_op, arr) arr = data_for_compare * 2 - self._compare_other(data_for_compare, comparison_op, arr) + self._compare_other(ser, data_for_compare, comparison_op, arr) - -class TestPrinting(BaseSparseTests, base.BasePrintingTests): @pytest.mark.xfail(reason="Different repr") def test_array_repr(self, data, size): super().test_array_repr(data, size) - -class TestParsing(BaseSparseTests, base.BaseParsingTests): - @pytest.mark.parametrize("engine", ["c", "python"]) - def test_EA_types(self, engine, data): - expected_msg = r".*must implement _from_sequence_of_strings.*" - with pytest.raises(NotImplementedError, match=expected_msg): - super().test_EA_types(engine, data) + @pytest.mark.xfail(reason="result does not match expected") + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + super().test_groupby_extension_agg(as_index, data_for_grouping) -class TestNoNumericAccumulations(base.BaseAccumulateTests): - pass +def test_array_type_with_arg(dtype): + assert dtype.construct_array_type() is SparseArray diff -Nru pandas-2.1.4+dfsg/pandas/tests/extension/test_string.py pandas-2.2.2+dfsg/pandas/tests/extension/test_string.py --- pandas-2.1.4+dfsg/pandas/tests/extension/test_string.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/extension/test_string.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,7 +13,10 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ +from __future__ import annotations + import string +from typing import cast import numpy as np import pytest @@ -26,22 +29,21 @@ from pandas.tests.extension import base -def split_array(arr): - if arr.dtype.storage != "pyarrow": - pytest.skip("only applicable for pyarrow chunked array n/a") - - def _split_array(arr): - import pyarrow as pa - - arrow_array = arr._pa_array - split = len(arrow_array) // 2 - arrow_array = pa.chunked_array( - [*arrow_array[:split].chunks, *arrow_array[split:].chunks] - ) - assert arrow_array.num_chunks == 2 - return type(arr)(arrow_array) - - return _split_array(arr) +def maybe_split_array(arr, chunked): + if not chunked: + return arr + elif arr.dtype.storage != "pyarrow": + return arr + + pa = pytest.importorskip("pyarrow") + + arrow_array = arr._pa_array + split = len(arrow_array) // 2 + arrow_array = pa.chunked_array( + [*arrow_array[:split].chunks, *arrow_array[split:].chunks] + ) + assert arrow_array.num_chunks == 2 + return type(arr)(arrow_array) @pytest.fixture(params=[True, False]) @@ -60,38 +62,38 @@ while strings[0] == strings[1]: strings = np.random.default_rng(2).choice(list(string.ascii_letters), size=100) - arr = dtype.construct_array_type()._from_sequence(strings) - return split_array(arr) if chunked else arr + arr = dtype.construct_array_type()._from_sequence(strings, dtype=dtype) + return maybe_split_array(arr, chunked) @pytest.fixture def data_missing(dtype, chunked): """Length 2 array with [NA, Valid]""" - arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"]) - return split_array(arr) if chunked else arr + arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"], dtype=dtype) + return maybe_split_array(arr, chunked) @pytest.fixture def data_for_sorting(dtype, chunked): - arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"]) - return split_array(arr) if chunked else arr + arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"], dtype=dtype) + return maybe_split_array(arr, chunked) @pytest.fixture def data_missing_for_sorting(dtype, chunked): - arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"]) - return split_array(arr) if chunked else arr + arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"], dtype=dtype) + return maybe_split_array(arr, chunked) @pytest.fixture def data_for_grouping(dtype, chunked): arr = dtype.construct_array_type()._from_sequence( - ["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"] + ["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"], dtype=dtype ) - return split_array(arr) if chunked else arr + return maybe_split_array(arr, chunked) -class TestDtype(base.BaseDtypeTests): +class TestStringArray(base.ExtensionTests): def test_eq_with_str(self, dtype): assert dtype == f"string[{dtype.storage}]" super().test_eq_with_str(dtype) @@ -101,43 +103,25 @@ # because StringDtype is a string type assert is_string_dtype(dtype) - -class TestInterface(base.BaseInterfaceTests): def test_view(self, data, request, arrow_string_storage): if data.dtype.storage in arrow_string_storage: pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_view(data) - -class TestConstructors(base.BaseConstructorsTests): def test_from_dtype(self, data): # base test uses string representation of dtype pass - -class TestReshaping(base.BaseReshapingTests): def test_transpose(self, data, request, arrow_string_storage): if data.dtype.storage in arrow_string_storage: pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_transpose(data) - -class TestGetitem(base.BaseGetitemTests): - pass - - -class TestSetitem(base.BaseSetitemTests): def test_setitem_preserves_views(self, data, request, arrow_string_storage): if data.dtype.storage in arrow_string_storage: pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_setitem_preserves_views(data) - -class TestIndex(base.BaseIndexTests): - pass - - -class TestMissing(base.BaseMissingTests): def test_dropna_array(self, data_missing): result = data_missing.dropna() expected = data_missing[[1]] @@ -155,61 +139,80 @@ assert result is not data tm.assert_extension_array_equal(result, data) + def _get_expected_exception( + self, op_name: str, obj, other + ) -> type[Exception] | None: + if op_name in ["__divmod__", "__rdivmod__"]: + if isinstance(obj, pd.Series) and cast( + StringDtype, tm.get_dtype(obj) + ).storage in [ + "pyarrow", + "pyarrow_numpy", + ]: + # TODO: re-raise as TypeError? + return NotImplementedError + elif isinstance(other, pd.Series) and cast( + StringDtype, tm.get_dtype(other) + ).storage in [ + "pyarrow", + "pyarrow_numpy", + ]: + # TODO: re-raise as TypeError? + return NotImplementedError + return TypeError + elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]: + if cast(StringDtype, tm.get_dtype(obj)).storage in [ + "pyarrow", + "pyarrow_numpy", + ]: + return NotImplementedError + return TypeError + elif op_name in ["__mul__", "__rmul__"]: + # Can only multiply strings by integers + return TypeError + elif op_name in [ + "__truediv__", + "__rtruediv__", + "__floordiv__", + "__rfloordiv__", + "__sub__", + "__rsub__", + ]: + if cast(StringDtype, tm.get_dtype(obj)).storage in [ + "pyarrow", + "pyarrow_numpy", + ]: + import pyarrow as pa + + # TODO: better to re-raise as TypeError? + return pa.ArrowNotImplementedError + return TypeError + + return None -class TestReduce(base.BaseReduceTests): def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( - ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] + op_name in ["min", "max"] + or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] and op_name in ("any", "all") ) - @pytest.mark.parametrize("skipna", [True, False]) - def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): - op_name = all_numeric_reductions - - if op_name in ["min", "max"]: - return None - - ser = pd.Series(data) - with pytest.raises(TypeError): - getattr(ser, op_name)(skipna=skipna) - - -class TestMethods(base.BaseMethodsTests): - pass - - -class TestCasting(base.BaseCastingTests): - pass - - -class TestComparisonOps(base.BaseComparisonOpsTests): def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): - dtype = tm.get_dtype(obj) - # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no - # attribute "storage" - if dtype.storage == "pyarrow": # type: ignore[union-attr] - cast_to = "boolean[pyarrow]" - elif dtype.storage == "pyarrow_numpy": # type: ignore[union-attr] + dtype = cast(StringDtype, tm.get_dtype(obj)) + if op_name in ["__add__", "__radd__"]: + cast_to = dtype + elif dtype.storage == "pyarrow": + cast_to = "boolean[pyarrow]" # type: ignore[assignment] + elif dtype.storage == "pyarrow_numpy": cast_to = np.bool_ # type: ignore[assignment] else: - cast_to = "boolean" + cast_to = "boolean" # type: ignore[assignment] return pointwise_result.astype(cast_to) def test_compare_scalar(self, data, comparison_op): ser = pd.Series(data) self._compare_other(ser, data, comparison_op, "abc") - -class TestParsing(base.BaseParsingTests): - pass - - -class TestPrinting(base.BasePrintingTests): - pass - - -class TestGroupBy(base.BaseGroupbyTests): @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning") def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) @@ -217,7 +220,7 @@ class Test2DCompat(base.Dim2CompatTests): @pytest.fixture(autouse=True) - def arrow_not_supported(self, data, request): + def arrow_not_supported(self, data): if isinstance(data, ArrowStringArray): pytest.skip(reason="2D support not implemented for ArrowStringArray") diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/conftest.py pandas-2.2.2+dfsg/pandas/tests/frame/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/frame/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,82 +3,24 @@ from pandas import ( DataFrame, + Index, NaT, date_range, ) -import pandas._testing as tm @pytest.fixture -def float_frame_with_na(): +def datetime_frame() -> DataFrame: """ - Fixture for DataFrame of floats with index of unique strings + Fixture for DataFrame of floats with DatetimeIndex - Columns are ['A', 'B', 'C', 'D']; some entries are missing - - A B C D - ABwBzA0ljw -1.128865 -0.897161 0.046603 0.274997 - DJiRzmbyQF 0.728869 0.233502 0.722431 -0.890872 - neMgPD5UBF 0.486072 -1.027393 -0.031553 1.449522 - 0yWA4n8VeX -1.937191 -1.142531 0.805215 -0.462018 - 3slYUbbqU1 0.153260 1.164691 1.489795 -0.545826 - soujjZ0A08 NaN NaN NaN NaN - 7W6NLGsjB9 NaN NaN NaN NaN - ... ... ... ... ... - uhfeaNkCR1 -0.231210 -0.340472 0.244717 -0.901590 - n6p7GYuBIV -0.419052 1.922721 -0.125361 -0.727717 - ZhzAeY6p1y 1.234374 -1.425359 -0.827038 -0.633189 - uWdPsORyUh 0.046738 -0.980445 -1.102965 0.605503 - 3DJA6aN590 -0.091018 -1.684734 -1.100900 0.215947 - 2GBPAzdbMk -2.883405 -1.021071 1.209877 1.633083 - sHadBoyVHw -2.223032 -0.326384 0.258931 0.245517 - - [30 rows x 4 columns] - """ - df = DataFrame(tm.getSeriesData()) - # set some NAs - df.iloc[5:10] = np.nan - df.iloc[15:20, -2:] = np.nan - return df - - -@pytest.fixture -def bool_frame_with_na(): + Columns are ['A', 'B', 'C', 'D'] """ - Fixture for DataFrame of booleans with index of unique strings - - Columns are ['A', 'B', 'C', 'D']; some entries are missing - - A B C D - zBZxY2IDGd False False False False - IhBWBMWllt False True True True - ctjdvZSR6R True False True True - AVTujptmxb False True False True - G9lrImrSWq False False False True - sFFwdIUfz2 NaN NaN NaN NaN - s15ptEJnRb NaN NaN NaN NaN - ... ... ... ... ... - UW41KkDyZ4 True True False False - l9l6XkOdqV True False False False - X2MeZfzDYA False True False False - xWkIKU7vfX False True False True - QOhL6VmpGU False False False True - 22PwkRJdat False True False False - kfboQ3VeIK True False True False - - [30 rows x 4 columns] - """ - df = DataFrame(tm.getSeriesData()) > 0 - df = df.astype(object) - # set some NAs - df.iloc[5:10] = np.nan - df.iloc[15:20, -2:] = np.nan - - # For `any` tests we need to have at least one True before the first NaN - # in each column - for i in range(4): - df.iloc[i, i] = True - return df + return DataFrame( + np.random.default_rng(2).standard_normal((100, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100, freq="B"), + ) @pytest.fixture @@ -87,27 +29,12 @@ Fixture for DataFrame of floats and strings with index of unique strings Columns are ['A', 'B', 'C', 'D', 'foo']. - - A B C D foo - w3orJvq07g -1.594062 -1.084273 -1.252457 0.356460 bar - PeukuVdmz2 0.109855 -0.955086 -0.809485 0.409747 bar - ahp2KvwiM8 -1.533729 -0.142519 -0.154666 1.302623 bar - 3WSJ7BUCGd 2.484964 0.213829 0.034778 -2.327831 bar - khdAmufk0U -0.193480 -0.743518 -0.077987 0.153646 bar - LE2DZiFlrE -0.193566 -1.343194 -0.107321 0.959978 bar - HJXSJhVn7b 0.142590 1.257603 -0.659409 -0.223844 bar - ... ... ... ... ... ... - 9a1Vypttgw -1.316394 1.601354 0.173596 1.213196 bar - h5d1gVFbEy 0.609475 1.106738 -0.155271 0.294630 bar - mK9LsTQG92 1.303613 0.857040 -1.019153 0.369468 bar - oOLksd9gKH 0.558219 -0.134491 -0.289869 -0.951033 bar - 9jgoOjKyHg 0.058270 -0.496110 -0.413212 -0.852659 bar - jZLDHclHAO 0.096298 1.267510 0.549206 -0.005235 bar - lR0nxDp1C2 -2.119350 -0.794384 0.544118 0.145849 bar - - [30 rows x 5 columns] """ - df = DataFrame(tm.getSeriesData()) + df = DataFrame( + np.random.default_rng(2).standard_normal((30, 4)), + index=Index([f"foo_{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD"), dtype=object), + ) df["foo"] = "bar" return df @@ -118,31 +45,18 @@ Fixture for DataFrame of different float types with index of unique strings Columns are ['A', 'B', 'C', 'D']. - - A B C D - GI7bbDaEZe -0.237908 -0.246225 -0.468506 0.752993 - KGp9mFepzA -1.140809 -0.644046 -1.225586 0.801588 - VeVYLAb1l2 -1.154013 -1.677615 0.690430 -0.003731 - kmPME4WKhO 0.979578 0.998274 -0.776367 0.897607 - CPyopdXTiz 0.048119 -0.257174 0.836426 0.111266 - 0kJZQndAj0 0.274357 -0.281135 -0.344238 0.834541 - tqdwQsaHG8 -0.979716 -0.519897 0.582031 0.144710 - ... ... ... ... ... - 7FhZTWILQj -2.906357 1.261039 -0.780273 -0.537237 - 4pUDPM4eGq -2.042512 -0.464382 -0.382080 1.132612 - B8dUgUzwTi -1.506637 -0.364435 1.087891 0.297653 - hErlVYjVv9 1.477453 -0.495515 -0.713867 1.438427 - 1BKN3o7YLs 0.127535 -0.349812 -0.881836 0.489827 - 9S4Ekn7zga 1.445518 -2.095149 0.031982 0.373204 - xN1dNn6OV6 1.425017 -0.983995 -0.363281 -0.224502 - - [30 rows x 4 columns] """ - df = DataFrame(tm.getSeriesData()) - df.A = df.A.astype("float32") - df.B = df.B.astype("float32") - df.C = df.C.astype("float16") - df.D = df.D.astype("float64") + df = DataFrame( + { + col: np.random.default_rng(2).random(30, dtype=dtype) + for col, dtype in zip( + list("ABCD"), ["float32", "float32", "float32", "float64"] + ) + }, + index=Index([f"foo_{i}" for i in range(30)], dtype=object), + ) + # not supported by numpy random + df["C"] = df["C"].astype("float16") return df @@ -152,32 +66,14 @@ Fixture for DataFrame of different int types with index of unique strings Columns are ['A', 'B', 'C', 'D']. - - A B C D - mUrCZ67juP 0 1 2 2 - rw99ACYaKS 0 1 0 0 - 7QsEcpaaVU 0 1 1 1 - xkrimI2pcE 0 1 0 0 - dz01SuzoS8 0 1 255 255 - ccQkqOHX75 -1 1 0 0 - DN0iXaoDLd 0 1 0 0 - ... .. .. ... ... - Dfb141wAaQ 1 1 254 254 - IPD8eQOVu5 0 1 0 0 - CcaKulsCmv 0 1 0 0 - rIBa8gu7E5 0 1 0 0 - RP6peZmh5o 0 1 1 1 - NMb9pipQWQ 0 1 0 0 - PqgbJEzjib 0 1 3 3 - - [30 rows x 4 columns] """ - df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) - df.A = df.A.astype("int32") - df.B = np.ones(len(df.B), dtype="uint64") - df.C = df.C.astype("uint8") - df.D = df.C.astype("int64") - return df + return DataFrame( + { + col: np.ones(30, dtype=dtype) + for col, dtype in zip(list("ABCD"), ["int32", "uint64", "uint8", "int64"]) + }, + index=Index([f"foo_{i}" for i in range(30)], dtype=object), + ) @pytest.fixture @@ -202,60 +98,3 @@ df.iloc[1, 1] = NaT df.iloc[1, 2] = NaT return df - - -@pytest.fixture -def uint64_frame(): - """ - Fixture for DataFrame with uint64 values - - Columns are ['A', 'B'] - """ - return DataFrame( - {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, dtype=np.uint64 - ) - - -@pytest.fixture -def simple_frame(): - """ - Fixture for simple 3x3 DataFrame - - Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c']. - - one two three - a 1.0 2.0 3.0 - b 4.0 5.0 6.0 - c 7.0 8.0 9.0 - """ - arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) - - return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"]) - - -@pytest.fixture -def frame_of_index_cols(): - """ - Fixture for DataFrame of columns that can be used for indexing - - Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; - 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. - - A B C D E (tuple, as, label) - 0 foo one a 0.608477 -0.012500 -1.664297 - 1 foo two b -0.633460 0.249614 -0.364411 - 2 foo three c 0.615256 2.154968 -0.834666 - 3 bar one d 0.234246 1.085675 0.718445 - 4 bar two e 0.533841 -0.005702 -3.533912 - """ - df = DataFrame( - { - "A": ["foo", "foo", "foo", "bar", "bar"], - "B": ["one", "two", "three", "one", "two"], - "C": ["a", "b", "c", "d", "e"], - "D": np.random.default_rng(2).standard_normal(5), - "E": np.random.default_rng(2).standard_normal(5), - ("tuple", "as", "label"): np.random.default_rng(2).standard_normal(5), - } - ) - return df diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/constructors/test_from_dict.py pandas-2.2.2+dfsg/pandas/tests/frame/constructors/test_from_dict.py --- pandas-2.1.4+dfsg/pandas/tests/frame/constructors/test_from_dict.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/constructors/test_from_dict.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas import ( DataFrame, Index, @@ -42,6 +44,9 @@ ) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), reason="columns inferring logic broken" + ) def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), @@ -200,3 +205,24 @@ ) with pytest.raises(ValueError, match=msg): DataFrame.from_dict({"foo": 1, "baz": 3, "bar": 2}, orient="abc") + + def test_from_dict_order_with_single_column(self): + data = { + "alpha": { + "value2": 123, + "value1": 532, + "animal": 222, + "plant": False, + "name": "test", + } + } + result = DataFrame.from_dict( + data, + orient="columns", + ) + expected = DataFrame( + [[123], [532], [222], [False], ["test"]], + index=["value2", "value1", "animal", "plant", "name"], + columns=["alpha"], + ) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/constructors/test_from_records.py pandas-2.2.2+dfsg/pandas/tests/frame/constructors/test_from_records.py --- pandas-2.1.4+dfsg/pandas/tests/frame/constructors/test_from_records.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/constructors/test_from_records.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,6 +6,8 @@ import pytest import pytz +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import is_platform_little_endian from pandas import ( @@ -56,6 +58,9 @@ expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]") tm.assert_frame_equal(result, expected) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work" + ) def test_from_records_sequencelike(self): df = DataFrame( { @@ -80,7 +85,7 @@ # this is actually tricky to create the recordlike arrays and # have the dtypes be intact - blocks = df._to_dict_of_blocks(copy=False) + blocks = df._to_dict_of_blocks() tuples = [] columns = [] dtypes = [] @@ -110,7 +115,7 @@ columns=df.columns ) - # list of tupels (no dtype info) + # list of tuples (no dtype info) result4 = DataFrame.from_records(lists, columns=columns).reindex( columns=df.columns ) @@ -169,7 +174,7 @@ # columns is in a different order here than the actual items iterated # from the dict - blocks = df._to_dict_of_blocks(copy=False) + blocks = df._to_dict_of_blocks() columns = [] for b in blocks.values(): columns.extend(b.columns) @@ -442,26 +447,27 @@ exp = DataFrame(data, index=["a", "b", "c"]) tm.assert_frame_equal(result, exp) + def test_from_records_misc_brokenness2(self): # GH#2623 rows = [] rows.append([datetime(2010, 1, 1), 1]) rows.append([datetime(2010, 1, 2), "hi"]) # test col upconverts to obj - df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) - result = df2_obj.dtypes - expected = Series( - [np.dtype("datetime64[ns]"), np.dtype("object")], index=["date", "test"] + result = DataFrame.from_records(rows, columns=["date", "test"]) + expected = DataFrame( + {"date": [row[0] for row in rows], "test": [row[1] for row in rows]} ) - tm.assert_series_equal(result, expected) + tm.assert_frame_equal(result, expected) + assert result.dtypes["test"] == np.dtype(object) + def test_from_records_misc_brokenness3(self): rows = [] rows.append([datetime(2010, 1, 1), 1]) rows.append([datetime(2010, 1, 2), 1]) - df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) - result = df2_obj.dtypes - expected = Series( - [np.dtype("datetime64[ns]"), np.dtype("int64")], index=["date", "test"] + result = DataFrame.from_records(rows, columns=["date", "test"]) + expected = DataFrame( + {"date": [row[0] for row in rows], "test": [row[1] for row in rows]} ) - tm.assert_series_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_from_records_empty(self): # GH#3562 diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_getitem.py pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_getitem.py --- pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_getitem.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_getitem.py 2024-04-10 17:42:52.000000000 +0000 @@ -42,9 +42,6 @@ ts = df[rng[0]] tm.assert_series_equal(ts, df.iloc[:, 0]) - # GH#1211; smoketest unrelated to the rest of this test - repr(df) - ts = df["1/1/2000"] tm.assert_series_equal(ts, df.iloc[:, 0]) @@ -106,7 +103,7 @@ def test_getitem_dupe_cols(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) - msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\"" + msg = "\"None of [Index(['baf'], dtype=" with pytest.raises(KeyError, match=re.escape(msg)): df[["baf"]] @@ -372,8 +369,6 @@ result = df[df.C > 6] tm.assert_frame_equal(result, expected) - result.dtypes - str(result) def test_getitem_boolean_frame_with_duplicate_columns(self, df_dup_cols): # where @@ -388,8 +383,6 @@ result = df[df > 6] tm.assert_frame_equal(result, expected) - result.dtypes - str(result) def test_getitem_empty_frame_with_boolean(self): # Test for issue GH#11859 @@ -399,13 +392,14 @@ tm.assert_frame_equal(df, df2) def test_getitem_returns_view_when_column_is_unique_in_df( - self, using_copy_on_write + self, using_copy_on_write, warn_copy_on_write ): # GH#45316 df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) df_orig = df.copy() view = df["b"] - view.loc[:] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + view.loc[:] = 100 if using_copy_on_write: expected = df_orig else: diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_indexing.py pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_indexing.py --- pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -48,7 +48,7 @@ # Column access for _, series in sl.items(): assert len(series.index) == 20 - assert tm.equalContents(series.index, sl.index) + tm.assert_index_equal(series.index, sl.index) for key, _ in float_frame._series.items(): assert float_frame[key] is not None @@ -288,7 +288,9 @@ df.foobar = 5 assert (df.foobar == 5).all() - def test_setitem(self, float_frame, using_copy_on_write): + def test_setitem( + self, float_frame, using_copy_on_write, warn_copy_on_write, using_infer_string + ): # not sure what else to do here series = float_frame["A"][::2] float_frame["col5"] = series @@ -324,14 +326,17 @@ smaller = float_frame[:2] msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: # With CoW, adding a new column doesn't raise a warning smaller["col10"] = ["1", "2"] else: with pytest.raises(SettingWithCopyError, match=msg): smaller["col10"] = ["1", "2"] - assert smaller["col10"].dtype == np.object_ + if using_infer_string: + assert smaller["col10"].dtype == "string" + else: + assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() def test_setitem2(self): @@ -426,7 +431,7 @@ float_frame["something"] = 2.5 assert float_frame["something"].dtype == np.float64 - def test_setitem_corner(self, float_frame): + def test_setitem_corner(self, float_frame, using_infer_string): # corner case df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3)) del df["B"] @@ -463,10 +468,16 @@ dm["foo"] = "bar" del dm["foo"] dm["foo"] = "bar" - assert dm["foo"].dtype == np.object_ + if using_infer_string: + assert dm["foo"].dtype == "string" + else: + assert dm["foo"].dtype == np.object_ dm["coercible"] = ["1", "2", "3"] - assert dm["coercible"].dtype == np.object_ + if using_infer_string: + assert dm["coercible"].dtype == "string" + else: + assert dm["coercible"].dtype == np.object_ def test_setitem_corner2(self): data = { @@ -483,7 +494,7 @@ assert df.loc[1, "title"] == "foobar" assert df.loc[1, "cruft"] == 0 - def test_setitem_ambig(self): + def test_setitem_ambig(self, using_infer_string): # Difficulties with mixed-type data # Created as float type dm = DataFrame(index=range(3), columns=range(3)) @@ -499,19 +510,22 @@ dm[2] = uncoercable_series assert len(dm.columns) == 3 - assert dm[2].dtype == np.object_ + if using_infer_string: + assert dm[2].dtype == "string" + else: + assert dm[2].dtype == np.object_ - def test_setitem_None(self, float_frame): + def test_setitem_None(self, float_frame, using_infer_string): # GH #766 float_frame[None] = float_frame["A"] + key = None if not using_infer_string else np.nan tm.assert_series_equal( float_frame.iloc[:, -1], float_frame["A"], check_names=False ) tm.assert_series_equal( - float_frame.loc[:, None], float_frame["A"], check_names=False + float_frame.loc[:, key], float_frame["A"], check_names=False ) - tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False) - repr(float_frame) + tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False) def test_loc_setitem_boolean_mask_allfalse(self): # GH 9596 @@ -520,7 +534,7 @@ ) result = df.copy() - result.loc[result.b.isna(), "a"] = result.a + result.loc[result.b.isna(), "a"] = result.a.copy() tm.assert_frame_equal(result, df) def test_getitem_fancy_slice_integers_step(self): @@ -562,7 +576,7 @@ @td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view def test_fancy_getitem_slice_mixed( - self, float_frame, float_string_frame, using_copy_on_write + self, float_frame, float_string_frame, using_copy_on_write, warn_copy_on_write ): sliced = float_string_frame.iloc[:, -3:] assert sliced["D"].dtype == np.float64 @@ -574,7 +588,8 @@ assert np.shares_memory(sliced["C"]._values, float_frame["C"]._values) - sliced.loc[:, "C"] = 4.0 + with tm.assert_cow_warning(warn_copy_on_write): + sliced.loc[:, "C"] = 4.0 if not using_copy_on_write: assert (float_frame["C"] == 4).all() @@ -584,7 +599,7 @@ tm.assert_frame_equal(float_frame, original) def test_getitem_setitem_non_ix_labels(self): - df = tm.makeTimeDataFrame() + df = DataFrame(range(20), index=date_range("2020-01-01", periods=20)) start, end = df.index[[5, 10]] @@ -748,11 +763,11 @@ expected = df.iloc[0:2] tm.assert_frame_equal(result, expected) - df.loc[1:2] = 0 + expected = df.iloc[0:2] msg = r"The behavior of obj\[i:j\] with a float-dtype index" with tm.assert_produces_warning(FutureWarning, match=msg): result = df[1:2] - assert (result == 0).all().all() + tm.assert_frame_equal(result, expected) # #2727 index = Index([1.0, 2.5, 3.5, 4.5, 5.0]) @@ -934,7 +949,8 @@ # needs upcasting df = DataFrame([[1, 2, "foo"], [3, 4, "bar"]], columns=["A", "B", "C"]) df2 = df.copy() - df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 expected = df.reindex(columns=["A", "B"]) expected += 0.5 expected["C"] = df["C"] @@ -1018,6 +1034,15 @@ result = df.loc[[0], "b"] tm.assert_series_equal(result, expected) + def test_iloc_callable_tuple_return_value(self): + # GH53769 + df = DataFrame(np.arange(40).reshape(10, 4), index=range(0, 20, 2)) + msg = "callable with iloc" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.iloc[lambda _: (0,)] + with tm.assert_produces_warning(FutureWarning, match=msg): + df.iloc[lambda _: (0,)] = 1 + def test_iloc_row(self): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), index=range(0, 20, 2) @@ -1041,7 +1066,7 @@ expected = df.reindex(df.index[[1, 2, 4, 6]]) tm.assert_frame_equal(result, expected) - def test_iloc_row_slice_view(self, using_copy_on_write, request): + def test_iloc_row_slice_view(self, using_copy_on_write, warn_copy_on_write): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), index=range(0, 20, 2) ) @@ -1054,9 +1079,9 @@ assert np.shares_memory(df[2], subset[2]) exp_col = original[2].copy() - subset.loc[:, 2] = 0.0 - if not using_copy_on_write: + with tm.assert_cow_warning(warn_copy_on_write): subset.loc[:, 2] = 0.0 + if not using_copy_on_write: exp_col._values[4:8] = 0.0 # With the enforcement of GH#45333 in 2.0, this remains a view @@ -1086,7 +1111,9 @@ expected = df.reindex(columns=df.columns[[1, 2, 4, 6]]) tm.assert_frame_equal(result, expected) - def test_iloc_col_slice_view(self, using_array_manager, using_copy_on_write): + def test_iloc_col_slice_view( + self, using_array_manager, using_copy_on_write, warn_copy_on_write + ): df = DataFrame( np.random.default_rng(2).standard_normal((4, 10)), columns=range(0, 20, 2) ) @@ -1097,7 +1124,8 @@ # verify slice is view assert np.shares_memory(df[8]._values, subset[8]._values) - subset.loc[:, 8] = 0.0 + with tm.assert_cow_warning(warn_copy_on_write): + subset.loc[:, 8] = 0.0 assert (df[8] == 0).all() @@ -1277,7 +1305,7 @@ df.loc[:] = pd.core.arrays.NumpyExtensionArray(df.values[:, ::-1]) tm.assert_frame_equal(df, orig) - df.iloc[:] = df.iloc[:, :] + df.iloc[:] = df.iloc[:, :].copy() tm.assert_frame_equal(df, orig) def test_getitem_segfault_with_empty_like_object(self): @@ -1287,6 +1315,7 @@ # this produces the segfault df[[0]] + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") @pytest.mark.parametrize( "null", [pd.NaT, pd.NaT.to_numpy("M8[ns]"), pd.NaT.to_numpy("m8[ns]")] ) @@ -1295,7 +1324,7 @@ ): # GH#44514 don't cast mismatched nulls to pd.NA df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype) - ser = df["A"] + ser = df["A"].copy() arr = ser._values msg = "|".join( @@ -1359,33 +1388,32 @@ tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( - "val, idxr, warn", + "val, idxr", [ - ("x", "a", None), # TODO: this should warn as well - ("x", ["a"], None), # TODO: this should warn as well - (1, "a", None), # TODO: this should warn as well - (1, ["a"], FutureWarning), + ("x", "a"), + ("x", ["a"]), + (1, "a"), + (1, ["a"]), ], ) - def test_loc_setitem_rhs_frame(self, idxr, val, warn): + def test_loc_setitem_rhs_frame(self, idxr, val): # GH#47578 df = DataFrame({"a": [1, 2]}) with tm.assert_produces_warning( - warn, match="Setting an item of incompatible dtype" + FutureWarning, match="Setting an item of incompatible dtype" ): df.loc[:, idxr] = DataFrame({"a": [val, 11]}, index=[1, 2]) expected = DataFrame({"a": [np.nan, val]}) tm.assert_frame_equal(df, expected) @td.skip_array_manager_invalid_test - def test_iloc_setitem_enlarge_no_warning(self): + def test_iloc_setitem_enlarge_no_warning(self, warn_copy_on_write): # GH#47381 df = DataFrame(columns=["a", "b"]) expected = df.copy() view = df[:] - with tm.assert_produces_warning(None): - df.iloc[:, 0] = np.array([1, 2], dtype=np.float64) + df.iloc[:, 0] = np.array([1, 2], dtype=np.float64) tm.assert_frame_equal(view, expected) def test_loc_internals_not_updated_correctly(self): @@ -1555,8 +1583,11 @@ class TestDataFrameIndexingUInt64: - def test_setitem(self, uint64_frame): - df = uint64_frame + def test_setitem(self): + df = DataFrame( + {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, + dtype=np.uint64, + ) idx = df["A"].rename("foo") # setitem @@ -1905,6 +1936,26 @@ tm.assert_frame_equal(df, expected) +@pytest.mark.parametrize( + ("dtype", "infer_string"), + [ + (object, False), + ("string[pyarrow_numpy]", True), + ], +) +def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: + # https://github.com/pandas-dev/pandas/issues/56204 + pytest.importorskip("pyarrow") + + df = DataFrame({"a": [1, 2], "b": [3, 4]}) + with pd.option_context("future.infer_string", infer_string): + df.loc[df["a"] == 1, "c"] = "1" + expected = DataFrame({"a": [1, 2], "b": [3, 4], "c": ["1", float("nan")]}).astype( + {"a": "int64", "b": "int64", "c": dtype} + ) + tm.assert_frame_equal(df, expected) + + def test_add_new_column_infer_string(): # GH#55366 pytest.importorskip("pyarrow") @@ -1946,7 +1997,7 @@ np.datetime64("NaT"), np.timedelta64("NaT"), ] - _indexers = [0, [0], slice(0, 1), [True, False, False]] + _indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)] @pytest.mark.parametrize( "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] @@ -1960,7 +2011,7 @@ @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype) - if isna(invalid) and invalid is not pd.NaT: + if isna(invalid) and invalid is not pd.NaT and not np.isnat(invalid): warn = None else: warn = FutureWarning diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_insert.py pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_insert.py --- pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_insert.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_insert.py 2024-04-10 17:42:52.000000000 +0000 @@ -51,14 +51,12 @@ df.insert(0, "a", [1, 2]) result = df.rename(columns={}) - str(result) expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) df.insert(0, "c", [1.3, 2.3]) result = df.rename(columns={}) - str(result) expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_set_value.py pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_set_value.py --- pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_set_value.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_set_value.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,7 +16,7 @@ float_frame._set_value(idx, col, 1) assert float_frame[col][idx] == 1 - def test_set_value_resize(self, float_frame): + def test_set_value_resize(self, float_frame, using_infer_string): res = float_frame._set_value("foobar", "B", 0) assert res is None assert float_frame.index[-1] == "foobar" @@ -27,13 +27,12 @@ res = float_frame.copy() res._set_value("foobar", "baz", "sam") - assert res["baz"].dtype == np.object_ - + if using_infer_string: + assert res["baz"].dtype == "string" + else: + assert res["baz"].dtype == np.object_ res = float_frame.copy() - with tm.assert_produces_warning( - FutureWarning, match="Setting an item of incompatible dtype" - ): - res._set_value("foobar", "baz", True) + res._set_value("foobar", "baz", True) assert res["baz"].dtype == np.object_ res = float_frame.copy() diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_setitem.py pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_setitem.py --- pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_setitem.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_setitem.py 2024-04-10 17:42:52.000000000 +0000 @@ -93,6 +93,11 @@ with pytest.raises(ValueError, match=msg): df["gr"] = df.groupby(["b", "c"]).count() + # GH 55956, specific message for zero columns + msg = "Cannot set a DataFrame without columns to the column gr" + with pytest.raises(ValueError, match=msg): + df["gr"] = DataFrame() + def test_setitem_benchmark(self): # from the vb_suite/frame_methods/frame_insert_columns N = 10 @@ -770,6 +775,35 @@ df[col_name] = df[[col_name]] tm.assert_frame_equal(df, expected) + def test_loc_setitem_ea_dtype(self): + # GH#55604 + df = DataFrame({"a": np.array([10], dtype="i8")}) + df.loc[:, "a"] = Series([11], dtype="Int64") + expected = DataFrame({"a": np.array([11], dtype="i8")}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": np.array([10], dtype="i8")}) + df.iloc[:, 0] = Series([11], dtype="Int64") + tm.assert_frame_equal(df, expected) + + def test_setitem_object_inferring(self): + # GH#56102 + idx = Index([Timestamp("2019-12-31")], dtype=object) + df = DataFrame({"a": [1]}) + with tm.assert_produces_warning(FutureWarning, match="infer"): + df.loc[:, "b"] = idx + with tm.assert_produces_warning(FutureWarning, match="infer"): + df["c"] = idx + + expected = DataFrame( + { + "a": [1], + "b": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), + "c": Series([Timestamp("2019-12-31")], dtype="datetime64[ns]"), + } + ) + tm.assert_frame_equal(df, expected) + class TestSetitemTZAwareValues: @pytest.fixture @@ -814,7 +848,7 @@ class TestDataFrameSetItemWithExpansion: - def test_setitem_listlike_views(self, using_copy_on_write): + def test_setitem_listlike_views(self, using_copy_on_write, warn_copy_on_write): # GH#38148 df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]}) @@ -825,7 +859,8 @@ df[["c", "d"]] = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.5]]) # edit in place the first column to check view semantics - df.iloc[0, 0] = 100 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 100 if using_copy_on_write: expected = Series([1, 2, 3], name="a") @@ -866,8 +901,6 @@ # setting with a Categorical df["D"] = cat - str(df) - result = df.dtypes expected = Series( [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], @@ -877,8 +910,6 @@ # setting with a Series df["E"] = ser - str(df) - result = df.dtypes expected = Series( [ @@ -1280,23 +1311,22 @@ tm.assert_frame_equal(view, expected) @td.skip_array_manager_invalid_test - def test_setitem_column_update_inplace(self, using_copy_on_write): + def test_setitem_column_update_inplace( + self, using_copy_on_write, warn_copy_on_write + ): # https://github.com/pandas-dev/pandas/issues/47172 labels = [f"c{i}" for i in range(10)] df = DataFrame({col: np.zeros(len(labels)) for col in labels}, index=labels) values = df._mgr.blocks[0].values - if not using_copy_on_write: + with tm.raises_chained_assignment_error(): for label in df.columns: df[label][label] = 1 - + if not using_copy_on_write: # diagonal values all updated assert np.all(values[np.arange(10), np.arange(10)] == 1) else: - with tm.raises_chained_assignment_error(): - for label in df.columns: - df[label][label] = 1 # original dataframe not updated assert np.all(values[np.arange(10), np.arange(10)] == 0) @@ -1307,7 +1337,7 @@ df["col2"] = Series([1, 2, 3], dtype="category") expected_types = Series( - ["int64", "category", "category"], index=[0, "col1", "col2"] + ["int64", "category", "category"], index=[0, "col1", "col2"], dtype=object ) tm.assert_series_equal(df.dtypes, expected_types) @@ -1338,7 +1368,8 @@ def test_frame_setitem_empty_dataframe(self): # GH#28871 - df = DataFrame({"date": [datetime(2000, 1, 1)]}).set_index("date") + dti = DatetimeIndex(["2000-01-01"], dtype="M8[ns]", name="date") + df = DataFrame({"date": dti}).set_index("date") df = df[0:0].copy() df["3010"] = None @@ -1347,6 +1378,42 @@ expected = DataFrame( [], columns=["3010", "2010"], - index=Index([], dtype="datetime64[ns]", name="date"), + index=dti[:0], ) tm.assert_frame_equal(df, expected) + + +def test_full_setter_loc_incompatible_dtype(): + # https://github.com/pandas-dev/pandas/issues/55791 + df = DataFrame({"a": [1, 2]}) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "a"] = True + expected = DataFrame({"a": [True, True]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": [1, 2]}) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "a"] = {0: 3.5, 1: 4.5} + expected = DataFrame({"a": [3.5, 4.5]}) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": [1, 2]}) + df.loc[:, "a"] = {0: 3, 1: 4} + expected = DataFrame({"a": [3, 4]}) + tm.assert_frame_equal(df, expected) + + +def test_setitem_partial_row_multiple_columns(): + # https://github.com/pandas-dev/pandas/issues/56503 + df = DataFrame({"A": [1, 2, 3], "B": [4.0, 5, 6]}) + # should not warn + df.loc[df.index <= 1, ["F", "G"]] = (1, "abc") + expected = DataFrame( + { + "A": [1, 2, 3], + "B": [4.0, 5, 6], + "F": [1.0, 1, float("nan")], + "G": ["abc", "abc", float("nan")], + } + ) + tm.assert_frame_equal(df, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_where.py pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_where.py --- pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_where.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_where.py 2024-04-10 17:42:52.000000000 +0000 @@ -96,6 +96,7 @@ tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") def test_where_alignment(self, where_frame, float_string_frame): # aligning def _check_align(df, cond, other, check_dtypes=True): @@ -170,12 +171,13 @@ with pytest.raises(ValueError, match=msg): df.mask(0) + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") def test_where_set(self, where_frame, float_string_frame, mixed_int_frame): # where inplace def _check_set(df, cond, check_dtypes=True): dfi = df.copy() - econd = cond.reindex_like(df).fillna(True) + econd = cond.reindex_like(df).fillna(True).infer_objects(copy=False) expected = dfi.mask(~econd) return_value = dfi.where(cond, np.nan, inplace=True) @@ -356,7 +358,9 @@ expected = a.copy() expected[~do_not_replace] = b - result = a.where(do_not_replace, b) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = a.where(do_not_replace, b) tm.assert_frame_equal(result, expected) a = DataFrame({0: [4, 6], 1: [1, 0]}) @@ -366,7 +370,8 @@ expected = a.copy() expected[~do_not_replace] = b - result = a.where(do_not_replace, b) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = a.where(do_not_replace, b) tm.assert_frame_equal(result, expected) def test_where_datetime(self): @@ -712,12 +717,11 @@ # TODO: ideally we would get Int64 instead of object result = df.where(mask, ser, axis=0) - expected = DataFrame({"A": [1, pd.NA, 3], "B": [4, pd.NA, 6]}).astype(object) + expected = DataFrame({"A": [1, np.nan, 3], "B": [4, np.nan, 6]}) tm.assert_frame_equal(result, expected) ser2 = Series(arr[:2], index=["A", "B"]) - expected = DataFrame({"A": [1, 7, 3], "B": [4, pd.NA, 6]}) - expected["B"] = expected["B"].astype(object) + expected = DataFrame({"A": [1, 7, 3], "B": [4, np.nan, 6]}) result = df.where(mask, ser2, axis=1) tm.assert_frame_equal(result, expected) @@ -735,7 +739,10 @@ # GH#45768 obj = frame_or_series([pd.Interval(0, 0)] * 2) other = frame_or_series([1.0, 2.0]) - res = obj.where(~obj.notna(), other) + + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = obj.where(~obj.notna(), other) # since all entries are being changed, we will downcast result # from object to ints (not floats) @@ -761,7 +768,8 @@ # GH#45135, analogue to GH#44181 for Period don't raise on no-op # For td64/dt64/dt64tz we already don't raise, but also are # checking that we don't unnecessarily upcast to object. - ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype) + with tm.assert_produces_warning(FutureWarning, match="is deprecated"): + ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype) df = ser.to_frame() mask = np.array([False, False, False]) @@ -780,7 +788,9 @@ # opposite case where we are replacing *all* values -> we downcast # from object dtype # GH#45768 - res5 = df.where(mask2, 4) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + res5 = df.where(mask2, 4) expected = DataFrame(4, index=df.index, columns=df.columns) tm.assert_frame_equal(res5, expected) @@ -984,10 +994,18 @@ td = pd.Timedelta(days=1) - res = ser.where(mask, td) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = ser.where(mask, td) expected = Series([td, td, td], dtype="m8[ns]") tm.assert_series_equal(res, expected) + with pd.option_context("future.no_silent_downcasting", True): + with tm.assert_produces_warning(None, match=msg): + res2 = ser.where(mask, td) + expected2 = expected.astype(object) + tm.assert_series_equal(res2, expected2) + def _check_where_equivalences(df, mask, other, expected): # similar to tests.series.indexing.test_setitem.SetitemCastingEquivalences @@ -1059,9 +1077,13 @@ @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement): +def test_where_int_overflow(replacement, using_infer_string, request): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) + if using_infer_string and replacement not in (None, "snake"): + request.node.add_marker( + pytest.mark.xfail(reason="Can't set non-string into string column") + ) result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_xs.py pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_xs.py --- pandas-2.1.4+dfsg/pandas/tests/frame/indexing/test_xs.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/indexing/test_xs.py 2024-04-10 17:42:52.000000000 +0000 @@ -36,7 +36,9 @@ class TestXS: - def test_xs(self, float_frame, datetime_frame, using_copy_on_write): + def test_xs( + self, float_frame, datetime_frame, using_copy_on_write, warn_copy_on_write + ): float_frame_orig = float_frame.copy() idx = float_frame.index[5] xs = float_frame.xs(idx) @@ -66,7 +68,8 @@ # view is returned if possible series = float_frame.xs("A", axis=1) - series[:] = 5 + with tm.assert_cow_warning(warn_copy_on_write): + series[:] = 5 if using_copy_on_write: # but with CoW the view shouldn't propagate mutations tm.assert_series_equal(float_frame["A"], float_frame_orig["A"]) @@ -119,7 +122,9 @@ result = df.xs((2008, "sat"), level=["year", "day"], drop_level=False) tm.assert_frame_equal(result, expected) - def test_xs_view(self, using_array_manager, using_copy_on_write): + def test_xs_view( + self, using_array_manager, using_copy_on_write, warn_copy_on_write + ): # in 0.14 this will return a view if possible a copy otherwise, but # this is numpy dependent @@ -138,7 +143,8 @@ dm.xs(2)[:] = 20 assert not (dm.xs(2) == 20).any() else: - dm.xs(2)[:] = 20 + with tm.raises_chained_assignment_error(): + dm.xs(2)[:] = 20 assert (dm.xs(2) == 20).all() @@ -198,14 +204,17 @@ tm.assert_frame_equal(result, expected) def test_xs_setting_with_copy_error( - self, multiindex_dataframe_random_data, using_copy_on_write + self, + multiindex_dataframe_random_data, + using_copy_on_write, + warn_copy_on_write, ): # this is a copy in 0.14 df = multiindex_dataframe_random_data df_orig = df.copy() result = df.xs("two", level="second") - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: result[:] = 10 else: # setting this will give a SettingWithCopyError @@ -216,14 +225,14 @@ tm.assert_frame_equal(df, df_orig) def test_xs_setting_with_copy_error_multiple( - self, four_level_index_dataframe, using_copy_on_write + self, four_level_index_dataframe, using_copy_on_write, warn_copy_on_write ): # this is a copy in 0.14 df = four_level_index_dataframe df_orig = df.copy() result = df.xs(("a", 4), level=["one", "four"]) - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: result[:] = 10 else: # setting this will give a SettingWithCopyError @@ -391,14 +400,17 @@ expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) - def test_xs_droplevel_false_view(self, using_array_manager, using_copy_on_write): + def test_xs_droplevel_false_view( + self, using_array_manager, using_copy_on_write, warn_copy_on_write + ): # GH#37832 df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) # check that result still views the same data as df assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values) - df.iloc[0, 0] = 2 + with tm.assert_cow_warning(warn_copy_on_write): + df.iloc[0, 0] = 2 if using_copy_on_write: # with copy on write the subset is never modified expected = DataFrame({"a": [1]}) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_align.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_align.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_align.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_align.py 2024-04-10 17:42:52.000000000 +0000 @@ -23,8 +23,8 @@ df.align(df.iloc[::-1], method="asfreq") def test_frame_align_aware(self): - idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") - idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern") + idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern") + idx2 = date_range("2001", periods=5, freq="2h", tz="US/Eastern") df1 = DataFrame(np.random.default_rng(2).standard_normal((len(idx1), 3)), idx1) df2 = DataFrame(np.random.default_rng(2).standard_normal((len(idx2), 3)), idx2) new1, new2 = df1.align(df2) @@ -107,7 +107,7 @@ af, bf = float_frame.align( other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None ) - tm.assert_index_equal(bf.index, Index([])) + tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) msg = ( "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " @@ -117,7 +117,7 @@ af, bf = float_frame.align( other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 ) - tm.assert_index_equal(bf.index, Index([])) + tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) # Try to align DataFrame to Series along bad axis msg = "No axis named 2 for object type DataFrame" @@ -392,27 +392,57 @@ with pytest.raises(ValueError, match=r"axis=0 or 1"): df.align(series) - def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): + @pytest.mark.parametrize("method", ["pad", "bfill"]) + @pytest.mark.parametrize("axis", [0, 1, None]) + @pytest.mark.parametrize("fill_axis", [0, 1]) + @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"]) + @pytest.mark.parametrize( + "left_slice", + [ + [slice(4), slice(10)], + [slice(0), slice(0)], + ], + ) + @pytest.mark.parametrize( + "right_slice", + [ + [slice(2, None), slice(6, None)], + [slice(0), slice(0)], + ], + ) + @pytest.mark.parametrize("limit", [1, None]) + def test_align_fill_method( + self, how, method, axis, fill_axis, float_frame, left_slice, right_slice, limit + ): + frame = float_frame + left = frame.iloc[left_slice[0], left_slice[1]] + right = frame.iloc[right_slice[0], right_slice[1]] + msg = ( "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " "are deprecated" ) with tm.assert_produces_warning(FutureWarning, match=msg): - aa, ab = a.align( - b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis + aa, ab = left.align( + right, + axis=axis, + join=how, + method=method, + limit=limit, + fill_axis=fill_axis, ) join_index, join_columns = None, None - ea, eb = a, b + ea, eb = left, right if axis is None or axis == 0: - join_index = a.index.join(b.index, how=how) + join_index = left.index.join(right.index, how=how) ea = ea.reindex(index=join_index) eb = eb.reindex(index=join_index) if axis is None or axis == 1: - join_columns = a.columns.join(b.columns, how=how) + join_columns = left.columns.join(right.columns, how=how) ea = ea.reindex(columns=join_columns) eb = eb.reindex(columns=join_columns) @@ -424,42 +454,6 @@ tm.assert_frame_equal(aa, ea) tm.assert_frame_equal(ab, eb) - @pytest.mark.parametrize("meth", ["pad", "bfill"]) - @pytest.mark.parametrize("ax", [0, 1, None]) - @pytest.mark.parametrize("fax", [0, 1]) - @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"]) - def test_align_fill_method(self, how, meth, ax, fax, float_frame): - df = float_frame - self._check_align_fill(df, how, meth, ax, fax) - - def _check_align_fill(self, frame, kind, meth, ax, fax): - left = frame.iloc[0:4, :10] - right = frame.iloc[2:, 6:] - empty = frame.iloc[:0, :0] - - self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth) - self._check_align( - left, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 - ) - - # empty left - self._check_align(empty, right, axis=ax, fill_axis=fax, how=kind, method=meth) - self._check_align( - empty, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 - ) - - # empty right - self._check_align(left, empty, axis=ax, fill_axis=fax, how=kind, method=meth) - self._check_align( - left, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 - ) - - # both empty - self._check_align(empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth) - self._check_align( - empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 - ) - def test_align_series_check_copy(self): # GH# df = DataFrame({0: [1, 2]}) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_asfreq.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_asfreq.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_asfreq.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_asfreq.py 2024-04-10 17:42:52.000000000 +0000 @@ -32,23 +32,24 @@ datetime(2009, 11, 30), datetime(2009, 12, 31), ], - freq="BM", + dtype="M8[ns]", + freq="BME", ), ) daily_ts = ts.asfreq("B") - monthly_ts = daily_ts.asfreq("BM") + monthly_ts = daily_ts.asfreq("BME") tm.assert_equal(monthly_ts, ts) daily_ts = ts.asfreq("B", method="pad") - monthly_ts = daily_ts.asfreq("BM") + monthly_ts = daily_ts.asfreq("BME") tm.assert_equal(monthly_ts, ts) daily_ts = ts.asfreq(offsets.BDay()) monthly_ts = daily_ts.asfreq(offsets.BMonthEnd()) tm.assert_equal(monthly_ts, ts) - result = ts[:0].asfreq("M") + result = ts[:0].asfreq("ME") assert len(result) == 0 assert result is not ts @@ -63,8 +64,8 @@ def test_asfreq_datetimeindex_empty(self, frame_or_series): # GH#14320 index = DatetimeIndex(["2016-09-29 11:00"]) - expected = frame_or_series(index=index, dtype=object).asfreq("H") - result = frame_or_series([3], index=index.copy()).asfreq("H") + expected = frame_or_series(index=index, dtype=object).asfreq("h") + result = frame_or_series([3], index=index.copy()).asfreq("h") tm.assert_index_equal(expected.index, result.index) @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) @@ -76,7 +77,7 @@ ) # it works! - obj.asfreq("T") + obj.asfreq("min") def test_asfreq_normalize(self, frame_or_series): rng = date_range("1/1/2000 09:30", periods=20) @@ -104,7 +105,7 @@ assert index_name == obj.asfreq("10D").index.name def test_asfreq_ts(self, frame_or_series): - index = period_range(freq="A", start="1/1/2001", end="12/31/2010") + index = period_range(freq="Y", start="1/1/2001", end="12/31/2010") obj = DataFrame( np.random.default_rng(2).standard_normal((len(index), 3)), index=index ) @@ -140,12 +141,12 @@ def test_asfreq_empty(self, datetime_frame): # test does not blow up on length-0 DataFrame zero_length = datetime_frame.reindex([]) - result = zero_length.asfreq("BM") + result = zero_length.asfreq("BME") assert result is not zero_length def test_asfreq(self, datetime_frame): offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd()) - rule_monthly = datetime_frame.asfreq("BM") + rule_monthly = datetime_frame.asfreq("BME") tm.assert_frame_equal(offset_monthly, rule_monthly) @@ -170,7 +171,7 @@ # test for fill value during upsampling, related to issue 3715 # setup - rng = date_range("1/1/2016", periods=10, freq="2S") + rng = date_range("1/1/2016", periods=10, freq="2s") # Explicit cast to 'float' to avoid implicit cast when setting None ts = Series(np.arange(len(rng)), index=rng, dtype="float") df = DataFrame({"one": ts}) @@ -178,13 +179,13 @@ # insert pre-existing missing value df.loc["2016-01-01 00:00:08", "one"] = None - actual_df = df.asfreq(freq="1S", fill_value=9.0) - expected_df = df.asfreq(freq="1S").fillna(9.0) + actual_df = df.asfreq(freq="1s", fill_value=9.0) + expected_df = df.asfreq(freq="1s").fillna(9.0) expected_df.loc["2016-01-01 00:00:08", "one"] = None tm.assert_frame_equal(expected_df, actual_df) - expected_series = ts.asfreq(freq="1S").fillna(9.0) - actual_series = ts.asfreq(freq="1S", fill_value=9.0) + expected_series = ts.asfreq(freq="1s").fillna(9.0) + actual_series = ts.asfreq(freq="1s", fill_value=9.0) tm.assert_series_equal(expected_series, actual_series) def test_asfreq_with_date_object_index(self, frame_or_series): @@ -194,8 +195,8 @@ ts2 = ts.copy() ts2.index = [x.date() for x in ts2.index] - result = ts2.asfreq("4H", method="ffill") - expected = ts.asfreq("4H", method="ffill") + result = ts2.asfreq("4h", method="ffill") + expected = ts.asfreq("4h", method="ffill") tm.assert_equal(result, expected) def test_asfreq_with_unsorted_index(self, frame_or_series): @@ -221,11 +222,11 @@ @pytest.mark.parametrize( "freq, freq_half", [ - ("2M", "M"), + ("2ME", "ME"), (MonthEnd(2), MonthEnd(1)), ], ) - def test_asfreq_2M(self, freq, freq_half): + def test_asfreq_2ME(self, freq, freq_half): index = date_range("1/1/2000", periods=6, freq=freq_half) df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], index=index)}) expected = df.asfreq(freq=freq) @@ -233,3 +234,30 @@ index = date_range("1/1/2000", periods=3, freq=freq) result = DataFrame({"s": Series([0.0, 2.0, 4.0], index=index)}) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "freq, freq_depr", + [ + ("2ME", "2M"), + ("2QE", "2Q"), + ("2QE-SEP", "2Q-SEP"), + ("1BQE", "1BQ"), + ("2BQE-SEP", "2BQ-SEP"), + ("1YE", "1Y"), + ("2YE-MAR", "2Y-MAR"), + ("1YE", "1A"), + ("2YE-MAR", "2A-MAR"), + ("2BYE-MAR", "2BA-MAR"), + ], + ) + def test_asfreq_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): + # GH#9586, #55978 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + f"in a future version, please use '{freq[1:]}' instead." + + index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}") + df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)}) + expected = df.asfreq(freq=freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = df.asfreq(freq=freq_depr) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_asof.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_asof.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_asof.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_asof.py 2024-04-10 17:42:52.000000000 +0000 @@ -178,7 +178,7 @@ def test_asof_periodindex_mismatched_freq(self): N = 50 - rng = period_range("1/1/1990", periods=N, freq="H") + rng = period_range("1/1/1990", periods=N, freq="h") df = DataFrame(np.random.default_rng(2).standard_normal(N), index=rng) # Mismatched freq diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_astype.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_astype.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_astype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_astype.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,7 +3,6 @@ import numpy as np import pytest -from pandas.compat import pa_version_under7p0 import pandas.util._test_decorators as td import pandas as pd @@ -68,9 +67,19 @@ casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16") _check_cast(casted, "float16") - def test_astype_mixed_type(self, mixed_type_frame): + def test_astype_mixed_type(self): # mixed casting - mn = mixed_type_frame._get_numeric_data().copy() + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) + mn = df._get_numeric_data().copy() mn["little_float"] = np.array(12345.0, dtype="float16") mn["big_float"] = np.array(123456789101112.0, dtype="float64") @@ -157,7 +166,8 @@ "c": [Timedelta(x)._repr_base() for x in c._values], "d": list(map(str, d._values)), "e": list(map(str, e._values)), - } + }, + dtype="object", ) tm.assert_frame_equal(result, expected) @@ -165,13 +175,13 @@ def test_astype_str_float(self): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"]) + expected = DataFrame(["nan"], dtype="object") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val]) + expected = DataFrame([val], dtype="object") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -190,7 +200,7 @@ expected = DataFrame( { "a": a, - "b": Series(["0", "1", "2", "3", "4"]), + "b": Series(["0", "1", "2", "3", "4"], dtype="object"), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } @@ -273,7 +283,7 @@ result = df.astype(dtypes) expected = DataFrame( { - 0: vals[:, 0].astype(str), + 0: Series(vals[:, 0].astype(str), dtype=object), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], @@ -383,7 +393,12 @@ ["2017-01-01", "2017-01-02", "2017-02-03"], ] df = DataFrame(vals, dtype=object) - with pytest.raises(TypeError, match="Cannot cast"): + msg = ( + rf"Unexpected value for 'dtype': 'datetime64\[{unit}\]'. " + r"Must be 'datetime64\[s\]', 'datetime64\[ms\]', 'datetime64\[us\]', " + r"'datetime64\[ns\]' or DatetimeTZDtype" + ) + with pytest.raises(ValueError, match=msg): df.astype(f"M8[{unit}]") @pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"]) @@ -606,6 +621,7 @@ {"a": 2.2, "b": "15.3", "c": "another_test"}, ] ) + expected["c"] = expected["c"].astype("object") type_dict = {"a": "float64", "b": "float64", "c": "object"} result = df.astype(dtype=type_dict, errors="ignore") @@ -666,6 +682,7 @@ ], ], columns=timezone_frame.columns, + dtype="object", ) tm.assert_frame_equal(result, expected) @@ -740,7 +757,9 @@ result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) - def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): + def test_astype_dt64_to_string( + self, frame_or_series, tz_naive_fixture, using_infer_string + ): # GH#41409 tz = tz_naive_fixture @@ -758,7 +777,10 @@ item = result.iloc[0] if frame_or_series is DataFrame: item = item.iloc[0] - assert item is pd.NA + if using_infer_string: + assert item is np.nan + else: + assert item is pd.NA # For non-NA values, we should match what we get for non-EA str alt = obj.astype(str) @@ -868,10 +890,10 @@ assert np.shares_memory(df.b.values, result.b.values) -@pytest.mark.skipif(pa_version_under7p0, reason="pyarrow is required for this test") @pytest.mark.parametrize("dtype", ["int64", "Int64"]) def test_astype_copies(dtype): # GH#50984 + pytest.importorskip("pyarrow") df = DataFrame({"a": [1, 2, 3]}, dtype=dtype) result = df.astype("int64[pyarrow]", copy=True) df.iloc[0, 0] = 100 diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_at_time.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_at_time.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_at_time.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_at_time.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,7 +18,7 @@ def test_localized_at_time(self, tzstr, frame_or_series): tz = timezones.maybe_get_tz(tzstr) - rng = date_range("4/16/2012", "5/1/2012", freq="H") + rng = date_range("4/16/2012", "5/1/2012", freq="h") ts = frame_or_series( np.random.default_rng(2).standard_normal(len(rng)), index=rng ) @@ -69,7 +69,7 @@ ) def test_at_time_errors(self, hour): # GH#24043 - dti = date_range("2018", periods=3, freq="H") + dti = date_range("2018", periods=3, freq="h") df = DataFrame(list(range(len(dti))), index=dti) if getattr(hour, "tzinfo", None) is None: result = df.at_time(hour) @@ -81,7 +81,7 @@ def test_at_time_tz(self): # GH#24043 - dti = date_range("2018", periods=3, freq="H", tz="US/Pacific") + dti = date_range("2018", periods=3, freq="h", tz="US/Pacific") df = DataFrame(list(range(len(dti))), index=dti) result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern"))) expected = df.iloc[1:2] diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_between_time.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_between_time.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_between_time.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_between_time.py 2024-04-10 17:42:52.000000000 +0000 @@ -46,7 +46,7 @@ def test_localized_between_time(self, tzstr, frame_or_series): tz = timezones.maybe_get_tz(tzstr) - rng = date_range("4/16/2012", "5/1/2012", freq="H") + rng = date_range("4/16/2012", "5/1/2012", freq="h") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) if frame_or_series is DataFrame: ts = ts.to_frame() diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_clip.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_clip.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_clip.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_clip.py 2024-04-10 17:42:52.000000000 +0000 @@ -94,9 +94,13 @@ (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), ], ) - def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): + def test_clip_against_list_like(self, inplace, lower, axis, res): # GH#15390 - original = simple_frame.copy(deep=True) + arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) + + original = DataFrame( + arr, columns=["one", "two", "three"], index=["a", "b", "c"] + ) result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) @@ -151,7 +155,11 @@ # GH#19992 and adjusted in GH#40420 df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) - result = df.clip(lower=[4, 5, np.nan], axis=0) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + # TODO: avoid this warning here? seems like we should never be upcasting + # in the first place? + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.clip(lower=[4, 5, np.nan], axis=0) expected = DataFrame( {"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]} ) @@ -167,7 +175,8 @@ data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]} df = DataFrame(data) t = Series([2, -4, np.nan, 6, 3]) - result = df.clip(lower=t, axis=0) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.clip(lower=t, axis=0) expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]}) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_combine_first.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_combine_first.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_combine_first.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_combine_first.py 2024-04-10 17:42:52.000000000 +0000 @@ -30,14 +30,14 @@ combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) - def test_combine_first(self, float_frame): + def test_combine_first(self, float_frame, using_infer_string): # disjoint head, tail = float_frame[:5], float_frame[5:] combined = head.combine_first(tail) reordered_frame = float_frame.reindex(combined.index) tm.assert_frame_equal(combined, reordered_frame) - assert tm.equalContents(combined.columns, float_frame.columns) + tm.assert_index_equal(combined.columns, float_frame.columns) tm.assert_series_equal(combined["A"], reordered_frame["A"]) # same index @@ -76,11 +76,13 @@ tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) # corner cases - comb = float_frame.combine_first(DataFrame()) + warning = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warning, match="empty entries"): + comb = float_frame.combine_first(DataFrame()) tm.assert_frame_equal(comb, float_frame) comb = DataFrame().combine_first(float_frame) - tm.assert_frame_equal(comb, float_frame) + tm.assert_frame_equal(comb, float_frame.sort_index()) comb = float_frame.combine_first(DataFrame(index=["faz", "boo"])) assert "faz" in comb.index @@ -218,15 +220,15 @@ # TODO: this must be int64 assert res["b"].dtype == "int64" - def test_combine_first_timezone(self): + def test_combine_first_timezone(self, unit): # see gh-7630 - data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC") + data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC").as_unit(unit) df1 = DataFrame( columns=["UTCdatetime", "abc"], data=data1, index=pd.date_range("20140627", periods=1), ) - data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") + data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC").as_unit(unit) df2 = DataFrame( columns=["UTCdatetime", "xyz"], data=data2, @@ -243,29 +245,32 @@ }, columns=["UTCdatetime", "abc"], index=pd.date_range("20140627", periods=2, freq="D"), + dtype=f"datetime64[{unit}, UTC]", ) - assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]" - assert res["abc"].dtype == "datetime64[ns, UTC]" + assert res["UTCdatetime"].dtype == f"datetime64[{unit}, UTC]" + assert res["abc"].dtype == f"datetime64[{unit}, UTC]" tm.assert_frame_equal(res, exp) + def test_combine_first_timezone2(self, unit): # see gh-10567 - dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC") + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC", unit=unit) df1 = DataFrame({"DATE": dts1}) - dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC") + dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC", unit=unit) df2 = DataFrame({"DATE": dts2}) res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) - assert res["DATE"].dtype == "datetime64[ns, UTC]" + assert res["DATE"].dtype == f"datetime64[{unit}, UTC]" + def test_combine_first_timezone3(self, unit): dts1 = pd.DatetimeIndex( ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern" - ) + ).as_unit(unit) df1 = DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) dts2 = pd.DatetimeIndex( ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern" - ) + ).as_unit(unit) df2 = DataFrame({"DATE": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) @@ -279,10 +284,12 @@ "2011-01-04", ], tz="US/Eastern", - ) + ).as_unit(unit) exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) + # FIXME: parametrizing over unit breaks on non-nano + def test_combine_first_timezone4(self): # different tz dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") df1 = DataFrame({"DATE": dts1}) @@ -294,9 +301,10 @@ tm.assert_frame_equal(res, df1) assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" - dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern") + def test_combine_first_timezone5(self, unit): + dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern", unit=unit) df1 = DataFrame({"DATE": dts1}) - dts2 = pd.date_range("2015-01-01", "2015-01-03") + dts2 = pd.date_range("2015-01-01", "2015-01-03", unit=unit) df2 = DataFrame({"DATE": dts2}) res = df1.combine_first(df2) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_convert_dtypes.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_convert_dtypes.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_convert_dtypes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_convert_dtypes.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,9 +11,13 @@ @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes(self, convert_integer, expected, string_storage): + def test_convert_dtypes( + self, convert_integer, expected, string_storage, using_infer_string + ): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here + if using_infer_string: + string_storage = "pyarrow_numpy" df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), @@ -175,3 +179,24 @@ expected = ser.astype("timestamp[ms][pyarrow]") result = expected.convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) + + def test_convert_dtypes_avoid_block_splitting(self): + # GH#55341 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"}) + result = df.convert_dtypes(convert_integer=False) + expected = pd.DataFrame( + { + "a": [1, 2, 3], + "b": [4, 5, 6], + "c": pd.Series(["a"] * 3, dtype="string[python]"), + } + ) + tm.assert_frame_equal(result, expected) + assert result._mgr.nblocks == 2 + + def test_convert_dtypes_from_arrow(self): + # GH#56581 + df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"]) + result = df.convert_dtypes() + expected = df.astype({"a": "string[python]"}) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_copy.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_copy.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_copy.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_copy.py 2024-04-10 17:42:52.000000000 +0000 @@ -56,7 +56,7 @@ } ) - for i in range(0, 10): + for i in range(10): df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55) assert len(df._mgr.blocks) == 11 diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_cov_corr.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_cov_corr.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_cov_corr.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_cov_corr.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,7 +6,9 @@ import pandas as pd from pandas import ( DataFrame, + Index, Series, + date_range, isna, ) import pandas._testing as tm @@ -106,7 +108,7 @@ pytest.importorskip("scipy") float_frame.loc[float_frame.index[:5], "A"] = np.nan float_frame.loc[float_frame.index[5:10], "B"] = np.nan - float_frame.loc[float_frame.index[:10], "A"] = float_frame["A"][10:20] + float_frame.loc[float_frame.index[:10], "A"] = float_frame["A"][10:20].copy() correls = float_frame.corr(method=method) expected = float_frame["A"].corr(float_frame["C"], method=method) @@ -205,7 +207,7 @@ expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) - def test_corr_item_cache(self, using_copy_on_write): + def test_corr_item_cache(self, using_copy_on_write, warn_copy_on_write): # Check that corr does not lead to incorrect entries in item_cache df = DataFrame({"A": range(10)}) @@ -223,7 +225,8 @@ # Check that the corr didn't break link between ser and df ser.values[0] = 99 assert df.loc[0, "A"] == 99 - assert df["A"] is ser + if not warn_copy_on_write: + assert df["A"] is ser assert df.values[0, 0] == 99 @pytest.mark.parametrize("length", [2, 20, 200, 2000]) @@ -323,16 +326,26 @@ for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) - def test_corrwith_with_objects(self): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame() + def test_corrwith_with_objects(self, using_infer_string): + df1 = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + df2 = df1.copy() cols = ["A", "B", "C", "D"] df1["obj"] = "foo" df2["obj"] = "bar" - with pytest.raises(TypeError, match="Could not convert"): - df1.corrwith(df2) + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + df1.corrwith(df2) + else: + with pytest.raises(TypeError, match="Could not convert"): + df1.corrwith(df2) result = df1.corrwith(df2, numeric_only=True) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_describe.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_describe.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_describe.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_describe.py 2024-04-10 17:42:52.000000000 +0000 @@ -204,7 +204,7 @@ def test_describe_timedelta_values(self): # GH#6145 t1 = pd.timedelta_range("1 days", freq="D", periods=5) - t2 = pd.timedelta_range("1 hours", freq="H", periods=5) + t2 = pd.timedelta_range("1 hours", freq="h", periods=5) df = DataFrame({"t1": t1, "t2": t2}) expected = DataFrame( diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_diff.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_diff.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_diff.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_diff.py 2024-04-10 17:42:52.000000000 +0000 @@ -70,15 +70,17 @@ tm.assert_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) - def test_diff_datetime_axis0_with_nat(self, tz): + def test_diff_datetime_axis0_with_nat(self, tz, unit): # GH#32441 - dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz) + dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz).as_unit(unit) ser = Series(dti) df = ser.to_frame() result = df.diff() - ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]) + ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]).as_unit( + unit + ) expected = Series(ex_index).to_frame() tm.assert_frame_equal(result, expected) @@ -87,7 +89,7 @@ # diff on NaT values should give NaT, not timedelta64(0) dti = date_range("2016-01-01", periods=4, tz=tz) ser = Series(dti) - df = ser.to_frame() + df = ser.to_frame().copy() df[1] = ser.copy() @@ -140,7 +142,7 @@ ) tm.assert_frame_equal(result, expected) - def test_diff_timedelta(self): + def test_diff_timedelta(self, unit): # GH#4533 df = DataFrame( { @@ -148,11 +150,13 @@ "value": [1.0, 2.0], } ) + df["time"] = df["time"].dt.as_unit(unit) res = df.diff() exp = DataFrame( [[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"] ) + exp["time"] = exp["time"].dt.as_unit(unit) tm.assert_frame_equal(res, exp) def test_diff_mixed_dtype(self): diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_drop.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_drop.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_drop.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_drop.py 2024-04-10 17:42:52.000000000 +0000 @@ -57,7 +57,7 @@ df = DataFrame( np.random.default_rng(2).standard_normal((5, 3)), columns=["a", "b", "c"], - index=pd.date_range("2012", freq="H", periods=5), + index=pd.date_range("2012", freq="h", periods=5), ) # create dataframe with non-unique datetime index df = df.iloc[[0, 2, 2, 3]].copy() @@ -510,7 +510,7 @@ def test_drop_inplace_no_leftover_column_reference(self): # GH 13934 - df = DataFrame({"a": [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object")) a = df.a df.drop(["a"], axis=1, inplace=True) tm.assert_index_equal(df.columns, Index([], dtype="object")) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_drop_duplicates.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_drop_duplicates.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_drop_duplicates.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_drop_duplicates.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,7 +16,7 @@ def test_drop_duplicates_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) - msg = re.escape("Index(['a'], dtype='object')") + msg = re.escape("Index(['a'], dtype=") with pytest.raises(KeyError, match=msg): df.drop_duplicates(subset) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_dtypes.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_dtypes.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_dtypes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_dtypes.py 2024-04-10 17:42:52.000000000 +0000 @@ -142,9 +142,12 @@ ) tm.assert_series_equal(result, expected) - def test_frame_apply_np_array_return_type(self): + def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) result = df.apply(lambda col: np.array("bar")) - expected = Series(["bar"]) + if using_infer_string: + expected = Series([np.array(["bar"])]) + else: + expected = Series(["bar"]) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_duplicated.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_duplicated.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_duplicated.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_duplicated.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,7 +16,7 @@ def test_duplicated_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) - msg = re.escape("Index(['a'], dtype='object')") + msg = re.escape("Index(['a'], dtype=") with pytest.raises(KeyError, match=msg): df.duplicated(subset) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_equals.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_equals.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_equals.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_equals.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,11 +14,11 @@ df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]}) assert df1.equals(df2) is False - def test_equals_different_blocks(self, using_array_manager): + def test_equals_different_blocks(self, using_array_manager, using_infer_string): # GH#9330 df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) df1 = df0.reset_index()[["A", "B", "C"]] - if not using_array_manager: + if not using_array_manager and not using_infer_string: # this assert verifies that the above operations have # induced a block rearrangement assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype @@ -35,7 +35,7 @@ np.random.default_rng(2).random(10), index=index, columns=["floats"] ) df1["text"] = "the sky is so blue. we could use more chocolate.".split() - df1["start"] = date_range("2000-1-1", periods=10, freq="T") + df1["start"] = date_range("2000-1-1", periods=10, freq="min") df1["end"] = date_range("2000-1-1", periods=10, freq="D") df1["diff"] = df1["end"] - df1["start"] # Explicitly cast to object, to avoid implicit cast when setting np.nan @@ -66,7 +66,7 @@ assert not df1.equals(different) # DatetimeIndex - index = date_range("2000-1-1", periods=10, freq="T") + index = date_range("2000-1-1", periods=10, freq="min") df1 = df1.set_index(index) df2 = df1.copy() assert df1.equals(df2) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_explode.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_explode.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_explode.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_explode.py 2024-04-10 17:42:52.000000000 +0000 @@ -203,7 +203,7 @@ ) def test_duplicate_index(input_dict, input_index, expected_dict, expected_index): # GH 28005 - df = pd.DataFrame(input_dict, index=input_index) + df = pd.DataFrame(input_dict, index=input_index, dtype=object) result = df.explode("col1") expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_fillna.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_fillna.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_fillna.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_fillna.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -20,14 +22,18 @@ class TestFillNA: - def test_fillna_dict_inplace_nonunique_columns(self, using_copy_on_write): + def test_fillna_dict_inplace_nonunique_columns( + self, using_copy_on_write, warn_copy_on_write + ): df = DataFrame( {"A": [np.nan] * 3, "B": [NaT, Timestamp(1), NaT], "C": [np.nan, "foo", 2]} ) df.columns = ["A", "A", "A"] orig = df[:] - df.fillna({"A": 2}, inplace=True) + # TODO(CoW-warn) better warning message + with tm.assert_cow_warning(warn_copy_on_write): + df.fillna({"A": 2}, inplace=True) # The first and third columns can be set inplace, while the second cannot. expected = DataFrame( @@ -54,7 +60,8 @@ df[0].fillna(-1, inplace=True) assert np.isnan(arr[:, 0]).all() else: - df[0].fillna(-1, inplace=True) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df[0].fillna(-1, inplace=True) assert (arr[:, 0] == -1).all() # i.e. we didn't create a new 49-column block @@ -84,6 +91,7 @@ with pytest.raises(ValueError, match=msg): datetime_frame.fillna(5, method="ffill") + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan @@ -117,19 +125,27 @@ df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) - def test_fillna_different_dtype(self): + def test_fillna_different_dtype(self, using_infer_string): # with different dtype (GH#3386) df = DataFrame( [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - result = df.fillna({2: "foo"}) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.fillna({2: "foo"}) + else: + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) tm.assert_frame_equal(result, expected) - return_value = df.fillna({2: "foo"}, inplace=True) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + return_value = df.fillna({2: "foo"}, inplace=True) + else: + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -353,20 +369,26 @@ expected["A"] = 0.0 tm.assert_frame_equal(result, expected) - def test_fillna_dtype_conversion(self): + def test_fillna_dtype_conversion(self, using_infer_string): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.dtypes expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5]) tm.assert_series_equal(result, expected) - result = df.fillna(1) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.fillna(1) expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) tm.assert_frame_equal(result, expected) # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - result = df.fillna("nan") + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.fillna("nan") + else: + result = df.fillna("nan") expected = DataFrame("nan", index=range(3), columns=["A", "B"]) tm.assert_frame_equal(result, expected) @@ -642,6 +664,7 @@ filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan @@ -731,12 +754,15 @@ @td.skip_array_manager_invalid_test @pytest.mark.parametrize("val", [-1, {"x": -1, "y": -1}]) - def test_inplace_dict_update_view(self, val, using_copy_on_write): + def test_inplace_dict_update_view( + self, val, using_copy_on_write, warn_copy_on_write + ): # GH#47188 df = DataFrame({"x": [np.nan, 2], "y": [np.nan, 2]}) df_orig = df.copy() result_view = df[:] - df.fillna(val, inplace=True) + with tm.assert_cow_warning(warn_copy_on_write): + df.fillna(val, inplace=True) expected = DataFrame({"x": [-1, 2.0], "y": [-1.0, 2]}) tm.assert_frame_equal(df, expected) if using_copy_on_write: @@ -817,7 +843,8 @@ [[None, None], [None, None]], columns=["A", "B"], ) - with tm.assert_produces_warning(False): + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): df.fillna(value={"A": 1, "B": 2}, inplace=True) expected = DataFrame([[1, 2], [1, 2]], columns=["A", "B"]) @@ -830,3 +857,76 @@ df = DataFrame({"a": [1, 2, 3]}) with tm.assert_produces_warning(FutureWarning): getattr(df, func)() + + +@pytest.mark.parametrize( + "data, expected_data, method, kwargs", + ( + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan], + "ffill", + {"limit_area": "inside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan], + "ffill", + {"limit_area": "inside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0], + "ffill", + {"limit_area": "outside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan], + "ffill", + {"limit_area": "outside", "limit": 1}, + ), + ( + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + "ffill", + {"limit_area": "outside", "limit": 1}, + ), + ( + range(5), + range(5), + "ffill", + {"limit_area": "outside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "inside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "inside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "outside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "outside", "limit": 1}, + ), + ), +) +def test_ffill_bfill_limit_area(data, expected_data, method, kwargs): + # GH#56492 + df = DataFrame(data) + expected = DataFrame(expected_data) + result = getattr(df, method)(**kwargs) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_first_and_last.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_first_and_last.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_first_and_last.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_first_and_last.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,12 +1,15 @@ """ Note: includes tests for `last` """ +import numpy as np import pytest import pandas as pd from pandas import ( DataFrame, + Index, bdate_range, + date_range, ) import pandas._testing as tm @@ -16,20 +19,28 @@ class TestFirst: def test_first_subset(self, frame_or_series): - ts = tm.makeTimeDataFrame(freq="12h") + ts = DataFrame( + np.random.default_rng(2).standard_normal((100, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100, freq="12h"), + ) ts = tm.get_obj(ts, frame_or_series) with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): result = ts.first("10d") assert len(result) == 20 - ts = tm.makeTimeDataFrame(freq="D") + ts = DataFrame( + np.random.default_rng(2).standard_normal((100, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100, freq="D"), + ) ts = tm.get_obj(ts, frame_or_series) with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): result = ts.first("10d") assert len(result) == 10 with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = ts.first("3M") + result = ts.first("3ME") expected = ts[:"3/31/2000"] tm.assert_equal(result, expected) @@ -39,7 +50,7 @@ tm.assert_equal(result, expected) with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = ts[:0].first("3M") + result = ts[:0].first("3ME") tm.assert_equal(result, ts[:0]) def test_first_last_raises(self, frame_or_series): @@ -64,13 +75,21 @@ obj.last("1D") def test_last_subset(self, frame_or_series): - ts = tm.makeTimeDataFrame(freq="12h") + ts = DataFrame( + np.random.default_rng(2).standard_normal((100, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100, freq="12h"), + ) ts = tm.get_obj(ts, frame_or_series) with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): result = ts.last("10d") assert len(result) == 20 - ts = tm.makeTimeDataFrame(nper=30, freq="D") + ts = DataFrame( + np.random.default_rng(2).standard_normal((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=30, freq="D"), + ) ts = tm.get_obj(ts, frame_or_series) with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): result = ts.last("10d") @@ -87,7 +106,7 @@ tm.assert_equal(result, expected) with tm.assert_produces_warning(FutureWarning, match=last_deprecated_msg): - result = ts[:0].last("3M") + result = ts[:0].last("3ME") tm.assert_equal(result, ts[:0]) @pytest.mark.parametrize("start, periods", [("2010-03-31", 1), ("2010-03-30", 2)]) @@ -95,7 +114,7 @@ # GH#29623 x = frame_or_series([1] * 100, index=bdate_range(start, periods=100)) with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = x.first("1M") + result = x.first("1ME") expected = frame_or_series( [1] * periods, index=bdate_range(start, periods=periods) ) @@ -105,7 +124,7 @@ # GH#29623 x = frame_or_series([1] * 100, index=bdate_range("2010-03-31", periods=100)) with tm.assert_produces_warning(FutureWarning, match=deprecated_msg): - result = x.first("2M") + result = x.first("2ME") expected = frame_or_series( [1] * 23, index=bdate_range("2010-03-31", "2010-04-30") ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_first_valid_index.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_first_valid_index.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_first_valid_index.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_first_valid_index.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,9 +6,10 @@ from pandas import ( DataFrame, + Index, Series, + date_range, ) -import pandas._testing as tm class TestFirstValidIndex: @@ -44,11 +45,12 @@ assert expected_first == df.first_valid_index() assert expected_last == df.last_valid_index() - @pytest.mark.parametrize("index_func", [tm.makeStringIndex, tm.makeDateIndex]) - def test_first_last_valid(self, index_func): - N = 30 - index = index_func(N) - mat = np.random.default_rng(2).standard_normal(N) + @pytest.mark.parametrize( + "index", + [Index([str(i) for i in range(20)]), date_range("2020-01-01", periods=20)], + ) + def test_first_last_valid(self, index): + mat = np.random.default_rng(2).standard_normal(len(index)) mat[:5] = np.nan mat[-5:] = np.nan @@ -60,10 +62,12 @@ assert ser.first_valid_index() == frame.index[5] assert ser.last_valid_index() == frame.index[-6] - @pytest.mark.parametrize("index_func", [tm.makeStringIndex, tm.makeDateIndex]) - def test_first_last_valid_all_nan(self, index_func): + @pytest.mark.parametrize( + "index", + [Index([str(i) for i in range(10)]), date_range("2020-01-01", periods=10)], + ) + def test_first_last_valid_all_nan(self, index): # GH#17400: no valid entries - index = index_func(30) frame = DataFrame(np.nan, columns=["foo"], index=index) assert frame.last_valid_index() is None diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_get_numeric_data.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_get_numeric_data.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_get_numeric_data.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_get_numeric_data.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,12 +15,12 @@ class TestGetNumericData: def test_get_numeric_data_preserve_dtype(self): # get the numeric data - obj = DataFrame({"A": [1, "2", 3.0]}) + obj = DataFrame({"A": [1, "2", 3.0]}, columns=Index(["A"], dtype="object")) result = obj._get_numeric_data() expected = DataFrame(dtype=object, index=pd.RangeIndex(3), columns=[]) tm.assert_frame_equal(result, expected) - def test_get_numeric_data(self): + def test_get_numeric_data(self, using_infer_string): datetime64name = np.dtype("M8[s]").name objectname = np.dtype(np.object_).name @@ -33,7 +33,7 @@ [ np.dtype("float64"), np.dtype("int64"), - np.dtype(objectname), + np.dtype(objectname) if not using_infer_string else "string", np.dtype(datetime64name), ], index=["a", "b", "c", "f"], diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_info.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_info.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_info.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_info.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,565 @@ +from io import StringIO +import re +from string import ascii_uppercase +import sys +import textwrap + +import numpy as np +import pytest + +from pandas.compat import ( + IS64, + PYPY, +) + +from pandas import ( + CategoricalIndex, + DataFrame, + MultiIndex, + Series, + date_range, + option_context, +) +import pandas._testing as tm + + +@pytest.fixture +def duplicate_columns_frame(): + """Dataframe with duplicate column names.""" + return DataFrame( + np.random.default_rng(2).standard_normal((1500, 4)), + columns=["a", "a", "b", "b"], + ) + + +def test_info_empty(): + # GH #45494 + df = DataFrame() + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + expected = textwrap.dedent( + """\ + + RangeIndex: 0 entries + Empty DataFrame\n""" + ) + assert result == expected + + +def test_info_categorical_column_smoke_test(): + n = 2500 + df = DataFrame({"int64": np.random.default_rng(2).integers(100, size=n, dtype=int)}) + df["category"] = Series( + np.array(list("abcdefghij")).take( + np.random.default_rng(2).integers(0, 10, size=n, dtype=int) + ) + ).astype("category") + df.isna() + buf = StringIO() + df.info(buf=buf) + + df2 = df[df["category"] == "d"] + buf = StringIO() + df2.info(buf=buf) + + +@pytest.mark.parametrize( + "fixture_func_name", + [ + "int_frame", + "float_frame", + "datetime_frame", + "duplicate_columns_frame", + "float_string_frame", + ], +) +def test_info_smoke_test(fixture_func_name, request): + frame = request.getfixturevalue(fixture_func_name) + buf = StringIO() + frame.info(buf=buf) + result = buf.getvalue().splitlines() + assert len(result) > 10 + + buf = StringIO() + frame.info(buf=buf, verbose=False) + + +def test_info_smoke_test2(float_frame): + # pretty useless test, used to be mixed into the repr tests + buf = StringIO() + float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf) + float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf) + + # no columns or index + DataFrame().info(buf=buf) + + +@pytest.mark.parametrize( + "num_columns, max_info_columns, verbose", + [ + (10, 100, True), + (10, 11, True), + (10, 10, True), + (10, 9, False), + (10, 1, False), + ], +) +def test_info_default_verbose_selection(num_columns, max_info_columns, verbose): + frame = DataFrame(np.random.default_rng(2).standard_normal((5, num_columns))) + with option_context("display.max_info_columns", max_info_columns): + io_default = StringIO() + frame.info(buf=io_default) + result = io_default.getvalue() + + io_explicit = StringIO() + frame.info(buf=io_explicit, verbose=verbose) + expected = io_explicit.getvalue() + + assert result == expected + + +def test_info_verbose_check_header_separator_body(): + buf = StringIO() + size = 1001 + start = 5 + frame = DataFrame(np.random.default_rng(2).standard_normal((3, size))) + frame.info(verbose=True, buf=buf) + + res = buf.getvalue() + header = " # Column Dtype \n--- ------ ----- " + assert header in res + + frame.info(verbose=True, buf=buf) + buf.seek(0) + lines = buf.readlines() + assert len(lines) > 0 + + for i, line in enumerate(lines): + if start <= i < start + size: + line_nr = f" {i - start} " + assert line.startswith(line_nr) + + +@pytest.mark.parametrize( + "size, header_exp, separator_exp, first_line_exp, last_line_exp", + [ + ( + 4, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 3 3 3 non-null float64", + ), + ( + 11, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 10 10 3 non-null float64", + ), + ( + 101, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 100 100 3 non-null float64", + ), + ( + 1001, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 1000 1000 3 non-null float64", + ), + ( + 10001, + " # Column Non-Null Count Dtype ", + "--- ------ -------------- ----- ", + " 0 0 3 non-null float64", + " 10000 10000 3 non-null float64", + ), + ], +) +def test_info_verbose_with_counts_spacing( + size, header_exp, separator_exp, first_line_exp, last_line_exp +): + """Test header column, spacer, first line and last line in verbose mode.""" + frame = DataFrame(np.random.default_rng(2).standard_normal((3, size))) + with StringIO() as buf: + frame.info(verbose=True, show_counts=True, buf=buf) + all_lines = buf.getvalue().splitlines() + # Here table would contain only header, separator and table lines + # dframe repr, index summary, memory usage and dtypes are excluded + table = all_lines[3:-2] + header, separator, first_line, *rest, last_line = table + assert header == header_exp + assert separator == separator_exp + assert first_line == first_line_exp + assert last_line == last_line_exp + + +def test_info_memory(): + # https://github.com/pandas-dev/pandas/issues/21056 + df = DataFrame({"a": Series([1, 2], dtype="i8")}) + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + bytes = float(df.memory_usage().sum()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Data columns (total 1 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 2 non-null int64 + dtypes: int64(1) + memory usage: {bytes} bytes + """ + ) + assert result == expected + + +def test_info_wide(): + io = StringIO() + df = DataFrame(np.random.default_rng(2).standard_normal((5, 101))) + df.info(buf=io) + + io = StringIO() + df.info(buf=io, max_cols=101) + result = io.getvalue() + assert len(result.splitlines()) > 100 + + expected = result + with option_context("display.max_info_columns", 101): + io = StringIO() + df.info(buf=io) + result = io.getvalue() + assert result == expected + + +def test_info_duplicate_columns_shows_correct_dtypes(): + # GH11761 + io = StringIO() + frame = DataFrame([[1, 2.0]], columns=["a", "a"]) + frame.info(buf=io) + lines = io.getvalue().splitlines(True) + assert " 0 a 1 non-null int64 \n" == lines[5] + assert " 1 a 1 non-null float64\n" == lines[6] + + +def test_info_shows_column_dtypes(): + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + df.info(buf=buf) + res = buf.getvalue() + header = ( + " # Column Non-Null Count Dtype \n" + "--- ------ -------------- ----- " + ) + assert header in res + for i, dtype in enumerate(dtypes): + name = f" {i:d} {i:d} {n:d} non-null {dtype}" + assert name in res + + +def test_info_max_cols(): + df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) + for len_, verbose in [(5, None), (5, False), (12, True)]: + # For verbose always ^ setting ^ summarize ^ full output + with option_context("max_info_columns", 4): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + for len_, verbose in [(12, None), (5, False), (12, True)]: + # max_cols not exceeded + with option_context("max_info_columns", 5): + buf = StringIO() + df.info(buf=buf, verbose=verbose) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + for len_, max_cols in [(12, 5), (5, 4)]: + # setting truncates + with option_context("max_info_columns", 4): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + # setting wouldn't truncate + with option_context("max_info_columns", 5): + buf = StringIO() + df.info(buf=buf, max_cols=max_cols) + res = buf.getvalue() + assert len(res.strip().split("\n")) == len_ + + +def test_info_memory_usage(): + # Ensure memory usage is displayed, when asserted, on the last line + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + data = {} + n = 10 + for i, dtype in enumerate(dtypes): + data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype) + df = DataFrame(data) + buf = StringIO() + + # display memory usage case + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert "memory usage: " in res[-1] + + # do not display memory usage case + df.info(buf=buf, memory_usage=False) + res = buf.getvalue().splitlines() + assert "memory usage: " not in res[-1] + + df.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + + # memory usage is a lower bound, so print it as XYZ+ MB + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df.iloc[:, :5].info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + + # excluded column with object dtype, so estimate is accurate + assert not re.match(r"memory usage: [^+]+\+", res[-1]) + + # Test a DataFrame with duplicate columns + dtypes = ["int64", "int64", "int64", "float64"] + data = {} + n = 100 + for i, dtype in enumerate(dtypes): + data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype) + df = DataFrame(data) + df.columns = dtypes + + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + df_with_object_index.info(buf=buf, memory_usage=True) + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+\+", res[-1]) + + df_with_object_index.info(buf=buf, memory_usage="deep") + res = buf.getvalue().splitlines() + assert re.match(r"memory usage: [^+]+$", res[-1]) + + # Ensure df size is as expected + # (cols * rows * bytes) + index size + df_size = df.memory_usage().sum() + exp_size = len(dtypes) * n * 8 + df.index.nbytes + assert df_size == exp_size + + # Ensure number of cols in memory_usage is the same as df + size_df = np.size(df.columns.values) + 1 # index=True; default + assert size_df == np.size(df.memory_usage()) + + # assert deep works only on object + assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() + + # test for validity + DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) + DataFrame(1, index=["a"], columns=["A"]).index.nbytes + df = DataFrame( + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] + ) + df.index.nbytes + df.memory_usage(index=True) + df.index.values.nbytes + + mem = df.memory_usage(deep=True).sum() + assert mem > 0 + + +@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") +def test_info_memory_usage_deep_not_pypy(): + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + > df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() + + +@pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result") +def test_info_memory_usage_deep_pypy(): + df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + == df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() + + +@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") +def test_usage_via_getsizeof(): + df = DataFrame( + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] + ) + mem = df.memory_usage(deep=True).sum() + # sys.getsizeof will call the .memory_usage with + # deep=True, and add on some GC overhead + diff = mem - sys.getsizeof(df) + assert abs(diff) < 100 + + +def test_info_memory_usage_qualified(): + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) + df.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=list("ABC")) + df.info(buf=buf) + assert "+" in buf.getvalue() + + buf = StringIO() + df = DataFrame( + 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) + ) + df.info(buf=buf) + assert "+" not in buf.getvalue() + + buf = StringIO() + df = DataFrame( + 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) + ) + df.info(buf=buf) + assert "+" in buf.getvalue() + + +def test_info_memory_usage_bug_on_multiindex(): + # GH 14308 + # memory usage introspection should not materialize .values + + def memory_usage(f): + return f.memory_usage(deep=True).sum() + + N = 100 + M = len(ascii_uppercase) + index = MultiIndex.from_product( + [list(ascii_uppercase), date_range("20160101", periods=N)], + names=["id", "date"], + ) + df = DataFrame( + {"value": np.random.default_rng(2).standard_normal(N * M)}, index=index + ) + + unstacked = df.unstack("id") + assert df.values.nbytes == unstacked.values.nbytes + assert memory_usage(df) > memory_usage(unstacked) + + # high upper bound + assert memory_usage(unstacked) - memory_usage(df) < 2000 + + +def test_info_categorical(): + # GH14298 + idx = CategoricalIndex(["a", "b"]) + df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx) + + buf = StringIO() + df.info(buf=buf) + + +@pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") +def test_info_int_columns(): + # GH#37245 + df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) + buf = StringIO() + df.info(show_counts=True, buf=buf) + result = buf.getvalue() + expected = textwrap.dedent( + """\ + + Index: 2 entries, A to B + Data columns (total 2 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 1 2 non-null int64 + 1 2 2 non-null int64 + dtypes: int64(2) + memory usage: 48.0+ bytes + """ + ) + assert result == expected + + +def test_memory_usage_empty_no_warning(): + # GH#50066 + df = DataFrame(index=["a", "b"]) + with tm.assert_produces_warning(None): + result = df.memory_usage() + expected = Series(16 if IS64 else 8, index=["Index"]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.single_cpu +def test_info_compute_numba(): + # GH#51922 + pytest.importorskip("numba") + df = DataFrame([[1, 2], [3, 4]]) + + with option_context("compute.use_numba", True): + buf = StringIO() + df.info(buf=buf) + result = buf.getvalue() + + buf = StringIO() + df.info(buf=buf) + expected = buf.getvalue() + assert result == expected + + +@pytest.mark.parametrize( + "row, columns, show_counts, result", + [ + [20, 20, None, True], + [20, 20, True, True], + [20, 20, False, False], + [5, 5, None, False], + [5, 5, True, False], + [5, 5, False, False], + ], +) +def test_info_show_counts(row, columns, show_counts, result): + # Explicit cast to float to avoid implicit cast when setting nan + df = DataFrame(1, columns=range(10), index=range(10)).astype({1: "float"}) + df.iloc[1, 1] = np.nan + + with option_context( + "display.max_info_rows", row, "display.max_info_columns", columns + ): + with StringIO() as buf: + df.info(buf=buf, show_counts=show_counts) + assert ("non-null" in buf.getvalue()) is result diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_interpolate.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_interpolate.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_interpolate.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_interpolate.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.errors import ChainedAssignmentError import pandas.util._test_decorators as td @@ -54,7 +56,7 @@ # GH#44749 if using_array_manager and frame_or_series is DataFrame: mark = pytest.mark.xfail(reason=".values-based in-place check is invalid") - request.node.add_marker(mark) + request.applymarker(mark) obj = frame_or_series([1, np.nan, 2]) orig = obj.values @@ -67,6 +69,9 @@ assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + ) def test_interp_basic(self, using_copy_on_write): df = DataFrame( { @@ -108,7 +113,10 @@ assert np.shares_memory(df["C"]._values, cvalues) assert np.shares_memory(df["D"]._values, dvalues) - def test_interp_basic_with_non_range_index(self): + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + ) + def test_interp_basic_with_non_range_index(self, using_infer_string): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -119,7 +127,8 @@ ) msg = "DataFrame.interpolate with object dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): + warning = FutureWarning if not using_infer_string else None + with tm.assert_produces_warning(warning, match=msg): result = df.set_index("C").interpolate() expected = df.set_index("C") expected.loc[3, "A"] = 3 @@ -322,7 +331,7 @@ # TODO: assert something? @pytest.mark.parametrize( - "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] ) def test_interp_leading_nans(self, check_scipy): df = DataFrame( @@ -378,7 +387,8 @@ assert return_value is None tm.assert_frame_equal(result, expected_cow) else: - return_value = result["a"].interpolate(inplace=True) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + return_value = result["a"].interpolate(inplace=True) assert return_value is None tm.assert_frame_equal(result, expected) @@ -498,8 +508,41 @@ assert result is None tm.assert_frame_equal(df, expected) - def test_interpolate_ea_raise(self): + def test_interpolate_ea(self, any_int_ea_dtype): # GH#55347 - df = DataFrame({"a": [1, None, 2]}, dtype="Int64") - with pytest.raises(NotImplementedError, match="does not implement"): - df.interpolate() + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=any_int_ea_dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="Float64") + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + [ + "Float64", + "Float32", + pytest.param("float32[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_interpolate_ea_float(self, dtype): + # GH#55347 + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype) + orig = df.copy() + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype=dtype) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "dtype", + ["int64", "uint64", "int32", "int16", "int8", "uint32", "uint16", "uint8"], + ) + def test_interpolate_arrow(self, dtype): + # GH#55347 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, None, None, None, 3]}, dtype=dtype + "[pyarrow]") + result = df.interpolate(limit=2) + expected = DataFrame({"a": [1, 1.5, 2.0, None, 3]}, dtype="float64[pyarrow]") + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_is_homogeneous_dtype.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_is_homogeneous_dtype.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_is_homogeneous_dtype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_is_homogeneous_dtype.py 2024-04-10 17:42:52.000000000 +0000 @@ -25,7 +25,8 @@ { "A": np.array([1, 2], dtype=object), "B": np.array(["a", "b"], dtype=object), - } + }, + dtype="object", ), True, ), diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_join.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_join.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_join.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_join.py 2024-04-10 17:42:52.000000000 +0000 @@ -22,7 +22,7 @@ return DataFrame( data=np.arange(20).reshape(4, 5), columns=list("abcde"), - index=period_range(start="2000", freq="A", periods=4), + index=period_range(start="2000", freq="Y", periods=4), ) @@ -558,13 +558,13 @@ test1 = DataFrame( np.zeros((6, 3)), index=date_range( - "2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central" + "2012-11-15 00:00:00", periods=6, freq="100ms", tz="US/Central" ), ) test2 = DataFrame( np.zeros((3, 3)), index=date_range( - "2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central" + "2012-11-15 00:00:00", periods=3, freq="250ms", tz="US/Central" ), columns=range(3, 6), ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_map.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_map.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_map.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_map.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,6 +12,8 @@ ) import pandas._testing as tm +from pandas.tseries.offsets import BDay + def test_map(float_frame): result = float_frame.map(lambda x: x * 2) @@ -158,8 +160,6 @@ def test_frame_map_dont_convert_datetime64(): - from pandas.tseries.offsets import BDay - df = DataFrame({"x1": [datetime(1996, 1, 1)]}) df = df.map(lambda x: x + BDay()) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_nlargest.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_nlargest.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_nlargest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_nlargest.py 2024-04-10 17:42:52.000000000 +0000 @@ -86,7 +86,7 @@ df = df_strings if "b" in order: error_msg = ( - f"Column 'b' has dtype object, " + f"Column 'b' has dtype (object|string), " f"cannot use method '{nselect_method}' with this dtype" ) with pytest.raises(TypeError, match=error_msg): @@ -169,7 +169,7 @@ if Version(np.__version__) >= Version("1.25") and ( (order == ["a"] and n in (1, 2, 3, 4)) or (order == ["a", "b"]) and n == 5 ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_pop.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_pop.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_pop.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_pop.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,7 +9,7 @@ class TestDataFramePop: - def test_pop(self, float_frame): + def test_pop(self, float_frame, warn_copy_on_write): float_frame.columns.name = "baz" float_frame.pop("A") @@ -23,7 +23,8 @@ # gh-10912: inplace ops cause caching issue a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"]) b = a.pop("B") - b += 1 + with tm.assert_cow_warning(warn_copy_on_write): + b += 1 # original frame expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_quantile.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_quantile.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_quantile.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_quantile.py 2024-04-10 17:42:52.000000000 +0000 @@ -63,7 +63,7 @@ tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result.index, expected.index) - request.node.add_marker( + request.applymarker( pytest.mark.xfail( using_array_manager, reason="Name set incorrectly for arraymanager" ) @@ -83,7 +83,7 @@ tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result.index, expected.index) - request.node.add_marker( + request.applymarker( pytest.mark.xfail( using_array_manager, reason="Name set incorrectly for arraymanager" ) @@ -107,9 +107,7 @@ if interpolation == "nearest": xp = (xp + 0.5).astype(np.int64) if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(rs, xp) def test_axis(self, interp_method, request, using_array_manager): @@ -121,9 +119,7 @@ if interpolation == "nearest": expected = expected.astype(np.int64) if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) result = df.quantile( @@ -151,9 +147,7 @@ if interpolation == "nearest": expected = expected.astype(np.int64) if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) def test_quantile_date_range(self, interp_method, request, using_array_manager): @@ -170,9 +164,7 @@ ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]" ) if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) @@ -194,9 +186,7 @@ if interpolation == "nearest": expected -= 0.5 if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_series_equal(result, expected) # must raise @@ -208,9 +198,7 @@ # GH 9543/9544 interpolation, method = interp_method if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(0.5, axis=0, interpolation=interpolation, method=method) @@ -336,9 +324,7 @@ if interpolation == "nearest": expected = expected.astype(np.int64) if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_frame_equal(result, expected) def test_quantile_multi_axis_1(self, interp_method, request, using_array_manager): @@ -353,9 +339,7 @@ if interpolation == "nearest": expected = expected.astype(np.int64) if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) tm.assert_frame_equal(result, expected) def test_quantile_multi_empty(self, interp_method): @@ -368,8 +352,9 @@ ) tm.assert_frame_equal(result, expected) - def test_quantile_datetime(self): - df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]}) + def test_quantile_datetime(self, unit): + dti = pd.to_datetime(["2010", "2011"]).as_unit(unit) + df = DataFrame({"a": dti, "b": [0, 5]}) # exclude datetime result = df.quantile(0.5, numeric_only=True) @@ -386,17 +371,19 @@ # datetime w/ multi result = df.quantile([0.5], numeric_only=False) expected = DataFrame( - [[Timestamp("2010-07-02 12:00:00"), 2.5]], index=[0.5], columns=["a", "b"] + {"a": Timestamp("2010-07-02 12:00:00").as_unit(unit), "b": 2.5}, + index=[0.5], ) tm.assert_frame_equal(result, expected) # axis = 1 - df["c"] = pd.to_datetime(["2011", "2012"]) + df["c"] = pd.to_datetime(["2011", "2012"]).as_unit(unit) result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False) expected = Series( [Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")], index=[0, 1], name=0.5, + dtype=f"M8[{unit}]", ) tm.assert_series_equal(result, expected) @@ -405,6 +392,7 @@ [[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]], index=[0.5], columns=[0, 1], + dtype=f"M8[{unit}]", ) tm.assert_frame_equal(result, expected) @@ -458,9 +446,7 @@ def test_quantile_box(self, interp_method, request, using_array_manager): interpolation, method = interp_method if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) df = DataFrame( { "A": [ @@ -591,9 +577,7 @@ def test_quantile_nan(self, interp_method, request, using_array_manager): interpolation, method = interp_method if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) # GH 14357 - float block where some cols have missing values df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) df.iloc[-1, 1] = np.nan @@ -637,25 +621,23 @@ exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) - def test_quantile_nat(self, interp_method, request, using_array_manager): + def test_quantile_nat(self, interp_method, request, using_array_manager, unit): interpolation, method = interp_method if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) + request.applymarker(pytest.mark.xfail(reason="Axis name incorrectly set.")) # full NaT column - df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}) + df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}, dtype=f"M8[{unit}]") res = df.quantile( 0.5, numeric_only=False, interpolation=interpolation, method=method ) - exp = Series([pd.NaT], index=["a"], name=0.5) + exp = Series([pd.NaT], index=["a"], name=0.5, dtype=f"M8[{unit}]") tm.assert_series_equal(res, exp) res = df.quantile( [0.5], numeric_only=False, interpolation=interpolation, method=method ) - exp = DataFrame({"a": [pd.NaT]}, index=[0.5]) + exp = DataFrame({"a": [pd.NaT]}, index=[0.5], dtype=f"M8[{unit}]") tm.assert_frame_equal(res, exp) # mixed non-null / full null column @@ -667,20 +649,29 @@ Timestamp("2012-01-03"), ], "b": [pd.NaT, pd.NaT, pd.NaT], - } + }, + dtype=f"M8[{unit}]", ) res = df.quantile( 0.5, numeric_only=False, interpolation=interpolation, method=method ) - exp = Series([Timestamp("2012-01-02"), pd.NaT], index=["a", "b"], name=0.5) + exp = Series( + [Timestamp("2012-01-02"), pd.NaT], + index=["a", "b"], + name=0.5, + dtype=f"M8[{unit}]", + ) tm.assert_series_equal(res, exp) res = df.quantile( [0.5], numeric_only=False, interpolation=interpolation, method=method ) exp = DataFrame( - [[Timestamp("2012-01-02"), pd.NaT]], index=[0.5], columns=["a", "b"] + [[Timestamp("2012-01-02"), pd.NaT]], + index=[0.5], + columns=["a", "b"], + dtype=f"M8[{unit}]", ) tm.assert_frame_equal(res, exp) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_rank.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_rank.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_rank.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_rank.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,6 +13,7 @@ from pandas import ( DataFrame, + Index, Series, ) import pandas._testing as tm @@ -469,21 +470,28 @@ ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) - def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): + def test_rank_object_first( + self, frame_or_series, na_option, ascending, expected, using_infer_string + ): obj = frame_or_series(["foo", "foo", None, "foo"]) result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) + if using_infer_string and isinstance(obj, Series): + expected = expected.astype("uint64") tm.assert_equal(result, expected) @pytest.mark.parametrize( "data,expected", [ - ({"a": [1, 2, "a"], "b": [4, 5, 6]}, DataFrame({"b": [1.0, 2.0, 3.0]})), + ( + {"a": [1, 2, "a"], "b": [4, 5, 6]}, + DataFrame({"b": [1.0, 2.0, 3.0]}, columns=Index(["b"], dtype=object)), + ), ({"a": [1, 2, "a"]}, DataFrame(index=range(3), columns=[])), ], ) def test_rank_mixed_axis_zero(self, data, expected): - df = DataFrame(data) + df = DataFrame(data, columns=Index(list(data.keys()), dtype=object)) with pytest.raises(TypeError, match="'<' not supported between instances of"): df.rank() result = df.rank(numeric_only=True) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_reindex.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_reindex.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_reindex.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_reindex.py 2024-04-10 17:42:52.000000000 +0000 @@ -27,7 +27,7 @@ isna, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype class TestReindexSetIndex: @@ -36,8 +36,8 @@ def test_dti_set_index_reindex_datetimeindex(self): # GH#6631 df = DataFrame(np.random.default_rng(2).random(6)) - idx1 = date_range("2011/01/01", periods=6, freq="M", tz="US/Eastern") - idx2 = date_range("2013", periods=6, freq="A", tz="Asia/Tokyo") + idx1 = date_range("2011/01/01", periods=6, freq="ME", tz="US/Eastern") + idx2 = date_range("2013", periods=6, freq="YE", tz="Asia/Tokyo") df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) @@ -47,7 +47,7 @@ def test_dti_set_index_reindex_freq_with_tz(self): # GH#11314 with tz index = date_range( - datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="H", tz="US/Eastern" + datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="h", tz="US/Eastern" ) df = DataFrame( np.random.default_rng(2).standard_normal((24, 1)), @@ -55,7 +55,7 @@ index=index, ) new_index = date_range( - datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="H", tz="US/Eastern" + datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="h", tz="US/Eastern" ) result = df.set_index(new_index) @@ -120,7 +120,7 @@ exp = DataFrame({"index": ["1", "2"], "vals": [np.nan, np.nan]}).set_index( "index" ) - exp = exp.astype(object) + exp = exp.astype(df.vals.dtype) tm.assert_frame_equal( df, exp, @@ -390,9 +390,9 @@ # GH#38566 obj = frame_or_series( [0, 1, 2, 3], - index=date_range("2020-01-01 00:00:00", periods=4, freq="H", tz="UTC"), + index=date_range("2020-01-01 00:00:00", periods=4, freq="h", tz="UTC"), ) - new_index = date_range("2020-01-01 00:01:00", periods=4, freq="H", tz="UTC") + new_index = date_range("2020-01-01 00:01:00", periods=4, freq="h", tz="UTC") result = obj.reindex(new_index, method=method, tolerance=pd.Timedelta("1 hour")) expected = frame_or_series(exp_values, index=new_index) tm.assert_equal(result, expected) @@ -609,7 +609,9 @@ tm.assert_frame_equal(result, expected) def test_reindex(self, float_frame, using_copy_on_write): - datetime_series = tm.makeTimeSeries(nper=30) + datetime_series = Series( + np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + ) newFrame = float_frame.reindex(datetime_series.index) @@ -624,7 +626,7 @@ assert np.isnan(val) for col, series in newFrame.items(): - assert tm.equalContents(series.index, newFrame.index) + tm.assert_index_equal(series.index, newFrame.index) emptyFrame = float_frame.reindex(Index([])) assert len(emptyFrame.index) == 0 @@ -642,7 +644,7 @@ assert np.isnan(val) for col, series in nonContigFrame.items(): - assert tm.equalContents(series.index, nonContigFrame.index) + tm.assert_index_equal(series.index, nonContigFrame.index) # corner cases @@ -838,8 +840,8 @@ # other dtypes df["foo"] = "foo" - result = df.reindex(range(15), fill_value=0) - expected = df.reindex(range(15)).fillna(0) + result = df.reindex(range(15), fill_value="0") + expected = df.reindex(range(15)).fillna("0") tm.assert_frame_equal(result, expected) def test_reindex_uint_dtypes_fill_value(self, any_unsigned_int_numpy_dtype): @@ -1068,7 +1070,7 @@ midx = MultiIndex.from_product( [ Categorical(["a", "b", "c"]), - Categorical(date_range("2012-01-01", periods=3, freq="H")), + Categorical(date_range("2012-01-01", periods=3, freq="h")), ] ) df = DataFrame({"a": range(len(midx))}, index=midx) @@ -1083,7 +1085,9 @@ { "A": np.arange(3, dtype="int64"), }, - index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"), + index=CategoricalIndex( + list("abc"), dtype=CategoricalDtype(list("cabe")), name="B" + ), ) # reindexing @@ -1112,13 +1116,13 @@ result = df.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} + {"A": [0, np.nan], "B": Series(list("ae")).astype(CategoricalDtype(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a"], categories=cats)) expected = DataFrame( - {"A": [0], "B": Series(list("a")).astype(CDT(cats))} + {"A": [0], "B": Series(list("a")).astype(CategoricalDtype(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) @@ -1139,13 +1143,19 @@ # give back the type of categorical that we received result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} + { + "A": [0, np.nan], + "B": Series(list("ae")).astype(CategoricalDtype(cats, ordered=True)), + } ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} + { + "A": [0, np.nan], + "B": Series(list("ad")).astype(CategoricalDtype(["a", "d"])), + } ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) @@ -1153,7 +1163,9 @@ { "A": np.arange(6, dtype="int64"), }, - index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B" + ), ) # passed duplicate indexers are not allowed msg = "cannot reindex on an axis with duplicate labels" diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_rename.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_rename.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_rename.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_rename.py 2024-04-10 17:42:52.000000000 +0000 @@ -50,13 +50,12 @@ # index data = {"A": {"foo": 0, "bar": 1}} - # gets sorted alphabetical df = DataFrame(data) renamed = df.rename(index={"foo": "bar", "bar": "foo"}) - tm.assert_index_equal(renamed.index, Index(["foo", "bar"])) + tm.assert_index_equal(renamed.index, Index(["bar", "foo"])) renamed = df.rename(index=str.upper) - tm.assert_index_equal(renamed.index, Index(["BAR", "FOO"])) + tm.assert_index_equal(renamed.index, Index(["FOO", "BAR"])) # have to pass something with pytest.raises(TypeError, match="must pass an index to rename"): @@ -165,12 +164,13 @@ renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) tm.assert_index_equal(renamed.index, new_index) - def test_rename_nocopy(self, float_frame, using_copy_on_write): + def test_rename_nocopy(self, float_frame, using_copy_on_write, warn_copy_on_write): renamed = float_frame.rename(columns={"C": "foo"}, copy=False) assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values) - renamed.loc[:, "foo"] = 1.0 + with tm.assert_cow_warning(warn_copy_on_write): + renamed.loc[:, "foo"] = 1.0 if using_copy_on_write: assert not (float_frame["C"] == 1.0).all() else: @@ -388,8 +388,6 @@ # TODO: can we construct this without merge? k = merge(df4, df5, how="inner", left_index=True, right_index=True) result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"}) - str(result) - result.dtypes expected = DataFrame( [[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]], diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_replace.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_replace.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_replace.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_replace.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -28,6 +30,9 @@ class TestDataFrameReplace: + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -278,20 +283,36 @@ tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) - def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): + def test_regex_replace_dict_nested_non_first_character( + self, any_string_dtype, using_infer_string + ): # GH 25259 dtype = any_string_dtype df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) - result = df.replace({"a": "."}, regex=True) + if using_infer_string and any_string_dtype == "object": + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) + + else: + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) - result = df.replace({"Type": {"Q": 0, "T": 1}}) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( @@ -301,21 +322,28 @@ "c": [np.nan, np.nan, np.nan, "d"], } ) - res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) res2 = df.copy() res3 = df.copy() - return_value = res2.replace( - [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = res2.replace( + [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True + ) assert return_value is None - return_value = res3.replace( - regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = res3.replace( + regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True + ) assert return_value is None tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -331,6 +359,9 @@ tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -409,12 +440,31 @@ ], ) def test_regex_replace_string_types( - self, data, to_replace, expected, frame_or_series, any_string_dtype + self, + data, + to_replace, + expected, + frame_or_series, + any_string_dtype, + using_infer_string, + request, ): # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) - result = obj.replace(to_replace, regex=True) + if using_infer_string and any_string_dtype == "object": + if len(to_replace) > 1 and isinstance(obj, DataFrame): + request.node.add_marker( + pytest.mark.xfail( + reason="object input array that gets downcasted raises on " + "second pass" + ) + ) + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = obj.replace(to_replace, regex=True) + dtype = "string[pyarrow_numpy]" + else: + result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) tm.assert_equal(result, expected) @@ -516,15 +566,23 @@ result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) m = {"foo": 1, "bar": 2, "bah": 3} - rep = df.replace(m) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + rep = df.replace(m) expec = Series([np.int64] * 3) res = rep.dtypes tm.assert_series_equal(expec, res) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan @@ -580,7 +638,7 @@ result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) - def test_replace_mixed2(self): + def test_replace_mixed2(self, using_infer_string): # to object block upcasting df = DataFrame( { @@ -599,11 +657,15 @@ expected = DataFrame( { - "A": Series(["foo", "bar"], dtype="object"), + "A": Series(["foo", "bar"]), "B": Series([0, "foo"], dtype="object"), } ) - result = df.replace([1, 2], ["foo", "bar"]) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.replace([1, 2], ["foo", "bar"]) + else: + result = df.replace([1, 2], ["foo", "bar"]) tm.assert_frame_equal(result, expected) def test_replace_mixed3(self): @@ -721,11 +783,7 @@ tsframe.loc[tsframe.index[:5], "A"] = np.nan tsframe.loc[tsframe.index[-5:], "A"] = np.nan - tsframe.loc[tsframe.index[:5], "B"] = -1e8 - - b = tsframe["B"] - b[b == -1e8] = np.nan - tsframe["B"] = b + tsframe.loc[tsframe.index[:5], "B"] = np.nan msg = "DataFrame.fillna with 'method' is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): # TODO: what is this even testing? @@ -801,11 +859,13 @@ Timestamp("20130104", tz="US/Eastern"), DataFrame( { - "A": [ - Timestamp("20130101", tz="US/Eastern"), - Timestamp("20130104", tz="US/Eastern"), - Timestamp("20130103", tz="US/Eastern"), - ], + "A": pd.DatetimeIndex( + [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ] + ).as_unit("ns"), "B": [0, np.nan, 2], } ), @@ -838,7 +898,12 @@ ], ) def test_replace_dtypes(self, frame, to_replace, value, expected): - result = frame.replace(to_replace, value) + warn = None + if isinstance(to_replace, datetime) and to_replace.year == 2920: + warn = FutureWarning + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(warn, match=msg): + result = frame.replace(to_replace, value) tm.assert_frame_equal(result, expected) def test_replace_input_formats_listlike(self): @@ -881,6 +946,9 @@ with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -909,6 +977,9 @@ # TODO pass + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_dict_no_regex(self): answer = Series( { @@ -927,9 +998,14 @@ "Strongly Disagree": 1, } expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) - result = answer.replace(weights) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = answer.replace(weights) tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_series_no_regex(self): answer = Series( { @@ -950,7 +1026,9 @@ } ) expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) - result = answer.replace(weights) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = answer.replace(weights) tm.assert_series_equal(result, expected) def test_replace_dict_tuple_list_ordering_remains_the_same(self): @@ -1034,7 +1112,10 @@ expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - def test_replace_swapping_bug(self): + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_swapping_bug(self, using_infer_string): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) expect = DataFrame({"a": ["Y", "N", "Y"]}) @@ -1045,6 +1126,9 @@ expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_period(self): d = { "fname": { @@ -1076,9 +1160,14 @@ expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) assert expected.dtypes.iloc[0] == "Period[M]" - result = df.replace(d) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(d) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_datetime(self): d = { "fname": { @@ -1106,7 +1195,9 @@ ) assert set(df.fname.values) == set(d["fname"].keys()) expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) - result = df.replace(d) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace(d) tm.assert_frame_equal(result, expected) def test_replace_datetimetz(self): @@ -1153,6 +1244,7 @@ "B": [0, np.nan, 2], } ) + expected["A"] = expected["A"].dt.as_unit("ns") tm.assert_frame_equal(result, expected) result = df.copy() @@ -1174,6 +1266,7 @@ "B": [0, np.nan, 2], } ) + expected["A"] = expected["A"].dt.as_unit("ns") tm.assert_frame_equal(result, expected) result = df.copy() @@ -1258,7 +1351,9 @@ b = pd.Categorical(final_data[:, 1], categories=ex_cat) expected = DataFrame({"a": a, "b": b}) - result = df.replace(replace_dict, 3) + msg2 = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg2): + result = df.replace(replace_dict, 3) tm.assert_frame_equal(result, expected) msg = ( r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " @@ -1267,7 +1362,8 @@ with pytest.raises(AssertionError, match=msg): # ensure non-inplace call does not affect original tm.assert_frame_equal(df, expected) - return_value = df.replace(replace_dict, 3, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg2): + return_value = df.replace(replace_dict, 3, inplace=True) assert return_value is None tm.assert_frame_equal(df, expected) @@ -1297,6 +1393,9 @@ result = df.replace(to_replace) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) @pytest.mark.parametrize( "replacer", [ @@ -1307,10 +1406,12 @@ np.float64(1), ], ) - def test_replace_replacer_dtype(self, request, replacer): + def test_replace_replacer_dtype(self, replacer): # GH26632 df = DataFrame(["a"]) - result = df.replace({"a": replacer, "b": replacer}) + msg = "Downcasting behavior in `replace` " + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.replace({"a": replacer, "b": replacer}) expected = DataFrame([replacer]) tm.assert_frame_equal(result, expected) @@ -1415,9 +1516,14 @@ ) # replace values in input dataframe - input_df = input_df.replace("d", "z") - input_df = input_df.replace("obj1", "obj9") - result = input_df.replace("cat2", "catX") + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + input_df = input_df.replace("d", "z") + input_df = input_df.replace("obj1", "obj9") + result = input_df.replace("cat2", "catX") tm.assert_frame_equal(result, expected) @@ -1443,7 +1549,12 @@ ) # replace values in input dataframe using a dict - result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) + msg = ( + r"The behavior of Series\.replace \(and DataFrame.replace\) " + "with CategoricalDtype" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) @@ -1455,10 +1566,12 @@ expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - def test_replace_intervals(self): + def test_replace_intervals(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) - result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + warning = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warning, match="Downcasting"): + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) expected = DataFrame({"a": ["x", "x"]}) tm.assert_frame_equal(result, expected) @@ -1559,17 +1672,23 @@ expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 df1 = DataFrame({"A": ["0"], "B": ["0"]}) expected_df1 = DataFrame({"A": [1], "B": [1]}) - result_df1 = df1.replace(to_replace="0", value=1, regex=regex) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result_df1 = df1.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df1, expected_df1) df2 = DataFrame({"A": ["0"], "B": ["1"]}) expected_df2 = DataFrame({"A": [1], "B": ["1"]}) - result_df2 = df2.replace(to_replace="0", value=1, regex=regex) + with tm.assert_produces_warning(FutureWarning, match=msg): + result_df2 = df2.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df2, expected_df2) def test_replace_with_value_also_being_replaced(self): @@ -1593,9 +1712,15 @@ result = df.replace(to_replace=[".", "def"], value=["_", None]) tm.assert_frame_equal(result, expected) - def test_replace_object_splitting(self): + def test_replace_object_splitting(self, using_infer_string): # GH#53977 df = DataFrame({"a": ["a"], "b": "b"}) - assert len(df._mgr.blocks) == 1 + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 df.replace(to_replace=r"^\s*$", value="", inplace=True, regex=True) - assert len(df._mgr.blocks) == 1 + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_reset_index.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_reset_index.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_reset_index.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_reset_index.py 2024-04-10 17:42:52.000000000 +0000 @@ -76,20 +76,14 @@ expected = DataFrame( { - "idx": [ - datetime(2011, 1, 1), - datetime(2011, 1, 2), - datetime(2011, 1, 3), - datetime(2011, 1, 4), - datetime(2011, 1, 5), - ], + "idx": idx, "a": range(5), "b": ["A", "B", "C", "D", "E"], }, columns=["idx", "a", "b"], ) - expected["idx"] = expected["idx"].apply(lambda d: Timestamp(d, tz=tz)) - tm.assert_frame_equal(df.reset_index(), expected) + result = df.reset_index() + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_frame_reset_index_tzaware_index(self, tz): @@ -494,23 +488,20 @@ expected = DataFrame( { - "idx1": [ - datetime(2011, 1, 1), - datetime(2011, 1, 2), - datetime(2011, 1, 3), - datetime(2011, 1, 4), - datetime(2011, 1, 5), - ], + "idx1": idx1, "idx2": np.arange(5, dtype="int64"), "a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"], }, columns=["idx1", "idx2", "a", "b"], ) - expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz)) tm.assert_frame_equal(df.reset_index(), expected) + def test_reset_index_datetime2(self, tz_naive_fixture): + tz = tz_naive_fixture + idx1 = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1") + idx2 = Index(range(5), name="idx2", dtype="int64") idx3 = date_range( "1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3" ) @@ -522,36 +513,22 @@ expected = DataFrame( { - "idx1": [ - datetime(2011, 1, 1), - datetime(2011, 1, 2), - datetime(2011, 1, 3), - datetime(2011, 1, 4), - datetime(2011, 1, 5), - ], + "idx1": idx1, "idx2": np.arange(5, dtype="int64"), - "idx3": [ - datetime(2012, 1, 1), - datetime(2012, 2, 1), - datetime(2012, 3, 1), - datetime(2012, 4, 1), - datetime(2012, 5, 1), - ], + "idx3": idx3, "a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"], }, columns=["idx1", "idx2", "idx3", "a", "b"], ) - expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz)) - expected["idx3"] = expected["idx3"].apply( - lambda d: Timestamp(d, tz="Europe/Paris") - ) - tm.assert_frame_equal(df.reset_index(), expected) + result = df.reset_index() + tm.assert_frame_equal(result, expected) + def test_reset_index_datetime3(self, tz_naive_fixture): # GH#7793 - idx = MultiIndex.from_product( - [["a", "b"], date_range("20130101", periods=3, tz=tz)] - ) + tz = tz_naive_fixture + dti = date_range("20130101", periods=3, tz=tz) + idx = MultiIndex.from_product([["a", "b"], dti]) df = DataFrame( np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx ) @@ -559,17 +536,11 @@ expected = DataFrame( { "level_0": "a a a b b b".split(), - "level_1": [ - datetime(2013, 1, 1), - datetime(2013, 1, 2), - datetime(2013, 1, 3), - ] - * 2, + "level_1": dti.append(dti), "a": np.arange(6, dtype="int64"), }, columns=["level_0", "level_1", "a"], ) - expected["level_1"] = expected["level_1"].apply(lambda d: Timestamp(d, tz=tz)) result = df.reset_index() tm.assert_frame_equal(result, expected) @@ -683,21 +654,22 @@ ), ], ) -def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype): +def test_reset_index_dtypes_on_empty_frame_with_multiindex( + array, dtype, using_infer_string +): # GH 19602 - Preserve dtype on empty DataFrame with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes + if using_infer_string and dtype == object: + dtype = "string" expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) def test_reset_index_empty_frame_with_datetime64_multiindex(): # https://github.com/pandas-dev/pandas/issues/35606 - idx = MultiIndex( - levels=[[Timestamp("2020-07-20 00:00:00")], [3, 4]], - codes=[[], []], - names=["a", "b"], - ) + dti = pd.DatetimeIndex(["2020-07-20 00:00:00"], dtype="M8[ns]") + idx = MultiIndex.from_product([dti, [3, 4]], names=["a", "b"])[:0] df = DataFrame(index=idx, columns=["c", "d"]) result = df.reset_index() expected = DataFrame( @@ -708,9 +680,12 @@ tm.assert_frame_equal(result, expected) -def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): +def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( + using_infer_string, +): # https://github.com/pandas-dev/pandas/issues/35657 - df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": pd.to_datetime("2020-01-01")}) + dti = pd.DatetimeIndex(["2020-01-01"], dtype="M8[ns]") + df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": dti}) df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum() result = df.reset_index() expected = DataFrame( @@ -718,6 +693,8 @@ ) expected["c3"] = expected["c3"].astype("datetime64[ns]") expected["c1"] = expected["c1"].astype("float64") + if using_infer_string: + expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) @@ -728,9 +705,12 @@ df = DataFrame({"id": idx, "tstamp": tstamp, "a": list("abc")}) df.loc[2, "tstamp"] = pd.NaT result = df.set_index(["id", "tstamp"]).reset_index("id") + exp_dti = pd.DatetimeIndex( + ["2015-07-01", "2015-07-02", "NaT"], dtype="M8[ns]", name="tstamp" + ) expected = DataFrame( {"id": range(3), "a": list("abc")}, - index=pd.DatetimeIndex(["2015-07-01", "2015-07-02", "NaT"], name="tstamp"), + index=exp_dti, ) tm.assert_frame_equal(result, expected) @@ -788,15 +768,15 @@ def test_reset_index_false_index_name(): - result_series = Series(data=range(5, 10), index=range(0, 5)) + result_series = Series(data=range(5, 10), index=range(5)) result_series.index.name = False result_series.reset_index() - expected_series = Series(range(5, 10), RangeIndex(range(0, 5), name=False)) + expected_series = Series(range(5, 10), RangeIndex(range(5), name=False)) tm.assert_series_equal(result_series, expected_series) # GH 38147 - result_frame = DataFrame(data=range(5, 10), index=range(0, 5)) + result_frame = DataFrame(data=range(5, 10), index=range(5)) result_frame.index.name = False result_frame.reset_index() - expected_frame = DataFrame(range(5, 10), RangeIndex(range(0, 5), name=False)) + expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False)) tm.assert_frame_equal(result_frame, expected_frame) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_select_dtypes.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_select_dtypes.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_select_dtypes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_select_dtypes.py 2024-04-10 17:42:52.000000000 +0000 @@ -32,7 +32,7 @@ self.data = data self._dtype = dtype - def __array__(self, dtype): + def __array__(self, dtype=None, copy=None): return self.data @property @@ -282,7 +282,7 @@ result = df.select_dtypes(include=[np.number], exclude=["floating"]) tm.assert_frame_equal(result, expected) - def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): + def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -296,11 +296,17 @@ df["g"] = df.f.diff() assert not hasattr(np, "u8") r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) - e = df[["a", "b"]] + if using_infer_string: + e = df[["b"]] + else: + e = df[["a", "b"]] tm.assert_frame_equal(r, e) r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) - e = df[["a", "b", "g"]] + if using_infer_string: + e = df[["b", "g"]] + else: + e = df[["a", "b", "g"]] tm.assert_frame_equal(r, e) def test_select_dtypes_empty(self): @@ -378,12 +384,9 @@ def test_select_dtypes_typecodes(self): # GH 11990 - df = tm.makeCustomDataframe( - 30, 3, data_gen_f=lambda x, y: np.random.default_rng(2).random() - ) - expected = df + df = DataFrame(np.random.default_rng(2).random((5, 3))) FLOAT_TYPES = list(np.typecodes["AllFloat"]) - tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected) + tm.assert_frame_equal(df.select_dtypes(FLOAT_TYPES), df) @pytest.mark.parametrize( "arr,expected", diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_set_index.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_set_index.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_set_index.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_set_index.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,6 +12,7 @@ from pandas import ( Categorical, + CategoricalIndex, DataFrame, DatetimeIndex, Index, @@ -24,6 +25,34 @@ import pandas._testing as tm +@pytest.fixture +def frame_of_index_cols(): + """ + Fixture for DataFrame of columns that can be used for indexing + + Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; + 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. + + A B C D E (tuple, as, label) + 0 foo one a 0.608477 -0.012500 -1.664297 + 1 foo two b -0.633460 0.249614 -0.364411 + 2 foo three c 0.615256 2.154968 -0.834666 + 3 bar one d 0.234246 1.085675 0.718445 + 4 bar two e 0.533841 -0.005702 -3.533912 + """ + df = DataFrame( + { + "A": ["foo", "foo", "foo", "bar", "bar"], + "B": ["one", "two", "three", "one", "two"], + "C": ["a", "b", "c", "d", "e"], + "D": np.random.default_rng(2).standard_normal(5), + "E": np.random.default_rng(2).standard_normal(5), + ("tuple", "as", "label"): np.random.default_rng(2).standard_normal(5), + } + ) + return df + + class TestSetIndex: def test_set_index_multiindex(self): # segfault in GH#3308 @@ -99,7 +128,7 @@ assert isinstance(idf.index, DatetimeIndex) def test_set_index_dst(self): - di = date_range("2006-10-29 00:00:00", periods=3, freq="H", tz="US/Pacific") + di = date_range("2006-10-29 00:00:00", periods=3, freq="h", tz="US/Pacific") df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index() # single level @@ -127,7 +156,11 @@ df.set_index(idx[::2]) def test_set_index_names(self): - df = tm.makeDataFrame() + df = DataFrame( + np.ones((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(10)], dtype=object), + ) df.index.name = "name" assert df.set_index(df.index).index.names == ["name"] @@ -370,8 +403,7 @@ tm.assert_frame_equal(result, expected) def test_construction_with_categorical_index(self): - ci = tm.makeCategoricalIndex(10) - ci.name = "B" + ci = CategoricalIndex(list("ab") * 5, name="B") # with Categorical df = DataFrame( @@ -491,16 +523,16 @@ df = DataFrame(np.random.default_rng(2).random(6)) idx1 = period_range("2011-01-01", periods=3, freq="M") idx1 = idx1.append(idx1) - idx2 = period_range("2013-01-01 09:00", periods=2, freq="H") + idx2 = period_range("2013-01-01 09:00", periods=2, freq="h") idx2 = idx2.append(idx2).append(idx2) - idx3 = period_range("2005", periods=6, freq="A") + idx3 = period_range("2005", periods=6, freq="Y") df = df.set_index(idx1) df = df.set_index(idx2, append=True) df = df.set_index(idx3, append=True) expected1 = period_range("2011-01-01", periods=3, freq="M") - expected2 = period_range("2013-01-01 09:00", periods=2, freq="H") + expected2 = period_range("2013-01-01 09:00", periods=2, freq="h") tm.assert_index_equal(df.index.levels[0], expected1) tm.assert_index_equal(df.index.levels[1], expected2) @@ -694,7 +726,7 @@ # GH#6631 df = DataFrame(np.random.default_rng(2).random(6)) idx1 = period_range("2011/01/01", periods=6, freq="M") - idx2 = period_range("2013", periods=6, freq="A") + idx2 = period_range("2013", periods=6, freq="Y") df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_shift.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_shift.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_shift.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_shift.py 2024-04-10 17:42:52.000000000 +0000 @@ -36,7 +36,7 @@ # Can't pass both! obj = frame_or_series( np.random.default_rng(2).standard_normal(5), - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) msg = ( @@ -44,12 +44,12 @@ "fill_value" ) with tm.assert_produces_warning(FutureWarning, match=msg): - obj.shift(1, fill_value=1, freq="H") + obj.shift(1, fill_value=1, freq="h") if frame_or_series is DataFrame: - obj.columns = date_range("1/1/2000", periods=1, freq="H") + obj.columns = date_range("1/1/2000", periods=1, freq="h") with tm.assert_produces_warning(FutureWarning, match=msg): - obj.shift(1, axis=1, fill_value=1, freq="H") + obj.shift(1, axis=1, fill_value=1, freq="h") @pytest.mark.parametrize( "input_data, output_data", @@ -76,15 +76,15 @@ def test_shift_mismatched_freq(self, frame_or_series): ts = frame_or_series( np.random.default_rng(2).standard_normal(5), - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) - result = ts.shift(1, freq="5T") - exp_index = ts.index.shift(1, freq="5T") + result = ts.shift(1, freq="5min") + exp_index = ts.index.shift(1, freq="5min") tm.assert_index_equal(result.index, exp_index) # GH#1063, multiple of same base - result = ts.shift(1, freq="4H") + result = ts.shift(1, freq="4h") exp_index = ts.index + offsets.Hour(4) tm.assert_index_equal(result.index, exp_index) @@ -92,7 +92,7 @@ "obj", [ Series([np.arange(5)]), - date_range("1/1/2011", periods=24, freq="H"), + date_range("1/1/2011", periods=24, freq="h"), Series(range(5), index=date_range("2017", periods=5)), ], ) @@ -144,20 +144,20 @@ # GH#21275 obj = frame_or_series( range(periods), - index=date_range("2016-1-1 00:00:00", periods=periods, freq="H"), + index=date_range("2016-1-1 00:00:00", periods=periods, freq="h"), ) - result = obj.shift(1, "2H") + result = obj.shift(1, "2h") expected = frame_or_series( range(periods), - index=date_range("2016-1-1 02:00:00", periods=periods, freq="H"), + index=date_range("2016-1-1 02:00:00", periods=periods, freq="h"), ) tm.assert_equal(result, expected) def test_shift_dst(self, frame_or_series): # GH#13926 - dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern") + dates = date_range("2016-11-06", freq="h", periods=10, tz="US/Eastern") obj = frame_or_series(dates) res = obj.shift(0) @@ -179,7 +179,7 @@ @pytest.mark.parametrize("ex", [10, -10, 20, -20]) def test_shift_dst_beyond(self, frame_or_series, ex): # GH#13926 - dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern") + dates = date_range("2016-11-06", freq="h", periods=10, tz="US/Eastern") obj = frame_or_series(dates) res = obj.shift(ex) exp = frame_or_series([NaT] * 10, dtype="datetime64[ns, US/Eastern]") @@ -241,7 +241,9 @@ def test_shift_with_periodindex(self, frame_or_series): # Shifting with PeriodIndex - ps = tm.makePeriodFrame() + ps = DataFrame( + np.arange(4, dtype=float), index=pd.period_range("2020-01-01", periods=4) + ) ps = tm.get_obj(ps, frame_or_series) shifted = ps.shift(1) @@ -366,7 +368,7 @@ def test_shift_fill_value(self, frame_or_series): # GH#24128 - dti = date_range("1/1/2000", periods=5, freq="H") + dti = date_range("1/1/2000", periods=5, freq="h") ts = frame_or_series([1.0, 2.0, 3.0, 4.0, 5.0], index=dti) exp = frame_or_series([0.0, 1.0, 2.0, 3.0, 4.0], index=dti) @@ -492,7 +494,7 @@ tm.assert_frame_equal(result, expected) def test_period_index_frame_shift_with_freq(self, frame_or_series): - ps = tm.makePeriodFrame() + ps = DataFrame(range(4), index=pd.period_range("2020-01-01", periods=4)) ps = tm.get_obj(ps, frame_or_series) shifted = ps.shift(1, freq="infer") @@ -529,7 +531,7 @@ tm.assert_equal(unshifted, inferred_ts) def test_period_index_frame_shift_with_freq_error(self, frame_or_series): - ps = tm.makePeriodFrame() + ps = DataFrame(range(4), index=pd.period_range("2020-01-01", periods=4)) ps = tm.get_obj(ps, frame_or_series) msg = "Given freq M does not match PeriodIndex freq D" with pytest.raises(ValueError, match=msg): @@ -706,7 +708,7 @@ # GH#44424 df = DataFrame( np.random.default_rng(2).standard_normal(5), - index=date_range("1/1/2000", periods=5, freq="H"), + index=date_range("1/1/2000", periods=5, freq="h"), ) tm.assert_frame_equal( @@ -716,8 +718,8 @@ ) tm.assert_frame_equal( - df.shift([1], freq="H").rename(columns=lambda x: int(x[0])), - df.shift(1, freq="H"), + df.shift([1], freq="h").rename(columns=lambda x: int(x[0])), + df.shift(1, freq="h"), ) msg = ( @@ -725,7 +727,7 @@ "fill_value" ) with tm.assert_produces_warning(FutureWarning, match=msg): - df.shift([1, 2], fill_value=1, freq="H") + df.shift([1, 2], fill_value=1, freq="h") def test_shift_with_iterable_check_other_arguments(self): # GH#44424 @@ -754,3 +756,9 @@ msg = "Cannot specify `suffix` if `periods` is an int." with pytest.raises(ValueError, match=msg): df.shift(1, suffix="fails") + + def test_shift_axis_one_empty(self): + # GH#57301 + df = DataFrame() + result = df.shift(1, axis=1) + tm.assert_frame_equal(result, df) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_sort_index.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_sort_index.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_sort_index.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_sort_index.py 2024-04-10 17:42:52.000000000 +0000 @@ -911,7 +911,7 @@ expected = DataFrame( { i: pd.array([0.0, 0.0, 0.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)) - for i in range(0, 4) + for i in range(4) }, index=MultiIndex.from_product([[1, 2], [1, 2]]), ) @@ -994,3 +994,35 @@ ), ) tm.assert_frame_equal(result, expected) + + +def test_axis_columns_ignore_index(): + # GH 56478 + df = DataFrame([[1, 2]], columns=["d", "c"]) + result = df.sort_index(axis="columns", ignore_index=True) + expected = DataFrame([[2, 1]]) + tm.assert_frame_equal(result, expected) + + +def test_sort_index_stable_sort(): + # GH 57151 + df = DataFrame( + data=[ + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + result = df.sort_index(level="dt", kind="stable") + expected = DataFrame( + data=[ + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_sort_values.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_sort_values.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_sort_values.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_sort_values.py 2024-04-10 17:42:52.000000000 +0000 @@ -863,7 +863,7 @@ Version(np.__version__) >= Version("1.25") and request.node.callspec.id == "df_idx0-inner-True" ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" @@ -907,7 +907,7 @@ result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) if Version(np.__version__) >= Version("1.25"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=( "pandas default unstable sorting of duplicates" diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_to_csv.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_to_csv.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_to_csv.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_to_csv.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,6 +16,7 @@ Series, Timestamp, date_range, + period_range, read_csv, to_datetime, ) @@ -155,7 +156,11 @@ chunksize = 5 N = int(chunksize * 2.5) - df = tm.makeCustomDataframe(N, 3) + df = DataFrame( + np.ones((N, 3)), + index=Index([f"i-{i}" for i in range(N)], name="a"), + columns=Index([f"i-{i}" for i in range(3)], name="a"), + ) cs = df.columns cols = [cs[2], cs[0]] @@ -171,8 +176,11 @@ N = int(chunksize * 2.5) # dupe cols - df = tm.makeCustomDataframe(N, 3) - df.columns = ["a", "a", "b"] + df = DataFrame( + np.ones((N, 3)), + index=Index([f"i-{i}" for i in range(N)], name="a"), + columns=["a", "a", "b"], + ) with tm.ensure_clean() as path: df.to_csv(path, columns=cols, chunksize=chunksize) rs_c = read_csv(path, index_col=0) @@ -335,7 +343,11 @@ "nrows", [2, 10, 99, 100, 101, 102, 198, 199, 200, 201, 202, 249, 250, 251] ) def test_to_csv_nrows(self, nrows): - df = tm.makeCustomDataframe(nrows, 4, r_idx_type="dt", c_idx_type="s") + df = DataFrame( + np.ones((nrows, 4)), + index=date_range("2020-01-01", periods=nrows), + columns=Index(list("abcd"), dtype=object), + ) result, expected = self._return_result_expected(df, 1000, "dt", "s") tm.assert_frame_equal(result, expected, check_names=False) @@ -349,8 +361,16 @@ @pytest.mark.parametrize("ncols", [1, 2, 3, 4]) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_to_csv_idx_types(self, nrows, r_idx_type, c_idx_type, ncols): - df = tm.makeCustomDataframe( - nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type + axes = { + "i": lambda n: Index(np.arange(n), dtype=np.int64), + "s": lambda n: Index([f"{i}_{chr(i)}" for i in range(97, 97 + n)]), + "dt": lambda n: date_range("2020-01-01", periods=n), + "p": lambda n: period_range("2020-01-01", periods=n, freq="D"), + } + df = DataFrame( + np.ones((nrows, ncols)), + index=axes[r_idx_type](nrows), + columns=axes[c_idx_type](ncols), ) result, expected = self._return_result_expected( df, @@ -366,14 +386,23 @@ ) @pytest.mark.parametrize("ncols", [1, 2, 3, 4]) def test_to_csv_idx_ncols(self, nrows, ncols): - df = tm.makeCustomDataframe(nrows, ncols) + df = DataFrame( + np.ones((nrows, ncols)), + index=Index([f"i-{i}" for i in range(nrows)], name="a"), + columns=Index([f"i-{i}" for i in range(ncols)], name="a"), + ) result, expected = self._return_result_expected(df, 1000) tm.assert_frame_equal(result, expected, check_names=False) @pytest.mark.slow @pytest.mark.parametrize("nrows", [10, 98, 99, 100, 101, 102]) def test_to_csv_dup_cols(self, nrows): - df = tm.makeCustomDataframe(nrows, 3) + df = DataFrame( + np.ones((nrows, 3)), + index=Index([f"i-{i}" for i in range(nrows)], name="a"), + columns=Index([f"i-{i}" for i in range(3)], name="a"), + ) + cols = list(df.columns) cols[:2] = ["dupe", "dupe"] cols[-2:] = ["dupe", "dupe"] @@ -394,7 +423,12 @@ @pytest.mark.slow def test_to_csv_chunksize(self): chunksize = 1000 - df = tm.makeCustomDataframe(chunksize // 2 + 1, 2, r_idx_nlevels=2) + rows = chunksize // 2 + 1 + df = DataFrame( + np.ones((rows, 2)), + columns=Index(list("ab"), dtype=object), + index=MultiIndex.from_arrays([range(rows) for _ in range(2)]), + ) result, expected = self._return_result_expected(df, chunksize, rnlvl=2) tm.assert_frame_equal(result, expected, check_names=False) @@ -412,7 +446,22 @@ ], ) def test_to_csv_params(self, nrows, df_params, func_params, ncols): - df = tm.makeCustomDataframe(nrows, ncols, **df_params) + if df_params.get("r_idx_nlevels"): + index = MultiIndex.from_arrays( + [f"i-{i}" for i in range(nrows)] + for _ in range(df_params["r_idx_nlevels"]) + ) + else: + index = None + + if df_params.get("c_idx_nlevels"): + columns = MultiIndex.from_arrays( + [f"i-{i}" for i in range(ncols)] + for _ in range(df_params["c_idx_nlevels"]) + ) + else: + columns = Index([f"i-{i}" for i in range(ncols)], dtype=object) + df = DataFrame(np.ones((nrows, ncols)), index=index, columns=columns) result, expected = self._return_result_expected(df, 1000, **func_params) tm.assert_frame_equal(result, expected, check_names=False) @@ -420,7 +469,7 @@ # test roundtrip with inf, -inf, nan, as full columns and mix float_frame["G"] = np.nan f = lambda x: [np.inf, np.nan][np.random.default_rng(2).random() < 0.5] - float_frame["H"] = float_frame.index.map(f) + float_frame["h"] = float_frame.index.map(f) with tm.ensure_clean() as path: float_frame.to_csv(path) @@ -545,19 +594,40 @@ ) # column & index are multi-index - df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) + df = DataFrame( + np.ones((5, 3)), + columns=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + ), + index=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(5)] for _ in range(2)], names=list("ab") + ), + ) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(df, result) # column is mi - df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) + df = DataFrame( + np.ones((5, 3)), + columns=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + ), + ) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=0) tm.assert_frame_equal(df, result) # dup column names? - df = tm.makeCustomDataframe(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) + df = DataFrame( + np.ones((5, 3)), + columns=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(3)] for _ in range(4)], names=list("abcd") + ), + index=MultiIndex.from_arrays( + [[f"i-{i}" for i in range(5)] for _ in range(3)], names=list("abc") + ), + ) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) tm.assert_frame_equal(df, result) @@ -612,7 +682,7 @@ tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0 - def test_to_csv_interval_index(self): + def test_to_csv_interval_index(self, using_infer_string): # GH 28210 df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) @@ -622,7 +692,10 @@ # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = df.copy() - expected.index = expected.index.astype(str) + if using_infer_string: + expected.index = expected.index.astype("string[pyarrow_numpy]") + else: + expected.index = expected.index.astype(str) tm.assert_frame_equal(result, expected) @@ -737,11 +810,13 @@ result.columns = df.columns tm.assert_frame_equal(result, df) + def test_to_csv_dups_cols2(self): # GH3457 - - N = 10 - df = tm.makeCustomDataframe(N, 3) - df.columns = ["a", "a", "b"] + df = DataFrame( + np.ones((5, 3)), + index=Index([f"i-{i}" for i in range(5)], name="foo"), + columns=Index(["a", "a", "b"], dtype=object), + ) with tm.ensure_clean() as filename: df.to_csv(filename) @@ -1077,7 +1152,7 @@ "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", - freq="H", + freq="h", ambiguous="infer", ) i = times + td @@ -1095,7 +1170,7 @@ def test_to_csv_with_dst_transitions_with_pickle(self): # GH11619 - idx = date_range("2015-01-01", "2015-12-31", freq="H", tz="Europe/Paris") + idx = date_range("2015-01-01", "2015-12-31", freq="h", tz="Europe/Paris") idx = idx._with_freq(None) # freq does not round-trip idx._data._freq = None # otherwise there is trouble on unpickle df = DataFrame({"values": 1, "idx": idx}, index=idx) @@ -1326,6 +1401,6 @@ ) df["a"] = df["a"].astype("category") result = df.to_csv() - expected_rows = [",a", '0,"[2020-01-01, 2020-01-02]"'] + expected_rows = [",a", '0,"[2020-01-01 00:00:00, 2020-01-02 00:00:00]"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_to_dict.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_to_dict.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_to_dict.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_to_dict.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,8 +12,11 @@ NA, DataFrame, Index, + Interval, MultiIndex, + Period, Series, + Timedelta, Timestamp, ) import pandas._testing as tm @@ -99,19 +102,19 @@ for k2, v2 in v.items(): assert v2 == recons_data[k][k2] - recons_data = DataFrame(test_data).to_dict("list", mapping) + recons_data = DataFrame(test_data).to_dict("list", into=mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][int(k2) - 1] - recons_data = DataFrame(test_data).to_dict("series", mapping) + recons_data = DataFrame(test_data).to_dict("series", into=mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] - recons_data = DataFrame(test_data).to_dict("split", mapping) + recons_data = DataFrame(test_data).to_dict("split", into=mapping) expected_split = { "columns": ["A", "B"], "index": ["1", "2", "3"], @@ -119,7 +122,7 @@ } tm.assert_dict_equal(recons_data, expected_split) - recons_data = DataFrame(test_data).to_dict("records", mapping) + recons_data = DataFrame(test_data).to_dict("records", into=mapping) expected_records = [ {"A": 1.0, "B": "1"}, {"A": 2.0, "B": "2"}, @@ -166,6 +169,21 @@ with tm.assert_produces_warning(UserWarning): df.to_dict() + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.parametrize( + "orient,expected", + [ + ("list", {"A": [2, 5], "B": [3, 6]}), + ("dict", {"A": {0: 2, 1: 5}, "B": {0: 3, 1: 6}}), + ], + ) + def test_to_dict_not_unique(self, orient, expected): + # GH#54824: This is to make sure that dataframes with non-unique column + # would have uniform behavior throughout different orients + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "A", "B"]) + result = df.to_dict(orient) + assert result == expected + # orient - orient argument to to_dict function # item_getter - function for extracting value from # the resulting dict using column name and index @@ -489,8 +507,29 @@ # GH#34665 df = DataFrame({"a": Series([1, 2], dtype="Int64"), "B": 1}) result = df.to_dict(orient="records") - assert type(result[0]["a"]) is int + assert isinstance(result[0]["a"], int) df = DataFrame({"a": Series([1, NA], dtype="Int64"), "B": 1}) result = df.to_dict(orient="records") - assert type(result[0]["a"]) is int + assert isinstance(result[0]["a"], int) + + def test_to_dict_pos_args_deprecation(self): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_dict except for the " + r"argument 'orient' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_dict("records", {}) + + +@pytest.mark.parametrize( + "val", [Timestamp(2020, 1, 1), Timedelta(1), Period("2020"), Interval(1, 2)] +) +def test_to_dict_list_pd_scalars(val): + # GH 54824 + df = DataFrame({"a": [val]}) + result = df.to_dict(orient="list") + expected = {"a": [val]} + assert result == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_to_dict_of_blocks.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_to_dict_of_blocks.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_to_dict_of_blocks.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_to_dict_of_blocks.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,22 +14,7 @@ class TestToDictOfBlocks: - def test_copy_blocks(self, float_frame): - # GH#9607 - df = DataFrame(float_frame, copy=True) - column = df.columns[0] - - # use the default copy=True, change a column - _last_df = None - blocks = df._to_dict_of_blocks(copy=True) - for _df in blocks.values(): - _last_df = _df - if column in _df: - _df.loc[:, column] = _df[column] + 1 - - # make sure we did not change the original DataFrame - assert _last_df is not None and not _last_df[column].equals(df[column]) - + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_no_copy_blocks(self, float_frame, using_copy_on_write): # GH#9607 df = DataFrame(float_frame, copy=True) @@ -37,7 +22,7 @@ _last_df = None # use the copy=False, change a column - blocks = df._to_dict_of_blocks(copy=False) + blocks = df._to_dict_of_blocks() for _df in blocks.values(): _last_df = _df if column in _df: @@ -50,9 +35,7 @@ assert _last_df is not None and not _last_df[column].equals(df[column]) -def test_to_dict_of_blocks_item_cache(request, using_copy_on_write): - if using_copy_on_write: - request.node.add_marker(pytest.mark.xfail(reason="CoW - not yet implemented")) +def test_to_dict_of_blocks_item_cache(using_copy_on_write, warn_copy_on_write): # Calling to_dict_of_blocks should not poison item_cache df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) df["c"] = NumpyExtensionArray(np.array([1, 2, None, 3], dtype=object)) @@ -64,10 +47,13 @@ df._to_dict_of_blocks() if using_copy_on_write: - # TODO(CoW) we should disallow this, so `df` doesn't get updated, - # this currently still updates df, so this test fails + with pytest.raises(ValueError, match="read-only"): + ser.values[0] = "foo" + elif warn_copy_on_write: ser.values[0] = "foo" - assert df.loc[0, "b"] == "a" + assert df.loc[0, "b"] == "foo" + # with warning mode, the item cache is disabled + assert df["b"] is not ser else: # Check that the to_dict_of_blocks didn't break link between ser and df ser.values[0] = "foo" diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_to_records.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_to_records.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_to_records.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_to_records.py 2024-04-10 17:42:52.000000000 +0000 @@ -512,7 +512,7 @@ @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"]) def test_to_records_datetimeindex_with_tz(self, tz): # GH#13937 - dr = date_range("2016-01-01", periods=10, freq="S", tz=tz) + dr = date_range("2016-01-01", periods=10, freq="s", tz=tz) df = DataFrame({"datetime": dr}, index=dr) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_to_timestamp.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_to_timestamp.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_to_timestamp.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_to_timestamp.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,7 +16,7 @@ import pandas._testing as tm -def _get_with_delta(delta, freq="A-DEC"): +def _get_with_delta(delta, freq="YE-DEC"): return date_range( to_datetime("1/1/2001") + delta, to_datetime("12/31/2009") + delta, @@ -27,7 +27,7 @@ class TestToTimestamp: def test_to_timestamp(self, frame_or_series): K = 5 - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") obj = DataFrame( np.random.default_rng(2).standard_normal((len(index), K)), index=index, @@ -36,7 +36,7 @@ obj["mix"] = "a" obj = tm.get_obj(obj, frame_or_series) - exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + exp_index = date_range("1/1/2001", end="12/31/2009", freq="YE-DEC") exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") result = obj.to_timestamp("D", "end") tm.assert_index_equal(result.index, exp_index) @@ -44,7 +44,7 @@ if frame_or_series is Series: assert result.name == "A" - exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN") result = obj.to_timestamp("D", "start") tm.assert_index_equal(result.index, exp_index) @@ -71,7 +71,7 @@ def test_to_timestamp_columns(self): K = 5 - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") df = DataFrame( np.random.default_rng(2).standard_normal((len(index), K)), index=index, @@ -82,13 +82,13 @@ # columns df = df.T - exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + exp_index = date_range("1/1/2001", end="12/31/2009", freq="YE-DEC") exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") result = df.to_timestamp("D", "end", axis=1) tm.assert_index_equal(result.columns, exp_index) tm.assert_numpy_array_equal(result.values, df.values) - exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + exp_index = date_range("1/1/2001", end="1/1/2009", freq="YS-JAN") result = df.to_timestamp("D", "start", axis=1) tm.assert_index_equal(result.columns, exp_index) @@ -99,7 +99,7 @@ tm.assert_index_equal(result.columns, exp_index) delta = timedelta(hours=23, minutes=59) - result = df.to_timestamp("T", "end", axis=1) + result = df.to_timestamp("min", "end", axis=1) exp_index = _get_with_delta(delta) exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") tm.assert_index_equal(result.columns, exp_index) @@ -110,19 +110,19 @@ exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.columns, exp_index) - result1 = df.to_timestamp("5t", axis=1) - result2 = df.to_timestamp("t", axis=1) - expected = date_range("2001-01-01", "2009-01-01", freq="AS") + result1 = df.to_timestamp("5min", axis=1) + result2 = df.to_timestamp("min", axis=1) + expected = date_range("2001-01-01", "2009-01-01", freq="YS") assert isinstance(result1.columns, DatetimeIndex) assert isinstance(result2.columns, DatetimeIndex) tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8) tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8) # PeriodIndex.to_timestamp always use 'infer' - assert result1.columns.freqstr == "AS-JAN" - assert result2.columns.freqstr == "AS-JAN" + assert result1.columns.freqstr == "YS-JAN" + assert result2.columns.freqstr == "YS-JAN" def test_to_timestamp_invalid_axis(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009") obj = DataFrame( np.random.default_rng(2).standard_normal((len(index), 5)), index=index ) @@ -132,12 +132,12 @@ obj.to_timestamp(axis=2) def test_to_timestamp_hourly(self, frame_or_series): - index = period_range(freq="H", start="1/1/2001", end="1/2/2001") + index = period_range(freq="h", start="1/1/2001", end="1/2/2001") obj = Series(1, index=index, name="foo") if frame_or_series is not Series: obj = obj.to_frame() - exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="H") + exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="h") result = obj.to_timestamp(how="end") exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_transpose.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_transpose.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_transpose.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_transpose.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,12 +3,15 @@ import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, + Index, IntervalIndex, Series, Timestamp, + bdate_range, date_range, timedelta_range, ) @@ -29,7 +32,8 @@ def test_transpose_empty_preserves_datetimeindex(self): # GH#41382 - df = DataFrame(index=DatetimeIndex([])) + dti = DatetimeIndex([], dtype="M8[ns]") + df = DataFrame(index=dti) expected = DatetimeIndex([], dtype="datetime64[ns]", freq=None) @@ -70,7 +74,7 @@ @pytest.mark.parametrize("tz", [None, "America/New_York"]) def test_transpose_preserves_dtindex_equality_with_dst(self, tz): # GH#19970 - idx = date_range("20161101", "20161130", freq="4H", tz=tz) + idx = date_range("20161101", "20161130", freq="4h", tz=tz) df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx) result = df.T == df.T expected = DataFrame(True, index=list("ab"), columns=idx) @@ -87,9 +91,13 @@ res2 = df2.T assert (res2.dtypes == object).all() - def test_transpose_uint64(self, uint64_frame): - result = uint64_frame.T - expected = DataFrame(uint64_frame.values.T) + def test_transpose_uint64(self): + df = DataFrame( + {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, + dtype=np.uint64, + ) + result = df.T + expected = DataFrame(df.values.T) expected.index = ["A", "B"] tm.assert_frame_equal(result, expected) @@ -103,9 +111,17 @@ else: assert value == frame[col][idx] + def test_transpose_mixed(self): # mixed type - index, data = tm.getMixedTypeDict() - mixed = DataFrame(data, index=index) + mixed = DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + }, + index=Index(["a", "b", "c", "d", "e"], dtype=object), + ) mixed_T = mixed.T for col, s in mixed_T.items(): @@ -175,3 +191,19 @@ dtype=object, ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype1", ["Int64", "Float64"]) + @pytest.mark.parametrize("dtype2", ["Int64", "Float64"]) + def test_transpose(self, dtype1, dtype2): + # GH#57315 - transpose should have F contiguous blocks + df = DataFrame( + { + "a": pd.array([1, 1, 2], dtype=dtype1), + "b": pd.array([3, 4, 5], dtype=dtype2), + } + ) + result = df.T + for blk in result._mgr.blocks: + # When dtypes are unequal, we get NumPy object array + data = blk.values._data if dtype1 == dtype2 else blk.values + assert data.flags["F_CONTIGUOUS"] diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_truncate.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_truncate.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_truncate.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_truncate.py 2024-04-10 17:42:52.000000000 +0000 @@ -60,7 +60,7 @@ truncated = ts.truncate(before=ts.index[-1] + ts.index.freq) assert len(truncated) == 0 - msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-02-04 00:00:00" + msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-05-16 00:00:00" with pytest.raises(ValueError, match=msg): ts.truncate( before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_tz_convert.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_tz_convert.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_tz_convert.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_tz_convert.py 2024-04-10 17:42:52.000000000 +0000 @@ -120,7 +120,7 @@ # GH#6326 obj = frame_or_series( np.arange(0, 5), - index=date_range("20131027", periods=5, freq="1H", tz="Europe/Berlin"), + index=date_range("20131027", periods=5, freq="h", tz="Europe/Berlin"), ) orig = obj.copy() result = obj.tz_convert("UTC", copy=copy) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_tz_localize.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_tz_localize.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_tz_localize.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_tz_localize.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,7 +16,7 @@ # test_tz_convert_and_localize in test_tz_convert def test_tz_localize(self, frame_or_series): - rng = date_range("1/1/2011", periods=100, freq="H") + rng = date_range("1/1/2011", periods=100, freq="h") obj = DataFrame({"a": 1}, index=rng) obj = tm.get_obj(obj, frame_or_series) @@ -29,7 +29,7 @@ tm.assert_equal(result, expected) def test_tz_localize_axis1(self): - rng = date_range("1/1/2011", periods=100, freq="H") + rng = date_range("1/1/2011", periods=100, freq="h") df = DataFrame({"a": 1}, index=rng) @@ -43,7 +43,7 @@ def test_tz_localize_naive(self, frame_or_series): # Can't localize if already tz-aware - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") ts = Series(1, index=rng) ts = frame_or_series(ts) @@ -54,13 +54,13 @@ def test_tz_localize_copy_inplace_mutate(self, copy, frame_or_series): # GH#6326 obj = frame_or_series( - np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=None) + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1h", tz=None) ) orig = obj.copy() result = obj.tz_localize("UTC", copy=copy) expected = frame_or_series( np.arange(0, 5), - index=date_range("20131027", periods=5, freq="1H", tz="UTC"), + index=date_range("20131027", periods=5, freq="1h", tz="UTC"), ) tm.assert_equal(result, expected) tm.assert_equal(obj, orig) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_update.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_update.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_update.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_update.py 2024-04-10 17:42:52.000000000 +0000 @@ -48,16 +48,18 @@ def test_update_dtypes(self): # gh 3016 df = DataFrame( - [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], + [[1.0, 2.0, 1, False, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], ) - other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) + other = DataFrame( + [[45, 45, 3, True]], index=[0], columns=["A", "B", "int", "bool1"] + ) df.update(other) expected = DataFrame( - [[45.0, 45.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], + [[45.0, 45.0, 3, True, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], ) tm.assert_frame_equal(df, expected) @@ -140,30 +142,53 @@ expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) tm.assert_frame_equal(result, expected) + def test_update_datetime_tz_in_place(self, using_copy_on_write, warn_copy_on_write): + # https://github.com/pandas-dev/pandas/issues/56227 + result = DataFrame([pd.Timestamp("2019", tz="UTC")]) + orig = result.copy() + view = result[:] + with tm.assert_produces_warning( + FutureWarning if warn_copy_on_write else None, match="Setting a value" + ): + result.update(result + pd.Timedelta(days=1)) + expected = DataFrame([pd.Timestamp("2019-01-02", tz="UTC")]) + tm.assert_frame_equal(result, expected) + if not using_copy_on_write: + tm.assert_frame_equal(view, expected) + else: + tm.assert_frame_equal(view, orig) + def test_update_with_different_dtype(self, using_copy_on_write): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) df["c"] = np.nan - if using_copy_on_write: + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): df.update({"c": Series(["foo"], index=[0])}) - else: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): - df["c"].update(Series(["foo"], index=[0])) - expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]}) + expected = DataFrame( + { + "a": [1, 3], + "b": [np.nan, 2], + "c": Series(["foo", np.nan], dtype="object"), + } + ) tm.assert_frame_equal(df, expected) @td.skip_array_manager_invalid_test - def test_update_modify_view(self, using_copy_on_write): + def test_update_modify_view( + self, using_copy_on_write, warn_copy_on_write, using_infer_string + ): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) df2_orig = df2.copy() result_view = df2[:] - df2.update(df) + # TODO(CoW-warn) better warning message + with tm.assert_cow_warning(warn_copy_on_write): + df2.update(df) expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) tm.assert_frame_equal(df2, expected) - if using_copy_on_write: + if using_copy_on_write or using_infer_string: tm.assert_frame_equal(result_view, df2_orig) else: tm.assert_frame_equal(result_view, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_value_counts.py pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_value_counts.py --- pandas-2.1.4+dfsg/pandas/tests/frame/methods/test_value_counts.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/methods/test_value_counts.py 2024-04-10 17:42:52.000000000 +0000 @@ -147,7 +147,7 @@ index=pd.MultiIndex( levels=[ pd.Index(["Anne", "Beth", "John"]), - pd.Index(["Louise", "Smith", nulls_fixture]), + pd.Index(["Louise", "Smith", np.nan]), ], codes=[[0, 1, 2, 2], [2, 0, 1, 2]], names=["first_name", "middle_name"], @@ -188,4 +188,18 @@ ), name="count", ) + tm.assert_series_equal(result, expected) + + +def test_value_counts_with_missing_category(): + # GH-54836 + df = pd.DataFrame({"a": pd.Categorical([1, 2, 4], categories=[1, 2, 3, 4])}) + result = df.value_counts() + expected = pd.Series( + [1, 1, 1, 0], + index=pd.MultiIndex.from_arrays( + [pd.CategoricalIndex([1, 2, 4, 3], categories=[1, 2, 3, 4], name="a")] + ), + name="count", + ) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_api.py pandas-2.2.2+dfsg/pandas/tests/frame/test_api.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_api.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_api.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,7 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype from pandas._config.config import option_context import pandas as pd @@ -112,6 +113,7 @@ with pytest.raises(TypeError, match=msg): hash(empty_frame) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="surrogates not allowed") def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" @@ -217,10 +219,8 @@ def test_deepcopy(self, float_frame): cp = deepcopy(float_frame) - series = cp["A"] - series[:] = 10 - for idx, value in series.items(): - assert float_frame["A"][idx] != value + cp.loc[0, "A"] = 10 + assert not float_frame.equals(cp) def test_inplace_return_self(self): # GH 1893 @@ -311,9 +311,22 @@ result = df.rename(columns=str) assert result.attrs == {"version": 1} + def test_attrs_deepcopy(self): + df = DataFrame({"A": [2, 3]}) + assert df.attrs == {} + df.attrs["tags"] = {"spam", "ham"} + + result = df.rename(columns=str) + assert result.attrs == df.attrs + assert result.attrs["tags"] is not df.attrs["tags"] + @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) def test_set_flags( - self, allows_duplicate_labels, frame_or_series, using_copy_on_write + self, + allows_duplicate_labels, + frame_or_series, + using_copy_on_write, + warn_copy_on_write, ): obj = DataFrame({"A": [1, 2]}) key = (0, 0) @@ -341,13 +354,15 @@ else: assert np.may_share_memory(obj["A"].values, result["A"].values) - result.iloc[key] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[key] = 0 if using_copy_on_write: assert obj.iloc[key] == 1 else: assert obj.iloc[key] == 0 # set back to 1 for test below - result.iloc[key] = 1 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[key] = 1 # Now we do copy. result = obj.set_flags( diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_arithmetic.py pandas-2.2.2+dfsg/pandas/tests/frame/test_arithmetic.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_arithmetic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_arithmetic.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -22,20 +24,34 @@ ) import pandas._testing as tm from pandas.core.computation import expressions as expr -from pandas.core.computation.expressions import _MIN_ELEMENTS from pandas.tests.frame.common import ( _check_mixed_float, _check_mixed_int, ) -from pandas.util.version import Version -@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) -def switch_numexpr_min_elements(request): - _MIN_ELEMENTS = expr._MIN_ELEMENTS - expr._MIN_ELEMENTS = request.param - yield request.param - expr._MIN_ELEMENTS = _MIN_ELEMENTS +@pytest.fixture +def simple_frame(): + """ + Fixture for simple 3x3 DataFrame + + Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c']. + + one two three + a 1.0 2.0 3.0 + b 4.0 5.0 6.0 + c 7.0 8.0 9.0 + """ + arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) + + return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"]) + + +@pytest.fixture(autouse=True, params=[0, 100], ids=["numexpr", "python"]) +def switch_numexpr_min_elements(request, monkeypatch): + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", request.param) + yield request.param class DummyElement: @@ -43,7 +59,7 @@ self.value = value self.dtype = np.dtype(dtype) - def __array__(self): + def __array__(self, dtype=None, copy=None): return np.array(self.value, dtype=self.dtype) def __str__(self) -> str: @@ -237,6 +253,9 @@ with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't compare string and int" + ) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError @@ -418,8 +437,8 @@ def test_bool_flex_frame_object_dtype(self): # corner, dtype=object - df1 = DataFrame({"col": ["foo", np.nan, "bar"]}) - df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}) + df1 = DataFrame({"col": ["foo", np.nan, "bar"]}, dtype=object) + df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}, dtype=object) result = df1.ne(df2) exp = DataFrame({"col": [False, True, False]}) tm.assert_frame_equal(result, exp) @@ -500,34 +519,6 @@ result2 = df.floordiv(ser.values, axis=0) tm.assert_frame_equal(result2, expected) - @pytest.mark.parametrize("opname", ["floordiv", "pow"]) - def test_floordiv_axis0_numexpr_path(self, opname, request): - # case that goes through numexpr and has to fall back to masked_arith_op - ne = pytest.importorskip("numexpr") - if ( - Version(ne.__version__) >= Version("2.8.7") - and opname == "pow" - and "python" in request.node.callspec.id - ): - request.node.add_marker( - pytest.mark.xfail(reason="https://github.com/pydata/numexpr/issues/454") - ) - - op = getattr(operator, opname) - - arr = np.arange(_MIN_ELEMENTS + 100).reshape(_MIN_ELEMENTS // 100 + 1, -1) * 100 - df = DataFrame(arr) - df["C"] = 1.0 - - ser = df[0] - result = getattr(df, opname)(ser, axis=0) - - expected = DataFrame({col: op(df[col], ser) for col in df.columns}) - tm.assert_frame_equal(result, expected) - - result2 = getattr(df, opname)(ser.values, axis=0) - tm.assert_frame_equal(result2, expected) - def test_df_add_td64_columnwise(self): # GH 22534 Check that column-wise addition broadcasts correctly dti = pd.date_range("2016-01-01", periods=10) @@ -638,10 +629,12 @@ # corner cases result = float_frame.add(float_frame[:0]) - tm.assert_frame_equal(result, float_frame * np.nan) + expected = float_frame.sort_index() * np.nan + tm.assert_frame_equal(result, expected) result = float_frame[:0].add(float_frame) - tm.assert_frame_equal(result, float_frame * np.nan) + expected = float_frame.sort_index() * np.nan + tm.assert_frame_equal(result, expected) with pytest.raises(NotImplementedError, match="fill_value"): float_frame.add(float_frame.iloc[0], fill_value=3) @@ -720,8 +713,6 @@ df.columns = ["A", "A"] result = getattr(df, op)(df) tm.assert_frame_equal(result, expected) - str(result) - result.dtypes @pytest.mark.parametrize("level", [0, None]) def test_broadcast_multiindex(self, level): @@ -1045,6 +1036,7 @@ "bar": [pd.Timestamp("2018"), pd.Timestamp("2021")], }, columns=["foo", "bar"], + dtype="M8[ns]", ) df2 = df[["foo"]] @@ -1081,7 +1073,7 @@ ], ids=lambda x: x.__name__, ) - def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements, request): + def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements): skip = { (operator.truediv, "bool"), (operator.pow, "bool"), @@ -1224,7 +1216,7 @@ class TestFrameArithmeticUnsorted: def test_frame_add_tz_mismatch_converts_to_utc(self): - rng = pd.date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") + rng = pd.date_range("1/1/2011", periods=10, freq="h", tz="US/Eastern") df = DataFrame( np.random.default_rng(2).standard_normal(len(rng)), index=rng, columns=["a"] ) @@ -1237,7 +1229,7 @@ assert result.index.tz is timezone.utc def test_align_frame(self): - rng = pd.period_range("1/1/2000", "1/1/2010", freq="A") + rng = pd.period_range("1/1/2000", "1/1/2010", freq="Y") ts = DataFrame( np.random.default_rng(2).standard_normal((len(rng), 3)), index=rng ) @@ -1261,7 +1253,9 @@ # since filling converts dtypes from object, changed expected to be # object - filled = df.fillna(np.nan) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + filled = df.fillna(np.nan) result = op(df, 3) expected = op(filled, 3).astype(object) expected[pd.isna(expected)] = np.nan @@ -1272,10 +1266,14 @@ expected[pd.isna(expected)] = np.nan tm.assert_frame_equal(result, expected) - result = op(df, df.fillna(7)) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = op(df, df.fillna(7)) tm.assert_frame_equal(result, expected) - result = op(df.fillna(7), df) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = op(df.fillna(7), df) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)]) @@ -1532,8 +1530,12 @@ [operator.eq, operator.ne, operator.lt, operator.gt, operator.ge, operator.le], ) def test_comparisons(self, simple_frame, float_frame, func): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame() + df1 = DataFrame( + np.random.default_rng(2).standard_normal((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=pd.date_range("2000-01-01", periods=30, freq="B"), + ) + df2 = df1.copy() row = simple_frame.xs("a") ndim_5 = np.ones(df1.shape + (1, 1, 1)) @@ -1575,7 +1577,10 @@ f(df, 0) def test_comparison_protected_from_errstate(self): - missing_df = tm.makeDataFrame() + missing_df = DataFrame( + np.ones((10, 4), dtype=np.float64), + columns=Index(list("ABCD"), dtype=object), + ) missing_df.loc[missing_df.index[0], "A"] = np.nan with np.errstate(invalid="ignore"): expected = missing_df.values < 0 @@ -1931,20 +1936,6 @@ tm.assert_frame_equal(result, expected) -# TODO: move to tests.arithmetic and parametrize -def test_pow_nan_with_zero(): - left = DataFrame({"A": [np.nan, np.nan, np.nan]}) - right = DataFrame({"A": [0, 0, 0]}) - - expected = DataFrame({"A": [1.0, 1.0, 1.0]}) - - result = left**right - tm.assert_frame_equal(result, expected) - - result = left["A"] ** right["A"] - tm.assert_series_equal(result, expected["A"]) - - def test_dataframe_series_extension_dtypes(): # https://github.com/pandas-dev/pandas/issues/34311 df = DataFrame( @@ -1992,7 +1983,12 @@ "df, col_dtype", [ (DataFrame([[1.0, 2.0], [4.0, 5.0]], columns=list("ab")), "float64"), - (DataFrame([[1.0, "b"], [4.0, "b"]], columns=list("ab")), "object"), + ( + DataFrame([[1.0, "b"], [4.0, "b"]], columns=list("ab")).astype( + {"b": object} + ), + "object", + ), ], ) def test_dataframe_operation_with_non_numeric_types(df, col_dtype): @@ -2024,14 +2020,15 @@ to_add + df -def test_inplace_arithmetic_series_update(using_copy_on_write): +def test_inplace_arithmetic_series_update(using_copy_on_write, warn_copy_on_write): # https://github.com/pandas-dev/pandas/issues/36373 df = DataFrame({"A": [1, 2, 3]}) df_orig = df.copy() series = df["A"] vals = series._values - series += 1 + with tm.assert_cow_warning(warn_copy_on_write): + series += 1 if using_copy_on_write: assert series._values is not vals tm.assert_frame_equal(df, df_orig) @@ -2127,3 +2124,13 @@ expected = Series([True, True, True], name=Cols.col1) tm.assert_series_equal(result, expected) + + +def test_mixed_col_index_dtype(): + # GH 47382 + df1 = DataFrame(columns=list("abc"), data=1.0, index=[0]) + df2 = DataFrame(columns=list("abc"), data=0.0, index=[0]) + df1.columns = df2.columns.astype("string") + result = df1 + df2 + expected = DataFrame(columns=list("abc"), data=1.0, index=[0]) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_arrow_interface.py pandas-2.2.2+dfsg/pandas/tests/frame/test_arrow_interface.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_arrow_interface.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_arrow_interface.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,45 @@ +import ctypes + +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd + +pa = pytest.importorskip("pyarrow") + + +@td.skip_if_no("pyarrow", min_version="14.0") +def test_dataframe_arrow_interface(): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + capsule = df.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + table = pa.table(df) + expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.table(df, schema=schema) + expected = expected.cast(schema) + assert table.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="15.0") +def test_dataframe_to_arrow(): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + table = pa.RecordBatchReader.from_stream(df).read_all() + expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + assert table.equals(expected) + + schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) + table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all() + expected = expected.cast(schema) + assert table.equals(expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_block_internals.py pandas-2.2.2+dfsg/pandas/tests/frame/test_block_internals.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_block_internals.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_block_internals.py 2024-04-10 17:42:52.000000000 +0000 @@ -50,11 +50,18 @@ assert dti[1] == ts def test_cast_internals(self, float_frame): - casted = DataFrame(float_frame._mgr, dtype=int) + msg = "Passing a BlockManager to DataFrame" + with tm.assert_produces_warning( + DeprecationWarning, match=msg, check_stacklevel=False + ): + casted = DataFrame(float_frame._mgr, dtype=int) expected = DataFrame(float_frame._series, dtype=int) tm.assert_frame_equal(casted, expected) - casted = DataFrame(float_frame._mgr, dtype=np.int32) + with tm.assert_produces_warning( + DeprecationWarning, match=msg, check_stacklevel=False + ): + casted = DataFrame(float_frame._mgr, dtype=np.int32) expected = DataFrame(float_frame._series, dtype=np.int32) tm.assert_frame_equal(casted, expected) @@ -176,7 +183,7 @@ ) tm.assert_series_equal(result, expected) - def test_construction_with_mixed(self, float_string_frame): + def test_construction_with_mixed(self, float_string_frame, using_infer_string): # test construction edge cases with mixed types # f7u12, this does not work without extensive workaround @@ -199,7 +206,7 @@ expected = Series( [np.dtype("float64")] * 4 + [ - np.dtype("object"), + np.dtype("object") if not using_infer_string else "string", np.dtype("datetime64[us]"), np.dtype("timedelta64[us]"), ], @@ -331,7 +338,7 @@ assert not float_frame._is_mixed_type assert float_string_frame._is_mixed_type - def test_stale_cached_series_bug_473(self, using_copy_on_write): + def test_stale_cached_series_bug_473(self, using_copy_on_write, warn_copy_on_write): # this is chained, but ok with option_context("chained_assignment", None): Y = DataFrame( @@ -341,10 +348,7 @@ ) repr(Y) Y["e"] = Y["e"].astype("object") - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - Y["g"]["c"] = np.nan - else: + with tm.raises_chained_assignment_error(): Y["g"]["c"] = np.nan repr(Y) Y.sum() @@ -354,13 +358,16 @@ else: assert pd.isna(Y["g"]["c"]) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_strange_column_corruption_issue(self, using_copy_on_write): # TODO(wesm): Unclear how exactly this is related to internal matters df = DataFrame(index=[0, 1]) df[0] = np.nan wasCol = {} - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): for i, dt in enumerate(df.index): for col in range(100, 200): if col not in wasCol: @@ -416,7 +423,8 @@ with tm.raises_chained_assignment_error(): df["a"].fillna(1, inplace=True) else: - df["a"].fillna(1, inplace=True) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["a"].fillna(1, inplace=True) # check we haven't put a Series into any block.values assert isinstance(df._mgr.blocks[0].values, Categorical) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/frame/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -21,7 +21,10 @@ import pytest import pytz +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib +from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -79,7 +82,7 @@ # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str)) + expected = DataFrame(arr.astype(str), dtype=object) tm.assert_frame_equal(df, expected) def test_constructor_from_2d_datetimearray(self, using_array_manager): @@ -261,8 +264,9 @@ result = DataFrame(emptylike) tm.assert_frame_equal(result, expected) - def test_constructor_mixed(self, float_string_frame): - assert float_string_frame["foo"].dtype == np.object_ + def test_constructor_mixed(self, float_string_frame, using_infer_string): + dtype = "string" if using_infer_string else np.object_ + assert float_string_frame["foo"].dtype == dtype def test_constructor_cast_failure(self): # as of 2.0, we raise if we can't respect "dtype", previously we @@ -293,23 +297,28 @@ new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 - def test_constructor_dtype_nocast_view_dataframe(self, using_copy_on_write): + def test_constructor_dtype_nocast_view_dataframe( + self, using_copy_on_write, warn_copy_on_write + ): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) if using_copy_on_write: should_be_view.iloc[0, 0] = 99 assert df.values[0, 0] == 1 else: - should_be_view[0][0] = 99 + with tm.assert_cow_warning(warn_copy_on_write): + should_be_view.iloc[0, 0] = 99 assert df.values[0, 0] == 99 def test_constructor_dtype_nocast_view_2d_array( - self, using_array_manager, using_copy_on_write + self, using_array_manager, using_copy_on_write, warn_copy_on_write ): df = DataFrame([[1, 2], [3, 4]], dtype="int64") if not using_array_manager and not using_copy_on_write: should_be_view = DataFrame(df.values, dtype=df[0].dtype) - should_be_view[0][0] = 97 + # TODO(CoW-warn) this should warn + # with tm.assert_cow_warning(warn_copy_on_write): + should_be_view.iloc[0, 0] = 97 assert df.values[0, 0] == 97 else: # INFO(ArrayManager) DataFrame(ndarray) doesn't necessarily preserve @@ -318,6 +327,7 @@ assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") @@ -325,6 +335,7 @@ assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") @@ -495,9 +506,11 @@ assert expected == list(df.columns) def test_constructor_dict(self): - datetime_series = tm.makeTimeSeries(nper=30) + datetime_series = Series( + np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + ) # test expects index shifted by 5 - datetime_series_short = tm.makeTimeSeries(nper=30)[5:] + datetime_series_short = datetime_series[5:] frame = DataFrame({"col1": datetime_series, "col2": datetime_series_short}) @@ -621,8 +634,10 @@ tm.assert_frame_equal(result, expected) def test_constructor_dict_order_insertion(self): - datetime_series = tm.makeTimeSeries(nper=30) - datetime_series_short = tm.makeTimeSeries(nper=25) + datetime_series = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + datetime_series_short = datetime_series[:5] # GH19018 # initialization ordering: by insertion order if python>= 3.6 @@ -692,12 +707,12 @@ arr = np.array([[4, 5, 6]]) msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)" with pytest.raises(ValueError, match=msg): - DataFrame(index=[0], columns=range(0, 4), data=arr) + DataFrame(index=[0], columns=range(4), data=arr) arr = np.array([4, 5, 6]) msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)" with pytest.raises(ValueError, match=msg): - DataFrame(index=[0], columns=range(0, 4), data=arr) + DataFrame(index=[0], columns=range(4), data=arr) # higher dim raise exception with pytest.raises(ValueError, match="Must pass 2-d input"): @@ -764,7 +779,7 @@ ) tm.assert_numpy_array_equal(df.values, expected) - def test_constructor_dict_cast(self): + def test_constructor_dict_cast(self, using_infer_string): # cast float tests test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} frame = DataFrame(test_data, dtype=float) @@ -774,13 +789,13 @@ frame = DataFrame(test_data) assert len(frame) == 3 - assert frame["B"].dtype == np.object_ + assert frame["B"].dtype == np.object_ if not using_infer_string else "string" assert frame["A"].dtype == np.float64 def test_constructor_dict_cast2(self): # can't cast to float test_data = { - "A": dict(zip(range(20), tm.makeStringIndex(20))), + "A": dict(zip(range(20), [f"word_{i}" for i in range(20)])), "B": dict(zip(range(15), np.random.default_rng(2).standard_normal(15))), } with pytest.raises(ValueError, match="could not convert string"): @@ -1097,8 +1112,8 @@ mat2[0, 0] = 1 mat2[1, 2] = 2 frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) - assert 1 == frame["A"].view("i8")[1] - assert 2 == frame["C"].view("i8")[2] + assert 1 == frame["A"].astype("i8")[1] + assert 2 == frame["C"].astype("i8")[2] # masked bool promoted to object mat = ma.masked_all((2, 3), dtype=bool) @@ -1186,7 +1201,7 @@ df = DataFrame({"a": data}, dtype=input_dtype) assert df["a"].dtype == expected_dtype() - def test_constructor_scalar_inference(self): + def test_constructor_scalar_inference(self, using_infer_string): data = {"int": 1, "bool": True, "float": 3.0, "complex": 4j, "object": "foo"} df = DataFrame(data, index=np.arange(10)) @@ -1194,7 +1209,7 @@ assert df["bool"].dtype == np.bool_ assert df["float"].dtype == np.float64 assert df["complex"].dtype == np.complex128 - assert df["object"].dtype == np.object_ + assert df["object"].dtype == np.object_ if not using_infer_string else "string" def test_constructor_arrays_and_scalars(self): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10), "b": True}) @@ -1273,11 +1288,11 @@ df = DataFrame(empty_gen(), columns=["A", "B"]) tm.assert_frame_equal(df, expected) - def test_constructor_list_of_lists(self): + def test_constructor_list_of_lists(self, using_infer_string): # GH #484 df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"]) assert is_integer_dtype(df["num"]) - assert df["str"].dtype == np.object_ + assert df["str"].dtype == np.object_ if not using_infer_string else "string" # GH 4851 # list of 0-dim ndarrays @@ -1741,7 +1756,11 @@ index = list(float_frame.index[:5]) columns = list(float_frame.columns[:3]) - result = DataFrame(float_frame._mgr, index=index, columns=columns) + msg = "Passing a BlockManager to DataFrame" + with tm.assert_produces_warning( + DeprecationWarning, match=msg, check_stacklevel=False + ): + result = DataFrame(float_frame._mgr, index=index, columns=columns) tm.assert_index_equal(result.index, Index(index)) tm.assert_index_equal(result.columns, Index(columns)) @@ -1822,7 +1841,7 @@ with pytest.raises(TypeError, match=msg): DataFrame("a", [1, 2], ["a", "c"], float) - def test_constructor_with_datetimes(self): + def test_constructor_with_datetimes(self, using_infer_string): intname = np.dtype(int).name floatname = np.dtype(np.float64).name objectname = np.dtype(np.object_).name @@ -1841,7 +1860,7 @@ result = df.dtypes expected = Series( [np.dtype("int64")] - + [np.dtype(objectname)] * 2 + + [np.dtype(objectname) if not using_infer_string else "string"] * 2 + [np.dtype("M8[s]"), np.dtype("M8[us]")], index=list("ABCDE"), ) @@ -1863,7 +1882,7 @@ expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object")] + + [np.dtype("object") if not using_infer_string else "string"] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1885,7 +1904,7 @@ expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object")] + + [np.dtype("object") if not using_infer_string else "string"] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1922,13 +1941,13 @@ df = DataFrame({"End Date": dt}, index=[0]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}) + df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}, dtype=object) ) df = DataFrame([{"End Date": dt}]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) + df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}, dtype=object) ) def test_constructor_with_datetimes4(self): @@ -2053,7 +2072,7 @@ # dtype=exp_dtype. tm.assert_frame_equal(df, expected) - def test_constructor_for_list_with_dtypes(self): + def test_constructor_for_list_with_dtypes(self, using_infer_string): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) result = df.dtypes @@ -2104,7 +2123,7 @@ [ np.dtype("int64"), np.dtype("float64"), - np.dtype("object"), + np.dtype("object") if not using_infer_string else "string", np.dtype("datetime64[ns]"), np.dtype("float64"), ], @@ -2391,7 +2410,7 @@ def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 - ser = Series(range(0, 100)) + ser = Series(range(100)) ser1 = cut(ser, 10).value_counts().head(5) ser2 = cut(ser, 10).value_counts().tail(5) result = DataFrame({"1": ser1, "2": ser2}) @@ -2471,8 +2490,8 @@ [ ([1, 2]), (["1", "2"]), - (list(date_range("1/1/2011", periods=2, freq="H"))), - (list(date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + (list(date_range("1/1/2011", periods=2, freq="h"))), + (list(date_range("1/1/2011", periods=2, freq="h", tz="US/Eastern"))), ([Interval(left=0, right=5)]), ], ) @@ -2750,6 +2769,23 @@ df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 + def test_inference_on_pandas_objects(self): + # GH#56012 + idx = Index([Timestamp("2019-12-31")], dtype=object) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = DataFrame(idx, columns=["a"]) + assert result.dtypes.iloc[0] != np.object_ + result = DataFrame({"a": idx}) + assert result.dtypes.iloc[0] == np.object_ + + ser = Series([Timestamp("2019-12-31")], dtype=object) + + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = DataFrame(ser, columns=["a"]) + assert result.dtypes.iloc[0] != np.object_ + result = DataFrame({"a": ser}) + assert result.dtypes.iloc[0] == np.object_ + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): @@ -2822,7 +2858,7 @@ ) result = DataFrame({key_val: [1, 2]}, columns=cols) expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols) - expected.iloc[:, 1] = expected.iloc[:, 1].astype(object) + expected.isetitem(1, expected.iloc[:, 1].astype(object)) tm.assert_frame_equal(result, expected) @@ -2968,7 +3004,9 @@ def test_frame_timeseries_column(self): # GH19157 - dr = date_range(start="20130101T10:00:00", periods=3, freq="T", tz="US/Eastern") + dr = date_range( + start="20130101T10:00:00", periods=3, freq="min", tz="US/Eastern" + ) result = DataFrame(dr, columns=["timestamps"]) expected = DataFrame( { @@ -3081,6 +3119,24 @@ with pytest.raises(ValueError, match="columns cannot be a set"): DataFrame(data, columns={"a", "b", "c"}) + # TODO: make this not cast to object in pandas 3.0 + @pytest.mark.skipif( + not np_version_gt2, reason="StringDType only available in numpy 2 and above" + ) + @pytest.mark.parametrize( + "data", + [ + {"a": ["a", "b", "c"], "b": [1.0, 2.0, 3.0], "c": ["d", "e", "f"]}, + ], + ) + def test_np_string_array_object_cast(self, data): + from numpy.dtypes import StringDType + + data["a"] = np.array(data["a"], dtype=StringDType()) + res = DataFrame(data) + assert res["a"].dtype == np.object_ + assert (res["a"] == data["a"]).all() + def get1(obj): # TODO: make a helper in tm? if isinstance(obj, Series): @@ -3146,9 +3202,9 @@ dtype = {np.datetime64: "m8[ns]", np.timedelta64: "M8[ns]"}[cls] if cls is np.datetime64: - msg1 = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" + msg1 = "Invalid type for timedelta scalar: " else: - msg1 = r"dtype timedelta64\[ns\] cannot be converted to datetime64\[ns\]" + msg1 = " is not convertible to datetime" msg = "|".join(["Cannot cast", msg1]) with pytest.raises(TypeError, match=msg): @@ -3169,7 +3225,7 @@ "non-nano, but DatetimeArray._from_sequence has not", strict=True, ) - request.node.add_marker(mark) + request.applymarker(mark) scalar = datetime(9999, 1, 1) exp_dtype = "M8[us]" # pydatetime objects default to this reso @@ -3186,6 +3242,7 @@ assert item.asm8.dtype == exp_dtype assert dtype == exp_dtype + @pytest.mark.skip_ubsan def test_out_of_s_bounds_datetime64(self, constructor): scalar = np.datetime64(np.iinfo(np.int64).max, "D") result = constructor(scalar) @@ -3205,7 +3262,7 @@ "to non-nano, but TimedeltaArray._from_sequence has not", strict=True, ) - request.node.add_marker(mark) + request.applymarker(mark) scalar = datetime(9999, 1, 1) - datetime(1970, 1, 1) exp_dtype = "m8[us]" # smallest reso that fits @@ -3221,6 +3278,7 @@ assert item.asm8.dtype == exp_dtype assert dtype == exp_dtype + @pytest.mark.skip_ubsan @pytest.mark.parametrize("cls", [np.datetime64, np.timedelta64]) def test_out_of_s_bounds_timedelta64(self, constructor, cls): scalar = cls(np.iinfo(np.int64).max, "D") diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_iteration.py pandas-2.2.2+dfsg/pandas/tests/frame/test_iteration.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_iteration.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_iteration.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,7 @@ import datetime import numpy as np +import pytest from pandas.compat import ( IS64, @@ -39,7 +40,7 @@ assert v.name == k def test_iter(self, float_frame): - assert tm.equalContents(list(float_frame), float_frame.columns) + assert list(float_frame) == list(float_frame.columns) def test_iterrows(self, float_frame, float_string_frame): for k, v in float_frame.iterrows(): @@ -55,7 +56,7 @@ s = DataFrame( { "non_iso8601": ["M1701", "M1802", "M1903", "M2004"], - "iso8601": date_range("2000-01-01", periods=4, freq="M"), + "iso8601": date_range("2000-01-01", periods=4, freq="ME"), } ) for k, v in s.iterrows(): @@ -91,6 +92,7 @@ expected = float_frame.iloc[i, :].reset_index(drop=True) tm.assert_series_equal(ser, expected) + def test_itertuples_index_false(self): df = DataFrame( {"floats": np.random.default_rng(2).standard_normal(5), "ints": range(5)}, columns=["floats", "ints"], @@ -99,6 +101,7 @@ for tup in df.itertuples(index=False): assert isinstance(tup[1], int) + def test_itertuples_duplicate_cols(self): df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]}) dfaa = df[["a", "a"]] @@ -111,32 +114,27 @@ == "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]" ) + def test_itertuples_tuple_name(self): + df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]}) tup = next(df.itertuples(name="TestName")) assert tup._fields == ("Index", "a", "b") assert (tup.Index, tup.a, tup.b) == tup assert type(tup).__name__ == "TestName" - df.columns = ["def", "return"] + def test_itertuples_disallowed_col_labels(self): + df = DataFrame(data={"def": [1, 2, 3], "return": [4, 5, 6]}) tup2 = next(df.itertuples(name="TestName")) assert tup2 == (0, 1, 4) assert tup2._fields == ("Index", "_1", "_2") - df3 = DataFrame({"f" + str(i): [i] for i in range(1024)}) - # will raise SyntaxError if trying to create namedtuple - tup3 = next(df3.itertuples()) - assert isinstance(tup3, tuple) - assert hasattr(tup3, "_fields") - + @pytest.mark.parametrize("limit", [254, 255, 1024]) + @pytest.mark.parametrize("index", [True, False]) + def test_itertuples_py2_3_field_limit_namedtuple(self, limit, index): # GH#28282 - df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}]) - result_254_columns = next(df_254_columns.itertuples(index=False)) - assert isinstance(result_254_columns, tuple) - assert hasattr(result_254_columns, "_fields") - - df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}]) - result_255_columns = next(df_255_columns.itertuples(index=False)) - assert isinstance(result_255_columns, tuple) - assert hasattr(result_255_columns, "_fields") + df = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(limit)}]) + result = next(df.itertuples(index=index)) + assert isinstance(result, tuple) + assert hasattr(result, "_fields") def test_sequence_like_with_categorical(self): # GH#7839 diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_logical_ops.py pandas-2.2.2+dfsg/pandas/tests/frame/test_logical_ops.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_logical_ops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_logical_ops.py 2024-04-10 17:42:52.000000000 +0000 @@ -96,7 +96,7 @@ res_ser = df1a_int["A"] | df1a_bool["A"] tm.assert_series_equal(res_ser, df1a_bool["A"]) - def test_logical_ops_invalid(self): + def test_logical_ops_invalid(self, using_infer_string): # GH#5808 df1 = DataFrame(1.0, index=[1], columns=["A"]) @@ -108,8 +108,14 @@ df1 = DataFrame("foo", index=[1], columns=["A"]) df2 = DataFrame(True, index=[1], columns=["A"]) msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") - with pytest.raises(TypeError, match=msg): - df1 | df2 + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="|has no kernel"): + df1 | df2 + else: + with pytest.raises(TypeError, match=msg): + df1 | df2 def test_logical_operators(self): def _check_bin_op(op): @@ -151,6 +157,7 @@ _check_unary_op(operator.inv) # TODO: belongs elsewhere + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") def test_logical_with_nas(self): d = DataFrame({"a": [np.nan, False], "b": [True, True]}) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_nonunique_indexes.py pandas-2.2.2+dfsg/pandas/tests/frame/test_nonunique_indexes.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_nonunique_indexes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_nonunique_indexes.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,13 +10,6 @@ import pandas._testing as tm -def check(result, expected=None): - if expected is not None: - tm.assert_frame_equal(result, expected) - result.dtypes - str(result) - - class TestDataFrameNonuniqueIndexes: def test_setattr_columns_vs_construct_with_columns(self): # assignment @@ -26,16 +19,16 @@ df = DataFrame(arr, columns=["A", "A"]) df.columns = idx expected = DataFrame(arr, columns=idx) - check(df, expected) + tm.assert_frame_equal(df, expected) def test_setattr_columns_vs_construct_with_columns_datetimeindx(self): - idx = date_range("20130101", periods=4, freq="Q-NOV") + idx = date_range("20130101", periods=4, freq="QE-NOV") df = DataFrame( [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"] ) df.columns = idx expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) - check(df, expected) + tm.assert_frame_equal(df, expected) def test_insert_with_duplicate_columns(self): # insert @@ -48,7 +41,7 @@ [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]], columns=["foo", "bar", "foo", "hello", "string"], ) - check(df, expected) + tm.assert_frame_equal(df, expected) with pytest.raises(ValueError, match="Length of value"): df.insert(0, "AnotherColumn", range(len(df.index) - 1)) @@ -58,7 +51,7 @@ [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]], columns=["foo", "bar", "foo", "hello", "string", "foo2"], ) - check(df, expected) + tm.assert_frame_equal(df, expected) # set (non-dup) df["foo2"] = 4 @@ -66,7 +59,7 @@ [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]], columns=["foo", "bar", "foo", "hello", "string", "foo2"], ) - check(df, expected) + tm.assert_frame_equal(df, expected) df["foo2"] = 3 # delete (non dup) @@ -75,7 +68,7 @@ [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]], columns=["foo", "foo", "hello", "string", "foo2"], ) - check(df, expected) + tm.assert_frame_equal(df, expected) # try to delete again (its not consolidated) del df["hello"] @@ -83,7 +76,7 @@ [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], columns=["foo", "foo", "string", "foo2"], ) - check(df, expected) + tm.assert_frame_equal(df, expected) # consolidate df = df._consolidate() @@ -91,7 +84,7 @@ [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], columns=["foo", "foo", "string", "foo2"], ) - check(df, expected) + tm.assert_frame_equal(df, expected) # insert df.insert(2, "new_col", 5.0) @@ -99,7 +92,7 @@ [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]], columns=["foo", "foo", "new_col", "string", "foo2"], ) - check(df, expected) + tm.assert_frame_equal(df, expected) # insert a dup with pytest.raises(ValueError, match="cannot insert"): @@ -114,7 +107,7 @@ ], columns=["foo", "foo", "new_col", "new_col", "string", "foo2"], ) - check(df, expected) + tm.assert_frame_equal(df, expected) # delete (dup) del df["foo"] @@ -130,18 +123,17 @@ [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]], columns=["foo", "bar", "foo", "hello"], ) - check(df) df["foo2"] = 7.0 expected = DataFrame( [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]], columns=["foo", "bar", "foo", "hello", "foo2"], ) - check(df, expected) + tm.assert_frame_equal(df, expected) result = df["foo"] expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"]) - check(result, expected) + tm.assert_frame_equal(result, expected) # multiple replacements df["foo"] = "string" @@ -153,13 +145,13 @@ ], columns=["foo", "bar", "foo", "hello", "foo2"], ) - check(df, expected) + tm.assert_frame_equal(df, expected) del df["foo"] expected = DataFrame( [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"] ) - check(df, expected) + tm.assert_frame_equal(df, expected) def test_column_dups_indexes(self): # check column dups with index equal and not equal to df's index @@ -176,7 +168,7 @@ columns=["A", "B", "A"], ) this_df["A"] = index - check(this_df, expected_df) + tm.assert_frame_equal(this_df, expected_df) def test_changing_dtypes_with_duplicate_columns(self): # multiple assignments that change dtypes @@ -188,7 +180,7 @@ expected = DataFrame(1.0, index=range(5), columns=["that", "that"]) df["that"] = 1.0 - check(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame( np.random.default_rng(2).random((5, 2)), columns=["that", "that"] @@ -196,7 +188,7 @@ expected = DataFrame(1, index=range(5), columns=["that", "that"]) df["that"] = 1 - check(df, expected) + tm.assert_frame_equal(df, expected) def test_dup_columns_comparisons(self): # equality @@ -231,7 +223,7 @@ ) expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1) result = dfbool[["one", "three", "one"]] - check(result, expected) + tm.assert_frame_equal(result, expected) def test_multi_axis_dups(self): # multi-axis dups @@ -251,7 +243,7 @@ ) z = df[["A", "C", "A"]] result = z.loc[["a", "c", "a"]] - check(result, expected) + tm.assert_frame_equal(result, expected) def test_columns_with_dups(self): # GH 3468 related @@ -259,13 +251,11 @@ # basic df = DataFrame([[1, 2]], columns=["a", "a"]) df.columns = ["a", "a.1"] - str(df) expected = DataFrame([[1, 2]], columns=["a", "a.1"]) tm.assert_frame_equal(df, expected) df = DataFrame([[1, 2, 3]], columns=["b", "a", "a"]) df.columns = ["b", "a", "a.1"] - str(df) expected = DataFrame([[1, 2, 3]], columns=["b", "a", "a.1"]) tm.assert_frame_equal(df, expected) @@ -273,7 +263,6 @@ # with a dup index df = DataFrame([[1, 2]], columns=["a", "a"]) df.columns = ["b", "b"] - str(df) expected = DataFrame([[1, 2]], columns=["b", "b"]) tm.assert_frame_equal(df, expected) @@ -284,7 +273,6 @@ columns=["a", "a", "b", "b", "d", "c", "c"], ) df.columns = list("ABCDEFG") - str(df) expected = DataFrame( [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("ABCDEFG") ) @@ -293,7 +281,6 @@ def test_multi_dtype2(self): df = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a", "a", "a"]) df.columns = ["a", "a.1", "a.2", "a.3"] - str(df) expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"]) tm.assert_frame_equal(df, expected) @@ -335,7 +322,7 @@ df = DataFrame(np.arange(9).reshape(3, 3).T) df.columns = list("AAA") - expected = df.iloc[:, 2] + expected = df.iloc[:, 2].copy() with tm.assert_produces_warning(warn, match=msg): df.iloc[:, 0] = 3 @@ -343,7 +330,7 @@ df = DataFrame(np.arange(9).reshape(3, 3).T) df.columns = [2, float(2), str(2)] - expected = df.iloc[:, 1] + expected = df.iloc[:, 1].copy() with tm.assert_produces_warning(warn, match=msg): df.iloc[:, 0] = 3 diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_query_eval.py pandas-2.2.2+dfsg/pandas/tests/frame/test_query_eval.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_query_eval.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_query_eval.py 2024-04-10 17:42:52.000000000 +0000 @@ -27,7 +27,8 @@ @pytest.fixture( - params=["python", pytest.param("numexpr", marks=td.skip_if_no_ne)], ids=lambda x: x + params=["python", pytest.param("numexpr", marks=td.skip_if_no("numexpr"))], + ids=lambda x: x, ) def engine(request): return request.param @@ -35,7 +36,7 @@ def skip_if_no_pandas_parser(parser): if parser != "pandas": - pytest.skip(f"cannot evaluate with parser {repr(parser)}") + pytest.skip(f"cannot evaluate with parser={parser}") class TestCompat: @@ -359,8 +360,11 @@ tm.assert_frame_equal(res, exp) def test_query_multiindex_get_index_resolvers(self): - df = tm.makeCustomDataframe( - 10, 3, r_idx_nlevels=2, r_idx_names=["spam", "eggs"] + df = DataFrame( + np.ones((10, 3)), + index=MultiIndex.from_arrays( + [range(10) for _ in range(2)], names=["spam", "eggs"] + ), ) resolvers = df._get_index_resolvers() @@ -376,7 +380,7 @@ "columns": col_series, "spam": to_series(df.index, "spam"), "eggs": to_series(df.index, "eggs"), - "C0": col_series, + "clevel_0": col_series, } for k, v in resolvers.items(): if isinstance(v, Index): @@ -387,7 +391,7 @@ raise AssertionError("object must be a Series or Index") -@td.skip_if_no_ne +@td.skip_if_no("numexpr") class TestDataFrameQueryNumExprPandas: @pytest.fixture def engine(self): @@ -765,7 +769,7 @@ tm.assert_frame_equal(result, expected) -@td.skip_if_no_ne +@td.skip_if_no("numexpr") class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): @pytest.fixture def engine(self): @@ -1031,7 +1035,7 @@ with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine) - def test_object_array_eq_ne(self, parser, engine): + def test_object_array_eq_ne(self, parser, engine, using_infer_string): df = DataFrame( { "a": list("aaaabbbbcccc"), @@ -1040,11 +1044,14 @@ "d": np.random.default_rng(2).integers(9, size=12), } ) - res = df.query("a == b", parser=parser, engine=engine) + warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None + with tm.assert_produces_warning(warning): + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] tm.assert_frame_equal(res, exp) - res = df.query("a != b", parser=parser, engine=engine) + with tm.assert_produces_warning(warning): + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] tm.assert_frame_equal(res, exp) @@ -1083,12 +1090,16 @@ [">=", operator.ge], ], ) - def test_query_lex_compare_strings(self, parser, engine, op, func): + def test_query_lex_compare_strings( + self, parser, engine, op, func, using_infer_string + ): a = Series(np.random.default_rng(2).choice(list("abcde"), 20)) b = Series(np.arange(a.size)) df = DataFrame({"X": a, "Y": b}) - res = df.query(f'X {op} "d"', engine=engine, parser=parser) + warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None + with tm.assert_produces_warning(warning): + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) @@ -1162,7 +1173,7 @@ @pytest.mark.parametrize("op", ["+", "-", "*", "/"]) def test_invalid_type_for_operator_raises(self, parser, engine, op): df = DataFrame({"a": [1, 2], "b": ["c", "d"]}) - msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" + msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'|Cannot" with pytest.raises(TypeError, match=msg): df.eval(f"a {op} b", engine=engine, parser=parser) @@ -1404,3 +1415,11 @@ } ) tm.assert_frame_equal(result, expected) + + def test_all_nat_in_object(self): + # GH#57068 + now = pd.Timestamp.now("UTC") # noqa: F841 + df = DataFrame({"a": pd.to_datetime([None, None], utc=True)}, dtype=object) + result = df.query("a > @now") + expected = DataFrame({"a": []}, dtype=object) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_reductions.py pandas-2.2.2+dfsg/pandas/tests/frame/test_reductions.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_reductions.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_reductions.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import ( IS64, is_platform_windows, @@ -18,7 +20,10 @@ Categorical, CategoricalDtype, DataFrame, + DatetimeIndex, Index, + PeriodIndex, + RangeIndex, Series, Timestamp, date_range, @@ -37,6 +42,38 @@ is_windows_or_is32 = is_platform_windows() or not IS64 +def make_skipna_wrapper(alternative, skipna_alternative=None): + """ + Create a function for calling on an array. + + Parameters + ---------- + alternative : function + The function to be called on the array with no NaNs. + Only used when 'skipna_alternative' is None. + skipna_alternative : function + The function to be called on the original array + + Returns + ------- + function + """ + if skipna_alternative: + + def skipna_wrapper(x): + return skipna_alternative(x.values) + + else: + + def skipna_wrapper(x): + nona = x.dropna() + if len(nona) == 0: + return np.nan + return alternative(nona) + + return skipna_wrapper + + def assert_stat_op_calc( opname, alternative, @@ -93,7 +130,7 @@ def wrapper(x): return alternative(x.values) - skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative) + skipna_wrapper = make_skipna_wrapper(alternative, skipna_alternative) result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) tm.assert_series_equal( @@ -147,6 +184,45 @@ tm.assert_series_equal(r1, expected) +@pytest.fixture +def bool_frame_with_na(): + """ + Fixture for DataFrame of booleans with index of unique strings + + Columns are ['A', 'B', 'C', 'D']; some entries are missing + """ + df = DataFrame( + np.concatenate( + [np.ones((15, 4), dtype=bool), np.zeros((15, 4), dtype=bool)], axis=0 + ), + index=Index([f"foo_{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD"), dtype=object), + dtype=object, + ) + # set some NAs + df.iloc[5:10] = np.nan + df.iloc[15:20, -2:] = np.nan + return df + + +@pytest.fixture +def float_frame_with_na(): + """ + Fixture for DataFrame of floats with index of unique strings + + Columns are ['A', 'B', 'C', 'D']; some entries are missing + """ + df = DataFrame( + np.random.default_rng(2).standard_normal((30, 4)), + index=Index([f"foo_{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD"), dtype=object), + ) + # set some NAs + df.iloc[5:10] = np.nan + df.iloc[15:20, -2:] = np.nan + return df + + class TestDataFrameAnalytics: # --------------------------------------------------------------------- # Reductions @@ -165,15 +241,21 @@ "var", "std", "sem", - pytest.param("skew", marks=td.skip_if_no_scipy), - pytest.param("kurt", marks=td.skip_if_no_scipy), + pytest.param("skew", marks=td.skip_if_no("scipy")), + pytest.param("kurt", marks=td.skip_if_no("scipy")), ], ) - def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): - if (opname in ("sum", "min", "max") and axis == 0) or opname in ( - "count", - "nunique", - ): + def test_stat_op_api_float_string_frame( + self, float_string_frame, axis, opname, using_infer_string + ): + if ( + (opname in ("sum", "min", "max") and axis == 0) + or opname + in ( + "count", + "nunique", + ) + ) and not (using_infer_string and opname == "sum"): getattr(float_string_frame, opname)(axis=axis) else: if opname in ["var", "std", "sem", "skew", "kurt"]: @@ -199,7 +281,11 @@ elif opname in ["min", "max"]: msg = "'[><]=' not supported between instances of 'float' and 'str'" elif opname == "median": - msg = re.compile(r"Cannot convert \[.*\] to numeric", flags=re.S) + msg = re.compile( + r"Cannot convert \[.*\] to numeric|does not support", flags=re.S + ) + if not isinstance(msg, re.Pattern): + msg = msg + "|does not support" with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -219,8 +305,8 @@ "var", "std", "sem", - pytest.param("skew", marks=td.skip_if_no_scipy), - pytest.param("kurt", marks=td.skip_if_no_scipy), + pytest.param("skew", marks=td.skip_if_no("scipy")), + pytest.param("kurt", marks=td.skip_if_no("scipy")), ], ) def test_stat_op_api_float_frame(self, float_frame, axis, opname): @@ -360,6 +446,7 @@ "Could not convert", "could not convert", "can't multiply sequence by non-int", + "does not support", ] ) with pytest.raises(TypeError, match=msg): @@ -371,11 +458,15 @@ "Could not convert", "could not convert", "can't multiply sequence by non-int", + "does not support", ] ) with pytest.raises(TypeError, match=msg): getattr(df, op)() + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="sum doesn't work for arrow strings" + ) def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame( @@ -442,7 +533,9 @@ df = DataFrame(d) - with pytest.raises(TypeError, match="unsupported operand type"): + with pytest.raises( + TypeError, match="unsupported operand type|does not support" + ): df.mean() result = df[["A", "C"]].mean() expected = Series([2.7, 681.6], index=["A", "C"], dtype=object) @@ -526,7 +619,7 @@ "C": [1.0], "D": ["a"], "E": Categorical(["a"], categories=["a"]), - "F": to_datetime(["2000-1-2"]), + "F": DatetimeIndex(["2000-01-02"], dtype="M8[ns]"), "G": to_timedelta(["1 days"]), }, ), @@ -538,7 +631,7 @@ "C": [np.nan], "D": np.array([np.nan], dtype=object), "E": Categorical([np.nan], categories=["a"]), - "F": [pd.NaT], + "F": DatetimeIndex([pd.NaT], dtype="M8[ns]"), "G": to_timedelta([pd.NaT]), }, ), @@ -549,7 +642,9 @@ "I": [8, 9, np.nan, np.nan], "J": [1, np.nan, np.nan, np.nan], "K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]), - "L": to_datetime(["2000-1-2", "NaT", "NaT", "NaT"]), + "L": DatetimeIndex( + ["2000-01-02", "NaT", "NaT", "NaT"], dtype="M8[ns]" + ), "M": to_timedelta(["1 days", "nan", "nan", "nan"]), "N": [0, 1, 2, 3], }, @@ -561,7 +656,9 @@ "I": [8, 9, np.nan, np.nan], "J": [1, np.nan, np.nan, np.nan], "K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]), - "L": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), + "L": DatetimeIndex( + ["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]" + ), "M": to_timedelta(["nan", "1 days", "nan", "nan"]), "N": [0, 1, 2, 3], }, @@ -574,15 +671,17 @@ "A": [12, 12, 19, 11], "B": [10, 10, np.nan, 3], "C": [1, np.nan, np.nan, np.nan], - "D": [np.nan, np.nan, "a", np.nan], + "D": Series([np.nan, np.nan, "a", np.nan], dtype=object), "E": Categorical([np.nan, np.nan, "a", np.nan]), - "F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), + "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), "H": [8, 8, 9, 9], "I": [9, 9, 8, 8], "J": [1, 1, np.nan, np.nan], "K": Categorical(["a", np.nan, "a", np.nan]), - "L": to_datetime(["2000-1-2", "2000-1-2", "NaT", "NaT"]), + "L": DatetimeIndex( + ["2000-01-02", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]" + ), "M": to_timedelta(["1 days", "nan", "1 days", "nan"]), "N": np.arange(4, dtype="int64"), } @@ -592,14 +691,15 @@ expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - def test_mode_sortwarning(self): + def test_mode_sortwarning(self, using_infer_string): # Check for the warning that is raised when the mode # results cannot be sorted df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - with tm.assert_produces_warning(UserWarning): + warning = None if using_infer_string else UserWarning + with tm.assert_produces_warning(warning): result = df.mode(dropna=False) result = result.sort_values(by="A").reset_index(drop=True) @@ -718,7 +818,7 @@ "values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]] ) def test_std_datetime64_with_nat( - self, values, skipna, using_array_manager, request + self, values, skipna, using_array_manager, request, unit ): # GH#51335 if using_array_manager and ( @@ -727,14 +827,15 @@ mark = pytest.mark.xfail( reason="GH#51446: Incorrect type inference on NaT in reduction result" ) - request.node.add_marker(mark) - df = DataFrame({"a": to_datetime(values)}) + request.applymarker(mark) + dti = to_datetime(values).as_unit(unit) + df = DataFrame({"a": dti}) result = df.std(skipna=skipna) if not skipna or all(value is pd.NaT for value in values): - expected = Series({"a": pd.NaT}, dtype="timedelta64[ns]") + expected = Series({"a": pd.NaT}, dtype=f"timedelta64[{unit}]") else: # 86400000000000ns == 1 day - expected = Series({"a": 86400000000000}, dtype="timedelta64[ns]") + expected = Series({"a": 86400000000000}, dtype=f"timedelta64[{unit}]") tm.assert_series_equal(result, expected) def test_sum_corner(self): @@ -750,15 +851,15 @@ @pytest.mark.parametrize( "index", [ - tm.makeRangeIndex(0), - tm.makeDateIndex(0), - tm.makeNumericIndex(0, dtype=int), - tm.makeNumericIndex(0, dtype=float), - tm.makeDateIndex(0, freq="M"), - tm.makePeriodIndex(0), + RangeIndex(0), + DatetimeIndex([]), + Index([], dtype=np.int64), + Index([], dtype=np.float64), + DatetimeIndex([], freq="ME"), + PeriodIndex([], freq="D"), ], ) - def test_axis_1_empty(self, all_reductions, index, using_array_manager): + def test_axis_1_empty(self, all_reductions, index): df = DataFrame(columns=["a"], index=index) result = getattr(df, all_reductions)(axis=1) if all_reductions in ("any", "all"): @@ -888,7 +989,8 @@ def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data - with pytest.raises(TypeError, match="Could not convert"): + msg = "Could not convert|does not support" + with pytest.raises(TypeError, match=msg): float_string_frame.mean(axis=0) # xs sum mixed type, just want to know it works... @@ -910,7 +1012,7 @@ "A": np.arange(3), "B": date_range("2016-01-01", periods=3), "C": pd.timedelta_range("1D", periods=3), - "D": pd.period_range("2016", periods=3, freq="A"), + "D": pd.period_range("2016", periods=3, freq="Y"), } ) result = df.mean(numeric_only=True) @@ -935,7 +1037,7 @@ tm.assert_series_equal(result, expected) # mean of period is not allowed - df["D"] = pd.period_range("2016", periods=3, freq="A") + df["D"] = pd.period_range("2016", periods=3, freq="Y") with pytest.raises(TypeError, match="mean is not implemented for Period"): df.mean(numeric_only=False) @@ -1179,6 +1281,7 @@ def test_any_all_bool_with_na(self, opname, axis, bool_frame_with_na): getattr(bool_frame_with_na, opname)(axis=axis, bool_only=False) + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("opname", ["any", "all"]) def test_any_all_bool_frame(self, opname, bool_frame_with_na): # GH#12863: numpy gives back non-boolean data for object type @@ -1259,7 +1362,9 @@ @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) - def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): + def test_any_all_object_dtype( + self, axis, bool_agg_func, skipna, using_infer_string + ): # GH#35450 df = DataFrame( data=[ @@ -1269,8 +1374,13 @@ [np.nan, np.nan, "5", np.nan], ] ) + if using_infer_string: + # na in object is True while in string pyarrow numpy it's false + val = not axis == 0 and not skipna and bool_agg_func == "all" + else: + val = True result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna) - expected = Series([True, True, True, True]) + expected = Series([True, True, val, True]) tm.assert_series_equal(result, expected) # GH#50947 deprecates this but it is not emitting a warning in some builds. @@ -1296,7 +1406,8 @@ def test_any_all_bool_only(self): # GH 25101 df = DataFrame( - {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]} + {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]}, + columns=Index(["col1", "col2", "col3"], dtype=object), ) result = df.all(bool_only=True) @@ -1591,7 +1702,7 @@ self, request, frame_or_series, all_reductions ): if all_reductions == "count": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Count does not accept skipna") ) obj = frame_or_series([1, 2, 3]) @@ -1821,7 +1932,7 @@ mark = pytest.mark.xfail( reason="Incorrect type inference on NaT in reduction result" ) - request.node.add_marker(mark) + request.applymarker(mark) arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2) arr[-1, -1] = "Nat" @@ -1849,6 +1960,9 @@ tm.assert_series_equal(result, expected) +@pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="sum doesn't work with arrow strings" +) def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) @@ -1869,7 +1983,7 @@ expected = Series( [getattr(int64_info, method)], dtype="Int64", - index=Index(["Int64"], dtype="object"), + index=Index(["Int64"]), ) tm.assert_series_equal(result, expected) @@ -1887,7 +2001,7 @@ df = DataFrame([1, "a", True]) result = df.prod(axis=0, min_count=1, numeric_only=False) - expected = Series(["a"]) + expected = Series(["a"], dtype=object) tm.assert_series_equal(result, expected) msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'") diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_repr.py pandas-2.2.2+dfsg/pandas/tests/frame/test_repr.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_repr.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_repr.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,521 @@ +from datetime import ( + datetime, + timedelta, +) +from io import StringIO + +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +from pandas import ( + NA, + Categorical, + CategoricalIndex, + DataFrame, + IntervalIndex, + MultiIndex, + NaT, + PeriodIndex, + Series, + Timestamp, + date_range, + option_context, + period_range, +) +import pandas._testing as tm + + +class TestDataFrameRepr: + def test_repr_should_return_str(self): + # https://docs.python.org/3/reference/datamodel.html#object.__repr__ + # "...The return value must be a string object." + + # (str on py2.x, str (unicode) on py3) + + data = [8, 5, 3, 5] + index1 = ["\u03c3", "\u03c4", "\u03c5", "\u03c6"] + cols = ["\u03c8"] + df = DataFrame(data, columns=cols, index=index1) + assert type(df.__repr__()) is str # noqa: E721 + + ser = df[cols[0]] + assert type(ser.__repr__()) is str # noqa: E721 + + def test_repr_bytes_61_lines(self): + # GH#12857 + lets = list("ACDEFGHIJKLMNOP") + words = np.random.default_rng(2).choice(lets, (1000, 50)) + df = DataFrame(words).astype("U1") + assert (df.dtypes == object).all() + + # smoke tests; at one point this raised with 61 but not 60 + repr(df) + repr(df.iloc[:60, :]) + repr(df.iloc[:61, :]) + + def test_repr_unicode_level_names(self, frame_or_series): + index = MultiIndex.from_tuples([(0, 0), (1, 1)], names=["\u0394", "i1"]) + + obj = DataFrame(np.random.default_rng(2).standard_normal((2, 4)), index=index) + obj = tm.get_obj(obj, frame_or_series) + repr(obj) + + def test_assign_index_sequences(self): + # GH#2200 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index( + ["a", "b"] + ) + index = list(df.index) + index[0] = ("faz", "boo") + df.index = index + repr(df) + + # this travels an improper code path + index[0] = ["faz", "boo"] + df.index = index + repr(df) + + def test_repr_with_mi_nat(self): + df = DataFrame({"X": [1, 2]}, index=[[NaT, Timestamp("20130101")], ["a", "b"]]) + result = repr(df) + expected = " X\nNaT a 1\n2013-01-01 b 2" + assert result == expected + + def test_repr_with_different_nulls(self): + # GH45263 + df = DataFrame([1, 2, 3, 4], [True, None, np.nan, NaT]) + result = repr(df) + expected = """ 0 +True 1 +None 2 +NaN 3 +NaT 4""" + assert result == expected + + def test_repr_with_different_nulls_cols(self): + # GH45263 + d = {np.nan: [1, 2], None: [3, 4], NaT: [6, 7], True: [8, 9]} + df = DataFrame(data=d) + result = repr(df) + expected = """ NaN None NaT True +0 1 3 6 8 +1 2 4 7 9""" + assert result == expected + + def test_multiindex_na_repr(self): + # only an issue with long columns + df3 = DataFrame( + { + "A" * 30: {("A", "A0006000", "nuit"): "A0006000"}, + "B" * 30: {("A", "A0006000", "nuit"): np.nan}, + "C" * 30: {("A", "A0006000", "nuit"): np.nan}, + "D" * 30: {("A", "A0006000", "nuit"): np.nan}, + "E" * 30: {("A", "A0006000", "nuit"): "A"}, + "F" * 30: {("A", "A0006000", "nuit"): np.nan}, + } + ) + + idf = df3.set_index(["A" * 30, "C" * 30]) + repr(idf) + + def test_repr_name_coincide(self): + index = MultiIndex.from_tuples( + [("a", 0, "foo"), ("b", 1, "bar")], names=["a", "b", "c"] + ) + + df = DataFrame({"value": [0, 1]}, index=index) + + lines = repr(df).split("\n") + assert lines[2].startswith("a 0 foo") + + def test_repr_to_string( + self, + multiindex_year_month_day_dataframe_random_data, + multiindex_dataframe_random_data, + ): + ymd = multiindex_year_month_day_dataframe_random_data + frame = multiindex_dataframe_random_data + + repr(frame) + repr(ymd) + repr(frame.T) + repr(ymd.T) + + buf = StringIO() + frame.to_string(buf=buf) + ymd.to_string(buf=buf) + frame.T.to_string(buf=buf) + ymd.T.to_string(buf=buf) + + def test_repr_empty(self): + # empty + repr(DataFrame()) + + # empty with index + frame = DataFrame(index=np.arange(1000)) + repr(frame) + + def test_repr_mixed(self, float_string_frame): + # mixed + repr(float_string_frame) + + @pytest.mark.slow + def test_repr_mixed_big(self): + # big mixed + biggie = DataFrame( + { + "A": np.random.default_rng(2).standard_normal(200), + "B": [str(i) for i in range(200)], + }, + index=range(200), + ) + biggie.loc[:20, "A"] = np.nan + biggie.loc[:20, "B"] = np.nan + + repr(biggie) + + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="/r in") + def test_repr(self): + # columns but no index + no_index = DataFrame(columns=[0, 1, 3]) + repr(no_index) + + df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"]) + assert "\t" not in repr(df) + assert "\r" not in repr(df) + assert "a\n" not in repr(df) + + def test_repr_dimensions(self): + df = DataFrame([[1, 2], [3, 4]]) + with option_context("display.show_dimensions", True): + assert "2 rows x 2 columns" in repr(df) + + with option_context("display.show_dimensions", False): + assert "2 rows x 2 columns" not in repr(df) + + with option_context("display.show_dimensions", "truncate"): + assert "2 rows x 2 columns" not in repr(df) + + @pytest.mark.slow + def test_repr_big(self): + # big one + biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200)) + repr(biggie) + + def test_repr_unsortable(self): + # columns are not sortable + + unsortable = DataFrame( + { + "foo": [1] * 50, + datetime.today(): [1] * 50, + "bar": ["bar"] * 50, + datetime.today() + timedelta(1): ["bar"] * 50, + }, + index=np.arange(50), + ) + repr(unsortable) + + def test_repr_float_frame_options(self, float_frame): + repr(float_frame) + + with option_context("display.precision", 3): + repr(float_frame) + + with option_context("display.max_rows", 10, "display.max_columns", 2): + repr(float_frame) + + with option_context("display.max_rows", 1000, "display.max_columns", 1000): + repr(float_frame) + + def test_repr_unicode(self): + uval = "\u03c3\u03c3\u03c3\u03c3" + + df = DataFrame({"A": [uval, uval]}) + + result = repr(df) + ex_top = " A" + assert result.split("\n")[0].rstrip() == ex_top + + df = DataFrame({"A": [uval, uval]}) + result = repr(df) + assert result.split("\n")[0].rstrip() == ex_top + + def test_unicode_string_with_unicode(self): + df = DataFrame({"A": ["\u05d0"]}) + str(df) + + def test_repr_unicode_columns(self): + df = DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) + repr(df.columns) # should not raise UnicodeDecodeError + + def test_str_to_bytes_raises(self): + # GH 26447 + df = DataFrame({"A": ["abc"]}) + msg = "^'str' object cannot be interpreted as an integer$" + with pytest.raises(TypeError, match=msg): + bytes(df) + + def test_very_wide_repr(self): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 20)), + columns=np.array(["a" * 10] * 20, dtype=object), + ) + repr(df) + + def test_repr_column_name_unicode_truncation_bug(self): + # #1906 + df = DataFrame( + { + "Id": [7117434], + "StringCol": ( + "Is it possible to modify drop plot code" + "so that the output graph is displayed " + "in iphone simulator, Is it possible to " + "modify drop plot code so that the " + "output graph is \xe2\x80\xa8displayed " + "in iphone simulator.Now we are adding " + "the CSV file externally. I want to Call " + "the File through the code.." + ), + } + ) + + with option_context("display.max_columns", 20): + assert "StringCol" in repr(df) + + def test_latex_repr(self): + pytest.importorskip("jinja2") + expected = r"""\begin{tabular}{llll} +\toprule + & 0 & 1 & 2 \\ +\midrule +0 & $\alpha$ & b & c \\ +1 & 1 & 2 & 3 \\ +\bottomrule +\end{tabular} +""" + with option_context( + "styler.format.escape", None, "styler.render.repr", "latex" + ): + df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]]) + result = df._repr_latex_() + assert result == expected + + # GH 12182 + assert df._repr_latex_() is None + + def test_repr_with_datetimeindex(self): + df = DataFrame({"A": [1, 2, 3]}, index=date_range("2000", periods=3)) + result = repr(df) + expected = " A\n2000-01-01 1\n2000-01-02 2\n2000-01-03 3" + assert result == expected + + def test_repr_with_intervalindex(self): + # https://github.com/pandas-dev/pandas/pull/24134/files + df = DataFrame( + {"A": [1, 2, 3, 4]}, index=IntervalIndex.from_breaks([0, 1, 2, 3, 4]) + ) + result = repr(df) + expected = " A\n(0, 1] 1\n(1, 2] 2\n(2, 3] 3\n(3, 4] 4" + assert result == expected + + def test_repr_with_categorical_index(self): + df = DataFrame({"A": [1, 2, 3]}, index=CategoricalIndex(["a", "b", "c"])) + result = repr(df) + expected = " A\na 1\nb 2\nc 3" + assert result == expected + + def test_repr_categorical_dates_periods(self): + # normal DataFrame + dt = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") + p = period_range("2011-01", freq="M", periods=5) + df = DataFrame({"dt": dt, "p": p}) + exp = """ dt p +0 2011-01-01 09:00:00-05:00 2011-01 +1 2011-01-01 10:00:00-05:00 2011-02 +2 2011-01-01 11:00:00-05:00 2011-03 +3 2011-01-01 12:00:00-05:00 2011-04 +4 2011-01-01 13:00:00-05:00 2011-05""" + + assert repr(df) == exp + + df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)}) + assert repr(df2) == exp + + @pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64]) + @pytest.mark.parametrize( + "box, expected", + [[Series, "0 NaT\ndtype: object"], [DataFrame, " 0\n0 NaT"]], + ) + def test_repr_np_nat_with_object(self, arg, box, expected): + # GH 25445 + result = repr(box([arg("NaT")], dtype=object)) + assert result == expected + + def test_frame_datetime64_pre1900_repr(self): + df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="YE-DEC")}) + # it works! + repr(df) + + def test_frame_to_string_with_periodindex(self): + index = PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M") + frame = DataFrame(np.random.default_rng(2).standard_normal((3, 4)), index=index) + + # it works! + frame.to_string() + + def test_to_string_ea_na_in_multiindex(self): + # GH#47986 + df = DataFrame( + {"a": [1, 2]}, + index=MultiIndex.from_arrays([Series([NA, 1], dtype="Int64")]), + ) + + result = df.to_string() + expected = """ a + 1 +1 2""" + assert result == expected + + def test_datetime64tz_slice_non_truncate(self): + # GH 30263 + df = DataFrame({"x": date_range("2019", periods=10, tz="UTC")}) + expected = repr(df) + df = df.iloc[:, :5] + result = repr(df) + assert result == expected + + def test_to_records_no_typeerror_in_repr(self): + # GH 48526 + df = DataFrame([["a", "b"], ["c", "d"], ["e", "f"]], columns=["left", "right"]) + df["record"] = df[["left", "right"]].to_records() + expected = """ left right record +0 a b [0, a, b] +1 c d [1, c, d] +2 e f [2, e, f]""" + result = repr(df) + assert result == expected + + def test_to_records_with_na_record_value(self): + # GH 48526 + df = DataFrame( + [["a", np.nan], ["c", "d"], ["e", "f"]], columns=["left", "right"] + ) + df["record"] = df[["left", "right"]].to_records() + expected = """ left right record +0 a NaN [0, a, nan] +1 c d [1, c, d] +2 e f [2, e, f]""" + result = repr(df) + assert result == expected + + def test_to_records_with_na_record(self): + # GH 48526 + df = DataFrame( + [["a", "b"], [np.nan, np.nan], ["e", "f"]], columns=[np.nan, "right"] + ) + df["record"] = df[[np.nan, "right"]].to_records() + expected = """ NaN right record +0 a b [0, a, b] +1 NaN NaN [1, nan, nan] +2 e f [2, e, f]""" + result = repr(df) + assert result == expected + + def test_to_records_with_inf_as_na_record(self): + # GH 48526 + expected = """ NaN inf record +0 inf b [0, inf, b] +1 NaN NaN [1, nan, nan] +2 e f [2, e, f]""" + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with option_context("use_inf_as_na", True): + df = DataFrame( + [[np.inf, "b"], [np.nan, np.nan], ["e", "f"]], + columns=[np.nan, np.inf], + ) + df["record"] = df[[np.nan, np.inf]].to_records() + result = repr(df) + assert result == expected + + def test_to_records_with_inf_record(self): + # GH 48526 + expected = """ NaN inf record +0 inf b [0, inf, b] +1 NaN NaN [1, nan, nan] +2 e f [2, e, f]""" + msg = "use_inf_as_na option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with option_context("use_inf_as_na", False): + df = DataFrame( + [[np.inf, "b"], [np.nan, np.nan], ["e", "f"]], + columns=[np.nan, np.inf], + ) + df["record"] = df[[np.nan, np.inf]].to_records() + result = repr(df) + assert result == expected + + def test_masked_ea_with_formatter(self): + # GH#39336 + df = DataFrame( + { + "a": Series([0.123456789, 1.123456789], dtype="Float64"), + "b": Series([1, 2], dtype="Int64"), + } + ) + result = df.to_string(formatters=["{:.2f}".format, "{:.2f}".format]) + expected = """ a b +0 0.12 1.00 +1 1.12 2.00""" + assert result == expected + + def test_repr_ea_columns(self, any_string_dtype): + # GH#54797 + pytest.importorskip("pyarrow") + df = DataFrame({"long_column_name": [1, 2, 3], "col2": [4, 5, 6]}) + df.columns = df.columns.astype(any_string_dtype) + expected = """ long_column_name col2 +0 1 4 +1 2 5 +2 3 6""" + assert repr(df) == expected + + +@pytest.mark.parametrize( + "data,output", + [ + ([2, complex("nan"), 1], [" 2.0+0.0j", " NaN+0.0j", " 1.0+0.0j"]), + ([2, complex("nan"), -1], [" 2.0+0.0j", " NaN+0.0j", "-1.0+0.0j"]), + ([-2, complex("nan"), -1], ["-2.0+0.0j", " NaN+0.0j", "-1.0+0.0j"]), + ([-1.23j, complex("nan"), -1], ["-0.00-1.23j", " NaN+0.00j", "-1.00+0.00j"]), + ([1.23j, complex("nan"), 1.23], [" 0.00+1.23j", " NaN+0.00j", " 1.23+0.00j"]), + ( + [-1.23j, complex(np.nan, np.nan), 1], + ["-0.00-1.23j", " NaN+ NaNj", " 1.00+0.00j"], + ), + ( + [-1.23j, complex(1.2, np.nan), 1], + ["-0.00-1.23j", " 1.20+ NaNj", " 1.00+0.00j"], + ), + ( + [-1.23j, complex(np.nan, -1.2), 1], + ["-0.00-1.23j", " NaN-1.20j", " 1.00+0.00j"], + ), + ], +) +@pytest.mark.parametrize("as_frame", [True, False]) +def test_repr_with_complex_nans(data, output, as_frame): + # GH#53762, GH#53841 + obj = Series(np.array(data)) + if as_frame: + obj = obj.to_frame(name="val") + reprs = [f"{i} {val}" for i, val in enumerate(output)] + expected = f"{'val': >{len(reprs[0])}}\n" + "\n".join(reprs) + else: + reprs = [f"{i} {val}" for i, val in enumerate(output)] + expected = "\n".join(reprs) + "\ndtype: complex128" + assert str(obj) == expected, f"\n{str(obj)}\n\n{expected}" diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_repr_info.py pandas-2.2.2+dfsg/pandas/tests/frame/test_repr_info.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_repr_info.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_repr_info.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,468 +0,0 @@ -from datetime import ( - datetime, - timedelta, -) -from io import StringIO - -import numpy as np -import pytest - -from pandas import ( - NA, - Categorical, - DataFrame, - MultiIndex, - NaT, - PeriodIndex, - Series, - Timestamp, - date_range, - option_context, - period_range, -) -import pandas._testing as tm - -import pandas.io.formats.format as fmt - - -class TestDataFrameReprInfoEtc: - def test_repr_bytes_61_lines(self): - # GH#12857 - lets = list("ACDEFGHIJKLMNOP") - slen = 50 - nseqs = 1000 - words = [ - [np.random.default_rng(2).choice(lets) for x in range(slen)] - for _ in range(nseqs) - ] - df = DataFrame(words).astype("U1") - assert (df.dtypes == object).all() - - # smoke tests; at one point this raised with 61 but not 60 - repr(df) - repr(df.iloc[:60, :]) - repr(df.iloc[:61, :]) - - def test_repr_unicode_level_names(self, frame_or_series): - index = MultiIndex.from_tuples([(0, 0), (1, 1)], names=["\u0394", "i1"]) - - obj = DataFrame(np.random.default_rng(2).standard_normal((2, 4)), index=index) - obj = tm.get_obj(obj, frame_or_series) - repr(obj) - - def test_assign_index_sequences(self): - # GH#2200 - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index( - ["a", "b"] - ) - index = list(df.index) - index[0] = ("faz", "boo") - df.index = index - repr(df) - - # this travels an improper code path - index[0] = ["faz", "boo"] - df.index = index - repr(df) - - def test_repr_with_mi_nat(self): - df = DataFrame({"X": [1, 2]}, index=[[NaT, Timestamp("20130101")], ["a", "b"]]) - result = repr(df) - expected = " X\nNaT a 1\n2013-01-01 b 2" - assert result == expected - - def test_repr_with_different_nulls(self): - # GH45263 - df = DataFrame([1, 2, 3, 4], [True, None, np.nan, NaT]) - result = repr(df) - expected = """ 0 -True 1 -None 2 -NaN 3 -NaT 4""" - assert result == expected - - def test_repr_with_different_nulls_cols(self): - # GH45263 - d = {np.nan: [1, 2], None: [3, 4], NaT: [6, 7], True: [8, 9]} - df = DataFrame(data=d) - result = repr(df) - expected = """ NaN None NaT True -0 1 3 6 8 -1 2 4 7 9""" - assert result == expected - - def test_multiindex_na_repr(self): - # only an issue with long columns - df3 = DataFrame( - { - "A" * 30: {("A", "A0006000", "nuit"): "A0006000"}, - "B" * 30: {("A", "A0006000", "nuit"): np.nan}, - "C" * 30: {("A", "A0006000", "nuit"): np.nan}, - "D" * 30: {("A", "A0006000", "nuit"): np.nan}, - "E" * 30: {("A", "A0006000", "nuit"): "A"}, - "F" * 30: {("A", "A0006000", "nuit"): np.nan}, - } - ) - - idf = df3.set_index(["A" * 30, "C" * 30]) - repr(idf) - - def test_repr_name_coincide(self): - index = MultiIndex.from_tuples( - [("a", 0, "foo"), ("b", 1, "bar")], names=["a", "b", "c"] - ) - - df = DataFrame({"value": [0, 1]}, index=index) - - lines = repr(df).split("\n") - assert lines[2].startswith("a 0 foo") - - def test_repr_to_string( - self, - multiindex_year_month_day_dataframe_random_data, - multiindex_dataframe_random_data, - ): - ymd = multiindex_year_month_day_dataframe_random_data - frame = multiindex_dataframe_random_data - - repr(frame) - repr(ymd) - repr(frame.T) - repr(ymd.T) - - buf = StringIO() - frame.to_string(buf=buf) - ymd.to_string(buf=buf) - frame.T.to_string(buf=buf) - ymd.T.to_string(buf=buf) - - def test_repr_empty(self): - # empty - repr(DataFrame()) - - # empty with index - frame = DataFrame(index=np.arange(1000)) - repr(frame) - - def test_repr_mixed(self, float_string_frame): - buf = StringIO() - - # mixed - repr(float_string_frame) - float_string_frame.info(verbose=False, buf=buf) - - @pytest.mark.slow - def test_repr_mixed_big(self): - # big mixed - biggie = DataFrame( - { - "A": np.random.default_rng(2).standard_normal(200), - "B": tm.makeStringIndex(200), - }, - index=range(200), - ) - biggie.loc[:20, "A"] = np.nan - biggie.loc[:20, "B"] = np.nan - - repr(biggie) - - def test_repr(self, float_frame): - buf = StringIO() - - # small one - repr(float_frame) - float_frame.info(verbose=False, buf=buf) - - # even smaller - float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf) - float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf) - - # exhausting cases in DataFrame.info - - # columns but no index - no_index = DataFrame(columns=[0, 1, 3]) - repr(no_index) - - # no columns or index - DataFrame().info(buf=buf) - - df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"]) - assert "\t" not in repr(df) - assert "\r" not in repr(df) - assert "a\n" not in repr(df) - - def test_repr_dimensions(self): - df = DataFrame([[1, 2], [3, 4]]) - with option_context("display.show_dimensions", True): - assert "2 rows x 2 columns" in repr(df) - - with option_context("display.show_dimensions", False): - assert "2 rows x 2 columns" not in repr(df) - - with option_context("display.show_dimensions", "truncate"): - assert "2 rows x 2 columns" not in repr(df) - - @pytest.mark.slow - def test_repr_big(self): - # big one - biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200)) - repr(biggie) - - def test_repr_unsortable(self, float_frame): - # columns are not sortable - - unsortable = DataFrame( - { - "foo": [1] * 50, - datetime.today(): [1] * 50, - "bar": ["bar"] * 50, - datetime.today() + timedelta(1): ["bar"] * 50, - }, - index=np.arange(50), - ) - repr(unsortable) - - fmt.set_option("display.precision", 3) - repr(float_frame) - - fmt.set_option("display.max_rows", 10, "display.max_columns", 2) - repr(float_frame) - - fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000) - repr(float_frame) - - tm.reset_display_options() - - def test_repr_unicode(self): - uval = "\u03c3\u03c3\u03c3\u03c3" - - df = DataFrame({"A": [uval, uval]}) - - result = repr(df) - ex_top = " A" - assert result.split("\n")[0].rstrip() == ex_top - - df = DataFrame({"A": [uval, uval]}) - result = repr(df) - assert result.split("\n")[0].rstrip() == ex_top - - def test_unicode_string_with_unicode(self): - df = DataFrame({"A": ["\u05d0"]}) - str(df) - - def test_repr_unicode_columns(self): - df = DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) - repr(df.columns) # should not raise UnicodeDecodeError - - def test_str_to_bytes_raises(self): - # GH 26447 - df = DataFrame({"A": ["abc"]}) - msg = "^'str' object cannot be interpreted as an integer$" - with pytest.raises(TypeError, match=msg): - bytes(df) - - def test_very_wide_info_repr(self): - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 20)), - columns=np.array(["a" * 10] * 20, dtype=object), - ) - repr(df) - - def test_repr_column_name_unicode_truncation_bug(self): - # #1906 - df = DataFrame( - { - "Id": [7117434], - "StringCol": ( - "Is it possible to modify drop plot code" - "so that the output graph is displayed " - "in iphone simulator, Is it possible to " - "modify drop plot code so that the " - "output graph is \xe2\x80\xa8displayed " - "in iphone simulator.Now we are adding " - "the CSV file externally. I want to Call " - "the File through the code.." - ), - } - ) - - with option_context("display.max_columns", 20): - assert "StringCol" in repr(df) - - def test_latex_repr(self): - pytest.importorskip("jinja2") - expected = r"""\begin{tabular}{llll} -\toprule - & 0 & 1 & 2 \\ -\midrule -0 & $\alpha$ & b & c \\ -1 & 1 & 2 & 3 \\ -\bottomrule -\end{tabular} -""" - with option_context( - "styler.format.escape", None, "styler.render.repr", "latex" - ): - df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]]) - result = df._repr_latex_() - assert result == expected - - # GH 12182 - assert df._repr_latex_() is None - - def test_repr_categorical_dates_periods(self): - # normal DataFrame - dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") - p = period_range("2011-01", freq="M", periods=5) - df = DataFrame({"dt": dt, "p": p}) - exp = """ dt p -0 2011-01-01 09:00:00-05:00 2011-01 -1 2011-01-01 10:00:00-05:00 2011-02 -2 2011-01-01 11:00:00-05:00 2011-03 -3 2011-01-01 12:00:00-05:00 2011-04 -4 2011-01-01 13:00:00-05:00 2011-05""" - - assert repr(df) == exp - - df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)}) - assert repr(df2) == exp - - @pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64]) - @pytest.mark.parametrize( - "box, expected", - [[Series, "0 NaT\ndtype: object"], [DataFrame, " 0\n0 NaT"]], - ) - def test_repr_np_nat_with_object(self, arg, box, expected): - # GH 25445 - result = repr(box([arg("NaT")], dtype=object)) - assert result == expected - - def test_frame_datetime64_pre1900_repr(self): - df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="A-DEC")}) - # it works! - repr(df) - - def test_frame_to_string_with_periodindex(self): - index = PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M") - frame = DataFrame(np.random.default_rng(2).standard_normal((3, 4)), index=index) - - # it works! - frame.to_string() - - def test_to_string_ea_na_in_multiindex(self): - # GH#47986 - df = DataFrame( - {"a": [1, 2]}, - index=MultiIndex.from_arrays([Series([NA, 1], dtype="Int64")]), - ) - - result = df.to_string() - expected = """ a - 1 -1 2""" - assert result == expected - - def test_datetime64tz_slice_non_truncate(self): - # GH 30263 - df = DataFrame({"x": date_range("2019", periods=10, tz="UTC")}) - expected = repr(df) - df = df.iloc[:, :5] - result = repr(df) - assert result == expected - - def test_to_records_no_typeerror_in_repr(self): - # GH 48526 - df = DataFrame([["a", "b"], ["c", "d"], ["e", "f"]], columns=["left", "right"]) - df["record"] = df[["left", "right"]].to_records() - expected = """ left right record -0 a b [0, a, b] -1 c d [1, c, d] -2 e f [2, e, f]""" - result = repr(df) - assert result == expected - - def test_to_records_with_na_record_value(self): - # GH 48526 - df = DataFrame( - [["a", np.nan], ["c", "d"], ["e", "f"]], columns=["left", "right"] - ) - df["record"] = df[["left", "right"]].to_records() - expected = """ left right record -0 a NaN [0, a, nan] -1 c d [1, c, d] -2 e f [2, e, f]""" - result = repr(df) - assert result == expected - - def test_to_records_with_na_record(self): - # GH 48526 - df = DataFrame( - [["a", "b"], [np.nan, np.nan], ["e", "f"]], columns=[np.nan, "right"] - ) - df["record"] = df[[np.nan, "right"]].to_records() - expected = """ NaN right record -0 a b [0, a, b] -1 NaN NaN [1, nan, nan] -2 e f [2, e, f]""" - result = repr(df) - assert result == expected - - def test_to_records_with_inf_as_na_record(self): - # GH 48526 - expected = """ NaN inf record -0 NaN b [0, inf, b] -1 NaN NaN [1, nan, nan] -2 e f [2, e, f]""" - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with option_context("use_inf_as_na", True): - df = DataFrame( - [[np.inf, "b"], [np.nan, np.nan], ["e", "f"]], - columns=[np.nan, np.inf], - ) - df["record"] = df[[np.nan, np.inf]].to_records() - result = repr(df) - assert result == expected - - def test_to_records_with_inf_record(self): - # GH 48526 - expected = """ NaN inf record -0 inf b [0, inf, b] -1 NaN NaN [1, nan, nan] -2 e f [2, e, f]""" - msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - with option_context("use_inf_as_na", False): - df = DataFrame( - [[np.inf, "b"], [np.nan, np.nan], ["e", "f"]], - columns=[np.nan, np.inf], - ) - df["record"] = df[[np.nan, np.inf]].to_records() - result = repr(df) - assert result == expected - - def test_masked_ea_with_formatter(self): - # GH#39336 - df = DataFrame( - { - "a": Series([0.123456789, 1.123456789], dtype="Float64"), - "b": Series([1, 2], dtype="Int64"), - } - ) - result = df.to_string(formatters=["{:.2f}".format, "{:.2f}".format]) - expected = """ a b -0 0.12 1.00 -1 1.12 2.00""" - assert result == expected - - def test_repr_ea_columns(self, any_string_dtype): - # GH#54797 - pytest.importorskip("pyarrow") - df = DataFrame({"long_column_name": [1, 2, 3], "col2": [4, 5, 6]}) - df.columns = df.columns.astype(any_string_dtype) - expected = """ long_column_name col2 -0 1 4 -1 2 5 -2 3 6""" - assert repr(df) == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_stack_unstack.py pandas-2.2.2+dfsg/pandas/tests/frame/test_stack_unstack.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_stack_unstack.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_stack_unstack.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,5 +1,4 @@ from datetime import datetime -from io import StringIO import itertools import re @@ -47,6 +46,9 @@ tm.assert_frame_equal(unstacked_cols.T, df) tm.assert_frame_equal(unstacked_cols_df["bar"].T, df) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_mixed_level(self, future_stack): # GH 18310 levels = [range(3), [3, "a", "b"], [1, 2]] @@ -82,6 +84,9 @@ expected = df.unstack() tm.assert_series_equal(res, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_unstack_fill(self, future_stack): # GH #9746: fill_value keyword argument for Series # and DataFrame unstack @@ -388,6 +393,9 @@ s = df1["A"] unstack_and_compare(s, "index") + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_ints(self, future_stack): columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3))) df = DataFrame( @@ -418,6 +426,9 @@ ), ) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_mixed_levels(self, future_stack): columns = MultiIndex.from_tuples( [ @@ -474,6 +485,9 @@ check_names=False, ) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_int_level_names(self, future_stack): columns = MultiIndex.from_tuples( [ @@ -549,6 +563,9 @@ ) tm.assert_frame_equal(rs, xp) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_unstack_level_binding(self, future_stack): # GH9856 mi = MultiIndex( @@ -602,7 +619,7 @@ data = data.unstack() tm.assert_frame_equal(old_data, data) - def test_unstack_dtypes(self): + def test_unstack_dtypes(self, using_infer_string): # GH 2929 rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]] @@ -638,8 +655,9 @@ df2["D"] = "foo" df3 = df2.unstack("B") result = df3.dtypes + dtype = "string" if using_infer_string else np.dtype("object") expected = Series( - [np.dtype("float64")] * 2 + [np.dtype("object")] * 2, + [np.dtype("float64")] * 2 + [dtype] * 2, index=MultiIndex.from_arrays( [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") ), @@ -676,6 +694,9 @@ assert left.shape == (3, 2) tm.assert_frame_equal(left, right) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_unstack_non_unique_index_names(self, future_stack): idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"]) df = DataFrame([1, 2], index=idx) @@ -1044,13 +1065,19 @@ # GH 8039 t = datetime(2014, 1, 1) df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")])) - result = df.stack(future_stack=future_stack) + warn = None if future_stack else FutureWarning + msg = "The previous implementation of stack is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = df.stack(future_stack=future_stack) eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) ecols = MultiIndex.from_tuples([(t, "A")]) expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) @pytest.mark.parametrize( "multiindex_columns", [ @@ -1111,6 +1138,9 @@ else: tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_full_multiIndex(self, future_stack): # GH 8844 full_multiindex = MultiIndex.from_tuples( @@ -1146,6 +1176,9 @@ tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) @pytest.mark.parametrize("ordered", [False, True]) @pytest.mark.parametrize( "labels,data", @@ -1184,6 +1217,10 @@ ) tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize( "index, columns", [ @@ -1194,6 +1231,7 @@ ) def test_stack_multi_columns_non_unique_index(self, index, columns, future_stack): # GH-28301 + df = DataFrame(index=index, columns=columns).fillna(1) stacked = df.stack(future_stack=future_stack) new_index = MultiIndex.from_tuples(stacked.index.to_numpy()) @@ -1205,6 +1243,9 @@ expected_codes = np.asarray(new_index.codes) tm.assert_numpy_array_equal(stacked_codes, expected_codes) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) @pytest.mark.parametrize( "vals1, vals2, dtype1, dtype2, expected_dtype", [ @@ -1319,14 +1360,16 @@ # By default missing values will be NaN result = data.unstack() expected = DataFrame( - {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, index=list("xyz") + {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, + index=list("xyz"), + dtype=object, ) tm.assert_frame_equal(result, expected) # Fill with any value replaces missing values as expected result = data.unstack(fill_value="d") expected = DataFrame( - {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz") + {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz"), dtype=object ) tm.assert_frame_equal(result, expected) @@ -1367,6 +1410,7 @@ tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") @pytest.mark.parametrize("dropna", [True, False, lib.no_default]) def test_stack_empty_frame(dropna, future_stack): # GH 36113 @@ -1382,6 +1426,7 @@ tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") @pytest.mark.parametrize("dropna", [True, False, lib.no_default]) @pytest.mark.parametrize("fill_value", [None, 0]) def test_stack_unstack_empty_frame(dropna, fill_value, future_stack): @@ -1439,6 +1484,7 @@ tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") def test_stack_positional_level_duplicate_column_names(future_stack): # https://github.com/pandas-dev/pandas/issues/36353 columns = MultiIndex.from_product([("x", "y"), ("y", "z")], names=["a", "a"]) @@ -1474,6 +1520,7 @@ tm.assert_frame_equal(res, expected) +@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") def test_stack_sort_false(future_stack): # GH 15105 data = [[1, 2, 3.0, 4.0], [2, 3, 4.0, 5.0], [3, 4, np.nan, np.nan]] @@ -1512,6 +1559,7 @@ tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") def test_stack_sort_false_multi_level(future_stack): # GH 15105 idx = MultiIndex.from_tuples([("weight", "kg"), ("height", "m")]) @@ -1598,6 +1646,9 @@ expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack(self, multiindex_year_month_day_dataframe_random_data, future_stack): ymd = multiindex_year_month_day_dataframe_random_data @@ -1718,22 +1769,25 @@ li, ri = result.index, expected.index tm.assert_index_equal(li, ri) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_unstack_odd_failure(self, future_stack): - data = """day,time,smoker,sum,len -Fri,Dinner,No,8.25,3. -Fri,Dinner,Yes,27.03,9 -Fri,Lunch,No,3.0,1 -Fri,Lunch,Yes,13.68,6 -Sat,Dinner,No,139.63,45 -Sat,Dinner,Yes,120.77,42 -Sun,Dinner,No,180.57,57 -Sun,Dinner,Yes,66.82,19 -Thu,Dinner,No,3.0,1 -Thu,Lunch,No,117.32,44 -Thu,Lunch,Yes,51.51,17""" - - df = pd.read_csv(StringIO(data)).set_index(["day", "time", "smoker"]) - + mi = MultiIndex.from_arrays( + [ + ["Fri"] * 4 + ["Sat"] * 2 + ["Sun"] * 2 + ["Thu"] * 3, + ["Dinner"] * 2 + ["Lunch"] * 2 + ["Dinner"] * 5 + ["Lunch"] * 2, + ["No", "Yes"] * 4 + ["No", "No", "Yes"], + ], + names=["day", "time", "smoker"], + ) + df = DataFrame( + { + "sum": np.arange(11, dtype="float64"), + "len": np.arange(11, dtype="float64"), + }, + index=mi, + ) # it works, #2100 result = df.unstack(2) @@ -1743,6 +1797,9 @@ recons = recons.dropna(how="all") tm.assert_frame_equal(recons, df) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_mixed_dtype(self, multiindex_dataframe_random_data, future_stack): frame = multiindex_dataframe_random_data @@ -1767,12 +1824,17 @@ } ) - result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack(future_stack=future_stack) tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_unstack_preserve_names( self, multiindex_dataframe_random_data, future_stack ): @@ -1812,6 +1874,9 @@ expected = frame.unstack(level=1) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack): frame = multiindex_dataframe_random_data @@ -1824,6 +1889,9 @@ expected = frame.stack(future_stack=future_stack) tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_unstack_multiple( self, multiindex_year_month_day_dataframe_random_data, future_stack ): @@ -1858,6 +1926,9 @@ expected = ymd.unstack(2).unstack(1).dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns]) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_names_and_numbers( self, multiindex_year_month_day_dataframe_random_data, future_stack ): @@ -1869,6 +1940,9 @@ with pytest.raises(ValueError, match="level should contain"): unstacked.stack([0, "month"], future_stack=future_stack) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_multiple_out_of_bounds( self, multiindex_year_month_day_dataframe_random_data, future_stack ): @@ -1998,6 +2072,9 @@ tm.assert_frame_equal(result3, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_multiple_bug(self, future_stack): # bug when some uniques are not present in the data GH#3170 id_col = ([1] * 3) + ([2] * 3) @@ -2009,7 +2086,7 @@ multi = df.set_index(["DATE", "ID"]) multi.columns.name = "Params" unst = multi.unstack("ID") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) @@ -2023,6 +2100,9 @@ xp.columns.name = "Params" tm.assert_frame_equal(rs, xp) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_dropna(self, future_stack): # GH#3997 df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]}) @@ -2076,6 +2156,9 @@ # it works! is sufficient idf.unstack("E") + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_unstack_unobserved_keys(self, future_stack): # related to GH#2278 refactoring levels = [[0, 1], [0, 1, 2, 3]] @@ -2105,7 +2188,7 @@ with monkeypatch.context() as m: m.setattr(reshape_lib, "_Unstacker", MockUnstacker) df = DataFrame( - np.random.default_rng(2).standard_normal((2**16, 2)), + np.zeros((2**16, 2)), index=[np.arange(2**16), np.arange(2**16)], ) msg = "The following operation may generate" @@ -2113,6 +2196,9 @@ with pytest.raises(Exception, match="Don't compute final result."): df.unstack() + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) @pytest.mark.parametrize( "levels", itertools.chain.from_iterable( @@ -2139,6 +2225,9 @@ result = df_stacked.loc[result_row, result_col] assert result == expected + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_order_with_unsorted_levels_multi_row(self, future_stack): # GH#16323 @@ -2157,6 +2246,9 @@ for col in df.columns ) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_order_with_unsorted_levels_multi_row_2(self, future_stack): # GH#53636 levels = ((0, 1), (1, 0)) @@ -2178,6 +2270,9 @@ ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_unstack_unordered_multiindex(self, future_stack): # GH# 18265 values = np.arange(5) @@ -2206,7 +2301,7 @@ tm.assert_frame_equal(result, expected) def test_unstack_preserve_types( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, using_infer_string ): # GH#403 ymd = multiindex_year_month_day_dataframe_random_data @@ -2215,7 +2310,11 @@ unstacked = ymd.unstack("month") assert unstacked["A", 1].dtype == np.float64 - assert unstacked["E", 1].dtype == np.object_ + assert ( + unstacked["E", 1].dtype == np.object_ + if not using_infer_string + else "string" + ) assert unstacked["F", 1].dtype == np.float64 def test_unstack_group_index_overflow(self, future_stack): @@ -2275,7 +2374,7 @@ expected = DataFrame( [[10.0, 10.0, 1.0, 1.0], [np.nan, 10.0, 0.0, 1.0]], - index=Index(["A", "B"], dtype="object", name="a"), + index=Index(["A", "B"], name="a"), columns=MultiIndex.from_tuples( [("v", "ca"), ("v", "cb"), ("is_", "ca"), ("is_", "cb")], names=[None, "b"], @@ -2311,6 +2410,9 @@ tm.assert_index_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_nan_in_multiindex_columns(self, future_stack): # GH#39481 df = DataFrame( @@ -2339,6 +2441,9 @@ ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_multi_level_stack_categorical(self, future_stack): # GH 15239 midx = MultiIndex.from_arrays( @@ -2394,6 +2499,9 @@ ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_nan_level(self, future_stack): # GH 9406 df_nan = DataFrame( @@ -2437,6 +2545,9 @@ expected.columns = MultiIndex.from_tuples([("cat", 0), ("cat", 1)]) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_unsorted(self, future_stack): # GH 16925 PAE = ["ITA", "FRA"] @@ -2459,6 +2570,9 @@ ) tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:The previous implementation of stack is deprecated" + ) def test_stack_nullable_dtype(self, future_stack): # GH#43561 columns = MultiIndex.from_product( @@ -2524,3 +2638,41 @@ ), ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype, na_value", + [ + ("float64", np.nan), + ("Float64", np.nan), + ("Float64", pd.NA), + ("Int64", pd.NA), + ], +) +@pytest.mark.parametrize("test_multiindex", [True, False]) +def test_stack_preserves_na(dtype, na_value, test_multiindex): + # GH#56573 + if test_multiindex: + index = MultiIndex.from_arrays(2 * [Index([na_value], dtype=dtype)]) + else: + index = Index([na_value], dtype=dtype) + df = DataFrame({"a": [1]}, index=index) + result = df.stack(future_stack=True) + + if test_multiindex: + expected_index = MultiIndex.from_arrays( + [ + Index([na_value], dtype=dtype), + Index([na_value], dtype=dtype), + Index(["a"]), + ] + ) + else: + expected_index = MultiIndex.from_arrays( + [ + Index([na_value], dtype=dtype), + Index(["a"]), + ] + ) + expected = Series(1, index=expected_index) + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_subclass.py pandas-2.2.2+dfsg/pandas/tests/frame/test_subclass.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_subclass.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_subclass.py 2024-04-10 17:42:52.000000000 +0000 @@ -26,6 +26,17 @@ class TestDataFrameSubclassing: + def test_no_warning_on_mgr(self): + # GH#57032 + df = tm.SubclassedDataFrame( + {"X": [1, 2, 3], "Y": [1, 2, 3]}, index=["a", "b", "c"] + ) + with tm.assert_produces_warning(None): + # df.isna() goes through _constructor_from_mgr, which we want to + # *not* pass a Manager do __init__ + df.isna() + df["X"].isna() + def test_frame_subclassing_and_slicing(self): # Subclass frame and ensure it returns the right class on slicing it # In reference to PR 9632 diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_ufunc.py pandas-2.2.2+dfsg/pandas/tests/frame/test_ufunc.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_ufunc.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_ufunc.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,7 +31,7 @@ def test_unary_binary(request, dtype): # unary input, binary output if is_extension_array_dtype(dtype) or isinstance(dtype, dict): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Extension / mixed with multiple outputs not implemented." ) @@ -106,7 +106,7 @@ or is_extension_array_dtype(dtype_b) or isinstance(dtype_b, dict) ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Extension / mixed with multiple inputs not implemented." ) @@ -135,7 +135,7 @@ @pytest.mark.parametrize("dtype", dtypes) def test_binary_input_aligns_index(request, dtype): if is_extension_array_dtype(dtype) or isinstance(dtype, dict): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Extension / mixed with multiple inputs not implemented." ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/frame/test_unary.py pandas-2.2.2+dfsg/pandas/tests/frame/test_unary.py --- pandas-2.1.4+dfsg/pandas/tests/frame/test_unary.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/frame/test_unary.py 2024-04-10 17:42:52.000000000 +0000 @@ -48,15 +48,25 @@ pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}), ], ) - def test_neg_raises(self, df): + def test_neg_raises(self, df, using_infer_string): msg = ( "bad operand type for unary -: 'str'|" r"bad operand type for unary -: 'DatetimeArray'" ) - with pytest.raises(TypeError, match=msg): - (-df) - with pytest.raises(TypeError, match=msg): - (-df["a"]) + if using_infer_string and df.dtypes.iloc[0] == "string": + import pyarrow as pa + + msg = "has no kernel" + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + (-df) + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + (-df["a"]) + + else: + with pytest.raises(TypeError, match=msg): + (-df) + with pytest.raises(TypeError, match=msg): + (-df["a"]) def test_invert(self, float_frame): df = float_frame diff -Nru pandas-2.1.4+dfsg/pandas/tests/generic/test_duplicate_labels.py pandas-2.2.2+dfsg/pandas/tests/generic/test_duplicate_labels.py --- pandas-2.1.4+dfsg/pandas/tests/generic/test_duplicate_labels.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/generic/test_duplicate_labels.py 2024-04-10 17:42:52.000000000 +0000 @@ -90,9 +90,11 @@ assert df.loc[[0]].flags.allows_duplicate_labels is False assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False - def test_ndframe_getitem_caching_issue(self, request, using_copy_on_write): - if not using_copy_on_write: - request.node.add_marker(pytest.mark.xfail(reason="Unclear behavior.")) + def test_ndframe_getitem_caching_issue( + self, request, using_copy_on_write, warn_copy_on_write + ): + if not (using_copy_on_write or warn_copy_on_write): + request.applymarker(pytest.mark.xfail(reason="Unclear behavior.")) # NDFrame.__getitem__ will cache the first df['A']. May need to # invalidate that cache? Update the cached entries? df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False) diff -Nru pandas-2.1.4+dfsg/pandas/tests/generic/test_finalize.py pandas-2.2.2+dfsg/pandas/tests/generic/test_finalize.py --- pandas-2.1.4+dfsg/pandas/tests/generic/test_finalize.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/generic/test_finalize.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,11 +31,6 @@ # - Callable: pass the constructed value with attrs set to this. _all_methods = [ - ( - pd.Series, - (np.array([0], dtype="float64")), - operator.methodcaller("view", "int64"), - ), (pd.Series, ([0],), operator.methodcaller("take", [])), (pd.Series, ([0],), operator.methodcaller("__getitem__", [True])), (pd.Series, ([0],), operator.methodcaller("repeat", 2)), @@ -281,12 +276,12 @@ ( pd.Series, (1, pd.date_range("2000", periods=4)), - operator.methodcaller("asfreq", "H"), + operator.methodcaller("asfreq", "h"), ), ( pd.DataFrame, ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), - operator.methodcaller("asfreq", "H"), + operator.methodcaller("asfreq", "h"), ), ( pd.Series, @@ -490,7 +485,7 @@ if not (isinstance(left, int) or isinstance(right, int)) and annotate != "both": if not all_binary_operators.__name__.startswith("r"): if annotate == "right" and isinstance(left, type(right)): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_binary_operators} doesn't work when right has " f"attrs and both are {type(left)}" @@ -498,14 +493,14 @@ ) if not isinstance(left, type(right)): if annotate == "left" and isinstance(left, pd.Series): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_binary_operators} doesn't work when the " "objects are different Series has attrs" ) ) elif annotate == "right" and isinstance(right, pd.Series): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_binary_operators} doesn't work when the " "objects are different Series has attrs" @@ -513,7 +508,7 @@ ) else: if annotate == "left" and isinstance(left, type(right)): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_binary_operators} doesn't work when left has " f"attrs and both are {type(left)}" @@ -521,14 +516,14 @@ ) if not isinstance(left, type(right)): if annotate == "right" and isinstance(right, pd.Series): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_binary_operators} doesn't work when the " "objects are different Series has attrs" ) ) elif annotate == "left" and isinstance(left, pd.Series): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{all_binary_operators} doesn't work when the " "objects are different Series has attrs" @@ -628,9 +623,9 @@ operator.methodcaller("tz_localize", "CET"), operator.methodcaller("normalize"), operator.methodcaller("strftime", "%Y"), - operator.methodcaller("round", "H"), - operator.methodcaller("floor", "H"), - operator.methodcaller("ceil", "H"), + operator.methodcaller("round", "h"), + operator.methodcaller("floor", "h"), + operator.methodcaller("ceil", "h"), operator.methodcaller("month_name"), operator.methodcaller("day_name"), ], diff -Nru pandas-2.1.4+dfsg/pandas/tests/generic/test_frame.py pandas-2.2.2+dfsg/pandas/tests/generic/test_frame.py --- pandas-2.1.4+dfsg/pandas/tests/generic/test_frame.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/generic/test_frame.py 2024-04-10 17:42:52.000000000 +0000 @@ -87,7 +87,7 @@ np.random.default_rng(2).standard_normal((1000, 2)), index=date_range("20130101", periods=1000, freq="s"), ) - result = df.resample("1T") + result = df.resample("1min") tm.assert_metadata_equivalent(df, result) def test_metadata_propagation_indiv(self, monkeypatch): diff -Nru pandas-2.1.4+dfsg/pandas/tests/generic/test_generic.py pandas-2.2.2+dfsg/pandas/tests/generic/test_generic.py --- pandas-2.1.4+dfsg/pandas/tests/generic/test_generic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/generic/test_generic.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,7 +10,9 @@ from pandas import ( DataFrame, + Index, Series, + date_range, ) import pandas._testing as tm @@ -316,7 +318,11 @@ # tests that don't fit elsewhere @pytest.mark.parametrize( - "ser", [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()] + "ser", + [ + Series(range(10), dtype=np.float64), + Series([str(i) for i in range(10)], dtype=object), + ], ) def test_squeeze_series_noop(self, ser): # noop @@ -324,12 +330,16 @@ def test_squeeze_frame_noop(self): # noop - df = tm.makeTimeDataFrame() + df = DataFrame(np.eye(2)) tm.assert_frame_equal(df.squeeze(), df) def test_squeeze_frame_reindex(self): # squeezing - df = tm.makeTimeDataFrame().reindex(columns=["A"]) + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ).reindex(columns=["A"]) tm.assert_series_equal(df.squeeze(), df["A"]) def test_squeeze_0_len_dim(self): @@ -341,7 +351,11 @@ def test_squeeze_axis(self): # axis argument - df = tm.makeTimeDataFrame(nper=1).iloc[:, :1] + df = DataFrame( + np.random.default_rng(2).standard_normal((1, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=1, freq="B"), + ).iloc[:, :1] assert df.shape == (1, 1) tm.assert_series_equal(df.squeeze(axis=0), df.iloc[0]) tm.assert_series_equal(df.squeeze(axis="index"), df.iloc[0]) @@ -356,29 +370,49 @@ df.squeeze(axis="x") def test_squeeze_axis_len_3(self): - df = tm.makeTimeDataFrame(3) + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=3, freq="B"), + ) tm.assert_frame_equal(df.squeeze(axis=0), df) def test_numpy_squeeze(self): - s = tm.makeFloatSeries() + s = Series(range(2), dtype=np.float64) tm.assert_series_equal(np.squeeze(s), s) - df = tm.makeTimeDataFrame().reindex(columns=["A"]) + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ).reindex(columns=["A"]) tm.assert_series_equal(np.squeeze(df), df["A"]) @pytest.mark.parametrize( - "ser", [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()] + "ser", + [ + Series(range(10), dtype=np.float64), + Series([str(i) for i in range(10)], dtype=object), + ], ) def test_transpose_series(self, ser): # calls implementation in pandas/core/base.py tm.assert_series_equal(ser.transpose(), ser) def test_transpose_frame(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) tm.assert_frame_equal(df.transpose().transpose(), df) def test_numpy_transpose(self, frame_or_series): - obj = tm.makeTimeDataFrame() + obj = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) obj = tm.get_obj(obj, frame_or_series) if frame_or_series is Series: @@ -393,7 +427,11 @@ np.transpose(obj, axes=1) @pytest.mark.parametrize( - "ser", [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()] + "ser", + [ + Series(range(10), dtype=np.float64), + Series([str(i) for i in range(10)], dtype=object), + ], ) def test_take_series(self, ser): indices = [1, 5, -2, 6, 3, -1] @@ -407,7 +445,11 @@ def test_take_frame(self): indices = [1, 5, -2, 6, 3, -1] - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) out = df.take(indices) expected = DataFrame( data=df.values.take(indices, axis=0), @@ -419,7 +461,7 @@ def test_take_invalid_kwargs(self, frame_or_series): indices = [-3, 2, 0, 1] - obj = tm.makeTimeDataFrame() + obj = DataFrame(range(5)) obj = tm.get_obj(obj, frame_or_series) msg = r"take\(\) got an unexpected keyword argument 'foo'" diff -Nru pandas-2.1.4+dfsg/pandas/tests/generic/test_series.py pandas-2.2.2+dfsg/pandas/tests/generic/test_series.py --- pandas-2.1.4+dfsg/pandas/tests/generic/test_series.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/generic/test_series.py 2024-04-10 17:42:52.000000000 +0000 @@ -111,13 +111,13 @@ index=date_range("20130101", periods=1000, freq="s"), name="foo", ) - result = ts.resample("1T").mean() + result = ts.resample("1min").mean() tm.assert_metadata_equivalent(ts, result) - result = ts.resample("1T").min() + result = ts.resample("1min").min() tm.assert_metadata_equivalent(ts, result) - result = ts.resample("1T").apply(lambda x: x.sum()) + result = ts.resample("1min").apply(lambda x: x.sum()) tm.assert_metadata_equivalent(ts, result) def test_metadata_propagation_indiv(self, monkeypatch): diff -Nru pandas-2.1.4+dfsg/pandas/tests/generic/test_to_xarray.py pandas-2.2.2+dfsg/pandas/tests/generic/test_to_xarray.py --- pandas-2.1.4+dfsg/pandas/tests/generic/test_to_xarray.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/generic/test_to_xarray.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,18 +18,18 @@ def df(self): return DataFrame( { - "a": list("abc"), - "b": list(range(1, 4)), - "c": np.arange(3, 6).astype("u1"), - "d": np.arange(4.0, 7.0, dtype="float64"), - "e": [True, False, True], - "f": Categorical(list("abc")), - "g": date_range("20130101", periods=3), - "h": date_range("20130101", periods=3, tz="US/Eastern"), + "a": list("abcd"), + "b": list(range(1, 5)), + "c": np.arange(3, 7).astype("u1"), + "d": np.arange(4.0, 8.0, dtype="float64"), + "e": [True, False, True, False], + "f": Categorical(list("abcd")), + "g": date_range("20130101", periods=4), + "h": date_range("20130101", periods=4, tz="US/Eastern"), } ) - def test_to_xarray_index_types(self, index_flat, df): + def test_to_xarray_index_types(self, index_flat, df, using_infer_string): index = index_flat # MultiIndex is tested in test_to_xarray_with_multiindex if len(index) == 0: @@ -37,11 +37,11 @@ from xarray import Dataset - df.index = index[:3] + df.index = index[:4] df.index.name = "foo" df.columns.name = "bar" result = df.to_xarray() - assert result.dims["foo"] == 3 + assert result.sizes["foo"] == 4 assert len(result.coords) == 1 assert len(result.data_vars) == 8 tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) @@ -51,7 +51,9 @@ # datetimes w/tz are preserved # column names are lost expected = df.copy() - expected["f"] = expected["f"].astype(object) + expected["f"] = expected["f"].astype( + object if not using_infer_string else "string[pyarrow_numpy]" + ) expected.columns.name = None tm.assert_frame_equal(result.to_dataframe(), expected) @@ -60,17 +62,17 @@ df.index.name = "foo" result = df[0:0].to_xarray() - assert result.dims["foo"] == 0 + assert result.sizes["foo"] == 0 assert isinstance(result, Dataset) - def test_to_xarray_with_multiindex(self, df): + def test_to_xarray_with_multiindex(self, df, using_infer_string): from xarray import Dataset # MultiIndex - df.index = MultiIndex.from_product([["a"], range(3)], names=["one", "two"]) + df.index = MultiIndex.from_product([["a"], range(4)], names=["one", "two"]) result = df.to_xarray() - assert result.dims["one"] == 1 - assert result.dims["two"] == 3 + assert result.sizes["one"] == 1 + assert result.sizes["two"] == 4 assert len(result.coords) == 2 assert len(result.data_vars) == 8 tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"]) @@ -78,7 +80,9 @@ result = result.to_dataframe() expected = df.copy() - expected["f"] = expected["f"].astype(object) + expected["f"] = expected["f"].astype( + object if not using_infer_string else "string[pyarrow_numpy]" + ) expected.columns.name = None tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/aggregate/test_aggregate.py pandas-2.2.2+dfsg/pandas/tests/groupby/aggregate/test_aggregate.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/aggregate/test_aggregate.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/aggregate/test_aggregate.py 2024-04-10 17:42:52.000000000 +0000 @@ -161,25 +161,29 @@ def test_agg_grouping_is_list_tuple(ts): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=pd.date_range("2000-01-01", periods=30, freq="B"), + ) grouped = df.groupby(lambda x: x.year) - grouper = grouped.grouper.groupings[0].grouping_vector - grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper)) + grouper = grouped._grouper.groupings[0].grouping_vector + grouped._grouper.groupings[0] = Grouping(ts.index, list(grouper)) result = grouped.agg("mean") expected = grouped.mean() tm.assert_frame_equal(result, expected) - grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper)) + grouped._grouper.groupings[0] = Grouping(ts.index, tuple(grouper)) result = grouped.agg("mean") expected = grouped.mean() tm.assert_frame_equal(result, expected) -def test_agg_python_multiindex(mframe): - grouped = mframe.groupby(["A", "B"]) +def test_agg_python_multiindex(multiindex_dataframe_random_data): + grouped = multiindex_dataframe_random_data.groupby(["A", "B"]) result = grouped.agg("mean") expected = grouped.mean() @@ -356,20 +360,26 @@ tm.assert_index_equal(result.columns, exp_cols) +def test_series_index_name(df): + grouped = df.loc[:, ["C"]].groupby(df["A"]) + result = grouped.agg(lambda x: x.mean()) + assert result.index.name == "A" + + def test_agg_multiple_functions_same_name(): # GH 30880 df = DataFrame( np.random.default_rng(2).standard_normal((1000, 3)), - index=pd.date_range("1/1/2012", freq="S", periods=1000), + index=pd.date_range("1/1/2012", freq="s", periods=1000), columns=["A", "B", "C"], ) - result = df.resample("3T").agg( + result = df.resample("3min").agg( {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} ) - expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) + expected_index = pd.date_range("1/1/2012", freq="3min", periods=6) expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")]) expected_values = np.array( - [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] + [df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]] ).T expected = DataFrame( expected_values, columns=expected_columns, index=expected_index @@ -382,13 +392,13 @@ # ohlc expands dimensions, so different test to the above is required. df = DataFrame( np.random.default_rng(2).standard_normal((1000, 3)), - index=pd.date_range("1/1/2012", freq="S", periods=1000, name="dti"), + index=pd.date_range("1/1/2012", freq="s", periods=1000, name="dti"), columns=Index(["A", "B", "C"], name="alpha"), ) - result = df.resample("3T").agg( + result = df.resample("3min").agg( {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} ) - expected_index = pd.date_range("1/1/2012", freq="3T", periods=6, name="dti") + expected_index = pd.date_range("1/1/2012", freq="3min", periods=6, name="dti") expected_columns = MultiIndex.from_tuples( [ ("A", "ohlc", "open"), @@ -401,9 +411,11 @@ names=["alpha", None, None], ) non_ohlc_expected_values = np.array( - [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] + [df.resample("3min").A.quantile(q=q).values for q in [0.9999, 0.1111]] ).T - expected_values = np.hstack([df.resample("3T").A.ohlc(), non_ohlc_expected_values]) + expected_values = np.hstack( + [df.resample("3min").A.ohlc(), non_ohlc_expected_values] + ) expected = DataFrame( expected_values, columns=expected_columns, index=expected_index ) @@ -1643,3 +1655,18 @@ gb = df.groupby("grps") result = gb.agg(td=("td", "cumsum")) tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregation_empty_group(): + # https://github.com/pandas-dev/pandas/issues/18869 + def func(x): + if len(x) == 0: + raise ValueError("length must not be 0") + return len(x) + + df = DataFrame( + {"A": pd.Categorical(["a", "a"], categories=["a", "b", "c"]), "B": [1, 1]} + ) + msg = "length must not be 0" + with pytest.raises(ValueError, match=msg): + df.groupby("A", observed=False).agg(func) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/aggregate/test_cython.py pandas-2.2.2+dfsg/pandas/tests/groupby/aggregate/test_cython.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/aggregate/test_cython.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/aggregate/test_cython.py 2024-04-10 17:42:52.000000000 +0000 @@ -118,7 +118,7 @@ { "a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25, - "dates": pd.date_range("now", periods=50, freq="T"), + "dates": pd.date_range("now", periods=50, freq="min"), } ) msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes" diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/aggregate/test_other.py pandas-2.2.2+dfsg/pandas/tests/groupby/aggregate/test_other.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/aggregate/test_other.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/aggregate/test_other.py 2024-04-10 17:42:52.000000000 +0000 @@ -106,7 +106,7 @@ df = DataFrame( { "class": ["A", "A", "B", "B", "C", "C", "D", "D"], - "time": date_range("1/1/2011", periods=8, freq="H"), + "time": date_range("1/1/2011", periods=8, freq="h"), } ) df.loc[[0, 1, 2, 5], "time"] = None @@ -296,7 +296,9 @@ def test_series_agg_multikey(): - ts = tm.makeTimeSeries() + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.agg("sum") @@ -499,13 +501,17 @@ assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] def test_sum_uint64_overflow(): diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/conftest.py pandas-2.2.2+dfsg/pandas/tests/groupby/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,8 +1,12 @@ import numpy as np import pytest -from pandas import DataFrame -import pandas._testing as tm +from pandas import ( + DataFrame, + Index, + Series, + date_range, +) from pandas.core.groupby.base import ( reduction_kernels, transformation_kernels, @@ -25,21 +29,11 @@ @pytest.fixture(params=[True, False]) -def skipna(request): - return request.param - - -@pytest.fixture(params=[True, False]) def observed(request): return request.param @pytest.fixture -def mframe(multiindex_dataframe_random_data): - return multiindex_dataframe_random_data - - -@pytest.fixture def df(): return DataFrame( { @@ -53,28 +47,18 @@ @pytest.fixture def ts(): - return tm.makeTimeSeries() - - -@pytest.fixture -def tsd(): - return tm.getTimeSeriesData() - - -@pytest.fixture -def tsframe(tsd): - return DataFrame(tsd) + return Series( + np.random.default_rng(2).standard_normal(30), + index=date_range("2000-01-01", periods=30, freq="B"), + ) @pytest.fixture -def df_mixed_floats(): +def tsframe(): return DataFrame( - { - "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], - "B": ["one", "one", "two", "three", "two", "two", "one", "three"], - "C": np.random.default_rng(2).standard_normal(8), - "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), - } + np.random.default_rng(2).standard_normal((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=30, freq="B"), ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_corrwith.py pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_corrwith.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_corrwith.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_corrwith.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,24 @@ +import numpy as np + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +def test_corrwith_with_1_axis(): + # GH 47723 + df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]}) + gb = df.groupby("a") + + msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.corrwith(df, axis=1) + index = Index( + data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)], + name=("a", None), + ) + expected = Series([np.nan] * 6, index=index) + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_describe.py pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_describe.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_describe.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_describe.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,297 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +def test_apply_describe_bug(multiindex_dataframe_random_data): + grouped = multiindex_dataframe_random_data.groupby(level="first") + grouped.describe() # it works! + + +def test_series_describe_multikey(): + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) + tm.assert_series_equal(result["std"], grouped.std(), check_names=False) + tm.assert_series_equal(result["min"], grouped.min(), check_names=False) + + +def test_series_describe_single(): + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x.describe()) + expected = grouped.describe().stack(future_stack=True) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) +def test_series_describe_as_index(as_index, keys): + # GH#49256 + df = DataFrame( + { + "key1": ["one", "two", "two", "three", "two"], + "key2": ["one", "two", "two", "three", "two"], + "foo2": [1, 2, 4, 4, 6], + } + ) + gb = df.groupby(keys, as_index=as_index)["foo2"] + result = gb.describe() + expected = DataFrame( + { + "key1": ["one", "three", "two"], + "count": [1.0, 1.0, 3.0], + "mean": [1.0, 4.0, 4.0], + "std": [np.nan, np.nan, 2.0], + "min": [1.0, 4.0, 2.0], + "25%": [1.0, 4.0, 3.0], + "50%": [1.0, 4.0, 4.0], + "75%": [1.0, 4.0, 5.0], + "max": [1.0, 4.0, 6.0], + } + ) + if len(keys) == 2: + expected.insert(1, "key2", expected["key1"]) + if as_index: + expected = expected.set_index(keys) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_multikey(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + desc_groups = [] + for col in tsframe: + group = grouped[col].describe() + # GH 17464 - Remove duplicate MultiIndex levels + group_col = MultiIndex( + levels=[[col], group.columns], + codes=[[0] * len(group.columns), range(len(group.columns))], + ) + group = DataFrame(group.values, columns=group_col, index=group.index) + desc_groups.append(group) + expected = pd.concat(desc_groups, axis=1) + tm.assert_frame_equal(result, expected) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) + result = groupedT.describe() + expected = tsframe.describe().T + # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ + expected.index = MultiIndex( + levels=[[0, 1], expected.index], + codes=[[0, 0, 1, 1], range(len(expected.index))], + ) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_tupleindex(): + # GH 14848 - regression from 0.19.0 to 0.19.1 + df1 = DataFrame( + { + "x": [1, 2, 3, 4, 5] * 3, + "y": [10, 20, 30, 40, 50] * 3, + "z": [100, 200, 300, 400, 500] * 3, + } + ) + df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={"k": "key"}) + msg = "Names should be list-like for a MultiIndex" + with pytest.raises(ValueError, match=msg): + df1.groupby("k").describe() + with pytest.raises(ValueError, match=msg): + df2.groupby("key").describe() + + +def test_frame_describe_unstacked_format(): + # GH 4792 + prices = { + Timestamp("2011-01-06 10:59:05", tz=None): 24990, + Timestamp("2011-01-06 12:43:33", tz=None): 25499, + Timestamp("2011-01-06 12:54:09", tz=None): 25499, + } + volumes = { + Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, + Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, + Timestamp("2011-01-06 12:54:09", tz=None): 100000000, + } + df = DataFrame({"PRICE": prices, "VOLUME": volumes}) + result = df.groupby("PRICE").VOLUME.describe() + data = [ + df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist(), + ] + expected = DataFrame( + data, + index=Index([24990, 25499], name="PRICE"), + columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.filterwarnings( + "ignore:" + "indexing past lexsort depth may impact performance:" + "pandas.errors.PerformanceWarning" +) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +def test_describe_with_duplicate_output_column_names(as_index, keys): + # GH 35314 + df = DataFrame( + { + "a1": [99, 99, 99, 88, 88, 88], + "a2": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + }, + columns=["a1", "a2", "b", "b"], + copy=False, + ) + if keys == ["a1"]: + df = df.drop(columns="a2") + + expected = ( + DataFrame.from_records( + [ + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ], + ) + .set_index([0, 1]) + .T + ) + expected.columns.names = [None, None] + if len(keys) == 2: + expected.index = MultiIndex( + levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] + ) + else: + expected.index = Index([88, 99], name="a1") + + if not as_index: + expected = expected.reset_index() + + result = df.groupby(keys, as_index=as_index).describe() + + tm.assert_frame_equal(result, expected) + + +def test_describe_duplicate_columns(): + # GH#50806 + df = DataFrame([[0, 1, 2, 3]]) + df.columns = [0, 1, 2, 0] + gb = df.groupby(df[1]) + result = gb.describe(percentiles=[]) + + columns = ["count", "mean", "std", "min", "50%", "max"] + frames = [ + DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) + for val in (0.0, 2.0, 3.0) + ] + expected = pd.concat(frames, axis=1) + expected.columns = MultiIndex( + levels=[[0, 2], columns], + codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], + ) + expected.index.names = [1] + tm.assert_frame_equal(result, expected) + + +class TestGroupByNonCythonPaths: + # GH#5610 non-cython calls should not include the grouper + # Tests for code not expected to go through cython paths. + + @pytest.fixture + def df(self): + df = DataFrame( + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], + columns=["A", "B", "C"], + ) + return df + + @pytest.fixture + def gb(self, df): + gb = df.groupby("A") + return gb + + @pytest.fixture + def gni(self, df): + gni = df.groupby("A", as_index=False) + return gni + + def test_describe(self, df, gb, gni): + # describe + expected_index = Index([1, 3], name="A") + expected_col = MultiIndex( + levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], + codes=[[0] * 8, list(range(8))], + ) + expected = DataFrame( + [ + [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + ], + index=expected_index, + columns=expected_col, + ) + result = gb.describe() + tm.assert_frame_equal(result, expected) + + expected = expected.reset_index() + result = gni.describe() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [int, float, object]) +@pytest.mark.parametrize( + "kwargs", + [ + {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None}, + {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]}, + {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None}, + ], +) +def test_groupby_empty_dataset(dtype, kwargs): + # GH#41575 + df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype) + df["B"] = df["B"].astype(int) + df["C"] = df["C"].astype(float) + + result = df.iloc[:0].groupby("A").describe(**kwargs) + expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0] + tm.assert_frame_equal(result, expected) + + result = df.iloc[:0].groupby("A").B.describe(**kwargs) + expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] + expected.index = Index([]) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_groupby_shift_diff.py pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_groupby_shift_diff.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_groupby_shift_diff.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_groupby_shift_diff.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,255 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + NaT, + Series, + Timedelta, + Timestamp, + date_range, +) +import pandas._testing as tm + + +def test_group_shift_with_null_key(): + # This test is designed to replicate the segfault in issue #13813. + n_rows = 1200 + + # Generate a moderately large dataframe with occasional missing + # values in column `B`, and then group by [`A`, `B`]. This should + # force `-1` in `labels` array of `g._grouper.group_info` exactly + # at those places, where the group-by key is partially missing. + df = DataFrame( + [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) + g = df.groupby(["A", "B"]) + + expected = DataFrame( + [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) + result = g.shift(-1) + + tm.assert_frame_equal(result, expected) + + +def test_group_shift_with_fill_value(): + # GH #24128 + n_rows = 24 + df = DataFrame( + [(i % 12, i % 3, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) + g = df.groupby(["A", "B"]) + + expected = DataFrame( + [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) + result = g.shift(-1, fill_value=0) + + tm.assert_frame_equal(result, expected) + + +def test_group_shift_lose_timezone(): + # GH 30134 + now_dt = Timestamp.utcnow().as_unit("ns") + df = DataFrame({"a": [1, 1], "date": now_dt}) + result = df.groupby("a").shift(0).iloc[0] + expected = Series({"date": now_dt}, name=result.name) + tm.assert_series_equal(result, expected) + + +def test_group_diff_real_series(any_real_numpy_dtype): + df = DataFrame( + {"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, + dtype=any_real_numpy_dtype, + ) + result = df.groupby("a")["b"].diff() + exp_dtype = "float" + if any_real_numpy_dtype in ["int8", "int16", "float32"]: + exp_dtype = "float32" + expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b") + tm.assert_series_equal(result, expected) + + +def test_group_diff_real_frame(any_real_numpy_dtype): + df = DataFrame( + { + "a": [1, 2, 3, 3, 2], + "b": [1, 2, 3, 4, 5], + "c": [1, 2, 3, 4, 6], + }, + dtype=any_real_numpy_dtype, + ) + result = df.groupby("a").diff() + exp_dtype = "float" + if any_real_numpy_dtype in ["int8", "int16", "float32"]: + exp_dtype = "float32" + expected = DataFrame( + { + "b": [np.nan, np.nan, np.nan, 1.0, 3.0], + "c": [np.nan, np.nan, np.nan, 1.0, 4.0], + }, + dtype=exp_dtype, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + [ + Timestamp("2013-01-01"), + Timestamp("2013-01-02"), + Timestamp("2013-01-03"), + ], + [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")], + ], +) +def test_group_diff_datetimelike(data, unit): + df = DataFrame({"a": [1, 2, 2], "b": data}) + df["b"] = df["b"].dt.as_unit(unit) + result = df.groupby("a")["b"].diff() + expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit) + tm.assert_series_equal(result, expected) + + +def test_group_diff_bool(): + df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) + result = df.groupby("a")["b"].diff() + expected = Series([np.nan, np.nan, np.nan, False, False], name="b") + tm.assert_series_equal(result, expected) + + +def test_group_diff_object_raises(object_dtype): + df = DataFrame( + {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype + ) + with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"): + df.groupby("a")["b"].diff() + + +def test_empty_shift_with_fill(): + # GH 41264, single-index check + df = DataFrame(columns=["a", "b", "c"]) + shifted = df.groupby(["a"]).shift(1) + shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0) + tm.assert_frame_equal(shifted, shifted_with_fill) + tm.assert_index_equal(shifted.index, shifted_with_fill.index) + + +def test_multindex_empty_shift_with_fill(): + # GH 41264, multi-index check + df = DataFrame(columns=["a", "b", "c"]) + shifted = df.groupby(["a", "b"]).shift(1) + shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0) + tm.assert_frame_equal(shifted, shifted_with_fill) + tm.assert_index_equal(shifted.index, shifted_with_fill.index) + + +def test_shift_periods_freq(): + # GH 54093 + data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} + df = DataFrame(data, index=date_range(start="20100101", periods=6)) + result = df.groupby(df.index).shift(periods=-2, freq="D") + expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6)) + tm.assert_frame_equal(result, expected) + + +def test_shift_deprecate_freq_and_fill_value(): + # GH 53832 + data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} + df = DataFrame(data, index=date_range(start="20100101", periods=6)) + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1") + + +def test_shift_disallow_suffix_if_periods_is_int(): + # GH#44424 + data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} + df = DataFrame(data) + msg = "Cannot specify `suffix` if `periods` is an int." + with pytest.raises(ValueError, match=msg): + df.groupby("b").shift(1, suffix="fails") + + +def test_group_shift_with_multiple_periods(): + # GH#44424 + df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) + + shifted_df = df.groupby("b")[["a"]].shift([0, 1]) + expected_df = DataFrame( + {"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]} + ) + tm.assert_frame_equal(shifted_df, expected_df) + + # series + shifted_series = df.groupby("b")["a"].shift([0, 1]) + tm.assert_frame_equal(shifted_series, expected_df) + + +def test_group_shift_with_multiple_periods_and_freq(): + # GH#44424 + df = DataFrame( + {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, + index=date_range("1/1/2000", periods=5, freq="h"), + ) + shifted_df = df.groupby("b")[["a"]].shift( + [0, 1], + freq="h", + ) + expected_df = DataFrame( + { + "a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan], + "a_1": [ + np.nan, + 1.0, + 2.0, + 3.0, + 4.0, + 5.0, + ], + }, + index=date_range("1/1/2000", periods=6, freq="h"), + ) + tm.assert_frame_equal(shifted_df, expected_df) + + +def test_group_shift_with_multiple_periods_and_fill_value(): + # GH#44424 + df = DataFrame( + {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, + ) + shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1) + expected_df = DataFrame( + {"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]}, + ) + tm.assert_frame_equal(shifted_df, expected_df) + + +def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): + # GH#44424 + df = DataFrame( + {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, + index=date_range("1/1/2000", periods=5, freq="h"), + ) + msg = ( + "Passing a 'freq' together with a 'fill_value' silently ignores the " + "fill_value" + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h") diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_is_monotonic.py pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_is_monotonic.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_is_monotonic.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_is_monotonic.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) +def test_is_monotonic_increasing(in_vals, out_vals): + # GH 17015 + source_dict = { + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } + df = DataFrame(source_dict) + result = df.groupby("B").C.is_monotonic_increasing + index = Index(list("abcd"), name="B") + expected = Series(index=index, data=out_vals, name="C") + tm.assert_series_equal(result, expected) + + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) +def test_is_monotonic_decreasing(in_vals, out_vals): + # GH 17015 + source_dict = { + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } + + df = DataFrame(source_dict) + result = df.groupby("B").C.is_monotonic_decreasing + index = Index(list("abcd"), name="B") + expected = Series(index=index, data=out_vals, name="C") + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_nlargest_nsmallest.py pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_nlargest_nsmallest.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_nlargest_nsmallest.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_nlargest_nsmallest.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,115 @@ +import numpy as np +import pytest + +from pandas import ( + MultiIndex, + Series, + date_range, +) +import pandas._testing as tm + + +def test_nlargest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list("a" * 5 + "b" * 5)) + gb = a.groupby(b) + r = gb.nlargest(3) + e = Series( + [7, 5, 3, 10, 9, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), + ) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series( + [3, 2, 1, 3, 3, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), + ) + tm.assert_series_equal(gb.nlargest(3, keep="last"), e) + + +def test_nlargest_mi_grouper(): + # see gh-21411 + npr = np.random.default_rng(2) + + dts = date_range("20180101", periods=10) + iterables = [dts, ["one", "two"]] + + idx = MultiIndex.from_product(iterables, names=["first", "second"]) + s = Series(npr.standard_normal(20), index=idx) + + result = s.groupby("first").nlargest(1) + + exp_idx = MultiIndex.from_tuples( + [ + (dts[0], dts[0], "one"), + (dts[1], dts[1], "one"), + (dts[2], dts[2], "one"), + (dts[3], dts[3], "two"), + (dts[4], dts[4], "one"), + (dts[5], dts[5], "one"), + (dts[6], dts[6], "one"), + (dts[7], dts[7], "one"), + (dts[8], dts[8], "one"), + (dts[9], dts[9], "one"), + ], + names=["first", "first", "second"], + ) + + exp_values = [ + 0.18905338179353307, + -0.41306354339189344, + 1.799707382720902, + 0.7738065867276614, + 0.28121066979764925, + 0.9775674511260357, + -0.3288239040579627, + 0.45495807124085547, + 0.5452887139646817, + 0.12682784711186987, + ] + + expected = Series(exp_values, index=exp_idx) + tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3) + + +def test_nsmallest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list("a" * 5 + "b" * 5)) + gb = a.groupby(b) + r = gb.nsmallest(3) + e = Series( + [1, 2, 3, 0, 4, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), + ) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series( + [0, 1, 1, 0, 1, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), + ) + tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) + + +@pytest.mark.parametrize( + "data, groups", + [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], +) +@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES]) +@pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) +def test_nlargest_and_smallest_noop(data, groups, dtype, method): + # GH 15272, GH 16345, GH 29129 + # Test nlargest/smallest when it results in a noop, + # i.e. input is sorted and group size <= n + if dtype is not None: + data = np.array(data, dtype=dtype) + if method == "nlargest": + data = list(reversed(data)) + ser = Series(data, name="a") + result = getattr(ser.groupby(groups), method)(n=2) + expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups + expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_nth.py pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_nth.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_nth.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_nth.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,921 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + isna, +) +import pandas._testing as tm + + +def test_first_last_nth(df): + # tests for first / last / nth + grouped = df.groupby("A") + first = grouped.first() + expected = df.loc[[1, 0], ["B", "C", "D"]] + expected.index = Index(["bar", "foo"], name="A") + expected = expected.sort_index() + tm.assert_frame_equal(first, expected) + + nth = grouped.nth(0) + expected = df.loc[[0, 1]] + tm.assert_frame_equal(nth, expected) + + last = grouped.last() + expected = df.loc[[5, 7], ["B", "C", "D"]] + expected.index = Index(["bar", "foo"], name="A") + tm.assert_frame_equal(last, expected) + + nth = grouped.nth(-1) + expected = df.iloc[[5, 7]] + tm.assert_frame_equal(nth, expected) + + nth = grouped.nth(1) + expected = df.iloc[[2, 3]] + tm.assert_frame_equal(nth, expected) + + # it works! + grouped["B"].first() + grouped["B"].last() + grouped["B"].nth(0) + + df = df.copy() + df.loc[df["A"] == "foo", "B"] = np.nan + grouped = df.groupby("A") + assert isna(grouped["B"].first()["foo"]) + assert isna(grouped["B"].last()["foo"]) + assert isna(grouped["B"].nth(0).iloc[0]) + + # v0.14.0 whatsnew + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") + result = g.first() + expected = df.iloc[[1, 2]].set_index("A") + tm.assert_frame_equal(result, expected) + + expected = df.iloc[[1, 2]] + result = g.nth(0, dropna="any") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("method", ["first", "last"]) +def test_first_last_with_na_object(method, nulls_fixture): + # https://github.com/pandas-dev/pandas/issues/32123 + groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a") + result = getattr(groups, method)() + + if method == "first": + values = [1, 3] + else: + values = [2, 3] + + values = np.array(values, dtype=result["b"].dtype) + idx = Index([1, 2], name="a") + expected = DataFrame({"b": values}, index=idx) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index", [0, -1]) +def test_nth_with_na_object(index, nulls_fixture): + # https://github.com/pandas-dev/pandas/issues/32123 + df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}) + groups = df.groupby("a") + result = groups.nth(index) + expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("method", ["first", "last"]) +def test_first_last_with_None(method): + # https://github.com/pandas-dev/pandas/issues/32800 + # None should be preserved as object dtype + df = DataFrame.from_dict({"id": ["a"], "value": [None]}) + groups = df.groupby("id", as_index=False) + result = getattr(groups, method)() + + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("method", ["first", "last"]) +@pytest.mark.parametrize( + "df, expected", + [ + ( + DataFrame({"id": "a", "value": [None, "foo", np.nan]}), + DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")), + ), + ( + DataFrame({"id": "a", "value": [np.nan]}, dtype=object), + DataFrame({"value": [None]}, index=Index(["a"], name="id")), + ), + ], +) +def test_first_last_with_None_expanded(method, df, expected): + # GH 32800, 38286 + result = getattr(df.groupby("id"), method)() + tm.assert_frame_equal(result, expected) + + +def test_first_last_nth_dtypes(): + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), + } + ) + df["E"] = True + df["F"] = 1 + + # tests for first / last / nth + grouped = df.groupby("A") + first = grouped.first() + expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") + expected = expected.sort_index() + tm.assert_frame_equal(first, expected) + + last = grouped.last() + expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") + expected = expected.sort_index() + tm.assert_frame_equal(last, expected) + + nth = grouped.nth(1) + expected = df.iloc[[2, 3]] + tm.assert_frame_equal(nth, expected) + + +def test_first_last_nth_dtypes2(): + # GH 2763, first/last shifting dtypes + idx = list(range(10)) + idx.append(9) + ser = Series(data=range(11), index=idx, name="IntCol") + assert ser.dtype == "int64" + f = ser.groupby(level=0).first() + assert f.dtype == "int64" + + +def test_first_last_nth_nan_dtype(): + # GH 33591 + df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)}) + grouped = df.groupby("data") + + expected = df.set_index("data").nans + tm.assert_series_equal(grouped.nans.first(), expected) + tm.assert_series_equal(grouped.nans.last(), expected) + + expected = df.nans + tm.assert_series_equal(grouped.nans.nth(-1), expected) + tm.assert_series_equal(grouped.nans.nth(0), expected) + + +def test_first_strings_timestamps(): + # GH 11244 + test = DataFrame( + { + Timestamp("2012-01-01 00:00:00"): ["a", "b"], + Timestamp("2012-01-02 00:00:00"): ["c", "d"], + "name": ["e", "e"], + "aaaa": ["f", "g"], + } + ) + result = test.groupby("name").first() + expected = DataFrame( + [["a", "c", "f"]], + columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]), + index=Index(["e"], name="name"), + ) + tm.assert_frame_equal(result, expected) + + +def test_nth(): + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + gb = df.groupby("A") + + tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]]) + tm.assert_frame_equal(gb.nth(1), df.iloc[[1]]) + tm.assert_frame_equal(gb.nth(2), df.loc[[]]) + tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]]) + tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]]) + tm.assert_frame_equal(gb.nth(-3), df.loc[[]]) + tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]]) + tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]]) + tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]]) + + tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]]) + tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]]) + + tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0]) + tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0]) + + +def test_nth2(): + # out of bounds, regression from 0.13.1 + # GH 6621 + df = DataFrame( + { + "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"}, + "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"}, + "two": { + 0: 1.5456590000000001, + 1: -0.070345000000000005, + 2: -2.4004539999999999, + 3: 0.46206000000000003, + 4: 0.52350799999999997, + }, + "one": { + 0: 0.56573799999999996, + 1: -0.9742360000000001, + 2: 1.033801, + 3: -0.78543499999999999, + 4: 0.70422799999999997, + }, + } + ).set_index(["color", "food"]) + + result = df.groupby(level=0, as_index=False).nth(2) + expected = df.iloc[[-1]] + tm.assert_frame_equal(result, expected) + + result = df.groupby(level=0, as_index=False).nth(3) + expected = df.loc[[]] + tm.assert_frame_equal(result, expected) + + +def test_nth3(): + # GH 7559 + # from the vbench + df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64") + ser = df[1] + gb = df[0] + expected = ser.groupby(gb).first() + expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0]) + tm.assert_series_equal(expected2, expected, check_names=False) + assert expected.name == 1 + assert expected2.name == 1 + + # validate first + v = ser[gb == 1].iloc[0] + assert expected.iloc[0] == v + assert expected2.iloc[0] == v + + with pytest.raises(ValueError, match="For a DataFrame"): + ser.groupby(gb, sort=False).nth(0, dropna=True) + + +def test_nth4(): + # doc example + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + gb = df.groupby("A") + result = gb.B.nth(0, dropna="all") + expected = df.B.iloc[[1, 2]] + tm.assert_series_equal(result, expected) + + +def test_nth5(): + # test multiple nth values + df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"]) + gb = df.groupby("A") + + tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]]) + tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]]) + tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]]) + tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]]) + tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]]) + tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]]) + tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]]) + tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]]) + + +def test_nth_bdays(unit): + business_dates = pd.date_range( + start="4/1/2014", end="6/30/2014", freq="B", unit=unit + ) + df = DataFrame(1, index=business_dates, columns=["a", "b"]) + # get the first, fourth and last two business days for each month + key = [df.index.year, df.index.month] + result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) + expected_dates = pd.to_datetime( + [ + "2014/4/1", + "2014/4/4", + "2014/4/29", + "2014/4/30", + "2014/5/1", + "2014/5/6", + "2014/5/29", + "2014/5/30", + "2014/6/2", + "2014/6/5", + "2014/6/27", + "2014/6/30", + ] + ).as_unit(unit) + expected = DataFrame(1, columns=["a", "b"], index=expected_dates) + tm.assert_frame_equal(result, expected) + + +def test_nth_multi_grouper(three_group): + # PR 9090, related to issue 8979 + # test nth on multiple groupers + grouped = three_group.groupby(["A", "B"]) + result = grouped.nth(0) + expected = three_group.iloc[[0, 3, 4, 7]] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data, expected_first, expected_last", + [ + ( + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + ), + ( + { + "id": ["A", "B", "A"], + "time": [ + Timestamp("2012-01-01 13:00:00", tz="America/New_York"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + Timestamp("2012-03-01 12:00:00", tz="Europe/London"), + ], + "foo": [1, 2, 3], + }, + { + "id": ["A", "B"], + "time": [ + Timestamp("2012-01-01 13:00:00", tz="America/New_York"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + ], + "foo": [1, 2], + }, + { + "id": ["A", "B"], + "time": [ + Timestamp("2012-03-01 12:00:00", tz="Europe/London"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + ], + "foo": [3, 2], + }, + ), + ], +) +def test_first_last_tz(data, expected_first, expected_last): + # GH15884 + # Test that the timezone is retained when calling first + # or last on groupby with as_index=False + + df = DataFrame(data) + + result = df.groupby("id", as_index=False).first() + expected = DataFrame(expected_first) + cols = ["id", "time", "foo"] + tm.assert_frame_equal(result[cols], expected[cols]) + + result = df.groupby("id", as_index=False)["time"].first() + tm.assert_frame_equal(result, expected[["id", "time"]]) + + result = df.groupby("id", as_index=False).last() + expected = DataFrame(expected_last) + cols = ["id", "time", "foo"] + tm.assert_frame_equal(result[cols], expected[cols]) + + result = df.groupby("id", as_index=False)["time"].last() + tm.assert_frame_equal(result, expected[["id", "time"]]) + + +@pytest.mark.parametrize( + "method, ts, alpha", + [ + ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"], + ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"], + ], +) +def test_first_last_tz_multi_column(method, ts, alpha, unit): + # GH 21603 + category_string = Series(list("abc")).astype("category") + dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit) + df = DataFrame( + { + "group": [1, 1, 2], + "category_string": category_string, + "datetimetz": dti, + } + ) + result = getattr(df.groupby("group"), method)() + expected = DataFrame( + { + "category_string": pd.Categorical( + [alpha, "c"], dtype=category_string.dtype + ), + "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")], + }, + index=Index([1, 2], name="group"), + ) + expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + pd.array([True, False], dtype="boolean"), + pd.array([1, 2], dtype="Int64"), + pd.to_datetime(["2020-01-01", "2020-02-01"]), + pd.to_timedelta([1, 2], unit="D"), + ], +) +@pytest.mark.parametrize("function", ["first", "last", "min", "max"]) +def test_first_last_extension_array_keeps_dtype(values, function): + # https://github.com/pandas-dev/pandas/issues/33071 + # https://github.com/pandas-dev/pandas/issues/32194 + df = DataFrame({"a": [1, 2], "b": values}) + grouped = df.groupby("a") + idx = Index([1, 2], name="a") + expected_series = Series(values, name="b", index=idx) + expected_frame = DataFrame({"b": values}, index=idx) + + result_series = getattr(grouped["b"], function)() + tm.assert_series_equal(result_series, expected_series) + + result_frame = grouped.agg({"b": function}) + tm.assert_frame_equal(result_frame, expected_frame) + + +def test_nth_multi_index_as_expected(): + # PR 9090, related to issue 8979 + # test nth on MultiIndex + three_group = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + } + ) + grouped = three_group.groupby(["A", "B"]) + result = grouped.nth(0) + expected = three_group.iloc[[0, 3, 4, 7]] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "op, n, expected_rows", + [ + ("head", -1, [0]), + ("head", 0, []), + ("head", 1, [0, 2]), + ("head", 7, [0, 1, 2]), + ("tail", -1, [1]), + ("tail", 0, []), + ("tail", 1, [1, 2]), + ("tail", 7, [0, 1, 2]), + ], +) +@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]]) +@pytest.mark.parametrize("as_index", [True, False]) +def test_groupby_head_tail(op, n, expected_rows, columns, as_index): + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A", as_index=as_index) + expected = df.iloc[expected_rows] + if columns is not None: + g = g[columns] + expected = expected[columns] + result = getattr(g, op)(n) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "op, n, expected_cols", + [ + ("head", -1, [0]), + ("head", 0, []), + ("head", 1, [0, 2]), + ("head", 7, [0, 1, 2]), + ("tail", -1, [1]), + ("tail", 0, []), + ("tail", 1, [1, 2]), + ("tail", 7, [0, 1, 2]), + ], +) +def test_groupby_head_tail_axis_1(op, n, expected_cols): + # GH 9772 + df = DataFrame( + [[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"] + ) + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + g = df.groupby([0, 0, 1], axis=1) + expected = df.iloc[:, expected_cols] + result = getattr(g, op)(n) + tm.assert_frame_equal(result, expected) + + +def test_group_selection_cache(): + # GH 12839 nth, head, and tail should return same result consistently + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + expected = df.iloc[[0, 2]] + + g = df.groupby("A") + result1 = g.head(n=2) + result2 = g.nth(0) + tm.assert_frame_equal(result1, df) + tm.assert_frame_equal(result2, expected) + + g = df.groupby("A") + result1 = g.tail(n=2) + result2 = g.nth(0) + tm.assert_frame_equal(result1, df) + tm.assert_frame_equal(result2, expected) + + g = df.groupby("A") + result1 = g.nth(0) + result2 = g.head(n=2) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, df) + + g = df.groupby("A") + result1 = g.nth(0) + result2 = g.tail(n=2) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, df) + + +def test_nth_empty(): + # GH 16064 + df = DataFrame(index=[0], columns=["a", "b", "c"]) + result = df.groupby("a").nth(10) + expected = df.iloc[:0] + tm.assert_frame_equal(result, expected) + + result = df.groupby(["a", "b"]).nth(10) + expected = df.iloc[:0] + tm.assert_frame_equal(result, expected) + + +def test_nth_column_order(): + # GH 20760 + # Check that nth preserves column order + df = DataFrame( + [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]], + columns=["A", "C", "B"], + ) + result = df.groupby("A").nth(0) + expected = df.iloc[[0, 3]] + tm.assert_frame_equal(result, expected) + + result = df.groupby("A").nth(-1, dropna="any") + expected = df.iloc[[1, 4]] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dropna", [None, "any", "all"]) +def test_nth_nan_in_grouper(dropna): + # GH 26011 + df = DataFrame( + { + "a": [np.nan, "a", np.nan, "b", np.nan], + "b": [0, 2, 4, 6, 8], + "c": [1, 3, 5, 7, 9], + } + ) + result = df.groupby("a").nth(0, dropna=dropna) + expected = df.iloc[[1, 3]] + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dropna", [None, "any", "all"]) +def test_nth_nan_in_grouper_series(dropna): + # GH 26454 + df = DataFrame( + { + "a": [np.nan, "a", np.nan, "b", np.nan], + "b": [0, 2, 4, 6, 8], + } + ) + result = df.groupby("a")["b"].nth(0, dropna=dropna) + expected = df["b"].iloc[[1, 3]] + + tm.assert_series_equal(result, expected) + + +def test_first_categorical_and_datetime_data_nat(): + # GH 20520 + df = DataFrame( + { + "group": ["first", "first", "second", "third", "third"], + "time": 5 * [np.datetime64("NaT")], + "categories": Series(["a", "b", "c", "a", "b"], dtype="category"), + } + ) + result = df.groupby("group").first() + expected = DataFrame( + { + "time": 3 * [np.datetime64("NaT")], + "categories": Series(["a", "c", "a"]).astype( + pd.CategoricalDtype(["a", "b", "c"]) + ), + } + ) + expected.index = Index(["first", "second", "third"], name="group") + tm.assert_frame_equal(result, expected) + + +def test_first_multi_key_groupby_categorical(): + # GH 22512 + df = DataFrame( + { + "A": [1, 1, 1, 2, 2], + "B": [100, 100, 200, 100, 100], + "C": ["apple", "orange", "mango", "mango", "orange"], + "D": ["jupiter", "mercury", "mars", "venus", "venus"], + } + ) + df = df.astype({"D": "category"}) + result = df.groupby(by=["A", "B"]).first() + expected = DataFrame( + { + "C": ["apple", "mango", "mango"], + "D": Series(["jupiter", "mars", "venus"]).astype( + pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"]) + ), + } + ) + expected.index = MultiIndex.from_tuples( + [(1, 100), (1, 200), (2, 100)], names=["A", "B"] + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("method", ["first", "last", "nth"]) +def test_groupby_last_first_nth_with_none(method, nulls_fixture): + # GH29645 + expected = Series(["y"]) + data = Series( + [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture], + index=[0, 0, 0, 0, 0], + ).groupby(level=0) + + if method == "nth": + result = getattr(data, method)(3) + else: + result = getattr(data, method)() + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "arg, expected_rows", + [ + [slice(None, 3, 2), [0, 1, 4, 5]], + [slice(None, -2), [0, 2, 5]], + [[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], + [[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], + ], +) +def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): + # Test slices GH #42947 + + result = slice_test_grouped.nth[arg] + equivalent = slice_test_grouped.nth(arg) + expected = slice_test_df.iloc[expected_rows] + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(equivalent, expected) + + +def test_nth_indexed(slice_test_df, slice_test_grouped): + # Test index notation GH #44688 + + result = slice_test_grouped.nth[0, 1, -2:] + equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)]) + expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(equivalent, expected) + + +def test_invalid_argument(slice_test_grouped): + # Test for error on invalid argument + + with pytest.raises(TypeError, match="Invalid index"): + slice_test_grouped.nth(3.14) + + +def test_negative_step(slice_test_grouped): + # Test for error on negative slice step + + with pytest.raises(ValueError, match="Invalid step"): + slice_test_grouped.nth(slice(None, None, -1)) + + +def test_np_ints(slice_test_df, slice_test_grouped): + # Test np ints work + + result = slice_test_grouped.nth(np.array([0, 1])) + expected = slice_test_df.iloc[[0, 1, 2, 3, 4]] + tm.assert_frame_equal(result, expected) + + +def test_groupby_nth_with_column_axis(): + # GH43926 + df = DataFrame( + [ + [4, 5, 6], + [8, 8, 7], + ], + index=["z", "y"], + columns=["C", "B", "A"], + ) + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby(df.iloc[1], axis=1) + result = gb.nth(0) + expected = df.iloc[:, [0, 2]] + tm.assert_frame_equal(result, expected) + + +def test_groupby_nth_interval(): + # GH#24205 + idx_result = MultiIndex( + [ + pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]), + pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]), + ], + [[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]], + ) + df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result) + result = df_result.groupby(level=[0, 1], observed=False).nth(0) + val_expected = [0, 1, 3] + idx_expected = MultiIndex( + [ + pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]), + pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]), + ], + [[0, 0, 1], [0, 1, 0]], + ) + expected = DataFrame(val_expected, index=idx_expected, columns=["col"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "start, stop, expected_values, expected_columns", + [ + (None, None, [0, 1, 2, 3, 4], list("ABCDE")), + (None, 1, [0, 3], list("AD")), + (None, 9, [0, 1, 2, 3, 4], list("ABCDE")), + (None, -1, [0, 1, 3], list("ABD")), + (1, None, [1, 2, 4], list("BCE")), + (1, -1, [1], list("B")), + (-1, None, [2, 4], list("CE")), + (-1, 2, [4], list("E")), + ], +) +@pytest.mark.parametrize("method", ["call", "index"]) +def test_nth_slices_with_column_axis( + start, stop, expected_values, expected_columns, method +): + df = DataFrame([range(5)], columns=[list("ABCDE")]) + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby([5, 5, 5, 6, 6], axis=1) + result = { + "call": lambda start, stop: gb.nth(slice(start, stop)), + "index": lambda start, stop: gb.nth[start:stop], + }[method](start, stop) + expected = DataFrame([expected_values], columns=[expected_columns]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.filterwarnings( + "ignore:invalid value encountered in remainder:RuntimeWarning" +) +def test_head_tail_dropna_true(): + # GH#45089 + df = DataFrame( + [["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"] + ) + expected = DataFrame([["a", "z"]], columns=["X", "Y"]) + + result = df.groupby(["X", "Y"]).head(n=1) + tm.assert_frame_equal(result, expected) + + result = df.groupby(["X", "Y"]).tail(n=1) + tm.assert_frame_equal(result, expected) + + result = df.groupby(["X", "Y"]).nth(n=0) + tm.assert_frame_equal(result, expected) + + +def test_head_tail_dropna_false(): + # GH#45089 + df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"]) + expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"]) + + result = df.groupby(["X", "Y"], dropna=False).head(n=1) + tm.assert_frame_equal(result, expected) + + result = df.groupby(["X", "Y"], dropna=False).tail(n=1) + tm.assert_frame_equal(result, expected) + + result = df.groupby(["X", "Y"], dropna=False).nth(n=0) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"])) +@pytest.mark.parametrize("dropna", ["any", "all", None]) +def test_nth_after_selection(selection, dropna): + # GH#11038, GH#53518 + df = DataFrame( + { + "a": [1, 1, 2], + "b": [np.nan, 3, 4], + "c": [5, 6, 7], + } + ) + gb = df.groupby("a")[selection] + result = gb.nth(0, dropna=dropna) + if dropna == "any" or (dropna == "all" and selection != ["b", "c"]): + locs = [1, 2] + else: + locs = [0, 2] + expected = df.loc[locs, selection] + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + ( + Timestamp("2011-01-15 12:50:28.502376"), + Timestamp("2011-01-20 12:50:28.593448"), + ), + (24650000000000001, 24650000000000002), + ], +) +def test_groupby_nth_int_like_precision(data): + # GH#6620, GH#9311 + df = DataFrame({"a": [1, 1], "b": data}) + + grouped = df.groupby("a") + result = grouped.nth(0) + expected = DataFrame({"a": 1, "b": [data[0]]}) + + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_quantile.py pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_quantile.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_quantile.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_quantile.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,496 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + Index, +) +import pandas._testing as tm + + +@pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] +) +@pytest.mark.parametrize( + "a_vals,b_vals", + [ + # Ints + ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), + ([1, 2, 3, 4], [4, 3, 2, 1]), + ([1, 2, 3, 4, 5], [4, 3, 2, 1]), + # Floats + ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]), + # Missing data + ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]), + ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), + # Timestamps + ( + pd.date_range("1/1/18", freq="D", periods=5), + pd.date_range("1/1/18", freq="D", periods=5)[::-1], + ), + ( + pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"), + pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"), + ), + # All NA + ([np.nan] * 5, [np.nan] * 5), + ], +) +@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1]) +def test_quantile(interpolation, a_vals, b_vals, q, request): + if ( + interpolation == "nearest" + and q == 0.5 + and isinstance(b_vals, list) + and b_vals == [4, 3, 2, 1] + ): + request.applymarker( + pytest.mark.xfail( + reason="Unclear numpy expectation for nearest " + "result with equidistant data" + ) + ) + all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)]) + + a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) + b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation) + + df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals}) + + expected = DataFrame( + [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key") + ) + if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M": + # TODO(non-nano): this should be unnecessary once array_to_datetime + # correctly infers non-nano from Timestamp.unit + expected = expected.astype(all_vals.dtype) + result = df.groupby("key").quantile(q, interpolation=interpolation) + + tm.assert_frame_equal(result, expected) + + +def test_quantile_array(): + # https://github.com/pandas-dev/pandas/issues/27526 + df = DataFrame({"A": [0, 1, 2, 3, 4]}) + key = np.array([0, 0, 1, 1, 1], dtype=np.int64) + result = df.groupby(key).quantile([0.25]) + + index = pd.MultiIndex.from_product([[0, 1], [0.25]]) + expected = DataFrame({"A": [0.25, 2.50]}, index=index) + tm.assert_frame_equal(result, expected) + + df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) + index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]]) + + key = np.array([0, 0, 1, 1], dtype=np.int64) + result = df.groupby(key).quantile([0.25, 0.75]) + expected = DataFrame( + {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array2(): + # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 + arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64) + df = DataFrame(arr, columns=list("ABC")) + result = df.groupby("A").quantile([0.3, 0.7]) + expected = DataFrame( + { + "B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7], + "C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8], + }, + index=pd.MultiIndex.from_product( + [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None] + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_no_sort(): + df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) + key = np.array([1, 0, 1], dtype=np.int64) + result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75]) + expected = DataFrame( + {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]), + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby(key, sort=False).quantile([0.75, 0.25]) + expected = DataFrame( + {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_multiple_levels(): + df = DataFrame( + {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]} + ) + result = df.groupby(["c", "d"]).quantile([0.25, 0.75]) + index = pd.MultiIndex.from_tuples( + [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)], + names=["c", "d", None], + ) + expected = DataFrame( + {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)]) +@pytest.mark.parametrize("groupby", [[0], [0, 1]]) +@pytest.mark.parametrize("q", [[0.5, 0.6]]) +def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): + # GH30289 + nrow, ncol = frame_size + df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol)) + + idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q] + idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [ + list(range(len(q))) * min(nrow, 4) + ] + expected_index = pd.MultiIndex( + levels=idx_levels, codes=idx_codes, names=groupby + [None] + ) + expected_values = [ + [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q + ] + expected_columns = [x for x in range(ncol) if x not in groupby] + expected = DataFrame( + expected_values, index=expected_index, columns=expected_columns + ) + result = df.groupby(groupby).quantile(q) + + tm.assert_frame_equal(result, expected) + + +def test_quantile_raises(): + df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) + + with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): + df.groupby("key").quantile() + + +def test_quantile_out_of_bounds_q_raises(): + # https://github.com/pandas-dev/pandas/issues/27470 + df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)}) + g = df.groupby([0, 0, 0, 1, 1, 1]) + with pytest.raises(ValueError, match="Got '50.0' instead"): + g.quantile(50) + + with pytest.raises(ValueError, match="Got '-1.0' instead"): + g.quantile(-1) + + +def test_quantile_missing_group_values_no_segfaults(): + # GH 28662 + data = np.array([1.0, np.nan, 1.0]) + df = DataFrame({"key": data, "val": range(3)}) + + # Random segfaults; would have been guaranteed in loop + grp = df.groupby("key") + for _ in range(100): + grp.quantile() + + +@pytest.mark.parametrize( + "key, val, expected_key, expected_val", + [ + ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]), + ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]), + (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]), + ([0], [42], [0], [42.0]), + ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")), + ], +) +def test_quantile_missing_group_values_correct_results( + key, val, expected_key, expected_val +): + # GH 28662, GH 33200, GH 33569 + df = DataFrame({"key": key, "val": val}) + + expected = DataFrame( + expected_val, index=Index(expected_key, name="key"), columns=["val"] + ) + + grp = df.groupby("key") + + result = grp.quantile(0.5) + tm.assert_frame_equal(result, expected) + + result = grp.quantile() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + pd.array([1, 0, None] * 2, dtype="Int64"), + pd.array([True, False, None] * 2, dtype="boolean"), + ], +) +@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) +def test_groupby_quantile_nullable_array(values, q): + # https://github.com/pandas-dev/pandas/issues/33136 + df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values}) + result = df.groupby("a")["b"].quantile(q) + + if isinstance(q, list): + idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) + true_quantiles = [0.0, 0.5, 1.0] + else: + idx = Index(["x", "y"], name="a") + true_quantiles = [0.5] + + expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): + df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) + if numeric_only: + result = df.groupby("a").quantile(q, numeric_only=numeric_only) + expected = df.groupby("a")[["b"]].quantile(q) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises( + TypeError, match="'quantile' cannot be performed against 'object' dtypes!" + ): + df.groupby("a").quantile(q, numeric_only=numeric_only) + + +def test_groupby_quantile_NA_float(any_float_dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) + result = df.groupby("x")["y"].quantile(0.5) + exp_index = Index([1.0], dtype=any_float_dtype, name="x") + + if any_float_dtype in ["Float32", "Float64"]: + expected_dtype = any_float_dtype + else: + expected_dtype = None + + expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y") + tm.assert_series_equal(result, expected) + + result = df.groupby("x")["y"].quantile([0.5, 0.75]) + expected = pd.Series( + [0.2] * 2, + index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]), + name="y", + dtype=expected_dtype, + ) + tm.assert_series_equal(result, expected) + + +def test_groupby_quantile_NA_int(any_int_ea_dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series( + [3.5], + dtype="Float64", + index=Index([1], name="x", dtype=any_int_ea_dtype), + name="y", + ) + tm.assert_series_equal(expected, result) + + result = df.groupby("x").quantile(0.5) + expected = DataFrame( + {"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype) + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)] +) +def test_groupby_quantile_all_na_group_masked( + interpolation, val1, val2, any_numeric_ea_dtype +): + # GH#37493 + df = DataFrame( + {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype + ) + result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation) + expected = DataFrame( + {"b": [val1, val2, pd.NA, pd.NA]}, + dtype=any_numeric_ea_dtype, + index=pd.MultiIndex.from_arrays( + [pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]], + names=["a", None], + ), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("interpolation", ["midpoint", "linear"]) +def test_groupby_quantile_all_na_group_masked_interp( + interpolation, any_numeric_ea_dtype +): + # GH#37493 + df = DataFrame( + {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype + ) + result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation) + + if any_numeric_ea_dtype == "Float32": + expected_dtype = any_numeric_ea_dtype + else: + expected_dtype = "Float64" + + expected = DataFrame( + {"b": [2.0, 2.5, pd.NA, pd.NA]}, + dtype=expected_dtype, + index=pd.MultiIndex.from_arrays( + [ + pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), + [0.5, 0.75, 0.5, 0.75], + ], + names=["a", None], + ), + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Float64", "Float32"]) +def test_groupby_quantile_allNA_column(dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series( + [np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y" + ) + expected.index.name = "x" + tm.assert_series_equal(expected, result) + + +def test_groupby_timedelta_quantile(): + # GH: 29485 + df = DataFrame( + {"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]} + ) + result = df.groupby("group").quantile(0.99) + expected = DataFrame( + { + "value": [ + pd.Timedelta("0 days 00:00:00.990000"), + pd.Timedelta("0 days 00:00:02.990000"), + ] + }, + index=Index([1, 2], name="group"), + ) + tm.assert_frame_equal(result, expected) + + +def test_columns_groupby_quantile(): + # GH 33795 + df = DataFrame( + np.arange(12).reshape(3, -1), + index=list("XYZ"), + columns=pd.Series(list("ABAB"), name="col"), + ) + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby("col", axis=1) + result = gb.quantile(q=[0.8, 0.2]) + expected = DataFrame( + [ + [1.6, 0.4, 2.6, 1.4], + [5.6, 4.4, 6.6, 5.4], + [9.6, 8.4, 10.6, 9.4], + ], + index=list("XYZ"), + columns=pd.MultiIndex.from_tuples( + [("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None] + ), + ) + + tm.assert_frame_equal(result, expected) + + +def test_timestamp_groupby_quantile(unit): + # GH 33168 + dti = pd.date_range( + start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit + ).floor("1h") + df = DataFrame( + { + "timestamp": dti, + "category": list(range(1, 101)), + "value": list(range(101, 201)), + } + ) + + result = df.groupby("timestamp").quantile([0.2, 0.8]) + + mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None)) + expected = DataFrame( + [ + {"category": 12.8, "value": 112.8}, + {"category": 48.2, "value": 148.2}, + {"category": 68.8, "value": 168.8}, + {"category": 92.2, "value": 192.2}, + ], + index=mi, + ) + + tm.assert_frame_equal(result, expected) + + +def test_groupby_quantile_dt64tz_period(): + # GH#51373 + dti = pd.date_range("2016-01-01", periods=1000) + df = pd.Series(dti).to_frame().copy() + df[1] = dti.tz_localize("US/Pacific") + df[2] = dti.to_period("D") + df[3] = dti - dti[0] + df.iloc[-1] = pd.NaT + + by = np.tile(np.arange(5), 200) + gb = df.groupby(by) + + result = gb.quantile(0.5) + + # Check that we match the group-by-group result + exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} + expected = DataFrame(exp).T.infer_objects() + expected.index = expected.index.astype(int) + + tm.assert_frame_equal(result, expected) + + +def test_groupby_quantile_nonmulti_levels_order(): + # Non-regression test for GH #53009 + ind = pd.MultiIndex.from_tuples( + [ + (0, "a", "B"), + (0, "a", "A"), + (0, "b", "B"), + (0, "b", "A"), + (1, "a", "B"), + (1, "a", "A"), + (1, "b", "B"), + (1, "b", "A"), + ], + names=["sample", "cat0", "cat1"], + ) + ser = pd.Series(range(8), index=ind) + result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8]) + + qind = pd.MultiIndex.from_tuples( + [("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None] + ) + expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind) + + tm.assert_series_equal(result, expected) + + # We need to check that index levels are not sorted + expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]]) + tm.assert_equal(result.index.levels, expected_levels) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_rank.py pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_rank.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_rank.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_rank.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,721 @@ +from datetime import datetime + +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + NaT, + Series, + concat, +) +import pandas._testing as tm + + +def test_rank_unordered_categorical_typeerror(): + # GH#51034 should be TypeError, not NotImplementedError + cat = pd.Categorical([], ordered=False) + ser = Series(cat) + df = ser.to_frame() + + msg = "Cannot perform rank with non-ordered Categorical" + + gb = ser.groupby(cat, observed=False) + with pytest.raises(TypeError, match=msg): + gb.rank() + + gb2 = df.groupby(cat, observed=False) + with pytest.raises(TypeError, match=msg): + gb2.rank() + + +def test_rank_apply(): + lev1 = np.array(["a" * 10] * 100, dtype=object) + lev2 = np.array(["b" * 10] * 130, dtype=object) + lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int) + lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int) + + df = DataFrame( + { + "value": np.random.default_rng(2).standard_normal(500), + "key1": lev1.take(lab1), + "key2": lev2.take(lab2), + } + ) + + result = df.groupby(["key1", "key2"]).value.rank() + + expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])] + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + tm.assert_series_equal(result, expected) + + result = df.groupby(["key1", "key2"]).value.rank(pct=True) + + expected = [ + piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"]) + ] + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", + [ + np.array([2, 2, 8, 2, 6], dtype=dtype) + for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"] + ] + + [ + [ + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-08"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-06"), + ], + [ + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-08", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-06", tz="US/Pacific"), + ], + [ + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-08") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-06") - pd.Timestamp(0), + ], + [ + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-08").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-06").to_period("D"), + ], + ], + ids=lambda x: type(x[0]), +) +@pytest.mark.parametrize( + "ties_method,ascending,pct,exp", + [ + ("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]), + ("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), + ("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]), + ("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]), + ("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]), + ("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), + ("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), + ("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]), + ("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]), + ("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), + ("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]), + ("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]), + ("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]), + ("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), + ("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]), + ("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]), + ("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]), + ("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]), + ("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), + ("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]), + ], +) +def test_rank_args(grps, vals, ties_method, ascending, pct, exp): + key = np.repeat(grps, len(vals)) + + orig_vals = vals + vals = list(vals) * len(grps) + if isinstance(orig_vals, np.ndarray): + vals = np.array(vals, dtype=orig_vals.dtype) + + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]] +) +@pytest.mark.parametrize( + "ties_method,ascending,na_option,exp", + [ + ("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), + ("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]), + ("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]), + ("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), + ("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]), + ("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]), + ("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]), + ("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]), + ("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]), + ("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]), + ("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]), + ("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]), + ("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]), + ("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]), + ("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]), + ("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]), + ("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]), + ("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]), + ("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]), + ("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]), + ("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]), + ("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]), + ("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]), + ("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]), + ("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]), + ("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]), + ("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]), + ("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]), + ("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]), + ("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]), + ], +) +def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): + # GH 20561 + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option + ) + exp_df = DataFrame(exp * len(grps), columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", + [ + np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype) + for dtype in ["f8", "f4", "f2"] + ] + + [ + [ + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-02"), + np.nan, + pd.Timestamp("2018-01-08"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-06"), + np.nan, + np.nan, + ], + [ + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + np.nan, + pd.Timestamp("2018-01-08", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-06", tz="US/Pacific"), + np.nan, + np.nan, + ], + [ + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + np.nan, + pd.Timestamp("2018-01-08") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-06") - pd.Timestamp(0), + np.nan, + np.nan, + ], + [ + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + np.nan, + pd.Timestamp("2018-01-08").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-06").to_period("D"), + np.nan, + np.nan, + ], + ], + ids=lambda x: type(x[0]), +) +@pytest.mark.parametrize( + "ties_method,ascending,na_option,pct,exp", + [ + ( + "average", + True, + "keep", + False, + [2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan], + ), + ( + "average", + True, + "keep", + True, + [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan], + ), + ( + "average", + False, + "keep", + False, + [4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan], + ), + ( + "average", + False, + "keep", + True, + [0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan], + ), + ("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]), + ("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), + ( + "min", + False, + "keep", + False, + [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], + ), + ("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]), + ("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ( + "max", + False, + "keep", + False, + [5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], + ), + ("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]), + ( + "first", + True, + "keep", + False, + [1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan], + ), + ( + "first", + True, + "keep", + True, + [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan], + ), + ( + "first", + False, + "keep", + False, + [3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], + ), + ( + "first", + False, + "keep", + True, + [0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan], + ), + ( + "dense", + True, + "keep", + False, + [1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan], + ), + ( + "dense", + True, + "keep", + True, + [ + 1.0 / 3.0, + 1.0 / 3.0, + np.nan, + 3.0 / 3.0, + 1.0 / 3.0, + 2.0 / 3.0, + np.nan, + np.nan, + ], + ), + ( + "dense", + False, + "keep", + False, + [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], + ), + ( + "dense", + False, + "keep", + True, + [ + 3.0 / 3.0, + 3.0 / 3.0, + np.nan, + 1.0 / 3.0, + 3.0 / 3.0, + 2.0 / 3.0, + np.nan, + np.nan, + ], + ), + ("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]), + ( + "average", + True, + "bottom", + True, + [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875], + ), + ("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]), + ( + "average", + False, + "bottom", + True, + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875], + ), + ("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]), + ( + "min", + True, + "bottom", + True, + [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75], + ), + ("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]), + ( + "min", + False, + "bottom", + True, + [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75], + ), + ("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]), + ("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]), + ("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]), + ( + "max", + False, + "bottom", + True, + [0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0], + ), + ("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]), + ( + "first", + True, + "bottom", + True, + [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0], + ), + ("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]), + ( + "first", + False, + "bottom", + True, + [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0], + ), + ("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]), + ("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]), + ("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]), + ("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]), + ], +) +def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp): + key = np.repeat(grps, len(vals)) + + orig_vals = vals + vals = list(vals) * len(grps) + if isinstance(orig_vals, np.ndarray): + vals = np.array(vals, dtype=orig_vals.dtype) + + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) + + exp_df = DataFrame(exp * len(grps), columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize( + "pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])] +) +def test_rank_resets_each_group(pct, exp): + df = DataFrame( + {"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10} + ) + result = df.groupby("key").rank(pct=pct) + exp_df = DataFrame(exp * 2, columns=["val"]) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize( + "dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"] +) +@pytest.mark.parametrize("upper", [True, False]) +def test_rank_avg_even_vals(dtype, upper): + if upper: + # use IntegerDtype/FloatingDtype + dtype = dtype[0].upper() + dtype[1:] + dtype = dtype.replace("Ui", "UI") + df = DataFrame({"key": ["a"] * 4, "val": [1] * 4}) + df["val"] = df["val"].astype(dtype) + assert df["val"].dtype == dtype + + result = df.groupby("key").rank() + exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"]) + if upper: + exp_df = exp_df.astype("Float64") + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) +@pytest.mark.parametrize("pct", [True, False]) +@pytest.mark.parametrize( + "vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]] +) +def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals): + df = DataFrame({"key": ["foo"] * 5, "val": vals}) + mask = df["val"].isna() + + gb = df.groupby("key") + res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct) + + # construct our expected by using numeric values with the same ordering + if mask.any(): + df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]}) + else: + df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]}) + + gb2 = df2.groupby("key") + alt = gb2.rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) + + tm.assert_frame_equal(res, alt) + + +@pytest.mark.parametrize("na_option", [True, "bad", 1]) +@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("pct", [True, False]) +@pytest.mark.parametrize( + "vals", + [ + ["bar", "bar", "foo", "bar", "baz"], + ["bar", np.nan, "foo", np.nan, "baz"], + [1, np.nan, 2, np.nan, 3], + ], +) +def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals): + df = DataFrame({"key": ["foo"] * 5, "val": vals}) + msg = "na_option must be one of 'keep', 'top', or 'bottom'" + + with pytest.raises(ValueError, match=msg): + df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) + + +def test_rank_empty_group(): + # see gh-22519 + column = "A" + df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]}) + + result = df.groupby(column).B.rank(pct=True) + expected = Series([0.5, np.nan, 1.0], name="B") + tm.assert_series_equal(result, expected) + + result = df.groupby(column).rank(pct=True) + expected = DataFrame({"B": [0.5, np.nan, 1.0]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "input_key,input_value,output_value", + [ + ([1, 2], [1, 1], [1.0, 1.0]), + ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]), + ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]), + ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]), + ], +) +def test_rank_zero_div(input_key, input_value, output_value): + # GH 23666 + df = DataFrame({"A": input_key, "B": input_value}) + + result = df.groupby("A").rank(method="dense", pct=True) + expected = DataFrame({"B": output_value}) + tm.assert_frame_equal(result, expected) + + +def test_rank_min_int(): + # GH-32859 + df = DataFrame( + { + "grp": [1, 1, 2], + "int_col": [ + np.iinfo(np.int64).min, + np.iinfo(np.int64).max, + np.iinfo(np.int64).min, + ], + "datetimelike": [NaT, datetime(2001, 1, 1), NaT], + } + ) + + result = df.groupby("grp").rank() + expected = DataFrame( + {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]} + ) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("use_nan", [True, False]) +def test_rank_pct_equal_values_on_group_transition(use_nan): + # GH#40518 + fill_value = np.nan if use_nan else 3 + df = DataFrame( + [ + [-1, 1], + [-1, 2], + [1, fill_value], + [-1, fill_value], + ], + columns=["group", "val"], + ) + result = df.groupby(["group"])["val"].rank( + method="dense", + pct=True, + ) + if use_nan: + expected = Series([0.5, 1, np.nan, np.nan], name="val") + else: + expected = Series([1 / 3, 2 / 3, 1, 1], name="val") + + tm.assert_series_equal(result, expected) + + +def test_rank_multiindex(): + # GH27721 + df = concat( + { + "a": DataFrame({"col1": [3, 4], "col2": [1, 2]}), + "b": DataFrame({"col3": [5, 6], "col4": [7, 8]}), + }, + axis=1, + ) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby(level=0, axis=1) + msg = "DataFrameGroupBy.rank with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.rank(axis=1) + + expected = concat( + [ + df["a"].rank(axis=1), + df["b"].rank(axis=1), + ], + axis=1, + keys=["a", "b"], + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_axis0_rank_axis1(): + # GH#41320 + df = DataFrame( + {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]}, + index=["a", "a", "b", "b"], + ) + msg = "The 'axis' keyword in DataFrame.groupby is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby(level=0, axis=0) + + msg = "DataFrameGroupBy.rank with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = gb.rank(axis=1) + + # This should match what we get when "manually" operating group-by-group + expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0) + tm.assert_frame_equal(res, expected) + + # check that we haven't accidentally written a case that coincidentally + # matches rank(axis=0) + msg = "The 'axis' keyword in DataFrameGroupBy.rank" + with tm.assert_produces_warning(FutureWarning, match=msg): + alt = gb.rank(axis=0) + assert not alt.equals(expected) + + +def test_groupby_axis0_cummax_axis1(): + # case where groupby axis is 0 and axis keyword in transform is 1 + + # df has mixed dtype -> multiple blocks + df = DataFrame( + {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]}, + index=["a", "a", "b", "b"], + ) + msg = "The 'axis' keyword in DataFrame.groupby is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gb = df.groupby(level=0, axis=0) + + msg = "DataFrameGroupBy.cummax with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + cmax = gb.cummax(axis=1) + expected = df[[0, 1]].astype(np.float64) + expected[2] = expected[1] + tm.assert_frame_equal(cmax, expected) + + +def test_non_unique_index(): + # GH 16577 + df = DataFrame( + {"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0}, + index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, + ) + result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True) + expected = Series( + [1.0, 1.0, 1.0, np.nan], + index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, + name="value", + ) + tm.assert_series_equal(result, expected) + + +def test_rank_categorical(): + cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True) + cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True) + + df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2}) + + gb = df.groupby("col1") + + res = gb.rank() + + expected = df.astype(object).groupby("col1").rank() + tm.assert_frame_equal(res, expected) + + +@pytest.mark.parametrize("na_option", ["top", "bottom"]) +def test_groupby_op_with_nullables(na_option): + # GH 54206 + df = DataFrame({"x": [None]}, dtype="Float64") + result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option) + expected = Series([1.0], dtype="Float64", name=result.name) + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_sample.py pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_sample.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_sample.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_sample.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,154 @@ +import pytest + +from pandas import ( + DataFrame, + Index, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)]) +def test_groupby_sample_balanced_groups_shape(n, frac): + values = [1] * 10 + [2] * 10 + df = DataFrame({"a": values, "b": values}) + + result = df.groupby("a").sample(n=n, frac=frac) + values = [1] * 2 + [2] * 2 + expected = DataFrame({"a": values, "b": values}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=n, frac=frac) + expected = Series(values, name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_unbalanced_groups_shape(): + values = [1] * 10 + [2] * 20 + df = DataFrame({"a": values, "b": values}) + + result = df.groupby("a").sample(n=5) + values = [1] * 5 + [2] * 5 + expected = DataFrame({"a": values, "b": values}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=5) + expected = Series(values, name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_index_value_spans_groups(): + values = [1] * 3 + [2] * 3 + df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2]) + + result = df.groupby("a").sample(n=2) + values = [1] * 2 + [2] * 2 + expected = DataFrame({"a": values, "b": values}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=2) + expected = Series(values, name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_n_and_frac_raises(): + df = DataFrame({"a": [1, 2], "b": [1, 2]}) + msg = "Please enter a value for `frac` OR `n`, not both" + + with pytest.raises(ValueError, match=msg): + df.groupby("a").sample(n=1, frac=1.0) + + with pytest.raises(ValueError, match=msg): + df.groupby("a")["b"].sample(n=1, frac=1.0) + + +def test_groupby_sample_frac_gt_one_without_replacement_raises(): + df = DataFrame({"a": [1, 2], "b": [1, 2]}) + msg = "Replace has to be set to `True` when upsampling the population `frac` > 1." + + with pytest.raises(ValueError, match=msg): + df.groupby("a").sample(frac=1.5, replace=False) + + with pytest.raises(ValueError, match=msg): + df.groupby("a")["b"].sample(frac=1.5, replace=False) + + +@pytest.mark.parametrize("n", [-1, 1.5]) +def test_groupby_sample_invalid_n_raises(n): + df = DataFrame({"a": [1, 2], "b": [1, 2]}) + + if n < 0: + msg = "A negative number of rows requested. Please provide `n` >= 0." + else: + msg = "Only integers accepted as `n` values" + + with pytest.raises(ValueError, match=msg): + df.groupby("a").sample(n=n) + + with pytest.raises(ValueError, match=msg): + df.groupby("a")["b"].sample(n=n) + + +def test_groupby_sample_oversample(): + values = [1] * 10 + [2] * 10 + df = DataFrame({"a": values, "b": values}) + + result = df.groupby("a").sample(frac=2.0, replace=True) + values = [1] * 20 + [2] * 20 + expected = DataFrame({"a": values, "b": values}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(frac=2.0, replace=True) + expected = Series(values, name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_without_n_or_frac(): + values = [1] * 10 + [2] * 10 + df = DataFrame({"a": values, "b": values}) + + result = df.groupby("a").sample(n=None, frac=None) + expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=None, frac=None) + expected = Series([1, 2], name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "index, expected_index", + [(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])], +) +def test_groupby_sample_with_weights(index, expected_index): + # GH 39927 - tests for integer index needed + values = [1] * 2 + [2] * 2 + df = DataFrame({"a": values, "b": values}, index=Index(index)) + + result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0]) + expected = DataFrame({"a": values, "b": values}, index=Index(expected_index)) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0]) + expected = Series(values, name="b", index=Index(expected_index)) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_with_selections(): + # GH 39928 + values = [1] * 10 + [2] * 10 + df = DataFrame({"a": values, "b": values, "c": values}) + + result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None) + expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index) + tm.assert_frame_equal(result, expected) + + +def test_groupby_sample_with_empty_inputs(): + # GH48459 + df = DataFrame({"a": [], "b": []}) + groupby_df = df.groupby("a") + + result = groupby_df.sample() + expected = df + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_size.py pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_size.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_size.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_size.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,130 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas.core.dtypes.common import is_integer_dtype + +from pandas import ( + DataFrame, + Index, + PeriodIndex, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) +def test_size(df, by): + grouped = df.groupby(by=by) + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + +@pytest.mark.parametrize( + "by", + [ + [0, 0, 0, 0], + [0, 1, 1, 1], + [1, 0, 1, 1], + [0, None, None, None], + pytest.param([None, None, None, None], marks=pytest.mark.xfail), + ], +) +def test_size_axis_1(df, axis_1, by, sort, dropna): + # GH#45715 + counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)} + if dropna: + counts = {key: value for key, value in counts.items() if key is not None} + expected = Series(counts, dtype="int64") + if sort: + expected = expected.sort_index() + if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): + expected.index = expected.index.astype(int) + + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna) + result = grouped.size() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) +@pytest.mark.parametrize("sort", [True, False]) +def test_size_sort(sort, by): + df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC")) + left = df.groupby(by=by, sort=sort).size() + right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0]) + tm.assert_series_equal(left, right, check_names=False) + + +def test_size_series_dataframe(): + # https://github.com/pandas-dev/pandas/issues/11699 + df = DataFrame(columns=["A", "B"]) + out = Series(dtype="int64", index=Index([], name="A")) + tm.assert_series_equal(df.groupby("A").size(), out) + + +def test_size_groupby_all_null(): + # https://github.com/pandas-dev/pandas/issues/23050 + # Assert no 'Value Error : Length of passed values is 2, index implies 0' + df = DataFrame({"A": [None, None]}) # all-null groups + result = df.groupby("A").size() + expected = Series(dtype="int64", index=Index([], name="A")) + tm.assert_series_equal(result, expected) + + +def test_size_period_index(): + # https://github.com/pandas-dev/pandas/issues/34010 + ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D")) + grp = ser.groupby(level="A") + result = grp.size() + tm.assert_series_equal(result, ser) + + +@pytest.mark.parametrize("as_index", [True, False]) +def test_size_on_categorical(as_index): + df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"]) + df["A"] = df["A"].astype("category") + result = df.groupby(["A", "B"], as_index=as_index, observed=False).size() + + expected = DataFrame( + [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"] + ) + expected["A"] = expected["A"].astype("category") + if as_index: + expected = expected.set_index(["A", "B"])["size"].rename(None) + + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +def test_size_series_masked_type_returns_Int64(dtype): + # GH 54132 + ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype) + result = ser.groupby(level=0).size() + expected = Series([2, 1], dtype="Int64", index=["a", "b"]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +def test_size_strings(dtype): + # GH#55627 + df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) + result = df.groupby("a")["b"].size() + exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" + expected = Series( + [2, 1], + index=Index(["a", "b"], name="a", dtype=dtype), + name="b", + dtype=exp_dtype, + ) + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_skew.py pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_skew.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_skew.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_skew.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,27 @@ +import numpy as np + +import pandas as pd +import pandas._testing as tm + + +def test_groupby_skew_equivalence(): + # Test that that groupby skew method (which uses libgroupby.group_skew) + # matches the results of operating group-by-group (which uses nanops.nanskew) + nrows = 1000 + ngroups = 3 + ncols = 2 + nan_frac = 0.05 + + arr = np.random.default_rng(2).standard_normal((nrows, ncols)) + arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan + + df = pd.DataFrame(arr) + grps = np.random.default_rng(2).integers(0, ngroups, size=nrows) + gb = df.groupby(grps) + + result = gb.skew() + + grpwise = [grp.skew().to_frame(i).T for i, grp in gb] + expected = pd.concat(grpwise, axis=0) + expected.index = expected.index.astype(result.index.dtype) # 32bit builds + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_value_counts.py pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_value_counts.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/methods/test_value_counts.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/methods/test_value_counts.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,1241 @@ +""" +these are systematically testing all of the args to value_counts +with different size combinations. This is to ensure stability of the sorting +and proper parameter handling +""" + + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + Grouper, + Index, + MultiIndex, + Series, + date_range, + to_datetime, +) +import pandas._testing as tm +from pandas.util.version import Version + + +def tests_value_counts_index_names_category_column(): + # GH44324 Missing name of index category column + df = DataFrame( + { + "gender": ["female"], + "country": ["US"], + } + ) + df["gender"] = df["gender"].astype("category") + result = df.groupby("country")["gender"].value_counts() + + # Construct expected, very specific multiindex + df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"]) + df_mi_expected["gender"] = df_mi_expected["gender"].astype("category") + mi_expected = MultiIndex.from_frame(df_mi_expected) + expected = Series([1], index=mi_expected, name="count") + + tm.assert_series_equal(result, expected) + + +def seed_df(seed_nans, n, m): + days = date_range("2015-08-24", periods=10) + + frame = DataFrame( + { + "1st": np.random.default_rng(2).choice(list("abcd"), n), + "2nd": np.random.default_rng(2).choice(days, n), + "3rd": np.random.default_rng(2).integers(1, m + 1, n), + } + ) + + if seed_nans: + # Explicitly cast to float to avoid implicit cast when setting nan + frame["3rd"] = frame["3rd"].astype("float") + frame.loc[1::11, "1st"] = np.nan + frame.loc[3::17, "2nd"] = np.nan + frame.loc[7::19, "3rd"] = np.nan + frame.loc[8::19, "3rd"] = np.nan + frame.loc[9::19, "3rd"] = np.nan + + return frame + + +@pytest.mark.slow +@pytest.mark.parametrize("seed_nans", [True, False]) +@pytest.mark.parametrize("num_rows", [10, 50]) +@pytest.mark.parametrize("max_int", [5, 20]) +@pytest.mark.parametrize("keys", ["1st", "2nd", ["1st", "2nd"]], ids=repr) +@pytest.mark.parametrize("bins", [None, [0, 5]], ids=repr) +@pytest.mark.parametrize("isort", [True, False]) +@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("dropna", [True, False]) +def test_series_groupby_value_counts( + seed_nans, + num_rows, + max_int, + keys, + bins, + isort, + normalize, + name, + sort, + ascending, + dropna, +): + df = seed_df(seed_nans, num_rows, max_int) + + def rebuild_index(df): + arr = list(map(df.index.get_level_values, range(df.index.nlevels))) + df.index = MultiIndex.from_arrays(arr, names=df.index.names) + return df + + kwargs = { + "normalize": normalize, + "sort": sort, + "ascending": ascending, + "dropna": dropna, + "bins": bins, + } + + gr = df.groupby(keys, sort=isort) + left = gr["3rd"].value_counts(**kwargs) + + gr = df.groupby(keys, sort=isort) + right = gr["3rd"].apply(Series.value_counts, **kwargs) + right.index.names = right.index.names[:-1] + ["3rd"] + # https://github.com/pandas-dev/pandas/issues/49909 + right = right.rename(name) + + # have to sort on index because of unstable sort on values + left, right = map(rebuild_index, (left, right)) # xref GH9212 + tm.assert_series_equal(left.sort_index(), right.sort_index()) + + +@pytest.mark.parametrize("utc", [True, False]) +def test_series_groupby_value_counts_with_grouper(utc): + # GH28479 + df = DataFrame( + { + "Timestamp": [ + 1565083561, + 1565083561 + 86400, + 1565083561 + 86500, + 1565083561 + 86400 * 2, + 1565083561 + 86400 * 3, + 1565083561 + 86500 * 3, + 1565083561 + 86400 * 4, + ], + "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], + } + ).drop([3]) + + df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s") + dfg = df.groupby(Grouper(freq="1D", key="Datetime")) + + # have to sort on index because of unstable sort on values xref GH9212 + result = dfg["Food"].value_counts().sort_index() + expected = dfg["Food"].apply(Series.value_counts).sort_index() + expected.index.names = result.index.names + # https://github.com/pandas-dev/pandas/issues/49909 + expected = expected.rename("count") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]]) +def test_series_groupby_value_counts_empty(columns): + # GH39172 + df = DataFrame(columns=columns) + dfg = df.groupby(columns[:-1]) + + result = dfg[columns[-1]].value_counts() + expected = Series([], dtype=result.dtype, name="count") + expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]]) +def test_series_groupby_value_counts_one_row(columns): + # GH42618 + df = DataFrame(data=[range(len(columns))], columns=columns) + dfg = df.groupby(columns[:-1]) + + result = dfg[columns[-1]].value_counts() + expected = df.value_counts() + + tm.assert_series_equal(result, expected) + + +def test_series_groupby_value_counts_on_categorical(): + # GH38672 + + s = Series(Categorical(["a"], categories=["a", "b"])) + result = s.groupby([0]).value_counts() + + expected = Series( + data=[1, 0], + index=MultiIndex.from_arrays( + [ + np.array([0, 0]), + CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, dtype="category" + ), + ] + ), + name="count", + ) + + # Expected: + # 0 a 1 + # b 0 + # dtype: int64 + + tm.assert_series_equal(result, expected) + + +def test_series_groupby_value_counts_no_sort(): + # GH#50482 + df = DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", "high", "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + gb = df.groupby(["country", "gender"], sort=False)["education"] + result = gb.value_counts(sort=False) + index = MultiIndex( + levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]], + codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]], + names=["country", "gender", "education"], + ) + expected = Series([1, 1, 1, 2, 1], index=index, name="count") + tm.assert_series_equal(result, expected) + + +@pytest.fixture +def education_df(): + return DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", "high", "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + + +def test_axis(education_df): + msg = "DataFrame.groupby with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + gp = education_df.groupby("country", axis=1) + with pytest.raises(NotImplementedError, match="axis"): + gp.value_counts() + + +def test_bad_subset(education_df): + gp = education_df.groupby("country") + with pytest.raises(ValueError, match="subset"): + gp.value_counts(subset=["country"]) + + +def test_basic(education_df, request): + # gh43564 + if Version(np.__version__) >= Version("1.25"): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + result = education_df.groupby("country")[["gender", "education"]].value_counts( + normalize=True + ) + expected = Series( + data=[0.5, 0.25, 0.25, 0.5, 0.5], + index=MultiIndex.from_tuples( + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("US", "female", "high"), + ("US", "male", "low"), + ], + names=["country", "gender", "education"], + ), + name="proportion", + ) + tm.assert_series_equal(result, expected) + + +def _frame_value_counts(df, keys, normalize, sort, ascending): + return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) + + +@pytest.mark.parametrize("groupby", ["column", "array", "function"]) +@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) +@pytest.mark.parametrize( + "sort, ascending", + [ + (False, None), + (True, True), + (True, False), + ], +) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("frame", [True, False]) +def test_against_frame_and_seriesgroupby( + education_df, groupby, normalize, name, sort, ascending, as_index, frame, request +): + # test all parameters: + # - Use column, array or function as by= parameter + # - Whether or not to normalize + # - Whether or not to sort and how + # - Whether or not to use the groupby as an index + # - 3-way compare against: + # - apply with :meth:`~DataFrame.value_counts` + # - `~SeriesGroupBy.value_counts` + if Version(np.__version__) >= Version("1.25") and frame and sort and normalize: + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + by = { + "column": "country", + "array": education_df["country"].values, + "function": lambda x: education_df["country"][x] == "US", + }[groupby] + + gp = education_df.groupby(by=by, as_index=as_index) + result = gp[["gender", "education"]].value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + if frame: + # compare against apply with DataFrame value_counts + warn = DeprecationWarning if groupby == "column" else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) + + if as_index: + tm.assert_series_equal(result, expected) + else: + name = "proportion" if normalize else "count" + expected = expected.reset_index().rename({0: name}, axis=1) + if groupby == "column": + expected = expected.rename({"level_0": "country"}, axis=1) + expected["country"] = np.where(expected["country"], "US", "FR") + elif groupby == "function": + expected["level_0"] = expected["level_0"] == 1 + else: + expected["level_0"] = np.where(expected["level_0"], "US", "FR") + tm.assert_frame_equal(result, expected) + else: + # compare against SeriesGroupBy value_counts + education_df["both"] = education_df["gender"] + "-" + education_df["education"] + expected = gp["both"].value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + expected.name = name + if as_index: + index_frame = expected.index.to_frame(index=False) + index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) + index_frame["education"] = index_frame["both"].str.split("-").str.get(1) + del index_frame["both"] + index_frame = index_frame.rename({0: None}, axis=1) + expected.index = MultiIndex.from_frame(index_frame) + tm.assert_series_equal(result, expected) + else: + expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) + expected.insert(2, "education", expected["both"].str.split("-").str.get(1)) + del expected["both"] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + object, + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], +) +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize( + "sort, ascending, expected_rows, expected_count, expected_group_size", + [ + (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]), + (True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]), + (True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]), + ], +) +def test_compound( + education_df, + normalize, + sort, + ascending, + expected_rows, + expected_count, + expected_group_size, + dtype, +): + education_df = education_df.astype(dtype) + education_df.columns = education_df.columns.astype(dtype) + # Multiple groupby keys and as_index=False + gp = education_df.groupby(["country", "gender"], as_index=False, sort=False) + result = gp["education"].value_counts( + normalize=normalize, sort=sort, ascending=ascending + ) + expected = DataFrame() + for column in ["country", "gender", "education"]: + expected[column] = [education_df[column][row] for row in expected_rows] + expected = expected.astype(dtype) + expected.columns = expected.columns.astype(dtype) + if normalize: + expected["proportion"] = expected_count + expected["proportion"] /= expected_group_size + if dtype == "string[pyarrow]": + expected["proportion"] = expected["proportion"].convert_dtypes() + else: + expected["count"] = expected_count + if dtype == "string[pyarrow]": + expected["count"] = expected["count"].convert_dtypes() + tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def animals_df(): + return DataFrame( + {"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + + +@pytest.mark.parametrize( + "sort, ascending, normalize, name, expected_data, expected_index", + [ + (False, None, False, "count", [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]), + (True, True, False, "count", [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]), + (True, False, False, "count", [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), + ( + True, + False, + True, + "proportion", + [0.5, 0.25, 0.25], + [(1, 1, 1), (4, 2, 6), (0, 2, 0)], + ), + ], +) +def test_data_frame_value_counts( + animals_df, sort, ascending, normalize, name, expected_data, expected_index +): + # 3-way compare with :meth:`~DataFrame.value_counts` + # Tests from frame/methods/test_value_counts.py + result_frame = animals_df.value_counts( + sort=sort, ascending=ascending, normalize=normalize + ) + expected = Series( + data=expected_data, + index=MultiIndex.from_arrays( + expected_index, names=["key", "num_legs", "num_wings"] + ), + name=name, + ) + tm.assert_series_equal(result_frame, expected) + + result_frame_groupby = animals_df.groupby("key").value_counts( + sort=sort, ascending=ascending, normalize=normalize + ) + + tm.assert_series_equal(result_frame_groupby, expected) + + +@pytest.fixture +def nulls_df(): + n = np.nan + return DataFrame( + { + "A": [1, 1, n, 4, n, 6, 6, 6, 6], + "B": [1, 1, 3, n, n, 6, 6, 6, 6], + "C": [1, 2, 3, 4, 5, 6, n, 8, n], + "D": [1, 2, 3, 4, 5, 6, 7, n, n], + } + ) + + +@pytest.mark.parametrize( + "group_dropna, count_dropna, expected_rows, expected_values", + [ + ( + False, + False, + [0, 1, 3, 5, 7, 6, 8, 2, 4], + [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0], + ), + (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), + (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), + (True, True, [0, 1, 5], [0.5, 0.5, 1.0]), + ], +) +def test_dropna_combinations( + nulls_df, group_dropna, count_dropna, expected_rows, expected_values, request +): + if Version(np.__version__) >= Version("1.25") and not group_dropna: + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + gp = nulls_df.groupby(["A", "B"], dropna=group_dropna) + result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna) + columns = DataFrame() + for column in nulls_df.columns: + columns[column] = [nulls_df[column][row] for row in expected_rows] + index = MultiIndex.from_frame(columns) + expected = Series(data=expected_values, index=index, name="proportion") + tm.assert_series_equal(result, expected) + + +@pytest.fixture +def names_with_nulls_df(nulls_fixture): + return DataFrame( + { + "key": [1, 1, 1, 1], + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + + +@pytest.mark.parametrize( + "dropna, expected_data, expected_index", + [ + ( + True, + [1, 1], + MultiIndex.from_arrays( + [(1, 1), ("Beth", "John"), ("Louise", "Smith")], + names=["key", "first_name", "middle_name"], + ), + ), + ( + False, + [1, 1, 1, 1], + MultiIndex( + levels=[ + Index([1]), + Index(["Anne", "Beth", "John"]), + Index(["Louise", "Smith", np.nan]), + ], + codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]], + names=["key", "first_name", "middle_name"], + ), + ), + ], +) +@pytest.mark.parametrize("normalize, name", [(False, "count"), (True, "proportion")]) +def test_data_frame_value_counts_dropna( + names_with_nulls_df, dropna, normalize, name, expected_data, expected_index +): + # GH 41334 + # 3-way compare with :meth:`~DataFrame.value_counts` + # Tests with nulls from frame/methods/test_value_counts.py + result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize) + expected = Series( + data=expected_data, + index=expected_index, + name=name, + ) + if normalize: + expected /= float(len(expected_data)) + + tm.assert_series_equal(result_frame, expected) + + result_frame_groupby = names_with_nulls_df.groupby("key").value_counts( + dropna=dropna, normalize=normalize + ) + + tm.assert_series_equal(result_frame_groupby, expected) + + +@pytest.mark.parametrize("as_index", [False, True]) +@pytest.mark.parametrize("observed", [False, True]) +@pytest.mark.parametrize( + "normalize, name, expected_data", + [ + ( + False, + "count", + np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), + ), + ( + True, + "proportion", + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_single_grouper_with_only_observed_categories( + education_df, as_index, observed, normalize, name, expected_data, request +): + # Test single categorical grouper with only observed grouping categories + # when non-groupers are also categorical + if Version(np.__version__) >= Version("1.25"): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + + gp = education_df.astype("category").groupby( + "country", as_index=as_index, observed=observed + ) + result = gp.value_counts(normalize=normalize) + + expected_index = MultiIndex.from_tuples( + [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ], + names=["country", "gender", "education"], + ) + + expected_series = Series( + data=expected_data, + index=expected_index, + name=name, + ) + for i in range(3): + expected_series.index = expected_series.index.set_levels( + CategoricalIndex(expected_series.index.levels[i]), level=i + ) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +def assert_categorical_single_grouper( + education_df, as_index, observed, expected_index, normalize, name, expected_data +): + # Test single categorical grouper when non-groupers are also categorical + education_df = education_df.copy().astype("category") + + # Add non-observed grouping categories + education_df["country"] = education_df["country"].cat.add_categories(["ASIA"]) + + gp = education_df.groupby("country", as_index=as_index, observed=observed) + result = gp.value_counts(normalize=normalize) + + expected_series = Series( + data=expected_data, + index=MultiIndex.from_tuples( + expected_index, + names=["country", "gender", "education"], + ), + name=name, + ) + for i in range(3): + index_level = CategoricalIndex(expected_series.index.levels[i]) + if i == 0: + index_level = index_level.set_categories( + education_df["country"].cat.categories + ) + expected_series.index = expected_series.index.set_levels(index_level, level=i) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index(name=name) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize( + "normalize, name, expected_data", + [ + ( + False, + "count", + np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), + ), + ( + True, + "proportion", + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_single_grouper_observed_true( + education_df, as_index, normalize, name, expected_data, request +): + # GH#46357 + + if Version(np.__version__) >= Version("1.25"): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] + + assert_categorical_single_grouper( + education_df=education_df, + as_index=as_index, + observed=True, + expected_index=expected_index, + normalize=normalize, + name=name, + expected_data=expected_data, + ) + + +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize( + "normalize, name, expected_data", + [ + ( + False, + "count", + np.array( + [2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64 + ), + ), + ( + True, + "proportion", + np.array( + [ + 0.5, + 0.25, + 0.25, + 0.0, + 0.0, + 0.0, + 0.5, + 0.5, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ] + ), + ), + ], +) +def test_categorical_single_grouper_observed_false( + education_df, as_index, normalize, name, expected_data, request +): + # GH#46357 + + if Version(np.__version__) >= Version("1.25"): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ("ASIA", "female", "high"), + ("ASIA", "female", "low"), + ("ASIA", "female", "medium"), + ("ASIA", "male", "high"), + ("ASIA", "male", "low"), + ("ASIA", "male", "medium"), + ] + + assert_categorical_single_grouper( + education_df=education_df, + as_index=as_index, + observed=False, + expected_index=expected_index, + normalize=normalize, + name=name, + expected_data=expected_data, + ) + + +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize( + "observed, expected_index", + [ + ( + False, + [ + ("FR", "high", "female"), + ("FR", "high", "male"), + ("FR", "low", "male"), + ("FR", "low", "female"), + ("FR", "medium", "male"), + ("FR", "medium", "female"), + ("US", "high", "female"), + ("US", "high", "male"), + ("US", "low", "male"), + ("US", "low", "female"), + ("US", "medium", "female"), + ("US", "medium", "male"), + ], + ), + ( + True, + [ + ("FR", "high", "female"), + ("FR", "low", "male"), + ("FR", "medium", "male"), + ("US", "high", "female"), + ("US", "low", "male"), + ], + ), + ], +) +@pytest.mark.parametrize( + "normalize, name, expected_data", + [ + ( + False, + "count", + np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64), + ), + ( + True, + "proportion", + # NaN values corresponds to non-observed groups + np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_multiple_groupers( + education_df, as_index, observed, expected_index, normalize, name, expected_data +): + # GH#46357 + + # Test multiple categorical groupers when non-groupers are non-categorical + education_df = education_df.copy() + education_df["country"] = education_df["country"].astype("category") + education_df["education"] = education_df["education"].astype("category") + + gp = education_df.groupby( + ["country", "education"], as_index=as_index, observed=observed + ) + result = gp.value_counts(normalize=normalize) + + expected_series = Series( + data=expected_data[expected_data > 0.0] if observed else expected_data, + index=MultiIndex.from_tuples( + expected_index, + names=["country", "education", "gender"], + ), + name=name, + ) + for i in range(2): + expected_series.index = expected_series.index.set_levels( + CategoricalIndex(expected_series.index.levels[i]), level=i + ) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("as_index", [False, True]) +@pytest.mark.parametrize("observed", [False, True]) +@pytest.mark.parametrize( + "normalize, name, expected_data", + [ + ( + False, + "count", + np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), + ), + ( + True, + "proportion", + # NaN values corresponds to non-observed groups + np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), + ), + ], +) +def test_categorical_non_groupers( + education_df, as_index, observed, normalize, name, expected_data, request +): + # GH#46357 Test non-observed categories are included in the result, + # regardless of `observed` + + if Version(np.__version__) >= Version("1.25"): + request.applymarker( + pytest.mark.xfail( + reason=( + "pandas default unstable sorting of duplicates" + "issue with numpy>=1.25 with AVX instructions" + ), + strict=False, + ) + ) + + education_df = education_df.copy() + education_df["gender"] = education_df["gender"].astype("category") + education_df["education"] = education_df["education"].astype("category") + + gp = education_df.groupby("country", as_index=as_index, observed=observed) + result = gp.value_counts(normalize=normalize) + + expected_index = [ + ("FR", "male", "low"), + ("FR", "female", "high"), + ("FR", "male", "medium"), + ("FR", "female", "low"), + ("FR", "female", "medium"), + ("FR", "male", "high"), + ("US", "female", "high"), + ("US", "male", "low"), + ("US", "female", "low"), + ("US", "female", "medium"), + ("US", "male", "high"), + ("US", "male", "medium"), + ] + expected_series = Series( + data=expected_data, + index=MultiIndex.from_tuples( + expected_index, + names=["country", "gender", "education"], + ), + name=name, + ) + for i in range(1, 3): + expected_series.index = expected_series.index.set_levels( + CategoricalIndex(expected_series.index.levels[i]), level=i + ) + + if as_index: + tm.assert_series_equal(result, expected_series) + else: + expected = expected_series.reset_index( + name="proportion" if normalize else "count" + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "normalize, expected_label, expected_values", + [ + (False, "count", [1, 1, 1]), + (True, "proportion", [0.5, 0.5, 1.0]), + ], +) +def test_mixed_groupings(normalize, expected_label, expected_values): + # Test multiple groupings + df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False) + result = gp.value_counts(sort=True, normalize=normalize) + expected = DataFrame( + { + "level_0": np.array([4, 4, 5], dtype=int), + "A": [1, 1, 2], + "level_2": [8, 8, 7], + "B": [1, 3, 2], + expected_label: expected_values, + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "test, columns, expected_names", + [ + ("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]), + ("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]), + ], +) +@pytest.mark.parametrize("as_index", [False, True]) +def test_column_label_duplicates(test, columns, expected_names, as_index): + # GH 44992 + # Test for duplicate input column labels and generated duplicate labels + df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns) + expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)] + keys = ["a", np.array([0, 1], dtype=np.int64), "d"] + result = df.groupby(keys, as_index=as_index).value_counts() + if as_index: + expected = Series( + data=(1, 1), + index=MultiIndex.from_tuples( + expected_data, + names=expected_names, + ), + name="count", + ) + tm.assert_series_equal(result, expected) + else: + expected_data = [list(row) + [1] for row in expected_data] + expected_columns = list(expected_names) + expected_columns[1] = "level_1" + expected_columns.append("count") + expected = DataFrame(expected_data, columns=expected_columns) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "normalize, expected_label", + [ + (False, "count"), + (True, "proportion"), + ], +) +def test_result_label_duplicates(normalize, expected_label): + # Test for result column label duplicating an input column label + gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby( + "a", as_index=False + ) + msg = f"Column label '{expected_label}' is duplicate of result column" + with pytest.raises(ValueError, match=msg): + gb.value_counts(normalize=normalize) + + +def test_ambiguous_grouping(): + # Test that groupby is not confused by groupings length equal to row count + df = DataFrame({"a": [1, 1]}) + gb = df.groupby(np.array([1, 1], dtype=np.int64)) + result = gb.value_counts() + expected = Series( + [2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]), name="count" + ) + tm.assert_series_equal(result, expected) + + +def test_subset_overlaps_gb_key_raises(): + # GH 46383 + df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) + msg = "Keys {'c1'} in subset cannot be in the groupby column keys." + with pytest.raises(ValueError, match=msg): + df.groupby("c1").value_counts(subset=["c1"]) + + +def test_subset_doesnt_exist_in_frame(): + # GH 46383 + df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) + msg = "Keys {'c3'} in subset do not exist in the DataFrame." + with pytest.raises(ValueError, match=msg): + df.groupby("c1").value_counts(subset=["c3"]) + + +def test_subset(): + # GH 46383 + df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) + result = df.groupby(level=0).value_counts(subset=["c2"]) + expected = Series( + [1, 2], + index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]), + name="count", + ) + tm.assert_series_equal(result, expected) + + +def test_subset_duplicate_columns(): + # GH 46383 + df = DataFrame( + [["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]], + index=[0, 1, 1], + columns=["c1", "c2", "c2"], + ) + result = df.groupby(level=0).value_counts(subset=["c2"]) + expected = Series( + [1, 2], + index=MultiIndex.from_arrays( + [[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"] + ), + name="count", + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("utc", [True, False]) +def test_value_counts_time_grouper(utc, unit): + # GH#50486 + df = DataFrame( + { + "Timestamp": [ + 1565083561, + 1565083561 + 86400, + 1565083561 + 86500, + 1565083561 + 86400 * 2, + 1565083561 + 86400 * 3, + 1565083561 + 86500 * 3, + 1565083561 + 86400 * 4, + ], + "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], + } + ).drop([3]) + + df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s").dt.as_unit(unit) + gb = df.groupby(Grouper(freq="1D", key="Datetime")) + result = gb.value_counts() + dates = to_datetime( + ["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc + ).as_unit(unit) + timestamps = df["Timestamp"].unique() + index = MultiIndex( + levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]], + codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]], + names=["Datetime", "Timestamp", "Food"], + ) + expected = Series(1, index=index, name="count") + tm.assert_series_equal(result, expected) + + +def test_value_counts_integer_columns(): + # GH#55627 + df = DataFrame({1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"]}) + gp = df.groupby([1, 2], as_index=False, sort=False) + result = gp[3].value_counts() + expected = DataFrame( + {1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1} + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("vc_sort", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +def test_value_counts_sort(sort, vc_sort, normalize): + # GH#55951 + df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}) + gb = df.groupby("a", sort=sort) + result = gb.value_counts(sort=vc_sort, normalize=normalize) + + if normalize: + values = [2 / 3, 1 / 3, 1.0] + else: + values = [2, 1, 1] + index = MultiIndex( + levels=[[1, 2], [3, 4]], codes=[[0, 0, 1], [0, 1, 0]], names=["a", 0] + ) + expected = Series(values, index=index, name="proportion" if normalize else "count") + if sort and vc_sort: + taker = [0, 1, 2] + elif sort and not vc_sort: + taker = [0, 1, 2] + elif not sort and vc_sort: + taker = [0, 2, 1] + else: + taker = [2, 1, 0] + expected = expected.take(taker) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("vc_sort", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +def test_value_counts_sort_categorical(sort, vc_sort, normalize): + # GH#55951 + df = DataFrame({"a": [2, 1, 1, 1], 0: [3, 4, 3, 3]}, dtype="category") + gb = df.groupby("a", sort=sort, observed=True) + result = gb.value_counts(sort=vc_sort, normalize=normalize) + + if normalize: + values = [2 / 3, 1 / 3, 1.0, 0.0] + else: + values = [2, 1, 1, 0] + name = "proportion" if normalize else "count" + expected = DataFrame( + { + "a": Categorical([1, 1, 2, 2]), + 0: Categorical([3, 4, 3, 4]), + name: values, + } + ).set_index(["a", 0])[name] + if sort and vc_sort: + taker = [0, 1, 2, 3] + elif sort and not vc_sort: + taker = [0, 1, 2, 3] + elif not sort and vc_sort: + taker = [0, 2, 1, 3] + else: + taker = [2, 3, 0, 1] + expected = expected.take(taker) + + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_all_methods.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_all_methods.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_all_methods.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_all_methods.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,83 @@ +""" +Tests that apply to all groupby operation methods. + +The only tests that should appear here are those that use the `groupby_func` fixture. +Even if it does use that fixture, prefer a more specific test file if it available +such as: + + - test_categorical + - test_groupby_dropna + - test_groupby_subclass + - test_raises +""" + +import pytest + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm +from pandas.tests.groupby import get_groupby_method_args + + +def test_multiindex_group_all_columns_when_empty(groupby_func): + # GH 32464 + df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"]) + gb = df.groupby(["a", "b", "c"], group_keys=False) + method = getattr(gb, groupby_func) + args = get_groupby_method_args(groupby_func, df) + + warn = FutureWarning if groupby_func == "fillna" else None + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + result = method(*args).index + expected = df.index + tm.assert_index_equal(result, expected) + + +def test_duplicate_columns(request, groupby_func, as_index): + # GH#50806 + if groupby_func == "corrwith": + msg = "GH#50845 - corrwith fails when there are duplicate columns" + request.applymarker(pytest.mark.xfail(reason=msg)) + df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb")) + args = get_groupby_method_args(groupby_func, df) + gb = df.groupby("a", as_index=as_index) + warn = FutureWarning if groupby_func == "fillna" else None + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + result = getattr(gb, groupby_func)(*args) + + expected_df = df.set_axis(["a", "b", "c"], axis=1) + expected_args = get_groupby_method_args(groupby_func, expected_df) + expected_gb = expected_df.groupby("a", as_index=as_index) + warn = FutureWarning if groupby_func == "fillna" else None + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + expected = getattr(expected_gb, groupby_func)(*expected_args) + if groupby_func not in ("size", "ngroup", "cumcount"): + expected = expected.rename(columns={"c": "b"}) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "idx", + [ + pd.Index(["a", "a"], name="foo"), + pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]), + ], +) +def test_dup_labels_output_shape(groupby_func, idx): + if groupby_func in {"size", "ngroup", "cumcount"}: + pytest.skip(f"Not applicable for {groupby_func}") + + df = DataFrame([[1, 1]], columns=idx) + grp_by = df.groupby([0]) + + args = get_groupby_method_args(groupby_func, df) + warn = FutureWarning if groupby_func == "fillna" else None + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + result = getattr(grp_by, groupby_func)(*args) + + assert result.shape == (1, 2) + tm.assert_index_equal(result.columns, idx) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_any_all.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_any_all.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_any_all.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_any_all.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,188 +0,0 @@ -import builtins - -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - Index, - Series, - isna, -) -import pandas._testing as tm - - -@pytest.mark.parametrize("agg_func", ["any", "all"]) -@pytest.mark.parametrize( - "vals", - [ - ["foo", "bar", "baz"], - ["foo", "", ""], - ["", "", ""], - [1, 2, 3], - [1, 0, 0], - [0, 0, 0], - [1.0, 2.0, 3.0], - [1.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [True, True, True], - [True, False, False], - [False, False, False], - [np.nan, np.nan, np.nan], - ], -) -def test_groupby_bool_aggs(skipna, agg_func, vals): - df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2}) - - # Figure out expectation using Python builtin - exp = getattr(builtins, agg_func)(vals) - - # edge case for missing data with skipna and 'any' - if skipna and all(isna(vals)) and agg_func == "any": - exp = False - - expected = DataFrame( - [exp] * 2, columns=["val"], index=Index(["a", "b"], name="key") - ) - result = getattr(df.groupby("key"), agg_func)(skipna=skipna) - tm.assert_frame_equal(result, expected) - - -def test_any(): - df = DataFrame( - [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], - columns=["A", "B", "C"], - ) - expected = DataFrame( - [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] - ) - expected.index.name = "A" - result = df.groupby("A").any() - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_bool_aggs_dup_column_labels(bool_agg_func): - # GH#21668 - df = DataFrame([[True, True]], columns=["a", "a"]) - grp_by = df.groupby([0]) - result = getattr(grp_by, bool_agg_func)() - - expected = df.set_axis(np.array([0])) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -@pytest.mark.parametrize( - "data", - [ - [False, False, False], - [True, True, True], - [pd.NA, pd.NA, pd.NA], - [False, pd.NA, False], - [True, pd.NA, True], - [True, pd.NA, False], - ], -) -def test_masked_kleene_logic(bool_agg_func, skipna, data): - # GH#37506 - ser = Series(data, dtype="boolean") - - # The result should match aggregating on the whole series. Correctness - # there is verified in test_reductions.py::test_any_all_boolean_kleene_logic - expected_data = getattr(ser, bool_agg_func)(skipna=skipna) - expected = Series(expected_data, index=np.array([0]), dtype="boolean") - - result = ser.groupby([0, 0, 0]).agg(bool_agg_func, skipna=skipna) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype1,dtype2,exp_col1,exp_col2", - [ - ( - "float", - "Float64", - np.array([True], dtype=bool), - pd.array([pd.NA], dtype="boolean"), - ), - ( - "Int64", - "float", - pd.array([pd.NA], dtype="boolean"), - np.array([True], dtype=bool), - ), - ( - "Int64", - "Int64", - pd.array([pd.NA], dtype="boolean"), - pd.array([pd.NA], dtype="boolean"), - ), - ( - "Float64", - "boolean", - pd.array([pd.NA], dtype="boolean"), - pd.array([pd.NA], dtype="boolean"), - ), - ], -) -def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2): - # GH#37506 - data = [1.0, np.nan] - df = DataFrame( - {"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)} - ) - result = df.groupby([1, 1]).agg("all", skipna=False) - - expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=np.array([1])) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) -def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series): - # GH#40585 - obj = frame_or_series([pd.NA, 1], dtype=dtype) - expected_res = True - if not skipna and bool_agg_func == "all": - expected_res = pd.NA - expected = frame_or_series([expected_res], index=np.array([1]), dtype="boolean") - - result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna) - tm.assert_equal(result, expected) - - -@pytest.mark.parametrize( - "bool_agg_func,data,expected_res", - [ - ("any", [pd.NA, np.nan], False), - ("any", [pd.NA, 1, np.nan], True), - ("all", [pd.NA, pd.NaT], True), - ("all", [pd.NA, False, pd.NaT], False), - ], -) -def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_series): - # GH#37501 - obj = frame_or_series(data, dtype=object) - result = obj.groupby([1] * len(data)).agg(bool_agg_func) - expected = frame_or_series([expected_res], index=np.array([1]), dtype="bool") - tm.assert_equal(result, expected) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_object_NA_raises_with_skipna_false(bool_agg_func): - # GH#37501 - ser = Series([pd.NA], dtype=object) - with pytest.raises(TypeError, match="boolean value of NA is ambiguous"): - ser.groupby([1]).agg(bool_agg_func, skipna=False) - - -@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) -def test_empty(frame_or_series, bool_agg_func): - # GH 45231 - kwargs = {"columns": ["a"]} if frame_or_series is DataFrame else {"name": "a"} - obj = frame_or_series(**kwargs, dtype=object) - result = getattr(obj.groupby(obj.index), bool_agg_func)() - expected = frame_or_series(**kwargs, dtype=bool) - tm.assert_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_api.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_api.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_api.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_api.py 2024-04-10 17:42:52.000000000 +0000 @@ -24,8 +24,8 @@ ) -def test_tab_completion(mframe): - grp = mframe.groupby(level="second") +def test_tab_completion(multiindex_dataframe_random_data): + grp = multiindex_dataframe_random_data.groupby(level="second") results = {v for v in dir(grp) if not v.startswith("_")} expected = { "A", @@ -98,9 +98,13 @@ assert results == expected -def test_all_methods_categorized(mframe): - grp = mframe.groupby(mframe.iloc[:, 0]) - names = {_ for _ in dir(grp) if not _.startswith("_")} - set(mframe.columns) +def test_all_methods_categorized(multiindex_dataframe_random_data): + grp = multiindex_dataframe_random_data.groupby( + multiindex_dataframe_random_data.iloc[:, 0] + ) + names = {_ for _ in dir(grp) if not _.startswith("_")} - set( + multiindex_dataframe_random_data.columns + ) new_names = set(names) new_names -= reduction_kernels new_names -= transformation_kernels @@ -179,7 +183,7 @@ elif groupby_func in ("median", "prod", "sem"): exclude_expected = {"axis", "kwargs", "skipna"} elif groupby_func in ("backfill", "bfill", "ffill", "pad"): - exclude_expected = {"downcast", "inplace", "axis"} + exclude_expected = {"downcast", "inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): exclude_expected = {"skipna", "args"} exclude_result = {"numeric_only"} @@ -236,7 +240,7 @@ elif groupby_func in ("median", "prod", "sem"): exclude_expected = {"axis", "kwargs", "skipna"} elif groupby_func in ("backfill", "bfill", "ffill", "pad"): - exclude_expected = {"downcast", "inplace", "axis"} + exclude_expected = {"downcast", "inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): exclude_expected = {"skipna", "args"} exclude_result = {"numeric_only"} diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_apply.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_apply.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_apply.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_apply.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,7 +2,6 @@ date, datetime, ) -from io import StringIO import numpy as np import pytest @@ -28,7 +27,9 @@ def store(group): groups.append(group) - df.groupby("index").apply(store) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + df.groupby("index").apply(store) expected_value = DataFrame( {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10) ) @@ -36,55 +37,100 @@ tm.assert_frame_equal(groups[0], expected_value) -def test_apply_issues(): +def test_apply_index_date(using_infer_string): # GH 5788 - - s = """2011.05.16,00:00,1.40893 -2011.05.16,01:00,1.40760 -2011.05.16,02:00,1.40750 -2011.05.16,03:00,1.40649 -2011.05.17,02:00,1.40893 -2011.05.17,03:00,1.40760 -2011.05.17,04:00,1.40750 -2011.05.17,05:00,1.40649 -2011.05.18,02:00,1.40893 -2011.05.18,03:00,1.40760 -2011.05.18,04:00,1.40750 -2011.05.18,05:00,1.40649""" - - df = pd.read_csv( - StringIO(s), - header=None, - names=["date", "time", "value"], - parse_dates=[["date", "time"]], + ts = [ + "2011-05-16 00:00", + "2011-05-16 01:00", + "2011-05-16 02:00", + "2011-05-16 03:00", + "2011-05-17 02:00", + "2011-05-17 03:00", + "2011-05-17 04:00", + "2011-05-17 05:00", + "2011-05-18 02:00", + "2011-05-18 03:00", + "2011-05-18 04:00", + "2011-05-18 05:00", + ] + df = DataFrame( + { + "value": [ + 1.40893, + 1.40760, + 1.40750, + 1.40649, + 1.40893, + 1.40760, + 1.40750, + 1.40649, + 1.40893, + 1.40760, + 1.40750, + 1.40649, + ], + }, + index=Index(pd.to_datetime(ts), name="date_time"), ) - df = df.set_index("date_time") - expected = df.groupby(df.index.date).idxmax() result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) tm.assert_frame_equal(result, expected) + +def test_apply_index_date_object(using_infer_string): # GH 5789 # don't auto coerce dates - df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"]) + ts = [ + "2011-05-16 00:00", + "2011-05-16 01:00", + "2011-05-16 02:00", + "2011-05-16 03:00", + "2011-05-17 02:00", + "2011-05-17 03:00", + "2011-05-17 04:00", + "2011-05-17 05:00", + "2011-05-18 02:00", + "2011-05-18 03:00", + "2011-05-18 04:00", + "2011-05-18 05:00", + ] + df = DataFrame([row.split() for row in ts], columns=["date", "time"]) + df["value"] = [ + 1.40893, + 1.40760, + 1.40750, + 1.40649, + 1.40893, + 1.40760, + 1.40750, + 1.40649, + 1.40893, + 1.40760, + 1.40750, + 1.40649, + ] + dtype = "string[pyarrow_numpy]" if using_infer_string else object exp_idx = Index( - ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" + ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) - result = df.groupby("date", group_keys=False).apply( - lambda x: x["time"][x["value"].idxmax()] - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("date", group_keys=False).apply( + lambda x: x["time"][x["value"].idxmax()] + ) tm.assert_series_equal(result, expected) -def test_apply_trivial(): +def test_apply_trivial(using_infer_string): # GH 20066 # trivial apply: ignore input and return a constant dataframe. df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"]) + dtype = "string" if using_infer_string else "object" + expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -94,13 +140,14 @@ tm.assert_frame_equal(result, expected) -def test_apply_trivial_fail(): +def test_apply_trivial_fail(using_infer_string): # GH 20066 df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - expected = pd.concat([df, df], axis=1, keys=["float64", "object"]) + dtype = "string" if using_infer_string else "object" + expected = pd.concat([df, df], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): gb = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True) @@ -179,7 +226,9 @@ for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: del names[:] - df.groupby("a", group_keys=False).apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + df.groupby("a", group_keys=False).apply(func) assert names == group_names @@ -197,9 +246,11 @@ index=["0", "2", "4", "6", "8", "10", "12", "14"], ) - df.groupby("group_by_column", group_keys=False).apply( - lambda df: print("function_called") - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + df.groupby("group_by_column", group_keys=False).apply( + lambda df: print("function_called") + ) result = capsys.readouterr().out.count("function_called") # If `groupby` behaves unexpectedly, this test will break @@ -219,8 +270,11 @@ def fast(group): return group.copy() - fast_df = df.groupby("A", group_keys=False).apply(fast) - slow_df = df.groupby("A", group_keys=False).apply(slow) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + fast_df = df.groupby("A", group_keys=False).apply(fast) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) @@ -242,7 +296,9 @@ df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - result = df.groupby("g", group_keys=False).apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("g", group_keys=False).apply(func) tm.assert_frame_equal(result, df) @@ -285,8 +341,11 @@ tm.assert_index_equal(res_as, exp) tm.assert_index_equal(res_not_as, exp) - res_as_apply = g_as.apply(lambda x: x.head(2)).index - res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + res_as_apply = g_as.apply(lambda x: x.head(2)).index + with tm.assert_produces_warning(DeprecationWarning, match=msg): + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here @@ -299,7 +358,9 @@ ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -328,13 +389,19 @@ # weirdo return result - result = grouped.apply(desc) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") - result2 = grouped.apply(desc2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") - result3 = grouped.apply(desc3) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -364,7 +431,9 @@ def test_apply_frame_yield_constant(df): # GH13568 - result = df.groupby(["A", "B"]).apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -375,7 +444,9 @@ def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) - result = grouped.apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) tm.assert_numpy_array_equal(result.values, expected.values) @@ -384,7 +455,9 @@ def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) - result = grouped.apply(len) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan tm.assert_index_equal(result.index, expected.index) @@ -407,7 +480,9 @@ } ) - result = df.groupby("A").apply(trans) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) assert result.name == "C" @@ -436,7 +511,9 @@ # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) if group_keys: expected.index = MultiIndex.from_arrays( @@ -457,7 +534,9 @@ # it works! #2605 grouped = df.groupby(["name", "name2"]) - grouped.apply(lambda x: x.sort_values("value", inplace=True)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + grouped.apply(lambda x: x.sort_values("value", inplace=True)) def test_apply_typecast_fail(): @@ -474,7 +553,9 @@ group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d", group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -498,7 +579,9 @@ group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby("d", group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("d", group_keys=False).apply(f) expected = df.copy() expected["v2"] = np.tile([0.0, 0.5, 1], 2) @@ -536,8 +619,11 @@ else: return x[x.category == "c"] - expected = data.groupby("id_field").apply(filt1) - result = data.groupby("id_field").apply(filt2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = data.groupby("id_field").apply(filt1) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -556,7 +642,9 @@ expected = ser.sort_index() tm.assert_series_equal(result, expected) else: - result = df.groupby("Y", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("Y", group_keys=False).apply(lambda x: x) # not expecting the order to remain the same for duplicated axis result = result.sort_values("Y") @@ -601,7 +689,9 @@ g["value3"] = g["value1"] * 2 return g - result = grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = grouped.apply(f) assert "value3" in result @@ -615,9 +705,13 @@ df = DataFrame( {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) - expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) - result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) # GH 15421 @@ -628,7 +722,9 @@ def get_B(g): return g.iloc[0][["B"]] - result = df.groupby("A").apply(get_B)["B"] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A tm.assert_series_equal(result, expected) @@ -653,8 +749,11 @@ ) df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) - expected = df1.groupby("Key").apply(predictions).p1 - result = df2.groupby("Key").apply(predictions).p1 + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = df1.groupby("Key").apply(predictions).p1 + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -669,11 +768,13 @@ } ) df["time_delta_zero"] = df.datetime - df.datetime - result = df.groupby("clientid").apply( - lambda ddf: Series( - {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("clientid").apply( + lambda ddf: Series( + {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} + ) ) - ) expected = DataFrame( { "clientid": ["A", "B", "C"], @@ -716,11 +817,15 @@ def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) - dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" - dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( {"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1] ) @@ -764,7 +869,9 @@ def test_func(x): pass - result = test_df.groupby("groups").apply(test_func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = test_df.groupby("groups").apply(test_func) expected = DataFrame() tm.assert_frame_equal(result, expected) @@ -779,8 +886,11 @@ return None return x.iloc[[0, -1]] - result1 = test_df1.groupby("groups").apply(test_func) - result2 = test_df2.groupby("groups").apply(test_func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result1 = test_df1.groupby("groups").apply(test_func) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result2 = test_df2.groupby("groups").apply(test_func) index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) @@ -793,7 +903,9 @@ # GH 22221: apply filter which returns some empty groups df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") - result = groups.apply(lambda group: group[group.value != 1]["value"]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], name="value", @@ -820,7 +932,9 @@ def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) - result = df.groupby("a").apply(lambda g: g.index) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) tm.assert_series_equal(result, expected) @@ -830,18 +944,19 @@ "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], ) -def test_apply_datetime_issue(group_column_dtlike): +def test_apply_datetime_issue(group_column_dtlike, using_infer_string): # GH-28247 # groupby-apply throws an error if one of the columns in the DataFrame # is a datetime object and the column labels are different from # standard int values in range(len(num_columns)) df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) - result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - expected = DataFrame( - ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] - ) + dtype = "string" if using_infer_string else "object" + expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -876,7 +991,9 @@ def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) - result = tdf.groupby("day").apply(most_common_values)["userId"] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) @@ -906,7 +1023,7 @@ assert df.index.names == ["A", "B"] -def test_groupby_apply_datetime_result_dtypes(): +def test_groupby_apply_datetime_result_dtypes(using_infer_string): # GH 14849 data = DataFrame.from_records( [ @@ -917,9 +1034,12 @@ ], columns=["observation", "color", "mood", "intensity", "score"], ) - result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + dtype = "string" if using_infer_string else object expected = Series( - [np.dtype("datetime64[ns]"), object, object, np.int64, object], + [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) @@ -937,7 +1057,9 @@ def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - result = df.groupby("group", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("group", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) @@ -960,7 +1082,9 @@ def test_apply_function_returns_non_pandas_non_scalar(function, expected_values): # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) - result = df.groupby("groups").apply(function) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -972,7 +1096,9 @@ df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) - result = df.groupby("A").apply(fct) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") ) @@ -983,7 +1109,9 @@ def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) - result = df.groupby("id").apply(function) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], index=Index([1, 2, 3], name="id"), @@ -1019,7 +1147,9 @@ # We'd like to control whether the group keys end up in the index # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) - df_result = df.groupby("A", group_keys=group_keys).apply(udf) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) if group_keys: @@ -1034,8 +1164,11 @@ # GH 34998 # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) - result = df.groupby("A", group_keys=False).apply(lambda x: x) - expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("A", group_keys=False).apply(lambda x: x) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1047,8 +1180,15 @@ df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) - result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) - result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result1 = df1.groupby("x", group_keys=False).apply( + lambda df: df[["x", "y"]].copy() + ) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result2 = df2.groupby("x", group_keys=False).apply( + lambda df: df[["x", "y"]].copy() + ) tm.assert_frame_equal(result1, result2) @@ -1065,7 +1205,7 @@ ) expected = DataFrame( - {"a": [264, 297], "b": [15, 6], "c": [150, 60]}, + {"b": [15, 6], "c": [150, 60]}, index=Index([88, 99], name="a"), ) @@ -1073,7 +1213,7 @@ grp = df.groupby(by="a") msg = "The behavior of DataFrame.sum with axis=None is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - result = grp.apply(sum) + result = grp.apply(sum, include_groups=False) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() @@ -1081,7 +1221,7 @@ args = get_groupby_method_args(reduction_func, df) _ = getattr(grp, reduction_func)(*args) with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): - result = grp.apply(sum) + result = grp.apply(sum, include_groups=False) tm.assert_frame_equal(result, expected) @@ -1103,7 +1243,9 @@ ) grp = df.groupby(["A", "B"]) - result = grp.apply(lambda x: x.head(1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() @@ -1151,7 +1293,9 @@ }, index=list("xxyxz"), ) - result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1176,7 +1320,9 @@ def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) - result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1186,7 +1332,9 @@ {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]}, index=range(5), ) - result = df.groupby("C").apply(lambda x: x.A.sort_index()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), index=MultiIndex.from_tuples( @@ -1201,14 +1349,16 @@ # GH 21651 expected = DataFrame( { - "date": pd.date_range("2010-01-01", freq="12H", periods=5), + "date": pd.date_range("2010-01-01", freq="12h", periods=5), "vals": range(5), "let": list("abcde"), } ) - result = expected.groupby( - [expected.let, expected.date.dt.date], group_keys=False - ).apply(lambda x: x.iloc[0:]) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = expected.groupby( + [expected.let, expected.date.dt.date], group_keys=False + ).apply(lambda x: x.iloc[0:]) tm.assert_frame_equal(result, expected) @@ -1251,24 +1401,29 @@ {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]} ) dfgrp = df.groupby("grp", dropna=dropna) - result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) def test_apply_empty_string_nan_coerce_bug(): # GH#24903 - result = ( - DataFrame( - { - "a": [1, 1, 2, 2], - "b": ["", "", "", ""], - "c": pd.to_datetime([1, 2, 3, 4], unit="s"), - } + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = ( + DataFrame( + { + "a": [1, 1, 2, 2], + "b": ["", "", "", ""], + "c": pd.to_datetime([1, 2, 3, 4], unit="s"), + } + ) + .groupby(["a", "b"]) + .apply(lambda df: df.iloc[-1]) ) - .groupby(["a", "b"]) - .apply(lambda df: df.iloc[-1]) - ) expected = DataFrame( [[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]], columns=["a", "b", "c"], @@ -1293,9 +1448,11 @@ }, index=Index(["a2", "a3", "aa"], name="a"), ) - result = result.groupby("a").apply( - lambda df: Series([df["b"].mean()], index=["b_mean"]) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = result.groupby("a").apply( + lambda df: Series([df["b"].mean()], index=["b_mean"]) + ) tm.assert_frame_equal(result, expected) @@ -1343,7 +1500,9 @@ def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 expected = DataFrame({"col": arg}, index=idx) - result = expected.groupby("col", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = expected.groupby("col", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, expected) @@ -1390,33 +1549,58 @@ tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "group_col", - [([0.0, np.nan, 0.0, 0.0]), ([np.nan, 0.0, 0.0, 0.0]), ([0, 0.0, 0.0, np.nan])], -) -def test_apply_inconsistent_output(group_col): - # GH 34478 - df = DataFrame({"group_col": group_col, "value_col": [2, 2, 2, 2]}) +@pytest.mark.parametrize("include_groups", [True, False]) +def test_include_groups(include_groups): + # GH#7155 + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) + gb = df.groupby("a") + warn = DeprecationWarning if include_groups else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + result = gb.apply(lambda x: x.sum(), include_groups=include_groups) + expected = DataFrame({"a": [2, 2], "b": [7, 5]}, index=Index([1, 2], name="a")) + if not include_groups: + expected = expected[["b"]] + tm.assert_frame_equal(result, expected) - result = df.groupby("group_col").value_col.apply( - lambda x: x.value_counts().reindex(index=[1, 2, 3]) - ) - expected = Series( - [np.nan, 3.0, np.nan], - name="value_col", - index=MultiIndex.from_product([[0.0], [1, 2, 3]], names=["group_col", 0.0]), - ) - tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("f", [max, min, sum]) +@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key +def test_builtins_apply(keys, f): + # see gh-8155 + rs = np.random.default_rng(2) + df = DataFrame(rs.integers(1, 7, (10, 2)), columns=["jim", "joe"]) + df["jolie"] = rs.standard_normal(10) + gb = df.groupby(keys) -def test_apply_array_output_multi_getitem(): - # GH 18930 - df = DataFrame( - {"A": {"a": 1, "b": 2}, "B": {"a": 1, "b": 2}, "C": {"a": 1, "b": 2}} - ) - result = df.groupby("A")[["B", "C"]].apply(lambda x: np.array([0])) - expected = Series( - [np.array([0])] * 2, index=Index([1, 2], name="A"), name=("B", "C") - ) - tm.assert_series_equal(result, expected) + fname = f.__name__ + + warn = None if f is not sum else FutureWarning + msg = "The behavior of DataFrame.sum with axis=None is deprecated" + with tm.assert_produces_warning( + warn, match=msg, check_stacklevel=False, raise_on_extra_warnings=False + ): + # Also warns on deprecation GH#53425 + result = gb.apply(f) + ngroups = len(df.drop_duplicates(subset=keys)) + + assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" + assert result.shape == (ngroups, 3), assert_msg + + npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = gb.apply(npfunc) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected2 = gb.apply(lambda x: npfunc(x)) + tm.assert_frame_equal(result, expected2) + + if f != sum: + expected = gb.agg(fname).reset_index() + expected.set_index(keys, inplace=True, drop=False) + tm.assert_frame_equal(result, expected, check_dtype=False) + + tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0)) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_apply_mutate.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_apply_mutate.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_apply_mutate.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_apply_mutate.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,10 +13,16 @@ } ).set_index("name") - grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group) - grp_by_copy = df.groupby(["age"], group_keys=False).apply( - lambda group: group.copy() - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + grp_by_same_value = df.groupby(["age"], group_keys=False).apply( + lambda group: group + ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + grp_by_copy = df.groupby(["age"], group_keys=False).apply( + lambda group: group.copy() + ) tm.assert_frame_equal(grp_by_same_value, grp_by_copy) @@ -47,8 +53,11 @@ x["rank"] = x.val.rank(method="min") return x.groupby("cat2")["rank"].min() - grpby_copy = df.groupby("cat1").apply(f_copy) - grpby_no_copy = df.groupby("cat1").apply(f_no_copy) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + grpby_copy = df.groupby("cat1").apply(f_copy) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + grpby_no_copy = df.groupby("cat1").apply(f_no_copy) tm.assert_series_equal(grpby_copy, grpby_no_copy) @@ -58,12 +67,15 @@ # second does not, but should yield the same results df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) tm.assert_series_equal(result1, result2) -def test_apply_function_with_indexing(): +def test_apply_function_with_indexing(warn_copy_on_write): # GH: 33058 df = pd.DataFrame( {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} @@ -73,7 +85,11 @@ x.loc[x.index[-1], "col2"] = 0 return x.col2 - result = df.groupby(["col1"], as_index=False).apply(fn) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning( + DeprecationWarning, match=msg, raise_on_extra_warnings=not warn_copy_on_write + ): + result = df.groupby(["col1"], as_index=False).apply(fn) expected = pd.Series( [1, 2, 0, 4, 5, 0], index=pd.MultiIndex.from_tuples( diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_categorical.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_categorical.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_categorical.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_categorical.py 2024-04-10 17:42:52.000000000 +0000 @@ -82,7 +82,7 @@ assert result.index.names[0] == "C" -def test_basic(): # TODO: split this test +def test_basic(using_infer_string): # TODO: split this test cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -124,10 +124,13 @@ def f(x): return x.drop_duplicates("person_name").iloc[0] - result = g.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - expected["person_name"] = expected["person_name"].astype("object") + dtype = "string[pyarrow_numpy]" if using_infer_string else object + expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) # GH 9921 @@ -268,7 +271,10 @@ names=["Index1", "Index2"], ), ) - result = g.get_group("a") + msg = "you will need to pass a length-1 tuple" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#25971 - warn when not passing a length-1 tuple + result = g.get_group("a") tm.assert_frame_equal(result, expected) @@ -326,7 +332,9 @@ # but for transform we should still get back the original index idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) - result = grouped.apply(lambda x: 1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) @@ -1147,7 +1155,7 @@ { "key1": Categorical(list("abcbabcba")), "key2": Categorical( - list(pd.date_range("2018-06-01 00", freq="1T", periods=3)) * 3 + list(pd.date_range("2018-06-01 00", freq="1min", periods=3)) * 3 ), "values": np.arange(9), } @@ -1157,7 +1165,7 @@ idx = MultiIndex.from_product( [ Categorical(["a", "b", "c"]), - Categorical(pd.date_range("2018-06-01 00", freq="1T", periods=3)), + Categorical(pd.date_range("2018-06-01 00", freq="1min", periods=3)), ], names=["key1", "key2"], ) @@ -1409,6 +1417,15 @@ return agg = getattr(series_groupby, reduction_func) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + agg(*args) + return + result = agg(*args) assert len(result) == expected_length @@ -1427,7 +1444,7 @@ mark = pytest.mark.xfail( reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293" ) - request.node.add_marker(mark) + request.applymarker(mark) df = DataFrame( { @@ -1441,6 +1458,15 @@ series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] agg = getattr(series_groupby, reduction_func) + + if reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + agg(*args) + return + result = agg(*args) zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1507,6 +1533,15 @@ df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) args = get_groupby_method_args(reduction_func, df) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(df_grp, reduction_func)(*args) + return + res = getattr(df_grp, reduction_func)(*args) expected = _results_for_groupbys_with_missing_categories[reduction_func] @@ -1876,16 +1911,9 @@ request, as_index, sort, observed, reduction_func, index_kind, ordered ): # GH#48749 - if ( - reduction_func in ("idxmax", "idxmin") - and not observed - and index_kind != "multi" - ): - msg = "GH#10694 - idxmax/min fail with unused categories" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - elif reduction_func == "corrwith" and not as_index: + if reduction_func == "corrwith" and not as_index: msg = "GH#49950 - corrwith with as_index=False may not have grouping column" - request.node.add_marker(pytest.mark.xfail(reason=msg)) + request.applymarker(pytest.mark.xfail(reason=msg)) elif index_kind != "range" and not as_index: pytest.skip(reason="Result doesn't have categories, nothing to test") df = DataFrame( @@ -1905,6 +1933,15 @@ df = df.set_index(keys) args = get_groupby_method_args(reduction_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + # idxmin and idxmax are designed to fail on empty inputs + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(gb, reduction_func)(*args) + return + op_result = getattr(gb, reduction_func)(*args) if as_index: result = op_result.index.get_level_values("a").categories @@ -1939,7 +1976,10 @@ df = df.set_index(keys) args = get_groupby_method_args(transformation_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - op_result = getattr(gb, transformation_func)(*args) + warn = FutureWarning if transformation_func == "fillna" else None + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + op_result = getattr(gb, transformation_func)(*args) result = op_result.index.get_level_values("a").categories expected = Index([1, 4, 3, 2]) tm.assert_index_equal(result, expected) @@ -2010,7 +2050,10 @@ df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) + warn = DeprecationWarning if method == "apply" and index_kind == "range" else None + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) if (method == "transform" or not as_index) and index_kind == "range": result = op_result["a"].cat.categories else: @@ -2084,7 +2127,7 @@ pytest.skip("corrwith not implemented for SeriesGroupBy") elif reduction_func == "corrwith": msg = "GH#32293: attempts to call SeriesGroupBy.corrwith" - request.node.add_marker(pytest.mark.xfail(reason=msg)) + request.applymarker(pytest.mark.xfail(reason=msg)) elif ( reduction_func == "nunique" and not test_series @@ -2093,7 +2136,7 @@ and not as_index ): msg = "GH#52848 - raises a ValueError" - request.node.add_marker(pytest.mark.xfail(reason=msg)) + request.applymarker(pytest.mark.xfail(reason=msg)) df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]}) df = df.astype({"a1": "category", "a2": "category"}) @@ -2104,6 +2147,13 @@ gb = gb["b"] args = get_groupby_method_args(reduction_func, df) + if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]: + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + gb.agg([reduction_func], *args) + return + result = gb.agg([reduction_func], *args) expected = getattr(gb, reduction_func)(*args) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_counting.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_counting.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_counting.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_counting.py 2024-04-10 17:42:52.000000000 +0000 @@ -265,7 +265,7 @@ def test_count(): n = 1 << 15 - dr = date_range("2015-08-30", periods=n // 10, freq="T") + dr = date_range("2015-08-30", periods=n // 10, freq="min") df = DataFrame( { @@ -289,7 +289,9 @@ for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) tm.assert_frame_equal(left, right) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_cumulative.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_cumulative.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_cumulative.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_cumulative.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,319 @@ +import numpy as np +import pytest + +from pandas.errors import UnsupportedFunctionCall +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +@pytest.fixture( + params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"], + ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"], +) +def dtypes_for_minmax(request): + """ + Fixture of dtypes with min and max values used for testing + cummin and cummax + """ + dtype = request.param + + np_type = dtype + if dtype == "Int64": + np_type = np.int64 + elif dtype == "Float64": + np_type = np.float64 + + min_val = ( + np.iinfo(np_type).min + if np.dtype(np_type).kind == "i" + else np.finfo(np_type).min + ) + max_val = ( + np.iinfo(np_type).max + if np.dtype(np_type).kind == "i" + else np.finfo(np_type).max + ) + + return (dtype, min_val, max_val) + + +def test_groupby_cumprod(): + # GH 4095 + df = DataFrame({"key": ["b"] * 10, "value": 2}) + + actual = df.groupby("key")["value"].cumprod() + expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) + expected.name = "value" + tm.assert_series_equal(actual, expected) + + df = DataFrame({"key": ["b"] * 100, "value": 2}) + df["value"] = df["value"].astype(float) + actual = df.groupby("key")["value"].cumprod() + expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) + expected.name = "value" + tm.assert_series_equal(actual, expected) + + +@pytest.mark.skip_ubsan +def test_groupby_cumprod_overflow(): + # GH#37493 if we overflow we return garbage consistent with numpy + df = DataFrame({"key": ["b"] * 4, "value": 100_000}) + actual = df.groupby("key")["value"].cumprod() + expected = Series( + [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920], + name="value", + ) + tm.assert_series_equal(actual, expected) + + numpy_result = df.groupby("key", group_keys=False)["value"].apply( + lambda x: x.cumprod() + ) + numpy_result.name = "value" + tm.assert_series_equal(actual, numpy_result) + + +def test_groupby_cumprod_nan_influences_other_columns(): + # GH#48064 + df = DataFrame( + { + "a": 1, + "b": [1, np.nan, 2], + "c": [1, 2, 3.0], + } + ) + result = df.groupby("a").cumprod(numeric_only=True, skipna=False) + expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]}) + tm.assert_frame_equal(result, expected) + + +def test_cummin(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + min_val = dtypes_for_minmax[1] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + + df = base_df.astype(dtype) + + expected = DataFrame({"B": expected_mins}).astype(dtype) + result = df.groupby("A").cummin() + tm.assert_frame_equal(result, expected) + result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test w/ min value for dtype + df.loc[[2, 6], "B"] = min_val + df.loc[[1, 5], "B"] = min_val + 1 + expected.loc[[2, 3, 6, 7], "B"] = min_val + expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val + result = df.groupby("A").cummin() + tm.assert_frame_equal(result, expected, check_exact=True) + expected = ( + df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + ) + tm.assert_frame_equal(result, expected, check_exact=True) + + # Test nan in some values + # Explicit cast to float to avoid implicit cast when setting nan + base_df = base_df.astype({"B": "float"}) + base_df.loc[[0, 2, 4, 6], "B"] = np.nan + expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) + result = base_df.groupby("A").cummin() + tm.assert_frame_equal(result, expected) + expected = ( + base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() + ) + tm.assert_frame_equal(result, expected) + + # GH 15561 + df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) + expected = Series(pd.to_datetime("2001"), index=[0], name="b") + + result = df.groupby("a")["b"].cummin() + tm.assert_series_equal(expected, result) + + # GH 15635 + df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) + result = df.groupby("a").b.cummin() + expected = Series([1, 2, 1], name="b") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"]) +def test_cummin_max_all_nan_column(method, dtype): + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) + base_df["B"] = base_df["B"].astype(dtype) + grouped = base_df.groupby("A") + + expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype) + result = getattr(grouped, method)() + tm.assert_frame_equal(expected, result) + + result = getattr(grouped["B"], method)().to_frame() + tm.assert_frame_equal(expected, result) + + +def test_cummax(dtypes_for_minmax): + dtype = dtypes_for_minmax[0] + max_val = dtypes_for_minmax[2] + + # GH 15048 + base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + df = base_df.astype(dtype) + + expected = DataFrame({"B": expected_maxs}).astype(dtype) + result = df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test w/ max value for dtype + df.loc[[2, 6], "B"] = max_val + expected.loc[[2, 3, 6, 7], "B"] = max_val + result = df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + expected = ( + df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + ) + tm.assert_frame_equal(result, expected) + + # Test nan in some values + # Explicit cast to float to avoid implicit cast when setting nan + base_df = base_df.astype({"B": "float"}) + base_df.loc[[0, 2, 4, 6], "B"] = np.nan + expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) + result = base_df.groupby("A").cummax() + tm.assert_frame_equal(result, expected) + expected = ( + base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() + ) + tm.assert_frame_equal(result, expected) + + # GH 15561 + df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) + expected = Series(pd.to_datetime("2001"), index=[0], name="b") + + result = df.groupby("a")["b"].cummax() + tm.assert_series_equal(expected, result) + + # GH 15635 + df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) + result = df.groupby("a").b.cummax() + expected = Series([2, 1, 2], name="b") + tm.assert_series_equal(result, expected) + + +def test_cummax_i8_at_implementation_bound(): + # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT + # for int64 dtype GH#46382 + ser = Series([pd.NaT._value + n for n in range(5)]) + df = DataFrame({"A": 1, "B": ser, "C": ser._values.view("M8[ns]")}) + gb = df.groupby("A") + + res = gb.cummax() + exp = df[["B", "C"]] + tm.assert_frame_equal(res, exp) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) +@pytest.mark.parametrize( + "groups,expected_data", + [ + ([1, 1, 1], [1, None, None]), + ([1, 2, 3], [1, None, 2]), + ([1, 3, 3], [1, None, None]), + ], +) +def test_cummin_max_skipna(method, dtype, groups, expected_data): + # GH-34047 + df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) + orig = df.copy() + gb = df.groupby(groups)["a"] + + result = getattr(gb, method)(skipna=False) + expected = Series(expected_data, dtype=dtype, name="a") + + # check we didn't accidentally alter df + tm.assert_frame_equal(df, orig) + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +def test_cummin_max_skipna_multiple_cols(method): + # Ensure missing value in "a" doesn't cause "b" to be nan-filled + df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]}) + gb = df.groupby([1, 1, 1])[["a", "b"]] + + result = getattr(gb, method)(skipna=False) + expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]}) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["cumprod", "cumsum"]) +def test_numpy_compat(func): + # see gh-12811 + df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + g = df.groupby("A") + + msg = "numpy operations are not valid with groupby" + + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(g, func)(1, 2, 3) + with pytest.raises(UnsupportedFunctionCall, match=msg): + getattr(g, func)(foo=1) + + +@td.skip_if_32bit +@pytest.mark.parametrize("method", ["cummin", "cummax"]) +@pytest.mark.parametrize( + "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)] +) +def test_nullable_int_not_cast_as_float(method, dtype, val): + data = [val, pd.NA] + df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) + grouped = df.groupby("grp") + + result = grouped.transform(method) + expected = DataFrame({"b": data}, dtype=dtype) + + tm.assert_frame_equal(result, expected) + + +def test_cython_api2(): + # this takes the fast apply path + + # cumsum (GH5614) + df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) + expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) + result = df.groupby("A").cumsum() + tm.assert_frame_equal(result, expected) + + # GH 5755 - cumsum is a transformer and should ignore as_index + result = df.groupby("A", as_index=False).cumsum() + tm.assert_frame_equal(result, expected) + + # GH 13994 + msg = "DataFrameGroupBy.cumsum with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").cumsum(axis=1) + expected = df.cumsum(axis=1) + tm.assert_frame_equal(result, expected) + + msg = "DataFrameGroupBy.cumprod with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.groupby("A").cumprod(axis=1) + expected = df.cumprod(axis=1) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_filters.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_filters.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_filters.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_filters.py 2024-04-10 17:42:52.000000000 +0000 @@ -190,9 +190,9 @@ tm.assert_series_equal(res, ser[[]]) -def test_filter_against_workaround(): +def test_filter_against_workaround_ints(): # Series of ints - s = Series(np.random.default_rng(2).integers(0, 100, 1000)) + s = Series(np.random.default_rng(2).integers(0, 100, 100)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 @@ -201,8 +201,10 @@ new_way = grouped.filter(f) tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) + +def test_filter_against_workaround_floats(): # Series of floats - s = 100 * Series(np.random.default_rng(2).random(1000)) + s = 100 * Series(np.random.default_rng(2).random(100)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 @@ -210,9 +212,11 @@ new_way = grouped.filter(f) tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) + +def test_filter_against_workaround_dataframe(): # Set up DataFrame of ints, floats, strings. letters = np.array(list(ascii_lowercase)) - N = 1000 + N = 100 random_letters = letters.take( np.random.default_rng(2).integers(0, 26, N, dtype=int) ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_function.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_function.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_function.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_function.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,1780 +0,0 @@ -import builtins -from io import StringIO -import re - -import numpy as np -import pytest - -from pandas._libs import lib -from pandas.errors import UnsupportedFunctionCall - -import pandas as pd -from pandas import ( - DataFrame, - Index, - MultiIndex, - Series, - Timestamp, - date_range, -) -import pandas._testing as tm -from pandas.tests.groupby import get_groupby_method_args -from pandas.util import _test_decorators as td - - -@pytest.fixture( - params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"], - ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"], -) -def dtypes_for_minmax(request): - """ - Fixture of dtypes with min and max values used for testing - cummin and cummax - """ - dtype = request.param - - np_type = dtype - if dtype == "Int64": - np_type = np.int64 - elif dtype == "Float64": - np_type = np.float64 - - min_val = ( - np.iinfo(np_type).min - if np.dtype(np_type).kind == "i" - else np.finfo(np_type).min - ) - max_val = ( - np.iinfo(np_type).max - if np.dtype(np_type).kind == "i" - else np.finfo(np_type).max - ) - - return (dtype, min_val, max_val) - - -def test_intercept_builtin_sum(): - s = Series([1.0, 2.0, np.nan, 3.0]) - grouped = s.groupby([0, 1, 2, 2]) - - msg = "using SeriesGroupBy.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result = grouped.agg(builtins.sum) - msg = "using np.sum" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#53425 - result2 = grouped.apply(builtins.sum) - expected = grouped.sum() - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result2, expected) - - -@pytest.mark.parametrize("f", [max, min, sum]) -@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key -def test_builtins_apply(keys, f): - # see gh-8155 - rs = np.random.default_rng(2) - df = DataFrame(rs.integers(1, 7, (10, 2)), columns=["jim", "joe"]) - df["jolie"] = rs.standard_normal(10) - - gb = df.groupby(keys) - - fname = f.__name__ - - warn = None if f is not sum else FutureWarning - msg = "The behavior of DataFrame.sum with axis=None is deprecated" - with tm.assert_produces_warning( - warn, match=msg, check_stacklevel=False, raise_on_extra_warnings=False - ): - # Also warns on deprecation GH#53425 - result = gb.apply(f) - ngroups = len(df.drop_duplicates(subset=keys)) - - assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" - assert result.shape == (ngroups, 3), assert_msg - - npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function - expected = gb.apply(npfunc) - tm.assert_frame_equal(result, expected) - - with tm.assert_produces_warning(None): - expected2 = gb.apply(lambda x: npfunc(x)) - tm.assert_frame_equal(result, expected2) - - if f != sum: - expected = gb.agg(fname).reset_index() - expected.set_index(keys, inplace=True, drop=False) - tm.assert_frame_equal(result, expected, check_dtype=False) - - tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0)) - - -class TestNumericOnly: - # make sure that we are passing thru kwargs to our agg functions - - @pytest.fixture - def df(self): - # GH3668 - # GH5724 - df = DataFrame( - { - "group": [1, 1, 2], - "int": [1, 2, 3], - "float": [4.0, 5.0, 6.0], - "string": list("abc"), - "category_string": Series(list("abc")).astype("category"), - "category_int": [7, 8, 9], - "datetime": date_range("20130101", periods=3), - "datetimetz": date_range("20130101", periods=3, tz="US/Eastern"), - "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), - }, - columns=[ - "group", - "int", - "float", - "string", - "category_string", - "category_int", - "datetime", - "datetimetz", - "timedelta", - ], - ) - return df - - @pytest.mark.parametrize("method", ["mean", "median"]) - def test_averages(self, df, method): - # mean / median - expected_columns_numeric = Index(["int", "float", "category_int"]) - - gb = df.groupby("group") - expected = DataFrame( - { - "category_int": [7.5, 9], - "float": [4.5, 6.0], - "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")], - "int": [1.5, 3], - "datetime": [ - Timestamp("2013-01-01 12:00:00"), - Timestamp("2013-01-03 00:00:00"), - ], - "datetimetz": [ - Timestamp("2013-01-01 12:00:00", tz="US/Eastern"), - Timestamp("2013-01-03 00:00:00", tz="US/Eastern"), - ], - }, - index=Index([1, 2], name="group"), - columns=[ - "int", - "float", - "category_int", - ], - ) - - result = getattr(gb, method)(numeric_only=True) - tm.assert_frame_equal(result.reindex_like(expected), expected) - - expected_columns = expected.columns - - self._check(df, method, expected_columns, expected_columns_numeric) - - @pytest.mark.parametrize("method", ["min", "max"]) - def test_extrema(self, df, method): - # TODO: min, max *should* handle - # categorical (ordered) dtype - - expected_columns = Index( - [ - "int", - "float", - "string", - "category_int", - "datetime", - "datetimetz", - "timedelta", - ] - ) - expected_columns_numeric = expected_columns - - self._check(df, method, expected_columns, expected_columns_numeric) - - @pytest.mark.parametrize("method", ["first", "last"]) - def test_first_last(self, df, method): - expected_columns = Index( - [ - "int", - "float", - "string", - "category_string", - "category_int", - "datetime", - "datetimetz", - "timedelta", - ] - ) - expected_columns_numeric = expected_columns - - self._check(df, method, expected_columns, expected_columns_numeric) - - @pytest.mark.parametrize("method", ["sum", "cumsum"]) - def test_sum_cumsum(self, df, method): - expected_columns_numeric = Index(["int", "float", "category_int"]) - expected_columns = Index( - ["int", "float", "string", "category_int", "timedelta"] - ) - if method == "cumsum": - # cumsum loses string - expected_columns = Index(["int", "float", "category_int", "timedelta"]) - - self._check(df, method, expected_columns, expected_columns_numeric) - - @pytest.mark.parametrize("method", ["prod", "cumprod"]) - def test_prod_cumprod(self, df, method): - expected_columns = Index(["int", "float", "category_int"]) - expected_columns_numeric = expected_columns - - self._check(df, method, expected_columns, expected_columns_numeric) - - @pytest.mark.parametrize("method", ["cummin", "cummax"]) - def test_cummin_cummax(self, df, method): - # like min, max, but don't include strings - expected_columns = Index( - ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"] - ) - - # GH#15561: numeric_only=False set by default like min/max - expected_columns_numeric = expected_columns - - self._check(df, method, expected_columns, expected_columns_numeric) - - def _check(self, df, method, expected_columns, expected_columns_numeric): - gb = df.groupby("group") - - # object dtypes for transformations are not implemented in Cython and - # have no Python fallback - exception = NotImplementedError if method.startswith("cum") else TypeError - - if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"): - # The methods default to numeric_only=False and raise TypeError - msg = "|".join( - [ - "Categorical is not ordered", - f"Cannot perform {method} with non-ordered Categorical", - re.escape(f"agg function failed [how->{method},dtype->object]"), - # cumsum/cummin/cummax/cumprod - "function is not implemented for this dtype", - ] - ) - with pytest.raises(exception, match=msg): - getattr(gb, method)() - elif method in ("sum", "mean", "median", "prod"): - msg = "|".join( - [ - "category type does not support sum operations", - re.escape(f"agg function failed [how->{method},dtype->object]"), - ] - ) - with pytest.raises(exception, match=msg): - getattr(gb, method)() - else: - result = getattr(gb, method)() - tm.assert_index_equal(result.columns, expected_columns_numeric) - - if method not in ("first", "last"): - msg = "|".join( - [ - "Categorical is not ordered", - "category type does not support", - "function is not implemented for this dtype", - f"Cannot perform {method} with non-ordered Categorical", - re.escape(f"agg function failed [how->{method},dtype->object]"), - ] - ) - with pytest.raises(exception, match=msg): - getattr(gb, method)(numeric_only=False) - else: - result = getattr(gb, method)(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - -class TestGroupByNonCythonPaths: - # GH#5610 non-cython calls should not include the grouper - # Tests for code not expected to go through cython paths. - - @pytest.fixture - def df(self): - df = DataFrame( - [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], - columns=["A", "B", "C"], - ) - return df - - @pytest.fixture - def gb(self, df): - gb = df.groupby("A") - return gb - - @pytest.fixture - def gni(self, df): - gni = df.groupby("A", as_index=False) - return gni - - def test_describe(self, df, gb, gni): - # describe - expected_index = Index([1, 3], name="A") - expected_col = MultiIndex( - levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], - codes=[[0] * 8, list(range(8))], - ) - expected = DataFrame( - [ - [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], - [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - ], - index=expected_index, - columns=expected_col, - ) - result = gb.describe() - tm.assert_frame_equal(result, expected) - - expected = expected.reset_index() - result = gni.describe() - tm.assert_frame_equal(result, expected) - - -def test_cython_api2(): - # this takes the fast apply path - - # cumsum (GH5614) - df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) - expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) - result = df.groupby("A").cumsum() - tm.assert_frame_equal(result, expected) - - # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby("A", as_index=False).cumsum() - tm.assert_frame_equal(result, expected) - - # GH 13994 - msg = "DataFrameGroupBy.cumsum with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").cumsum(axis=1) - expected = df.cumsum(axis=1) - tm.assert_frame_equal(result, expected) - - msg = "DataFrameGroupBy.cumprod with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.groupby("A").cumprod(axis=1) - expected = df.cumprod(axis=1) - tm.assert_frame_equal(result, expected) - - -def test_cython_median(): - arr = np.random.default_rng(2).standard_normal(1000) - arr[::2] = np.nan - df = DataFrame(arr) - - labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) - labels[::17] = np.nan - - result = df.groupby(labels).median() - msg = "using DataFrameGroupBy.median" - with tm.assert_produces_warning(FutureWarning, match=msg): - exp = df.groupby(labels).agg(np.nanmedian) - tm.assert_frame_equal(result, exp) - - df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5))) - msg = "using DataFrameGroupBy.median" - with tm.assert_produces_warning(FutureWarning, match=msg): - rs = df.groupby(labels).agg(np.median) - xp = df.groupby(labels).median() - tm.assert_frame_equal(rs, xp) - - -def test_median_empty_bins(observed): - df = DataFrame(np.random.default_rng(2).integers(0, 44, 500)) - - grps = range(0, 55, 5) - bins = pd.cut(df[0], grps) - - result = df.groupby(bins, observed=observed).median() - expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"] -) -@pytest.mark.parametrize( - "method,data", - [ - ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), - ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), - ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), - ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), - ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}), - ], -) -def test_groupby_non_arithmetic_agg_types(dtype, method, data): - # GH9311, GH6620 - df = DataFrame( - [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}] - ) - - df["b"] = df.b.astype(dtype) - - if "args" not in data: - data["args"] = [] - - if "out_type" in data: - out_type = data["out_type"] - else: - out_type = dtype - - exp = data["df"] - df_out = DataFrame(exp) - - df_out["b"] = df_out.b.astype(out_type) - df_out.set_index("a", inplace=True) - - grpd = df.groupby("a") - t = getattr(grpd, method)(*data["args"]) - tm.assert_frame_equal(t, df_out) - - -@pytest.mark.parametrize( - "i", - [ - ( - Timestamp("2011-01-15 12:50:28.502376"), - Timestamp("2011-01-20 12:50:28.593448"), - ), - (24650000000000001, 24650000000000002), - ], -) -def test_groupby_non_arithmetic_agg_int_like_precision(i): - # see gh-6620, gh-9311 - df = DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}]) - - grp_exp = { - "first": {"expected": i[0]}, - "last": {"expected": i[1]}, - "min": {"expected": i[0]}, - "max": {"expected": i[1]}, - "nth": {"expected": i[1], "args": [1]}, - "count": {"expected": 2}, - } - - for method, data in grp_exp.items(): - if "args" not in data: - data["args"] = [] - - grouped = df.groupby("a") - res = getattr(grouped, method)(*data["args"]) - - assert res.iloc[0].b == data["expected"] - - -@pytest.mark.parametrize( - "func, values", - [ - ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}), - ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), - ], -) -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_idxmin_idxmax_returns_int_types(func, values, numeric_only): - # GH 25444 - df = DataFrame( - { - "name": ["A", "A", "B", "B"], - "c_int": [1, 2, 3, 4], - "c_float": [4.02, 3.03, 2.04, 1.05], - "c_date": ["2019", "2018", "2016", "2017"], - } - ) - df["c_date"] = pd.to_datetime(df["c_date"]) - df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific") - df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0] - df["c_period"] = df["c_date"].dt.to_period("W") - df["c_Integer"] = df["c_int"].astype("Int64") - df["c_Floating"] = df["c_float"].astype("Float64") - - result = getattr(df.groupby("name"), func)(numeric_only=numeric_only) - - expected = DataFrame(values, index=Index(["A", "B"], name="name")) - if numeric_only: - expected = expected.drop(columns=["c_date"]) - else: - expected["c_date_tz"] = expected["c_date"] - expected["c_timedelta"] = expected["c_date"] - expected["c_period"] = expected["c_date"] - expected["c_Integer"] = expected["c_int"] - expected["c_Floating"] = expected["c_float"] - - tm.assert_frame_equal(result, expected) - - -def test_idxmin_idxmax_axis1(): - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] - ) - df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] - - gb = df.groupby("A") - - warn_msg = "DataFrameGroupBy.idxmax with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - res = gb.idxmax(axis=1) - - alt = df.iloc[:, 1:].idxmax(axis=1) - indexer = res.index.get_level_values(1) - - tm.assert_series_equal(alt[indexer], res.droplevel("A")) - - df["E"] = date_range("2016-01-01", periods=10) - gb2 = df.groupby("A") - - msg = "'>' not supported between instances of 'Timestamp' and 'float'" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - gb2.idxmax(axis=1) - - -@pytest.mark.parametrize("numeric_only", [True, False, None]) -def test_axis1_numeric_only(request, groupby_func, numeric_only): - if groupby_func in ("idxmax", "idxmin"): - pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") - if groupby_func in ("corrwith", "skew"): - msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] - ) - df["E"] = "x" - groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] - gb = df.groupby(groups) - method = getattr(gb, groupby_func) - args = get_groupby_method_args(groupby_func, df) - kwargs = {"axis": 1} - if numeric_only is not None: - # when numeric_only is None we don't pass any argument - kwargs["numeric_only"] = numeric_only - - # Functions without numeric_only and axis args - no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift") - # Functions with axis args - has_axis = ( - "cumprod", - "cumsum", - "diff", - "pct_change", - "rank", - "shift", - "cummax", - "cummin", - "idxmin", - "idxmax", - "fillna", - ) - warn_msg = f"DataFrameGroupBy.{groupby_func} with axis=1 is deprecated" - if numeric_only is not None and groupby_func in no_args: - msg = "got an unexpected keyword argument 'numeric_only'" - if groupby_func in ["cumprod", "cumsum"]: - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - method(*args, **kwargs) - else: - with pytest.raises(TypeError, match=msg): - method(*args, **kwargs) - elif groupby_func not in has_axis: - msg = "got an unexpected keyword argument 'axis'" - with pytest.raises(TypeError, match=msg): - method(*args, **kwargs) - # fillna and shift are successful even on object dtypes - elif (numeric_only is None or not numeric_only) and groupby_func not in ( - "fillna", - "shift", - ): - msgs = ( - # cummax, cummin, rank - "not supported between instances of", - # cumprod - "can't multiply sequence by non-int of type 'float'", - # cumsum, diff, pct_change - "unsupported operand type", - ) - with pytest.raises(TypeError, match=f"({'|'.join(msgs)})"): - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - method(*args, **kwargs) - else: - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - result = method(*args, **kwargs) - - df_expected = df.drop(columns="E").T if numeric_only else df.T - expected = getattr(df_expected, groupby_func)(*args).T - if groupby_func == "shift" and not numeric_only: - # shift with axis=1 leaves the leftmost column as numeric - # but transposing for expected gives us object dtype - expected = expected.astype(float) - - tm.assert_equal(result, expected) - - -def test_groupby_cumprod(): - # GH 4095 - df = DataFrame({"key": ["b"] * 10, "value": 2}) - - actual = df.groupby("key")["value"].cumprod() - expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) - expected.name = "value" - tm.assert_series_equal(actual, expected) - - df = DataFrame({"key": ["b"] * 100, "value": 2}) - df["value"] = df["value"].astype(float) - actual = df.groupby("key")["value"].cumprod() - expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod()) - expected.name = "value" - tm.assert_series_equal(actual, expected) - - -def test_groupby_cumprod_overflow(): - # GH#37493 if we overflow we return garbage consistent with numpy - df = DataFrame({"key": ["b"] * 4, "value": 100_000}) - actual = df.groupby("key")["value"].cumprod() - expected = Series( - [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920], - name="value", - ) - tm.assert_series_equal(actual, expected) - - numpy_result = df.groupby("key", group_keys=False)["value"].apply( - lambda x: x.cumprod() - ) - numpy_result.name = "value" - tm.assert_series_equal(actual, numpy_result) - - -def test_groupby_cumprod_nan_influences_other_columns(): - # GH#48064 - df = DataFrame( - { - "a": 1, - "b": [1, np.nan, 2], - "c": [1, 2, 3.0], - } - ) - result = df.groupby("a").cumprod(numeric_only=True, skipna=False) - expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]}) - tm.assert_frame_equal(result, expected) - - -def scipy_sem(*args, **kwargs): - from scipy.stats import sem - - return sem(*args, ddof=1, **kwargs) - - -@pytest.mark.parametrize( - "op,targop", - [ - ("mean", np.mean), - ("median", np.median), - ("std", np.std), - ("var", np.var), - ("sum", np.sum), - ("prod", np.prod), - ("min", np.min), - ("max", np.max), - ("first", lambda x: x.iloc[0]), - ("last", lambda x: x.iloc[-1]), - ("count", np.size), - pytest.param("sem", scipy_sem, marks=td.skip_if_no_scipy), - ], -) -def test_ops_general(op, targop): - df = DataFrame(np.random.default_rng(2).standard_normal(1000)) - labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) - - result = getattr(df.groupby(labels), op)() - warn = None if op in ("first", "last", "count", "sem") else FutureWarning - msg = f"using DataFrameGroupBy.{op}" - with tm.assert_produces_warning(warn, match=msg): - expected = df.groupby(labels).agg(targop) - tm.assert_frame_equal(result, expected) - - -def test_max_nan_bug(): - raw = """,Date,app,File --04-23,2013-04-23 00:00:00,,log080001.log --05-06,2013-05-06 00:00:00,,log.log --05-07,2013-05-07 00:00:00,OE,xlsx""" - - with tm.assert_produces_warning(UserWarning, match="Could not infer format"): - df = pd.read_csv(StringIO(raw), parse_dates=[0]) - gb = df.groupby("Date") - r = gb[["File"]].max() - e = gb["File"].max().to_frame() - tm.assert_frame_equal(r, e) - assert not r["File"].isna().any() - - -def test_nlargest(): - a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list("a" * 5 + "b" * 5)) - gb = a.groupby(b) - r = gb.nlargest(3) - e = Series( - [7, 5, 3, 10, 9, 6], - index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), - ) - tm.assert_series_equal(r, e) - - a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) - gb = a.groupby(b) - e = Series( - [3, 2, 1, 3, 3, 2], - index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), - ) - tm.assert_series_equal(gb.nlargest(3, keep="last"), e) - - -def test_nlargest_mi_grouper(): - # see gh-21411 - npr = np.random.default_rng(2) - - dts = date_range("20180101", periods=10) - iterables = [dts, ["one", "two"]] - - idx = MultiIndex.from_product(iterables, names=["first", "second"]) - s = Series(npr.standard_normal(20), index=idx) - - result = s.groupby("first").nlargest(1) - - exp_idx = MultiIndex.from_tuples( - [ - (dts[0], dts[0], "one"), - (dts[1], dts[1], "one"), - (dts[2], dts[2], "one"), - (dts[3], dts[3], "two"), - (dts[4], dts[4], "one"), - (dts[5], dts[5], "one"), - (dts[6], dts[6], "one"), - (dts[7], dts[7], "one"), - (dts[8], dts[8], "one"), - (dts[9], dts[9], "one"), - ], - names=["first", "first", "second"], - ) - - exp_values = [ - 0.18905338179353307, - -0.41306354339189344, - 1.799707382720902, - 0.7738065867276614, - 0.28121066979764925, - 0.9775674511260357, - -0.3288239040579627, - 0.45495807124085547, - 0.5452887139646817, - 0.12682784711186987, - ] - - expected = Series(exp_values, index=exp_idx) - tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3) - - -def test_nsmallest(): - a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list("a" * 5 + "b" * 5)) - gb = a.groupby(b) - r = gb.nsmallest(3) - e = Series( - [1, 2, 3, 0, 4, 6], - index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), - ) - tm.assert_series_equal(r, e) - - a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) - gb = a.groupby(b) - e = Series( - [0, 1, 1, 0, 1, 2], - index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), - ) - tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) - - -@pytest.mark.parametrize( - "data, groups", - [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])], -) -@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES]) -@pytest.mark.parametrize("method", ["nlargest", "nsmallest"]) -def test_nlargest_and_smallest_noop(data, groups, dtype, method): - # GH 15272, GH 16345, GH 29129 - # Test nlargest/smallest when it results in a noop, - # i.e. input is sorted and group size <= n - if dtype is not None: - data = np.array(data, dtype=dtype) - if method == "nlargest": - data = list(reversed(data)) - ser = Series(data, name="a") - result = getattr(ser.groupby(groups), method)(n=2) - expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups - expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a") - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("func", ["cumprod", "cumsum"]) -def test_numpy_compat(func): - # see gh-12811 - df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) - g = df.groupby("A") - - msg = "numpy operations are not valid with groupby" - - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(g, func)(1, 2, 3) - with pytest.raises(UnsupportedFunctionCall, match=msg): - getattr(g, func)(foo=1) - - -def test_cummin(dtypes_for_minmax): - dtype = dtypes_for_minmax[0] - min_val = dtypes_for_minmax[1] - - # GH 15048 - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) - expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] - - df = base_df.astype(dtype) - - expected = DataFrame({"B": expected_mins}).astype(dtype) - result = df.groupby("A").cummin() - tm.assert_frame_equal(result, expected) - result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test w/ min value for dtype - df.loc[[2, 6], "B"] = min_val - df.loc[[1, 5], "B"] = min_val + 1 - expected.loc[[2, 3, 6, 7], "B"] = min_val - expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val - result = df.groupby("A").cummin() - tm.assert_frame_equal(result, expected, check_exact=True) - expected = ( - df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() - ) - tm.assert_frame_equal(result, expected, check_exact=True) - - # Test nan in some values - # Explicit cast to float to avoid implicit cast when setting nan - base_df = base_df.astype({"B": "float"}) - base_df.loc[[0, 2, 4, 6], "B"] = np.nan - expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) - result = base_df.groupby("A").cummin() - tm.assert_frame_equal(result, expected) - expected = ( - base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame() - ) - tm.assert_frame_equal(result, expected) - - # GH 15561 - df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) - expected = Series(pd.to_datetime("2001"), index=[0], name="b") - - result = df.groupby("a")["b"].cummin() - tm.assert_series_equal(expected, result) - - # GH 15635 - df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]}) - result = df.groupby("a").b.cummin() - expected = Series([1, 2, 1], name="b") - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"]) -def test_cummin_max_all_nan_column(method, dtype): - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) - base_df["B"] = base_df["B"].astype(dtype) - grouped = base_df.groupby("A") - - expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype) - result = getattr(grouped, method)() - tm.assert_frame_equal(expected, result) - - result = getattr(grouped["B"], method)().to_frame() - tm.assert_frame_equal(expected, result) - - -def test_cummax(dtypes_for_minmax): - dtype = dtypes_for_minmax[0] - max_val = dtypes_for_minmax[2] - - # GH 15048 - base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}) - expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] - - df = base_df.astype(dtype) - - expected = DataFrame({"B": expected_maxs}).astype(dtype) - result = df.groupby("A").cummax() - tm.assert_frame_equal(result, expected) - result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test w/ max value for dtype - df.loc[[2, 6], "B"] = max_val - expected.loc[[2, 3, 6, 7], "B"] = max_val - result = df.groupby("A").cummax() - tm.assert_frame_equal(result, expected) - expected = ( - df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() - ) - tm.assert_frame_equal(result, expected) - - # Test nan in some values - # Explicit cast to float to avoid implicit cast when setting nan - base_df = base_df.astype({"B": "float"}) - base_df.loc[[0, 2, 4, 6], "B"] = np.nan - expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) - result = base_df.groupby("A").cummax() - tm.assert_frame_equal(result, expected) - expected = ( - base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame() - ) - tm.assert_frame_equal(result, expected) - - # GH 15561 - df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])}) - expected = Series(pd.to_datetime("2001"), index=[0], name="b") - - result = df.groupby("a")["b"].cummax() - tm.assert_series_equal(expected, result) - - # GH 15635 - df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]}) - result = df.groupby("a").b.cummax() - expected = Series([2, 1, 2], name="b") - tm.assert_series_equal(result, expected) - - -def test_cummax_i8_at_implementation_bound(): - # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT - # for int64 dtype GH#46382 - ser = Series([pd.NaT._value + n for n in range(5)]) - df = DataFrame({"A": 1, "B": ser, "C": ser.view("M8[ns]")}) - gb = df.groupby("A") - - res = gb.cummax() - exp = df[["B", "C"]] - tm.assert_frame_equal(res, exp) - - -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"]) -@pytest.mark.parametrize( - "groups,expected_data", - [ - ([1, 1, 1], [1, None, None]), - ([1, 2, 3], [1, None, 2]), - ([1, 3, 3], [1, None, None]), - ], -) -def test_cummin_max_skipna(method, dtype, groups, expected_data): - # GH-34047 - df = DataFrame({"a": Series([1, None, 2], dtype=dtype)}) - orig = df.copy() - gb = df.groupby(groups)["a"] - - result = getattr(gb, method)(skipna=False) - expected = Series(expected_data, dtype=dtype, name="a") - - # check we didn't accidentally alter df - tm.assert_frame_equal(df, orig) - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -def test_cummin_max_skipna_multiple_cols(method): - # Ensure missing value in "a" doesn't cause "b" to be nan-filled - df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]}) - gb = df.groupby([1, 1, 1])[["a", "b"]] - - result = getattr(gb, method)(skipna=False) - expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]}) - - tm.assert_frame_equal(result, expected) - - -@td.skip_if_32bit -@pytest.mark.parametrize("method", ["cummin", "cummax"]) -@pytest.mark.parametrize( - "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)] -) -def test_nullable_int_not_cast_as_float(method, dtype, val): - data = [val, pd.NA] - df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype) - grouped = df.groupby("grp") - - result = grouped.transform(method) - expected = DataFrame({"b": data}, dtype=dtype) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "in_vals, out_vals", - [ - # Basics: strictly increasing (T), strictly decreasing (F), - # abs val increasing (F), non-strictly increasing (T) - ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), - # Test with inf vals - ( - [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], - [True, False, True, False], - ), - # Test with nan vals; should always be False - ( - [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False], - ), - ], -) -def test_is_monotonic_increasing(in_vals, out_vals): - # GH 17015 - source_dict = { - "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], - "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], - "C": in_vals, - } - df = DataFrame(source_dict) - result = df.groupby("B").C.is_monotonic_increasing - index = Index(list("abcd"), name="B") - expected = Series(index=index, data=out_vals, name="C") - tm.assert_series_equal(result, expected) - - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "in_vals, out_vals", - [ - # Basics: strictly decreasing (T), strictly increasing (F), - # abs val decreasing (F), non-strictly increasing (T) - ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), - # Test with inf vals - ( - [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], - [True, True, False, True], - ), - # Test with nan vals; should always be False - ( - [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False], - ), - ], -) -def test_is_monotonic_decreasing(in_vals, out_vals): - # GH 17015 - source_dict = { - "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], - "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], - "C": in_vals, - } - - df = DataFrame(source_dict) - result = df.groupby("B").C.is_monotonic_decreasing - index = Index(list("abcd"), name="B") - expected = Series(index=index, data=out_vals, name="C") - tm.assert_series_equal(result, expected) - - -# describe -# -------------------------------- - - -def test_apply_describe_bug(mframe): - grouped = mframe.groupby(level="first") - grouped.describe() # it works! - - -def test_series_describe_multikey(): - ts = tm.makeTimeSeries() - grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) - tm.assert_series_equal(result["std"], grouped.std(), check_names=False) - tm.assert_series_equal(result["min"], grouped.min(), check_names=False) - - -def test_series_describe_single(): - ts = tm.makeTimeSeries() - grouped = ts.groupby(lambda x: x.month) - result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack(future_stack=True) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]]) -def test_series_describe_as_index(as_index, keys): - # GH#49256 - df = DataFrame( - { - "key1": ["one", "two", "two", "three", "two"], - "key2": ["one", "two", "two", "three", "two"], - "foo2": [1, 2, 4, 4, 6], - } - ) - gb = df.groupby(keys, as_index=as_index)["foo2"] - result = gb.describe() - expected = DataFrame( - { - "key1": ["one", "three", "two"], - "count": [1.0, 1.0, 3.0], - "mean": [1.0, 4.0, 4.0], - "std": [np.nan, np.nan, 2.0], - "min": [1.0, 4.0, 2.0], - "25%": [1.0, 4.0, 3.0], - "50%": [1.0, 4.0, 4.0], - "75%": [1.0, 4.0, 5.0], - "max": [1.0, 4.0, 6.0], - } - ) - if len(keys) == 2: - expected.insert(1, "key2", expected["key1"]) - if as_index: - expected = expected.set_index(keys) - tm.assert_frame_equal(result, expected) - - -def test_series_index_name(df): - grouped = df.loc[:, ["C"]].groupby(df["A"]) - result = grouped.agg(lambda x: x.mean()) - assert result.index.name == "A" - - -def test_frame_describe_multikey(tsframe): - grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - desc_groups = [] - for col in tsframe: - group = grouped[col].describe() - # GH 17464 - Remove duplicate MultiIndex levels - group_col = MultiIndex( - levels=[[col], group.columns], - codes=[[0] * len(group.columns), range(len(group.columns))], - ) - group = DataFrame(group.values, columns=group_col, index=group.index) - desc_groups.append(group) - expected = pd.concat(desc_groups, axis=1) - tm.assert_frame_equal(result, expected) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) - result = groupedT.describe() - expected = tsframe.describe().T - # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/ - expected.index = MultiIndex( - levels=[[0, 1], expected.index], - codes=[[0, 0, 1, 1], range(len(expected.index))], - ) - tm.assert_frame_equal(result, expected) - - -def test_frame_describe_tupleindex(): - # GH 14848 - regression from 0.19.0 to 0.19.1 - df1 = DataFrame( - { - "x": [1, 2, 3, 4, 5] * 3, - "y": [10, 20, 30, 40, 50] * 3, - "z": [100, 200, 300, 400, 500] * 3, - } - ) - df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 - df2 = df1.rename(columns={"k": "key"}) - msg = "Names should be list-like for a MultiIndex" - with pytest.raises(ValueError, match=msg): - df1.groupby("k").describe() - with pytest.raises(ValueError, match=msg): - df2.groupby("key").describe() - - -def test_frame_describe_unstacked_format(): - # GH 4792 - prices = { - Timestamp("2011-01-06 10:59:05", tz=None): 24990, - Timestamp("2011-01-06 12:43:33", tz=None): 25499, - Timestamp("2011-01-06 12:54:09", tz=None): 25499, - } - volumes = { - Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, - Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, - Timestamp("2011-01-06 12:54:09", tz=None): 100000000, - } - df = DataFrame({"PRICE": prices, "VOLUME": volumes}) - result = df.groupby("PRICE").VOLUME.describe() - data = [ - df[df.PRICE == 24990].VOLUME.describe().values.tolist(), - df[df.PRICE == 25499].VOLUME.describe().values.tolist(), - ] - expected = DataFrame( - data, - index=Index([24990, 25499], name="PRICE"), - columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.filterwarnings( - "ignore:" - "indexing past lexsort depth may impact performance:" - "pandas.errors.PerformanceWarning" -) -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) -def test_describe_with_duplicate_output_column_names(as_index, keys): - # GH 35314 - df = DataFrame( - { - "a1": [99, 99, 99, 88, 88, 88], - "a2": [99, 99, 99, 88, 88, 88], - "b": [1, 2, 3, 4, 5, 6], - "c": [10, 20, 30, 40, 50, 60], - }, - columns=["a1", "a2", "b", "b"], - copy=False, - ) - if keys == ["a1"]: - df = df.drop(columns="a2") - - expected = ( - DataFrame.from_records( - [ - ("b", "count", 3.0, 3.0), - ("b", "mean", 5.0, 2.0), - ("b", "std", 1.0, 1.0), - ("b", "min", 4.0, 1.0), - ("b", "25%", 4.5, 1.5), - ("b", "50%", 5.0, 2.0), - ("b", "75%", 5.5, 2.5), - ("b", "max", 6.0, 3.0), - ("b", "count", 3.0, 3.0), - ("b", "mean", 5.0, 2.0), - ("b", "std", 1.0, 1.0), - ("b", "min", 4.0, 1.0), - ("b", "25%", 4.5, 1.5), - ("b", "50%", 5.0, 2.0), - ("b", "75%", 5.5, 2.5), - ("b", "max", 6.0, 3.0), - ], - ) - .set_index([0, 1]) - .T - ) - expected.columns.names = [None, None] - if len(keys) == 2: - expected.index = MultiIndex( - levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"] - ) - else: - expected.index = Index([88, 99], name="a1") - - if not as_index: - expected = expected.reset_index() - - result = df.groupby(keys, as_index=as_index).describe() - - tm.assert_frame_equal(result, expected) - - -def test_describe_duplicate_columns(): - # GH#50806 - df = DataFrame([[0, 1, 2, 3]]) - df.columns = [0, 1, 2, 0] - gb = df.groupby(df[1]) - result = gb.describe(percentiles=[]) - - columns = ["count", "mean", "std", "min", "50%", "max"] - frames = [ - DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns) - for val in (0.0, 2.0, 3.0) - ] - expected = pd.concat(frames, axis=1) - expected.columns = MultiIndex( - levels=[[0, 2], columns], - codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))], - ) - expected.index.names = [1] - tm.assert_frame_equal(result, expected) - - -def test_groupby_mean_no_overflow(): - # Regression test for (#22487) - df = DataFrame( - { - "user": ["A", "A", "A", "A", "A"], - "connections": [4970, 4749, 4719, 4704, 18446744073699999744], - } - ) - assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 - - -@pytest.mark.parametrize( - "values", - [ - { - "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], - "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2], - }, - {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]}, - ], -) -@pytest.mark.parametrize("function", ["mean", "median", "var"]) -def test_apply_to_nullable_integer_returns_float(values, function): - # https://github.com/pandas-dev/pandas/issues/32219 - output = 0.5 if function == "var" else 1.5 - arr = np.array([output] * 3, dtype=float) - idx = Index([1, 2, 3], name="a", dtype="Int64") - expected = DataFrame({"b": arr}, index=idx).astype("Float64") - - groups = DataFrame(values, dtype="Int64").groupby("a") - - result = getattr(groups, function)() - tm.assert_frame_equal(result, expected) - - result = groups.agg(function) - tm.assert_frame_equal(result, expected) - - result = groups.agg([function]) - expected.columns = MultiIndex.from_tuples([("b", function)]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("min_count", [0, 10]) -def test_groupby_sum_mincount_boolean(min_count): - b = True - a = False - na = np.nan - dfg = pd.array([b, b, na, na, a, a, b], dtype="boolean") - - df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg}) - result = df.groupby("A").sum(min_count=min_count) - if min_count == 0: - expected = DataFrame( - {"B": pd.array([3, 0, 0], dtype="Int64")}, - index=Index([1, 2, 3], name="A"), - ) - tm.assert_frame_equal(result, expected) - else: - expected = DataFrame( - {"B": pd.array([pd.NA] * 3, dtype="Int64")}, - index=Index([1, 2, 3], name="A"), - ) - tm.assert_frame_equal(result, expected) - - -def test_groupby_sum_below_mincount_nullable_integer(): - # https://github.com/pandas-dev/pandas/issues/32861 - df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") - grouped = df.groupby("a") - idx = Index([0, 1, 2], name="a", dtype="Int64") - - result = grouped["b"].sum(min_count=2) - expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") - tm.assert_series_equal(result, expected) - - result = grouped.sum(min_count=2) - expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx) - tm.assert_frame_equal(result, expected) - - -def test_mean_on_timedelta(): - # GH 17382 - df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5}) - result = df.groupby("cat")["time"].mean() - expected = Series( - pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat") - ) - tm.assert_series_equal(result, expected) - - -def test_groupby_sum_timedelta_with_nat(): - # GH#42659 - df = DataFrame( - { - "a": [1, 1, 2, 2], - "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], - } - ) - td3 = pd.Timedelta(days=3) - - gb = df.groupby("a") - - res = gb.sum() - expected = DataFrame({"b": [td3, td3]}, index=Index([1, 2], name="a")) - tm.assert_frame_equal(res, expected) - - res = gb["b"].sum() - tm.assert_series_equal(res, expected["b"]) - - res = gb["b"].sum(min_count=2) - expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) - tm.assert_series_equal(res, expected) - - -@pytest.mark.parametrize( - "kernel, has_arg", - [ - ("all", False), - ("any", False), - ("bfill", False), - ("corr", True), - ("corrwith", True), - ("cov", True), - ("cummax", True), - ("cummin", True), - ("cumprod", True), - ("cumsum", True), - ("diff", False), - ("ffill", False), - ("fillna", False), - ("first", True), - ("idxmax", True), - ("idxmin", True), - ("last", True), - ("max", True), - ("mean", True), - ("median", True), - ("min", True), - ("nth", False), - ("nunique", False), - ("pct_change", False), - ("prod", True), - ("quantile", True), - ("sem", True), - ("skew", True), - ("std", True), - ("sum", True), - ("var", True), - ], -) -@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default]) -@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) -def test_numeric_only(kernel, has_arg, numeric_only, keys): - # GH#46072 - # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False - # has_arg: Whether the op has a numeric_only arg - df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]}) - - args = get_groupby_method_args(kernel, df) - kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only} - - gb = df.groupby(keys) - method = getattr(gb, kernel) - if has_arg and numeric_only is True: - # Cases where b does not appear in the result - result = method(*args, **kwargs) - assert "b" not in result.columns - elif ( - # kernels that work on any dtype and have numeric_only arg - kernel in ("first", "last") - or ( - # kernels that work on any dtype and don't have numeric_only arg - kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") - and numeric_only is lib.no_default - ) - ): - result = method(*args, **kwargs) - assert "b" in result.columns - elif has_arg: - assert numeric_only is not True - # kernels that are successful on any dtype were above; this will fail - - # object dtypes for transformations are not implemented in Cython and - # have no Python fallback - exception = NotImplementedError if kernel.startswith("cum") else TypeError - - msg = "|".join( - [ - "not allowed for this dtype", - "cannot be performed against 'object' dtypes", - # On PY39 message is "a number"; on PY310 and after is "a real number" - "must be a string or a.* number", - "unsupported operand type", - "function is not implemented for this dtype", - re.escape(f"agg function failed [how->{kernel},dtype->object]"), - ] - ) - if kernel == "idxmin": - msg = "'<' not supported between instances of 'type' and 'type'" - elif kernel == "idxmax": - msg = "'>' not supported between instances of 'type' and 'type'" - with pytest.raises(exception, match=msg): - method(*args, **kwargs) - elif not has_arg and numeric_only is not lib.no_default: - with pytest.raises( - TypeError, match="got an unexpected keyword argument 'numeric_only'" - ): - method(*args, **kwargs) - else: - assert kernel in ("diff", "pct_change") - assert numeric_only is lib.no_default - # Doesn't have numeric_only argument and fails on nuisance columns - with pytest.raises(TypeError, match=r"unsupported operand type"): - method(*args, **kwargs) - - -@pytest.mark.parametrize("dtype", [bool, int, float, object]) -def test_deprecate_numeric_only_series(dtype, groupby_func, request): - # GH#46560 - grouper = [0, 0, 1] - - ser = Series([1, 0, 0], dtype=dtype) - gb = ser.groupby(grouper) - - if groupby_func == "corrwith": - # corrwith is not implemented on SeriesGroupBy - assert not hasattr(gb, groupby_func) - return - - method = getattr(gb, groupby_func) - - expected_ser = Series([1, 0, 0]) - expected_gb = expected_ser.groupby(grouper) - expected_method = getattr(expected_gb, groupby_func) - - args = get_groupby_method_args(groupby_func, ser) - - fails_on_numeric_object = ( - "corr", - "cov", - "cummax", - "cummin", - "cumprod", - "cumsum", - "quantile", - ) - # ops that give an object result on object input - obj_result = ( - "first", - "last", - "nth", - "bfill", - "ffill", - "shift", - "sum", - "diff", - "pct_change", - "var", - "mean", - "median", - "min", - "max", - "prod", - "skew", - ) - - # Test default behavior; kernels that fail may be enabled in the future but kernels - # that succeed should not be allowed to fail (without deprecation, at least) - if groupby_func in fails_on_numeric_object and dtype is object: - if groupby_func == "quantile": - msg = "cannot be performed against 'object' dtypes" - else: - msg = "is not supported for object dtype" - with pytest.raises(TypeError, match=msg): - method(*args) - elif dtype is object: - result = method(*args) - expected = expected_method(*args) - if groupby_func in obj_result: - expected = expected.astype(object) - tm.assert_series_equal(result, expected) - - has_numeric_only = ( - "first", - "last", - "max", - "mean", - "median", - "min", - "prod", - "quantile", - "sem", - "skew", - "std", - "sum", - "var", - "cummax", - "cummin", - "cumprod", - "cumsum", - ) - if groupby_func not in has_numeric_only: - msg = "got an unexpected keyword argument 'numeric_only'" - with pytest.raises(TypeError, match=msg): - method(*args, numeric_only=True) - elif dtype is object: - msg = "|".join( - [ - "SeriesGroupBy.sem called with numeric_only=True and dtype object", - "Series.skew does not allow numeric_only=True with non-numeric", - "cum(sum|prod|min|max) is not supported for object dtype", - r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric", - ] - ) - with pytest.raises(TypeError, match=msg): - method(*args, numeric_only=True) - elif dtype == bool and groupby_func == "quantile": - msg = "Allowing bool dtype in SeriesGroupBy.quantile" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#51424 - result = method(*args, numeric_only=True) - expected = method(*args, numeric_only=False) - tm.assert_series_equal(result, expected) - else: - result = method(*args, numeric_only=True) - expected = method(*args, numeric_only=False) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("dtype", [int, float, object]) -@pytest.mark.parametrize( - "kwargs", - [ - {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None}, - {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]}, - {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None}, - ], -) -def test_groupby_empty_dataset(dtype, kwargs): - # GH#41575 - df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype) - df["B"] = df["B"].astype(int) - df["C"] = df["C"].astype(float) - - result = df.iloc[:0].groupby("A").describe(**kwargs) - expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0] - tm.assert_frame_equal(result, expected) - - result = df.iloc[:0].groupby("A").B.describe(**kwargs) - expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0] - expected.index = Index([]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("func", ["min", "max"]) -def test_min_empty_string_dtype(func): - # GH#55619 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] - result = getattr(df.groupby("a"), func)() - expected = DataFrame( - columns=["b", "c"], dtype=dtype, index=Index([], dtype=dtype, name="a") - ) - tm.assert_frame_equal(result, expected) - - -def test_corrwith_with_1_axis(): - # GH 47723 - df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]}) - gb = df.groupby("a") - - msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.corrwith(df, axis=1) - index = Index( - data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)], - name=("a", None), - ) - expected = Series([np.nan] * 6, index=index) - tm.assert_series_equal(result, expected) - - -def test_multiindex_group_all_columns_when_empty(groupby_func): - # GH 32464 - df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"]) - gb = df.groupby(["a", "b", "c"], group_keys=False) - method = getattr(gb, groupby_func) - args = get_groupby_method_args(groupby_func, df) - - result = method(*args).index - expected = df.index - tm.assert_index_equal(result, expected) - - -def test_duplicate_columns(request, groupby_func, as_index): - # GH#50806 - if groupby_func == "corrwith": - msg = "GH#50845 - corrwith fails when there are duplicate columns" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb")) - args = get_groupby_method_args(groupby_func, df) - gb = df.groupby("a", as_index=as_index) - result = getattr(gb, groupby_func)(*args) - - expected_df = df.set_axis(["a", "b", "c"], axis=1) - expected_args = get_groupby_method_args(groupby_func, expected_df) - expected_gb = expected_df.groupby("a", as_index=as_index) - expected = getattr(expected_gb, groupby_func)(*expected_args) - if groupby_func not in ("size", "ngroup", "cumcount"): - expected = expected.rename(columns={"c": "b"}) - tm.assert_equal(result, expected) - - -@pytest.mark.parametrize( - "op", - [ - "sum", - "prod", - "min", - "max", - "median", - "mean", - "skew", - "std", - "var", - "sem", - ], -) -@pytest.mark.parametrize("axis", [0, 1]) -@pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -def test_regression_allowlist_methods(op, axis, skipna, sort): - # GH6944 - # GH 17537 - # explicitly test the allowlist methods - raw_frame = DataFrame([0]) - if axis == 0: - frame = raw_frame - msg = "The 'axis' keyword in DataFrame.groupby is deprecated and will be" - else: - frame = raw_frame.T - msg = "DataFrame.groupby with axis=1 is deprecated" - - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped = frame.groupby(level=0, axis=axis, sort=sort) - - if op == "skew": - # skew has skipna - result = getattr(grouped, op)(skipna=skipna) - expected = frame.groupby(level=0).apply( - lambda h: getattr(h, op)(axis=axis, skipna=skipna) - ) - if sort: - expected = expected.sort_index(axis=axis) - tm.assert_frame_equal(result, expected) - else: - result = getattr(grouped, op)() - expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(axis=axis)) - if sort: - expected = expected.sort_index(axis=axis) - tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_groupby.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_groupby.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_groupby.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_groupby.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,5 @@ from datetime import datetime +import decimal from decimal import Decimal import re @@ -11,6 +12,8 @@ ) import pandas.util._test_decorators as td +from pandas.core.dtypes.common import is_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -29,7 +32,6 @@ import pandas._testing as tm from pandas.core.arrays import BooleanArray import pandas.core.common as com -from pandas.tests.groupby import get_groupby_method_args pytestmark = pytest.mark.filterwarnings("ignore:Mean of empty slice:RuntimeWarning") @@ -41,13 +43,13 @@ assert result == expected -def test_groupby_std_datetimelike(): +def test_groupby_std_datetimelike(warn_copy_on_write): # GH#48481 tdi = pd.timedelta_range("1 Day", periods=10000) ser = Series(tdi) ser[::5] *= 2 # get different std for different groups - df = ser.to_frame("A") + df = ser.to_frame("A").copy() df["B"] = ser + Timestamp(0) df["C"] = ser + Timestamp(0, tz="UTC") @@ -134,24 +136,35 @@ grouped.aggregate(lambda x: x * 2) -def test_groupby_nonobject_dtype(mframe, df_mixed_floats): - key = mframe.index.codes[0] - grouped = mframe.groupby(key) +def test_groupby_nonobject_dtype(multiindex_dataframe_random_data): + key = multiindex_dataframe_random_data.index.codes[0] + grouped = multiindex_dataframe_random_data.groupby(key) result = grouped.sum() - expected = mframe.groupby(key.astype("O")).sum() + expected = multiindex_dataframe_random_data.groupby(key.astype("O")).sum() assert result.index.dtype == np.int8 assert expected.index.dtype == np.int64 tm.assert_frame_equal(result, expected, check_index_type=False) + +def test_groupby_nonobject_dtype_mixed(): # GH 3911, mixed frame non-conversion - df = df_mixed_floats.copy() + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.default_rng(2).standard_normal(8), + "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), + } + ) df["value"] = range(len(df)) def max_value(group): return group.loc[group["value"].idxmax()] - applied = df.groupby("A").apply(max_value) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + applied = df.groupby("A").apply(max_value) result = applied.dtypes expected = df.dtypes tm.assert_series_equal(result, expected) @@ -172,7 +185,9 @@ return grp.iloc[0] expected = df.groupby("A").first()[["B"]] - result = df.groupby("A").apply(f_0)[["B"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) def f_1(grp): @@ -180,9 +195,10 @@ return None return grp.iloc[0] - result = df.groupby("A").apply(f_1)[["B"]] - # Cast to avoid upcast when setting nan below - e = expected.copy().astype("float64") + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("A").apply(f_1)[["B"]] + e = expected.copy() e.loc["Tiger"] = np.nan tm.assert_frame_equal(result, e) @@ -191,9 +207,10 @@ return None return grp.iloc[0] - result = df.groupby("A").apply(f_2)[["B"]] - # Explicit cast to float to avoid implicit cast when setting nan - e = expected.copy().astype({"B": "float"}) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("A").apply(f_2)[["B"]] + e = expected.copy() e.loc["Pony"] = np.nan tm.assert_frame_equal(result, e) @@ -203,7 +220,9 @@ return None return grp.iloc[0] - result = df.groupby("A").apply(f_3)[["C"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("A").apply(f_3)[["C"]] e = df.groupby("A").first()[["C"]] e.loc["Pony"] = pd.NaT tm.assert_frame_equal(result, e) @@ -214,7 +233,9 @@ return None return grp.iloc[0].loc["C"] - result = df.groupby("A").apply(f_4) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("A").apply(f_4) e = df.groupby("A").first()["C"].copy() e.loc["Pony"] = np.nan e.name = None @@ -298,7 +319,11 @@ def test_len(): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) assert len(grouped) == len(df) @@ -306,6 +331,8 @@ expected = len({(x.year, x.month) for x in df.index}) assert len(grouped) == expected + +def test_len_nan_group(): # issue 11016 df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]}) assert len(df.groupby("a")) == 0 @@ -393,8 +420,11 @@ depr_msg = "The behavior of array concatenation with empty entries is deprecated" # correct result - result1 = df.groupby("a").apply(f1) - result2 = df2.groupby("a").apply(f1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result1 = df.groupby("a").apply(f1) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -660,7 +690,7 @@ grouped = data.groupby(["A", "B"]) funcs = ["mean", "std"] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -916,7 +946,11 @@ def test_groupby_multiple_key(): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) agged = grouped.sum() tm.assert_almost_equal(df.values, agged.values) @@ -949,7 +983,7 @@ def test_raises_on_nuisance(df): grouped = df.groupby("A") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1005,7 +1039,7 @@ msg = "could not convert string to float: 'one'" else: klass = TypeError - msg = re.escape(f"agg function failed [how->{agg_function},dtype->object]") + msg = re.escape(f"agg function failed [how->{agg_function},dtype->") with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: @@ -1030,14 +1064,14 @@ def test_raise_on_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): grouped.mean() -def test_empty_groups_corner(mframe): +def test_empty_groups_corner(multiindex_dataframe_random_data): # handle empty groups df = DataFrame( { @@ -1054,7 +1088,7 @@ expected = grouped.mean(numeric_only=True) tm.assert_frame_equal(result, expected) - grouped = mframe[3:5].groupby(level=0) + grouped = multiindex_dataframe_random_data[3:5].groupby(level=0) agged = grouped.apply(lambda x: x.mean()) agged_A = grouped["A"].apply("mean") tm.assert_series_equal(agged["A"], agged_A) @@ -1068,12 +1102,12 @@ df.groupby(lambda x: x + "foo") -def test_wrap_aggregated_output_multindex(mframe): - df = mframe.T +def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): + df = multiindex_dataframe_random_data.T df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") @@ -1088,24 +1122,24 @@ df.groupby(keys).aggregate(aggfun) -def test_groupby_level_apply(mframe): - result = mframe.groupby(level=0).count() +def test_groupby_level_apply(multiindex_dataframe_random_data): + result = multiindex_dataframe_random_data.groupby(level=0).count() assert result.index.name == "first" - result = mframe.groupby(level=1).count() + result = multiindex_dataframe_random_data.groupby(level=1).count() assert result.index.name == "second" - result = mframe["A"].groupby(level=0).count() + result = multiindex_dataframe_random_data["A"].groupby(level=0).count() assert result.index.name == "first" -def test_groupby_level_mapper(mframe): - deleveled = mframe.reset_index() +def test_groupby_level_mapper(multiindex_dataframe_random_data): + deleveled = multiindex_dataframe_random_data.reset_index() mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1} mapper1 = {"one": 0, "two": 0, "three": 1} - result0 = mframe.groupby(mapper0, level=0).sum() - result1 = mframe.groupby(mapper1, level=1).sum() + result0 = multiindex_dataframe_random_data.groupby(mapper0, level=0).sum() + result1 = multiindex_dataframe_random_data.groupby(mapper1, level=1).sum() mapped_level0 = np.array( [mapper0.get(x) for x in deleveled["first"]], dtype=np.int64 @@ -1113,8 +1147,8 @@ mapped_level1 = np.array( [mapper1.get(x) for x in deleveled["second"]], dtype=np.int64 ) - expected0 = mframe.groupby(mapped_level0).sum() - expected1 = mframe.groupby(mapped_level1).sum() + expected0 = multiindex_dataframe_random_data.groupby(mapped_level0).sum() + expected1 = multiindex_dataframe_random_data.groupby(mapped_level1).sum() expected0.index.name, expected1.index.name = "first", "second" tm.assert_frame_equal(result0, expected0) @@ -1162,7 +1196,25 @@ tm.assert_series_equal(result, expected) -def test_groupby_complex_numbers(): +def test_groupby_complex_mean(): + # GH 26475 + df = DataFrame( + [ + {"a": 2, "b": 1 + 2j}, + {"a": 1, "b": 1 + 1j}, + {"a": 1, "b": 1 + 2j}, + ] + ) + result = df.groupby("b").mean() + expected = DataFrame( + [[1.0], [1.5]], + index=Index([(1 + 1j), (1 + 2j)], name="b"), + columns=Index(["a"]), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_complex_numbers(using_infer_string): # GH 17927 df = DataFrame( [ @@ -1171,10 +1223,11 @@ {"a": 4, "b": 1}, ] ) + dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( np.array([1, 1, 1], dtype=np.int64), index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), - columns=Index(["a"], dtype="object"), + columns=Index(["a"], dtype=dtype), ) result = df.groupby("b", sort=False).count() tm.assert_frame_equal(result, expected) @@ -1323,11 +1376,15 @@ # inconsistent. return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) - metrics = df.groupby("A").apply(summarize) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - metrics = df.groupby("A").apply(summarize, "metrics") + with tm.assert_produces_warning(DeprecationWarning, match=msg): + metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - metrics = df.groupby("A").apply(summarize_random_name) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1499,7 +1556,7 @@ tm.assert_index_equal(grouped.groups[k], e) # confirm obj is not filtered - tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) + tm.assert_frame_equal(grouped._grouper.groupings[0].obj, df) assert grouped.ngroups == 2 expected = { @@ -1620,12 +1677,18 @@ {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - result = df.groupby("key", group_keys=False).apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("key", group_keys=False).apply(lambda x: x) tm.assert_frame_equal(result, df) def test_skip_group_keys(): - tsf = tm.makeTimeDataFrame() + tsf = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) grouped = tsf.groupby(lambda x: x.month, group_keys=False) result = grouped.apply(lambda x: x.sort_values(by="A")[:3]) @@ -1679,14 +1742,18 @@ @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) -def test_set_group_name(df, grouper): +def test_set_group_name(df, grouper, using_infer_string): def f(group): assert group.name is not None return group def freduce(group): assert group.name is not None - return group.sum() + if using_infer_string and grouper == "A" and is_string_dtype(group.dtype): + with pytest.raises(TypeError, match="does not support"): + group.sum() + else: + return group.sum() def freducex(x): return freduce(x) @@ -1694,7 +1761,9 @@ grouped = df.groupby(grouper, group_keys=False) # make sure all these work - grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) grouped.transform(f) @@ -1715,7 +1784,9 @@ names.append(group.name) return group.copy() - df.groupby("a", sort=False, group_keys=False).apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] assert names == expected_names @@ -1921,15 +1992,17 @@ def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) - g.apply(test_sort) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + g.apply(test_sort) def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 df = DataFrame( { - "eventDate": date_range(datetime.today(), periods=20, freq="M").tolist(), - "thename": range(0, 20), + "eventDate": date_range(datetime.today(), periods=20, freq="ME").tolist(), + "thename": range(20), } ) @@ -1978,21 +2051,11 @@ "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] ) def test_empty_groupby( - columns, keys, values, method, op, request, using_array_manager, dropna + columns, keys, values, method, op, using_array_manager, dropna, using_infer_string ): # GH8093 & GH26411 override_dtype = None - if ( - isinstance(values, Categorical) - and len(keys) == 1 - and op in ["idxmax", "idxmin"] - ): - mark = pytest.mark.xfail( - raises=ValueError, match="attempt to get arg(min|max) of an empty sequence" - ) - request.node.add_marker(mark) - if isinstance(values, BooleanArray) and op in ["sum", "prod"]: # We expect to get Int64 back for these override_dtype = "Int64" @@ -2030,19 +2093,32 @@ # Categorical is special without 'observed=True' idx = Index(lev, name=keys[0]) - expected = DataFrame([], columns=[], index=idx) + if using_infer_string: + columns = Index([], dtype="string[pyarrow_numpy]") + else: + columns = [] + expected = DataFrame([], columns=columns, index=idx) return expected is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) is_dt64 = df.dtypes.iloc[0].kind == "M" is_cat = isinstance(values, Categorical) - if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]: - msg = f"Cannot perform {op} with non-ordered Categorical" - with pytest.raises(TypeError, match=msg): + if ( + isinstance(values, Categorical) + and not values.ordered + and op in ["min", "max", "idxmin", "idxmax"] + ): + if op in ["min", "max"]: + msg = f"Cannot perform {op} with non-ordered Categorical" + klass = TypeError + else: + msg = f"Can't get {op} of an empty group due to unobserved categories" + klass = ValueError + with pytest.raises(klass, match=msg): get_result() - if isinstance(columns, list): + if op in ["min", "max", "idxmin", "idxmax"] and isinstance(columns, list): # i.e. DataframeGroupBy, not SeriesGroupBy result = get_result(numeric_only=True) expected = get_categorical_invalid_expected() @@ -2103,7 +2179,9 @@ df[3] = df[3].astype(np.int64) df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) - res = gb.apply(lambda x: x) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + res = gb.apply(lambda x: x) assert (res.dtypes == df.dtypes).all() @@ -2344,44 +2422,37 @@ args = ("ffill",) else: args = () - result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) - expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] + warn = FutureWarning if transformation_func == "fillna" else None + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) + with tm.assert_produces_warning(warn, match=warn_msg): + expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) - result = ( - df["col_3"].iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) - ) - expected = ( - df["col_3"].groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] - ) + warn_msg = "SeriesGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + result = ( + df["col_3"] + .iloc[:0] + .groupby(["col_1"]) + .transform(transformation_func, *args) + ) + warn_msg = "SeriesGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + expected = ( + df["col_3"] + .groupby(["col_1"]) + .transform(transformation_func, *args) + .iloc[:0] + ) if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) -@pytest.mark.parametrize( - "idx", - [ - Index(["a", "a"], name="foo"), - MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]), - ], -) -def test_dup_labels_output_shape(groupby_func, idx): - if groupby_func in {"size", "ngroup", "cumcount"}: - pytest.skip(f"Not applicable for {groupby_func}") - - df = DataFrame([[1, 1]], columns=idx) - grp_by = df.groupby([0]) - - args = get_groupby_method_args(groupby_func, df) - result = getattr(grp_by, groupby_func)(*args) - - assert result.shape == (1, 2) - tm.assert_index_equal(result.columns, idx) - - def test_groupby_crash_on_nunique(axis): # Fix following 30253 dti = date_range("2016-01-01", periods=2, name="foo") @@ -2523,6 +2594,8 @@ ) def test_groupby_duplicate_columns(infer_string): # GH: 31735 + if infer_string: + pytest.importorskip("pyarrow") df = DataFrame( {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]} ).astype(object) @@ -3130,7 +3203,9 @@ g_exp = df[["C"]].groupby(df["A"]) # methods which aren't just .foo() - tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) msg = "DataFrameGroupBy.dtypes is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): tm.assert_frame_equal(g.dtypes, g_exp.dtypes) @@ -3144,28 +3219,32 @@ ) -def test_groupby_with_Time_Grouper(): - idx2 = [ - to_datetime("2016-08-31 22:08:12.000"), - to_datetime("2016-08-31 22:09:12.200"), - to_datetime("2016-08-31 22:20:12.400"), - ] +def test_groupby_with_Time_Grouper(unit): + idx2 = to_datetime( + [ + "2016-08-31 22:08:12.000", + "2016-08-31 22:09:12.200", + "2016-08-31 22:20:12.400", + ] + ).as_unit(unit) test_data = DataFrame( {"quant": [1.0, 1.0, 3.0], "quant2": [1.0, 1.0, 3.0], "time2": idx2} ) + time2 = date_range("2016-08-31 22:08:00", periods=13, freq="1min", unit=unit) expected_output = DataFrame( { - "time2": date_range("2016-08-31 22:08:00", periods=13, freq="1T"), + "time2": time2, "quant": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], "quant2": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], } ) - df = test_data.groupby(Grouper(key="time2", freq="1T")).count().reset_index() + gb = test_data.groupby(Grouper(key="time2", freq="1min")) + result = gb.count().reset_index() - tm.assert_frame_equal(df, expected_output) + tm.assert_frame_equal(result, expected_output) def test_groupby_series_with_datetimeindex_month_name(): @@ -3177,6 +3256,42 @@ tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("test_series", [True, False]) +@pytest.mark.parametrize( + "kwarg, value, name, warn", + [ + ("by", "a", 1, None), + ("by", ["a"], 1, FutureWarning), + ("by", ["a"], (1,), None), + ("level", 0, 1, None), + ("level", [0], 1, FutureWarning), + ("level", [0], (1,), None), + ], +) +def test_depr_get_group_len_1_list_likes(test_series, kwarg, value, name, warn): + # GH#25971 + obj = DataFrame({"b": [3, 4, 5]}, index=Index([1, 1, 2], name="a")) + if test_series: + obj = obj["b"] + gb = obj.groupby(**{kwarg: value}) + msg = "you will need to pass a length-1 tuple" + with tm.assert_produces_warning(warn, match=msg): + result = gb.get_group(name) + if test_series: + expected = Series([3, 4], index=Index([1, 1], name="a"), name="b") + else: + expected = DataFrame({"b": [3, 4]}, index=Index([1, 1], name="a")) + tm.assert_equal(result, expected) + + +def test_groupby_ngroup_with_nan(): + # GH#50100 + df = DataFrame({"a": Categorical([np.nan]), "b": [1]}) + result = df.groupby(["a", "b"], dropna=False, observed=False).ngroup() + expected = Series([0]) + tm.assert_series_equal(result, expected) + + def test_get_group_axis_1(): # GH#54858 df = DataFrame( @@ -3198,3 +3313,32 @@ } ) tm.assert_frame_equal(result, expected) + + +def test_groupby_ffill_with_duplicated_index(): + # GH#43412 + df = DataFrame({"a": [1, 2, 3, 4, np.nan, np.nan]}, index=[0, 1, 2, 0, 1, 2]) + + result = df.groupby(level=0).ffill() + expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2]) + tm.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize("test_series", [True, False]) +def test_decimal_na_sort(test_series): + # GH#54847 + # We catch both TypeError and decimal.InvalidOperation exceptions in safe_sort. + # If this next assert raises, we can just catch TypeError + assert not isinstance(decimal.InvalidOperation, TypeError) + df = DataFrame( + { + "key": [Decimal(1), Decimal(1), None, None], + "value": [Decimal(2), Decimal(3), Decimal(4), Decimal(5)], + } + ) + gb = df.groupby("key", dropna=False) + if test_series: + gb = gb["value"] + result = gb._grouper.result_index + expected = Index([Decimal(1), None], name="key") + tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_groupby_dropna.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_groupby_dropna.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_groupby_dropna.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_groupby_dropna.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas.compat.pyarrow import pa_version_under7p0 +from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -172,7 +172,7 @@ # GH 36604 df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) gb = df.groupby("A", dropna=dropna) - assert gb.grouper.dropna == dropna + assert gb._grouper.dropna == dropna @pytest.mark.parametrize( @@ -324,7 +324,9 @@ df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) - result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) @@ -416,7 +418,7 @@ pytest.param( "string[pyarrow]", marks=pytest.mark.skipif( - pa_version_under7p0, reason="pyarrow is not installed" + pa_version_under10p1, reason="pyarrow is not installed" ), ), "datetime64[ns]", @@ -501,18 +503,7 @@ @pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) -def test_categorical_reducers( - request, reduction_func, observed, sort, as_index, index_kind -): - # GH#36327 - if ( - reduction_func in ("idxmin", "idxmax") - and not observed - and index_kind != "multi" - ): - msg = "GH#10694 - idxmin/max broken for categorical with observed=False" - request.node.add_marker(pytest.mark.xfail(reason=msg)) - +def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind): # Ensure there is at least one null value by appending to the end values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None) df = pd.DataFrame( @@ -542,11 +533,22 @@ args = (args[0].drop(columns=keys),) args_filled = (args_filled[0].drop(columns=keys),) + gb_keepna = df.groupby( + keys, dropna=False, observed=observed, sort=sort, as_index=as_index + ) + + if not observed and reduction_func in ["idxmin", "idxmax"]: + with pytest.raises( + ValueError, match="empty group due to unobserved categories" + ): + getattr(gb_keepna, reduction_func)(*args) + return + gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True) expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index() - expected["x"] = expected["x"].replace(4, None) + expected["x"] = expected["x"].cat.remove_categories([4]) if index_kind == "multi": - expected["x2"] = expected["x2"].replace(4, None) + expected["x2"] = expected["x2"].cat.remove_categories([4]) if as_index: if index_kind == "multi": expected = expected.set_index(["x", "x2"]) @@ -562,18 +564,16 @@ values = expected["y"].values.tolist() if index_kind == "single": values = [np.nan if e == 4 else e for e in values] + expected["y"] = pd.Categorical(values, categories=[1, 2, 3]) else: values = [(np.nan, np.nan) if e == (4, 4) else e for e in values] - expected["y"] = values + expected["y"] = values if reduction_func == "size": # size, unlike other methods, has the desired behavior in GH#49519 expected = expected.rename(columns={0: "size"}) if as_index: expected = expected["size"].rename(None) - gb_keepna = df.groupby( - keys, dropna=False, observed=observed, sort=sort, as_index=as_index - ) if as_index or index_kind == "range" or reduction_func == "size": warn = None else: @@ -592,7 +592,7 @@ # GH#36327 if transformation_func == "fillna": msg = "GH#49651 fillna may incorrectly reorders results when dropna=False" - request.node.add_marker(pytest.mark.xfail(reason=msg, strict=False)) + request.applymarker(pytest.mark.xfail(reason=msg, strict=False)) values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None) df = pd.DataFrame( diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_groupby_shift_diff.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_groupby_shift_diff.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_groupby_shift_diff.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_groupby_shift_diff.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,254 +0,0 @@ -import numpy as np -import pytest - -from pandas import ( - DataFrame, - NaT, - Series, - Timedelta, - Timestamp, - date_range, -) -import pandas._testing as tm - - -def test_group_shift_with_null_key(): - # This test is designed to replicate the segfault in issue #13813. - n_rows = 1200 - - # Generate a moderately large dataframe with occasional missing - # values in column `B`, and then group by [`A`, `B`]. This should - # force `-1` in `labels` array of `g.grouper.group_info` exactly - # at those places, where the group-by key is partially missing. - df = DataFrame( - [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], - dtype=float, - columns=["A", "B", "Z"], - index=None, - ) - g = df.groupby(["A", "B"]) - - expected = DataFrame( - [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], - dtype=float, - columns=["Z"], - index=None, - ) - result = g.shift(-1) - - tm.assert_frame_equal(result, expected) - - -def test_group_shift_with_fill_value(): - # GH #24128 - n_rows = 24 - df = DataFrame( - [(i % 12, i % 3, i) for i in range(n_rows)], - dtype=float, - columns=["A", "B", "Z"], - index=None, - ) - g = df.groupby(["A", "B"]) - - expected = DataFrame( - [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], - dtype=float, - columns=["Z"], - index=None, - ) - result = g.shift(-1, fill_value=0) - - tm.assert_frame_equal(result, expected) - - -def test_group_shift_lose_timezone(): - # GH 30134 - now_dt = Timestamp.utcnow().as_unit("ns") - df = DataFrame({"a": [1, 1], "date": now_dt}) - result = df.groupby("a").shift(0).iloc[0] - expected = Series({"date": now_dt}, name=result.name) - tm.assert_series_equal(result, expected) - - -def test_group_diff_real_series(any_real_numpy_dtype): - df = DataFrame( - {"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, - dtype=any_real_numpy_dtype, - ) - result = df.groupby("a")["b"].diff() - exp_dtype = "float" - if any_real_numpy_dtype in ["int8", "int16", "float32"]: - exp_dtype = "float32" - expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b") - tm.assert_series_equal(result, expected) - - -def test_group_diff_real_frame(any_real_numpy_dtype): - df = DataFrame( - { - "a": [1, 2, 3, 3, 2], - "b": [1, 2, 3, 4, 5], - "c": [1, 2, 3, 4, 6], - }, - dtype=any_real_numpy_dtype, - ) - result = df.groupby("a").diff() - exp_dtype = "float" - if any_real_numpy_dtype in ["int8", "int16", "float32"]: - exp_dtype = "float32" - expected = DataFrame( - { - "b": [np.nan, np.nan, np.nan, 1.0, 3.0], - "c": [np.nan, np.nan, np.nan, 1.0, 4.0], - }, - dtype=exp_dtype, - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data", - [ - [ - Timestamp("2013-01-01"), - Timestamp("2013-01-02"), - Timestamp("2013-01-03"), - ], - [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")], - ], -) -def test_group_diff_datetimelike(data): - df = DataFrame({"a": [1, 2, 2], "b": data}) - result = df.groupby("a")["b"].diff() - expected = Series([NaT, NaT, Timedelta("1 days")], name="b") - tm.assert_series_equal(result, expected) - - -def test_group_diff_bool(): - df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) - result = df.groupby("a")["b"].diff() - expected = Series([np.nan, np.nan, np.nan, False, False], name="b") - tm.assert_series_equal(result, expected) - - -def test_group_diff_object_raises(object_dtype): - df = DataFrame( - {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype - ) - with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"): - df.groupby("a")["b"].diff() - - -def test_empty_shift_with_fill(): - # GH 41264, single-index check - df = DataFrame(columns=["a", "b", "c"]) - shifted = df.groupby(["a"]).shift(1) - shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0) - tm.assert_frame_equal(shifted, shifted_with_fill) - tm.assert_index_equal(shifted.index, shifted_with_fill.index) - - -def test_multindex_empty_shift_with_fill(): - # GH 41264, multi-index check - df = DataFrame(columns=["a", "b", "c"]) - shifted = df.groupby(["a", "b"]).shift(1) - shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0) - tm.assert_frame_equal(shifted, shifted_with_fill) - tm.assert_index_equal(shifted.index, shifted_with_fill.index) - - -def test_shift_periods_freq(): - # GH 54093 - data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} - df = DataFrame(data, index=date_range(start="20100101", periods=6)) - result = df.groupby(df.index).shift(periods=-2, freq="D") - expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6)) - tm.assert_frame_equal(result, expected) - - -def test_shift_deprecate_freq_and_fill_value(): - # GH 53832 - data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} - df = DataFrame(data, index=date_range(start="20100101", periods=6)) - msg = ( - "Passing a 'freq' together with a 'fill_value' silently ignores the fill_value" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1") - - -def test_shift_disallow_suffix_if_periods_is_int(): - # GH#44424 - data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]} - df = DataFrame(data) - msg = "Cannot specify `suffix` if `periods` is an int." - with pytest.raises(ValueError, match=msg): - df.groupby("b").shift(1, suffix="fails") - - -def test_group_shift_with_multiple_periods(): - # GH#44424 - df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) - - shifted_df = df.groupby("b")[["a"]].shift([0, 1]) - expected_df = DataFrame( - {"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]} - ) - tm.assert_frame_equal(shifted_df, expected_df) - - # series - shifted_series = df.groupby("b")["a"].shift([0, 1]) - tm.assert_frame_equal(shifted_series, expected_df) - - -def test_group_shift_with_multiple_periods_and_freq(): - # GH#44424 - df = DataFrame( - {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, - index=date_range("1/1/2000", periods=5, freq="H"), - ) - shifted_df = df.groupby("b")[["a"]].shift( - [0, 1], - freq="H", - ) - expected_df = DataFrame( - { - "a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan], - "a_1": [ - np.nan, - 1.0, - 2.0, - 3.0, - 4.0, - 5.0, - ], - }, - index=date_range("1/1/2000", periods=6, freq="H"), - ) - tm.assert_frame_equal(shifted_df, expected_df) - - -def test_group_shift_with_multiple_periods_and_fill_value(): - # GH#44424 - df = DataFrame( - {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, - ) - shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1) - expected_df = DataFrame( - {"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]}, - ) - tm.assert_frame_equal(shifted_df, expected_df) - - -def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): - # GH#44424 - df = DataFrame( - {"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]}, - index=date_range("1/1/2000", periods=5, freq="H"), - ) - msg = ( - "Passing a 'freq' together with a 'fill_value' silently ignores the " - "fill_value" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="H") diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_groupby_subclass.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_groupby_subclass.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_groupby_subclass.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_groupby_subclass.py 2024-04-10 17:42:52.000000000 +0000 @@ -36,8 +36,12 @@ args = get_groupby_method_args(groupby_func, obj) - result1 = getattr(grouped, groupby_func)(*args) - result2 = grouped.agg(groupby_func, *args) + warn = FutureWarning if groupby_func == "fillna" else None + msg = f"{type(grouped).__name__}.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): + result1 = getattr(grouped, groupby_func)(*args) + with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): + result2 = grouped.agg(groupby_func, *args) # Reduction or transformation kernels should preserve type slices = {"ngroup", "cumcount", "size"} @@ -65,12 +69,27 @@ def func(group): assert isinstance(group, tm.SubclassedDataFrame) assert hasattr(group, "testattr") + assert group.testattr == "hello" return group.testattr - result = custom_df.groupby("c").apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning( + DeprecationWarning, + match=msg, + raise_on_extra_warnings=False, + check_stacklevel=False, + ): + result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) + result = custom_df.groupby("c").apply(func, include_groups=False) + tm.assert_series_equal(result, expected) + + # https://github.com/pandas-dev/pandas/pull/56761 + result = custom_df.groupby("c")[["a", "b"]].apply(func) + tm.assert_series_equal(result, expected) + def func2(group): assert isinstance(group, tm.SubclassedSeries) assert hasattr(group, "testattr") @@ -105,5 +124,12 @@ df = df.set_index("Date") # Confirm groupby.resample() preserves dataframe type - result = df.groupby("Buyer").resample("5D").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning( + DeprecationWarning, + match=msg, + raise_on_extra_warnings=False, + check_stacklevel=False, + ): + result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_grouping.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_grouping.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_grouping.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_grouping.py 2024-04-10 17:42:52.000000000 +0000 @@ -19,6 +19,7 @@ Series, Timestamp, date_range, + period_range, ) import pandas._testing as tm from pandas.core.groupby.grouper import Grouping @@ -131,6 +132,20 @@ tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "func", [lambda x: x.sum(), lambda x: x.agg(lambda y: y.sum())] + ) + def test_getitem_from_grouper(self, func): + # GH 50383 + df = DataFrame({"a": [1, 1, 2], "b": 3, "c": 4, "d": 5}) + gb = df.groupby(["a", "b"])[["a", "c"]] + + idx = MultiIndex.from_tuples([(1, 3), (2, 3)], names=["a", "b"]) + expected = DataFrame({"a": [2, 2], "c": [8, 4]}, index=idx) + result = func(gb) + + tm.assert_frame_equal(result, expected) + def test_indices_grouped_by_tuple_with_lambda(self): # GH 36158 df = DataFrame( @@ -160,23 +175,21 @@ @pytest.mark.parametrize( "index", [ - tm.makeFloatIndex, - tm.makeStringIndex, - tm.makeIntIndex, - tm.makeDateIndex, - tm.makePeriodIndex, + Index(list("abcde")), + Index(np.arange(5)), + Index(np.arange(5, dtype=float)), + date_range("2020-01-01", periods=5), + period_range("2020-01-01", periods=5), ], ) - @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_grouper_index_types(self, index): # related GH5375 # groupby misbehaving when using a Floatlike index - df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB")) + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB"), index=index) - df.index = index(len(df)) df.groupby(list("abcde"), group_keys=False).apply(lambda x: x) - df.index = list(reversed(df.index.tolist())) + df.index = df.index[::-1] df.groupby(list("abcde"), group_keys=False).apply(lambda x: x) def test_grouper_multilevel_freq(self): @@ -224,11 +237,14 @@ result = g.sum() tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x.sum()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = g.apply(lambda x: x.sum()) expected["A"] = [0, 2, 4] expected = expected.loc[:, ["A", "B"]] tm.assert_frame_equal(result, expected) + def test_grouper_creation_bug2(self): # GH14334 # Grouper(key=...) may be passed in a list df = DataFrame( @@ -259,26 +275,49 @@ result = g.sum() tm.assert_frame_equal(result, expected) + def test_grouper_creation_bug3(self, unit): # GH8866 - s = Series( + dti = date_range("20130101", periods=2, unit=unit) + mi = MultiIndex.from_product( + [list("ab"), range(2), dti], + names=["one", "two", "three"], + ) + ser = Series( np.arange(8, dtype="int64"), - index=MultiIndex.from_product( - [list("ab"), range(2), date_range("20130101", periods=2)], - names=["one", "two", "three"], - ), + index=mi, ) - result = s.groupby(Grouper(level="three", freq="M")).sum() + result = ser.groupby(Grouper(level="three", freq="ME")).sum() + exp_dti = pd.DatetimeIndex( + [Timestamp("2013-01-31")], freq="ME", name="three" + ).as_unit(unit) expected = Series( [28], - index=pd.DatetimeIndex([Timestamp("2013-01-31")], freq="M", name="three"), + index=exp_dti, ) tm.assert_series_equal(result, expected) # just specifying a level breaks - result = s.groupby(Grouper(level="one")).sum() - expected = s.groupby(level="one").sum() + result = ser.groupby(Grouper(level="one")).sum() + expected = ser.groupby(level="one").sum() tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("func", [False, True]) + def test_grouper_returning_tuples(self, func): + # GH 22257 , both with dict and with callable + df = DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}) + mapping = dict(zip(range(4), [("C", 5), ("D", 6)] * 2)) + + if func: + gb = df.groupby(by=lambda idx: mapping[idx], sort=False) + else: + gb = df.groupby(by=mapping, sort=False) + + name, expected = next(iter(gb)) + assert name == ("C", 5) + result = gb.get_group(name) + + tm.assert_frame_equal(result, expected) + def test_grouper_column_and_index(self): # GH 14327 @@ -377,19 +416,25 @@ ), ) result = df.groupby( - [Grouper(level="one"), Grouper(level="two", freq="M")] + [Grouper(level="one"), Grouper(level="two", freq="ME")] ).sum() expected = DataFrame( {"A": [31, 28, 21, 31, 28, 21]}, index=MultiIndex.from_product( - [list("ab"), date_range("20130101", freq="M", periods=3)], + [list("ab"), date_range("20130101", freq="ME", periods=3)], names=["one", "two"], ), ) tm.assert_frame_equal(result, expected) def test_grouper_iter(self, df): - assert sorted(df.groupby("A").grouper) == ["bar", "foo"] + gb = df.groupby("A") + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouper = gb.grouper + result = sorted(grouper) + expected = ["bar", "foo"] + assert result == expected def test_empty_groups(self, df): # see gh-1048 @@ -398,8 +443,10 @@ def test_groupby_grouper(self, df): grouped = df.groupby("A") - - result = df.groupby(grouped.grouper).mean(numeric_only=True) + msg = "DataFrameGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouper = grouped.grouper + result = df.groupby(grouper).mean(numeric_only=True) expected = grouped.mean(numeric_only=True) tm.assert_frame_equal(result, expected) @@ -459,7 +506,7 @@ df = DataFrame( { "id": ["a", "b"] * 3, - "b": date_range("2000-01-01", "2000-01-03", freq="9H"), + "b": date_range("2000-01-01", "2000-01-03", freq="9h"), } ) grouper = Grouper(key="b", freq="D") @@ -494,22 +541,24 @@ result = gb.first() tm.assert_frame_equal(result, df) - def test_multiindex_negative_level(self, mframe): + def test_multiindex_negative_level(self, multiindex_dataframe_random_data): # GH 13901 - result = mframe.groupby(level=-1).sum() - expected = mframe.groupby(level="second").sum() + result = multiindex_dataframe_random_data.groupby(level=-1).sum() + expected = multiindex_dataframe_random_data.groupby(level="second").sum() tm.assert_frame_equal(result, expected) - result = mframe.groupby(level=-2).sum() - expected = mframe.groupby(level="first").sum() + result = multiindex_dataframe_random_data.groupby(level=-2).sum() + expected = multiindex_dataframe_random_data.groupby(level="first").sum() tm.assert_frame_equal(result, expected) - result = mframe.groupby(level=[-2, -1]).sum() - expected = mframe.sort_index() + result = multiindex_dataframe_random_data.groupby(level=[-2, -1]).sum() + expected = multiindex_dataframe_random_data.sort_index() tm.assert_frame_equal(result, expected) - result = mframe.groupby(level=[-1, "first"]).sum() - expected = mframe.groupby(level=["second", "first"]).sum() + result = multiindex_dataframe_random_data.groupby(level=[-1, "first"]).sum() + expected = multiindex_dataframe_random_data.groupby( + level=["second", "first"] + ).sum() tm.assert_frame_equal(result, expected) def test_multifunc_select_col_integer_cols(self, df): @@ -601,9 +650,9 @@ tm.assert_dict_equal(expected_groups, result_groups) @pytest.mark.parametrize("sort", [True, False]) - def test_groupby_level(self, sort, mframe, df): + def test_groupby_level(self, sort, multiindex_dataframe_random_data, df): # GH 17537 - frame = mframe + frame = multiindex_dataframe_random_data deleveled = frame.reset_index() result0 = frame.groupby(level=0, sort=sort).sum() @@ -684,9 +733,9 @@ expected = Series([6.0, 18.0], index=[0.0, 1.0]) tm.assert_series_equal(result, expected) - def test_groupby_args(self, mframe): + def test_groupby_args(self, multiindex_dataframe_random_data): # PR8618 and issue 8015 - frame = mframe + frame = multiindex_dataframe_random_data msg = "You have to supply one of 'by' and 'level'" with pytest.raises(TypeError, match=msg): @@ -703,22 +752,24 @@ [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]], ], ) - def test_level_preserve_order(self, sort, labels, mframe): + def test_level_preserve_order(self, sort, labels, multiindex_dataframe_random_data): # GH 17537 - grouped = mframe.groupby(level=0, sort=sort) + grouped = multiindex_dataframe_random_data.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) - tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) + tm.assert_almost_equal(grouped._grouper.codes[0], exp_labels) - def test_grouping_labels(self, mframe): - grouped = mframe.groupby(mframe.index.get_level_values(0)) + def test_grouping_labels(self, multiindex_dataframe_random_data): + grouped = multiindex_dataframe_random_data.groupby( + multiindex_dataframe_random_data.index.get_level_values(0) + ) exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) - tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) + tm.assert_almost_equal(grouped._grouper.codes[0], exp_labels) def test_list_grouper_with_nat(self): # GH 14715 df = DataFrame({"date": date_range("1/1/2011", periods=365, freq="D")}) df.iloc[-1] = pd.NaT - grouper = Grouper(key="date", freq="AS") + grouper = Grouper(key="date", freq="YS") # Grouper in a list grouping result = df.groupby([grouper]) @@ -771,19 +822,25 @@ tm.assert_series_equal(result, expected) # check group properties - assert len(gr.grouper.groupings) == 1 + assert len(gr._grouper.groupings) == 1 tm.assert_numpy_array_equal( - gr.grouper.group_info[0], np.array([], dtype=np.dtype(np.intp)) + gr._grouper.group_info[0], np.array([], dtype=np.dtype(np.intp)) ) tm.assert_numpy_array_equal( - gr.grouper.group_info[1], np.array([], dtype=np.dtype(np.intp)) + gr._grouper.group_info[1], np.array([], dtype=np.dtype(np.intp)) ) - assert gr.grouper.group_info[2] == 0 + assert gr._grouper.group_info[2] == 0 # check name - assert s.groupby(s).grouper.names == ["name"] + gb = s.groupby(s) + msg = "SeriesGroupBy.grouper is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + grouper = gb.grouper + result = grouper.names + expected = ["name"] + assert result == expected def test_groupby_level_index_value_all_na(self): # issue 20519 @@ -981,7 +1038,7 @@ grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year]) # test it works - for g in grouped.grouper.groupings[0]: + for g in grouped._grouper.groupings[0]: pass def test_multi_iter(self): @@ -1067,7 +1124,7 @@ {"event": ["start", "start"], "change": [1234, 5678]}, index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]), ) - grouped = df.groupby([Grouper(freq="M"), "event"]) + grouped = df.groupby([Grouper(freq="ME"), "event"]) assert len(grouped.groups) == 2 assert grouped.ngroups == 2 assert (Timestamp("2014-09-30"), "start") in grouped.groups @@ -1082,7 +1139,7 @@ {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]), ) - grouped = df.groupby([Grouper(freq="M"), "event"]) + grouped = df.groupby([Grouper(freq="ME"), "event"]) assert len(grouped.groups) == 2 assert grouped.ngroups == 2 assert (Timestamp("2014-09-30"), "start") in grouped.groups @@ -1098,7 +1155,7 @@ {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]), ) - grouped = df.groupby([Grouper(freq="M"), "event"]) + grouped = df.groupby([Grouper(freq="ME"), "event"]) assert len(grouped.groups) == 3 assert grouped.ngroups == 3 assert (Timestamp("2014-09-30"), "start") in grouped.groups @@ -1118,7 +1175,7 @@ df = DataFrame([[1, 2, 3]], columns=mi) gr = df.groupby(df[("A", "a")]) - result = gr.grouper.groupings[0].__repr__() + result = gr._grouper.groupings[0].__repr__() expected = "Grouping(('A', 'a'))" assert result == expected @@ -1127,8 +1184,8 @@ # GH#50413 - Groupers specified by key are in-axis df = DataFrame({"a": [1, 1, 2], "b": [1, 1, 2], "c": [3, 4, 5]}).set_index("a") gb = df.groupby([Grouper(level="a"), Grouper(key="b")], as_index=False) - assert not gb.grouper.groupings[0].in_axis - assert gb.grouper.groupings[1].in_axis + assert not gb._grouper.groupings[0].in_axis + assert gb._grouper.groupings[1].in_axis # Currently only in-axis groupings are including in the result when as_index=False; # This is likely to change in the future. @@ -1153,7 +1210,7 @@ msg = "Use GroupBy.grouper instead" with tm.assert_produces_warning(FutureWarning, match=msg): res = grper.grouper - assert res is gb.grouper + assert res is gb._grouper msg = "Grouper.obj is deprecated and will be removed" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -1167,3 +1224,13 @@ msg = "Grouper.indexer is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): grper.indexer + + +@pytest.mark.parametrize("attr", ["group_index", "result_index", "group_arraylike"]) +def test_depr_grouping_attrs(attr): + # GH#56148 + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) + gb = df.groupby("a") + msg = f"{attr} is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + getattr(gb._grouper.groupings[0], attr) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_min_max.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_min_max.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_min_max.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_min_max.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,272 +0,0 @@ -import numpy as np -import pytest - -from pandas._libs.tslibs import iNaT - -import pandas as pd -from pandas import ( - DataFrame, - Index, - Series, -) -import pandas._testing as tm - - -def test_max_min_non_numeric(): - # #2700 - aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) - - result = aa.groupby("nn").max() - assert "ss" in result - - result = aa.groupby("nn").max(numeric_only=False) - assert "ss" in result - - result = aa.groupby("nn").min() - assert "ss" in result - - result = aa.groupby("nn").min(numeric_only=False) - assert "ss" in result - - -def test_max_min_object_multiple_columns(using_array_manager): - # GH#41111 case where the aggregation is valid for some columns but not - # others; we split object blocks column-wise, consistent with - # DataFrame._reduce - - df = DataFrame( - { - "A": [1, 1, 2, 2, 3], - "B": [1, "foo", 2, "bar", False], - "C": ["a", "b", "c", "d", "e"], - } - ) - df._consolidate_inplace() # should already be consolidate, but double-check - if not using_array_manager: - assert len(df._mgr.blocks) == 2 - - gb = df.groupby("A") - - result = gb[["C"]].max() - # "max" is valid for column "C" but not for "B" - ei = Index([1, 2, 3], name="A") - expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) - tm.assert_frame_equal(result, expected) - - result = gb[["C"]].min() - # "min" is valid for column "C" but not for "B" - ei = Index([1, 2, 3], name="A") - expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) - tm.assert_frame_equal(result, expected) - - -def test_min_date_with_nans(): - # GH26321 - dates = pd.to_datetime( - Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d" - ).dt.date - df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) - - result = df.groupby("b", as_index=False)["c"].min()["c"] - expected = pd.to_datetime( - Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" - ).dt.date - tm.assert_series_equal(result, expected) - - result = df.groupby("b")["c"].min() - expected.index.name = "b" - tm.assert_series_equal(result, expected) - - -def test_max_inat(): - # GH#40767 dont interpret iNaT as NaN - ser = Series([1, iNaT]) - key = np.array([1, 1], dtype=np.int64) - gb = ser.groupby(key) - - result = gb.max(min_count=2) - expected = Series({1: 1}, dtype=np.int64) - tm.assert_series_equal(result, expected, check_exact=True) - - result = gb.min(min_count=2) - expected = Series({1: iNaT}, dtype=np.int64) - tm.assert_series_equal(result, expected, check_exact=True) - - # not enough entries -> gets masked to NaN - result = gb.min(min_count=3) - expected = Series({1: np.nan}) - tm.assert_series_equal(result, expected, check_exact=True) - - -def test_max_inat_not_all_na(): - # GH#40767 dont interpret iNaT as NaN - - # make sure we dont round iNaT+1 to iNaT - ser = Series([1, iNaT, 2, iNaT + 1]) - gb = ser.groupby([1, 2, 3, 3]) - result = gb.min(min_count=2) - - # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy - expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) - expected.index = expected.index.astype(int) - tm.assert_series_equal(result, expected, check_exact=True) - - -@pytest.mark.parametrize("func", ["min", "max"]) -def test_groupby_aggregate_period_column(func): - # GH 31471 - groups = [1, 2] - periods = pd.period_range("2020", periods=2, freq="Y") - df = DataFrame({"a": groups, "b": periods}) - - result = getattr(df.groupby("a")["b"], func)() - idx = Index([1, 2], name="a") - expected = Series(periods, index=idx, name="b") - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("func", ["min", "max"]) -def test_groupby_aggregate_period_frame(func): - # GH 31471 - groups = [1, 2] - periods = pd.period_range("2020", periods=2, freq="Y") - df = DataFrame({"a": groups, "b": periods}) - - result = getattr(df.groupby("a"), func)() - idx = Index([1, 2], name="a") - expected = DataFrame({"b": periods}, index=idx) - - tm.assert_frame_equal(result, expected) - - -def test_aggregate_numeric_object_dtype(): - # https://github.com/pandas-dev/pandas/issues/39329 - # simplified case: multiple object columns where one is all-NaN - # -> gets split as the all-NaN is inferred as float - df = DataFrame( - {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, - ).astype(object) - result = df.groupby("key").min() - expected = ( - DataFrame( - {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}, - ) - .set_index("key") - .astype(object) - ) - tm.assert_frame_equal(result, expected) - - # same but with numbers - df = DataFrame( - {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, - ).astype(object) - result = df.groupby("key").min() - expected = ( - DataFrame({"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}) - .set_index("key") - .astype(object) - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("func", ["min", "max"]) -def test_aggregate_categorical_lost_index(func: str): - # GH: 28641 groupby drops index, when grouping over categorical column with min/max - ds = Series(["b"], dtype="category").cat.as_ordered() - df = DataFrame({"A": [1997], "B": ds}) - result = df.groupby("A").agg({"B": func}) - expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A")) - - # ordered categorical dtype should be preserved - expected["B"] = expected["B"].astype(ds.dtype) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"]) -def test_groupby_min_max_nullable(dtype): - if dtype == "Int64": - # GH#41743 avoid precision loss - ts = 1618556707013635762 - elif dtype == "boolean": - ts = 0 - else: - ts = 4.0 - - df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]}) - df["ts"] = df["ts"].astype(dtype) - - gb = df.groupby("id") - - result = gb.min() - expected = df.iloc[:1].set_index("id") - tm.assert_frame_equal(result, expected) - - res_max = gb.max() - expected_max = df.iloc[1:].set_index("id") - tm.assert_frame_equal(res_max, expected_max) - - result2 = gb.min(min_count=3) - expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype) - tm.assert_frame_equal(result2, expected2) - - res_max2 = gb.max(min_count=3) - tm.assert_frame_equal(res_max2, expected2) - - # Case with NA values - df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]}) - df2["ts"] = df2["ts"].astype(dtype) - gb2 = df2.groupby("id") - - result3 = gb2.min() - tm.assert_frame_equal(result3, expected) - - res_max3 = gb2.max() - tm.assert_frame_equal(res_max3, expected_max) - - result4 = gb2.min(min_count=100) - tm.assert_frame_equal(result4, expected2) - - res_max4 = gb2.max(min_count=100) - tm.assert_frame_equal(res_max4, expected2) - - -def test_min_max_nullable_uint64_empty_group(): - # don't raise NotImplementedError from libgroupby - cat = pd.Categorical([0] * 10, categories=[0, 1]) - df = DataFrame({"A": cat, "B": pd.array(np.arange(10, dtype=np.uint64))}) - gb = df.groupby("A", observed=False) - - res = gb.min() - - idx = pd.CategoricalIndex([0, 1], dtype=cat.dtype, name="A") - expected = DataFrame({"B": pd.array([0, pd.NA], dtype="UInt64")}, index=idx) - tm.assert_frame_equal(res, expected) - - res = gb.max() - expected.iloc[0, 0] = 9 - tm.assert_frame_equal(res, expected) - - -@pytest.mark.parametrize("func", ["first", "last", "min", "max"]) -def test_groupby_min_max_categorical(func): - # GH: 52151 - df = DataFrame( - { - "col1": pd.Categorical(["A"], categories=list("AB"), ordered=True), - "col2": pd.Categorical([1], categories=[1, 2], ordered=True), - "value": 0.1, - } - ) - result = getattr(df.groupby("col1", observed=False), func)() - - idx = pd.CategoricalIndex(data=["A", "B"], name="col1", ordered=True) - expected = DataFrame( - { - "col2": pd.Categorical([1, None], categories=[1, 2], ordered=True), - "value": [0.1, None], - }, - index=idx, - ) - tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_missing.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_missing.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_missing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_missing.py 2024-04-10 17:42:52.000000000 +0000 @@ -39,8 +39,10 @@ def test_ffill_missing_arguments(): # GH 14955 df = DataFrame({"a": [1, 2], "b": [1, 1]}) - with pytest.raises(ValueError, match="Must specify a fill"): - df.groupby("b").fillna() + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match="Must specify a fill"): + df.groupby("b").fillna() @pytest.mark.parametrize( @@ -50,7 +52,7 @@ # GH 40250 df = DataFrame({"a": pd.array([None, "a", None], dtype="string"), "b": [0, 0, 0]}) grp = df.groupby("b") - msg = "DataFrameGroupBy.fillna with 'method' is deprecated" + msg = "DataFrameGroupBy.fillna is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): result = grp.fillna(method=method) expected = DataFrame({"a": pd.array(expected, dtype="string")}) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_nth.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_nth.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_nth.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_nth.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,875 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - Index, - MultiIndex, - Series, - Timestamp, - isna, -) -import pandas._testing as tm - - -def test_first_last_nth(df): - # tests for first / last / nth - grouped = df.groupby("A") - first = grouped.first() - expected = df.loc[[1, 0], ["B", "C", "D"]] - expected.index = Index(["bar", "foo"], name="A") - expected = expected.sort_index() - tm.assert_frame_equal(first, expected) - - nth = grouped.nth(0) - expected = df.loc[[0, 1]] - tm.assert_frame_equal(nth, expected) - - last = grouped.last() - expected = df.loc[[5, 7], ["B", "C", "D"]] - expected.index = Index(["bar", "foo"], name="A") - tm.assert_frame_equal(last, expected) - - nth = grouped.nth(-1) - expected = df.iloc[[5, 7]] - tm.assert_frame_equal(nth, expected) - - nth = grouped.nth(1) - expected = df.iloc[[2, 3]] - tm.assert_frame_equal(nth, expected) - - # it works! - grouped["B"].first() - grouped["B"].last() - grouped["B"].nth(0) - - df.loc[df["A"] == "foo", "B"] = np.nan - assert isna(grouped["B"].first()["foo"]) - assert isna(grouped["B"].last()["foo"]) - assert isna(grouped["B"].nth(0).iloc[0]) - - # v0.14.0 whatsnew - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) - g = df.groupby("A") - result = g.first() - expected = df.iloc[[1, 2]].set_index("A") - tm.assert_frame_equal(result, expected) - - expected = df.iloc[[1, 2]] - result = g.nth(0, dropna="any") - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("method", ["first", "last"]) -def test_first_last_with_na_object(method, nulls_fixture): - # https://github.com/pandas-dev/pandas/issues/32123 - groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a") - result = getattr(groups, method)() - - if method == "first": - values = [1, 3] - else: - values = [2, 3] - - values = np.array(values, dtype=result["b"].dtype) - idx = Index([1, 2], name="a") - expected = DataFrame({"b": values}, index=idx) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("index", [0, -1]) -def test_nth_with_na_object(index, nulls_fixture): - # https://github.com/pandas-dev/pandas/issues/32123 - df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}) - groups = df.groupby("a") - result = groups.nth(index) - expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]] - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("method", ["first", "last"]) -def test_first_last_with_None(method): - # https://github.com/pandas-dev/pandas/issues/32800 - # None should be preserved as object dtype - df = DataFrame.from_dict({"id": ["a"], "value": [None]}) - groups = df.groupby("id", as_index=False) - result = getattr(groups, method)() - - tm.assert_frame_equal(result, df) - - -@pytest.mark.parametrize("method", ["first", "last"]) -@pytest.mark.parametrize( - "df, expected", - [ - ( - DataFrame({"id": "a", "value": [None, "foo", np.nan]}), - DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")), - ), - ( - DataFrame({"id": "a", "value": [np.nan]}, dtype=object), - DataFrame({"value": [None]}, index=Index(["a"], name="id")), - ), - ], -) -def test_first_last_with_None_expanded(method, df, expected): - # GH 32800, 38286 - result = getattr(df.groupby("id"), method)() - tm.assert_frame_equal(result, expected) - - -def test_first_last_nth_dtypes(df_mixed_floats): - df = df_mixed_floats.copy() - df["E"] = True - df["F"] = 1 - - # tests for first / last / nth - grouped = df.groupby("A") - first = grouped.first() - expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]] - expected.index = Index(["bar", "foo"], name="A") - expected = expected.sort_index() - tm.assert_frame_equal(first, expected) - - last = grouped.last() - expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]] - expected.index = Index(["bar", "foo"], name="A") - expected = expected.sort_index() - tm.assert_frame_equal(last, expected) - - nth = grouped.nth(1) - expected = df.iloc[[2, 3]] - tm.assert_frame_equal(nth, expected) - - # GH 2763, first/last shifting dtypes - idx = list(range(10)) - idx.append(9) - s = Series(data=range(11), index=idx, name="IntCol") - assert s.dtype == "int64" - f = s.groupby(level=0).first() - assert f.dtype == "int64" - - -def test_first_last_nth_nan_dtype(): - # GH 33591 - df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)}) - grouped = df.groupby("data") - - expected = df.set_index("data").nans - tm.assert_series_equal(grouped.nans.first(), expected) - tm.assert_series_equal(grouped.nans.last(), expected) - - expected = df.nans - tm.assert_series_equal(grouped.nans.nth(-1), expected) - tm.assert_series_equal(grouped.nans.nth(0), expected) - - -def test_first_strings_timestamps(): - # GH 11244 - test = DataFrame( - { - Timestamp("2012-01-01 00:00:00"): ["a", "b"], - Timestamp("2012-01-02 00:00:00"): ["c", "d"], - "name": ["e", "e"], - "aaaa": ["f", "g"], - } - ) - result = test.groupby("name").first() - expected = DataFrame( - [["a", "c", "f"]], - columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]), - index=Index(["e"], name="name"), - ) - tm.assert_frame_equal(result, expected) - - -def test_nth(): - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) - g = df.groupby("A") - - tm.assert_frame_equal(g.nth(0), df.iloc[[0, 2]]) - tm.assert_frame_equal(g.nth(1), df.iloc[[1]]) - tm.assert_frame_equal(g.nth(2), df.loc[[]]) - tm.assert_frame_equal(g.nth(-1), df.iloc[[1, 2]]) - tm.assert_frame_equal(g.nth(-2), df.iloc[[0]]) - tm.assert_frame_equal(g.nth(-3), df.loc[[]]) - tm.assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]]) - tm.assert_series_equal(g.B.nth(1), df.B.iloc[[1]]) - tm.assert_frame_equal(g[["B"]].nth(0), df[["B"]].iloc[[0, 2]]) - - tm.assert_frame_equal(g.nth(0, dropna="any"), df.iloc[[1, 2]]) - tm.assert_frame_equal(g.nth(-1, dropna="any"), df.iloc[[1, 2]]) - - tm.assert_frame_equal(g.nth(7, dropna="any"), df.iloc[:0]) - tm.assert_frame_equal(g.nth(2, dropna="any"), df.iloc[:0]) - - # out of bounds, regression from 0.13.1 - # GH 6621 - df = DataFrame( - { - "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"}, - "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"}, - "two": { - 0: 1.5456590000000001, - 1: -0.070345000000000005, - 2: -2.4004539999999999, - 3: 0.46206000000000003, - 4: 0.52350799999999997, - }, - "one": { - 0: 0.56573799999999996, - 1: -0.9742360000000001, - 2: 1.033801, - 3: -0.78543499999999999, - 4: 0.70422799999999997, - }, - } - ).set_index(["color", "food"]) - - result = df.groupby(level=0, as_index=False).nth(2) - expected = df.iloc[[-1]] - tm.assert_frame_equal(result, expected) - - result = df.groupby(level=0, as_index=False).nth(3) - expected = df.loc[[]] - tm.assert_frame_equal(result, expected) - - # GH 7559 - # from the vbench - df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64") - s = df[1] - g = df[0] - expected = s.groupby(g).first() - expected2 = s.groupby(g).apply(lambda x: x.iloc[0]) - tm.assert_series_equal(expected2, expected, check_names=False) - assert expected.name == 1 - assert expected2.name == 1 - - # validate first - v = s[g == 1].iloc[0] - assert expected.iloc[0] == v - assert expected2.iloc[0] == v - - with pytest.raises(ValueError, match="For a DataFrame"): - s.groupby(g, sort=False).nth(0, dropna=True) - - # doc example - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) - g = df.groupby("A") - result = g.B.nth(0, dropna="all") - expected = df.B.iloc[[1, 2]] - tm.assert_series_equal(result, expected) - - # test multiple nth values - df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"]) - g = df.groupby("A") - - tm.assert_frame_equal(g.nth(0), df.iloc[[0, 3]]) - tm.assert_frame_equal(g.nth([0]), df.iloc[[0, 3]]) - tm.assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]]) - tm.assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]]) - tm.assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]]) - tm.assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]]) - tm.assert_frame_equal(g.nth([2]), df.iloc[[2]]) - tm.assert_frame_equal(g.nth([3, 4]), df.loc[[]]) - - business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B") - df = DataFrame(1, index=business_dates, columns=["a", "b"]) - # get the first, fourth and last two business days for each month - key = [df.index.year, df.index.month] - result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) - expected_dates = pd.to_datetime( - [ - "2014/4/1", - "2014/4/4", - "2014/4/29", - "2014/4/30", - "2014/5/1", - "2014/5/6", - "2014/5/29", - "2014/5/30", - "2014/6/2", - "2014/6/5", - "2014/6/27", - "2014/6/30", - ] - ) - expected = DataFrame(1, columns=["a", "b"], index=expected_dates) - tm.assert_frame_equal(result, expected) - - -def test_nth_multi_grouper(three_group): - # PR 9090, related to issue 8979 - # test nth on multiple groupers - grouped = three_group.groupby(["A", "B"]) - result = grouped.nth(0) - expected = three_group.iloc[[0, 3, 4, 7]] - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data, expected_first, expected_last", - [ - ( - { - "id": ["A"], - "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), - "foo": [1], - }, - { - "id": ["A"], - "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), - "foo": [1], - }, - { - "id": ["A"], - "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), - "foo": [1], - }, - ), - ( - { - "id": ["A", "B", "A"], - "time": [ - Timestamp("2012-01-01 13:00:00", tz="America/New_York"), - Timestamp("2012-02-01 14:00:00", tz="US/Central"), - Timestamp("2012-03-01 12:00:00", tz="Europe/London"), - ], - "foo": [1, 2, 3], - }, - { - "id": ["A", "B"], - "time": [ - Timestamp("2012-01-01 13:00:00", tz="America/New_York"), - Timestamp("2012-02-01 14:00:00", tz="US/Central"), - ], - "foo": [1, 2], - }, - { - "id": ["A", "B"], - "time": [ - Timestamp("2012-03-01 12:00:00", tz="Europe/London"), - Timestamp("2012-02-01 14:00:00", tz="US/Central"), - ], - "foo": [3, 2], - }, - ), - ], -) -def test_first_last_tz(data, expected_first, expected_last): - # GH15884 - # Test that the timezone is retained when calling first - # or last on groupby with as_index=False - - df = DataFrame(data) - - result = df.groupby("id", as_index=False).first() - expected = DataFrame(expected_first) - cols = ["id", "time", "foo"] - tm.assert_frame_equal(result[cols], expected[cols]) - - result = df.groupby("id", as_index=False)["time"].first() - tm.assert_frame_equal(result, expected[["id", "time"]]) - - result = df.groupby("id", as_index=False).last() - expected = DataFrame(expected_last) - cols = ["id", "time", "foo"] - tm.assert_frame_equal(result[cols], expected[cols]) - - result = df.groupby("id", as_index=False)["time"].last() - tm.assert_frame_equal(result, expected[["id", "time"]]) - - -@pytest.mark.parametrize( - "method, ts, alpha", - [ - ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"], - ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"], - ], -) -def test_first_last_tz_multi_column(method, ts, alpha): - # GH 21603 - category_string = Series(list("abc")).astype("category") - df = DataFrame( - { - "group": [1, 1, 2], - "category_string": category_string, - "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), - } - ) - result = getattr(df.groupby("group"), method)() - expected = DataFrame( - { - "category_string": pd.Categorical( - [alpha, "c"], dtype=category_string.dtype - ), - "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")], - }, - index=Index([1, 2], name="group"), - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "values", - [ - pd.array([True, False], dtype="boolean"), - pd.array([1, 2], dtype="Int64"), - pd.to_datetime(["2020-01-01", "2020-02-01"]), - pd.to_timedelta([1, 2], unit="D"), - ], -) -@pytest.mark.parametrize("function", ["first", "last", "min", "max"]) -def test_first_last_extension_array_keeps_dtype(values, function): - # https://github.com/pandas-dev/pandas/issues/33071 - # https://github.com/pandas-dev/pandas/issues/32194 - df = DataFrame({"a": [1, 2], "b": values}) - grouped = df.groupby("a") - idx = Index([1, 2], name="a") - expected_series = Series(values, name="b", index=idx) - expected_frame = DataFrame({"b": values}, index=idx) - - result_series = getattr(grouped["b"], function)() - tm.assert_series_equal(result_series, expected_series) - - result_frame = grouped.agg({"b": function}) - tm.assert_frame_equal(result_frame, expected_frame) - - -def test_nth_multi_index_as_expected(): - # PR 9090, related to issue 8979 - # test nth on MultiIndex - three_group = DataFrame( - { - "A": [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - "B": [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - "C": [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - } - ) - grouped = three_group.groupby(["A", "B"]) - result = grouped.nth(0) - expected = three_group.iloc[[0, 3, 4, 7]] - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "op, n, expected_rows", - [ - ("head", -1, [0]), - ("head", 0, []), - ("head", 1, [0, 2]), - ("head", 7, [0, 1, 2]), - ("tail", -1, [1]), - ("tail", 0, []), - ("tail", 1, [1, 2]), - ("tail", 7, [0, 1, 2]), - ], -) -@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]]) -@pytest.mark.parametrize("as_index", [True, False]) -def test_groupby_head_tail(op, n, expected_rows, columns, as_index): - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) - g = df.groupby("A", as_index=as_index) - expected = df.iloc[expected_rows] - if columns is not None: - g = g[columns] - expected = expected[columns] - result = getattr(g, op)(n) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "op, n, expected_cols", - [ - ("head", -1, [0]), - ("head", 0, []), - ("head", 1, [0, 2]), - ("head", 7, [0, 1, 2]), - ("tail", -1, [1]), - ("tail", 0, []), - ("tail", 1, [1, 2]), - ("tail", 7, [0, 1, 2]), - ], -) -def test_groupby_head_tail_axis_1(op, n, expected_cols): - # GH 9772 - df = DataFrame( - [[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"] - ) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - g = df.groupby([0, 0, 1], axis=1) - expected = df.iloc[:, expected_cols] - result = getattr(g, op)(n) - tm.assert_frame_equal(result, expected) - - -def test_group_selection_cache(): - # GH 12839 nth, head, and tail should return same result consistently - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) - expected = df.iloc[[0, 2]] - - g = df.groupby("A") - result1 = g.head(n=2) - result2 = g.nth(0) - tm.assert_frame_equal(result1, df) - tm.assert_frame_equal(result2, expected) - - g = df.groupby("A") - result1 = g.tail(n=2) - result2 = g.nth(0) - tm.assert_frame_equal(result1, df) - tm.assert_frame_equal(result2, expected) - - g = df.groupby("A") - result1 = g.nth(0) - result2 = g.head(n=2) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, df) - - g = df.groupby("A") - result1 = g.nth(0) - result2 = g.tail(n=2) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, df) - - -def test_nth_empty(): - # GH 16064 - df = DataFrame(index=[0], columns=["a", "b", "c"]) - result = df.groupby("a").nth(10) - expected = df.iloc[:0] - tm.assert_frame_equal(result, expected) - - result = df.groupby(["a", "b"]).nth(10) - expected = df.iloc[:0] - tm.assert_frame_equal(result, expected) - - -def test_nth_column_order(): - # GH 20760 - # Check that nth preserves column order - df = DataFrame( - [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]], - columns=["A", "C", "B"], - ) - result = df.groupby("A").nth(0) - expected = df.iloc[[0, 3]] - tm.assert_frame_equal(result, expected) - - result = df.groupby("A").nth(-1, dropna="any") - expected = df.iloc[[1, 4]] - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dropna", [None, "any", "all"]) -def test_nth_nan_in_grouper(dropna): - # GH 26011 - df = DataFrame( - { - "a": [np.nan, "a", np.nan, "b", np.nan], - "b": [0, 2, 4, 6, 8], - "c": [1, 3, 5, 7, 9], - } - ) - result = df.groupby("a").nth(0, dropna=dropna) - expected = df.iloc[[1, 3]] - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dropna", [None, "any", "all"]) -def test_nth_nan_in_grouper_series(dropna): - # GH 26454 - df = DataFrame( - { - "a": [np.nan, "a", np.nan, "b", np.nan], - "b": [0, 2, 4, 6, 8], - } - ) - result = df.groupby("a")["b"].nth(0, dropna=dropna) - expected = df["b"].iloc[[1, 3]] - - tm.assert_series_equal(result, expected) - - -def test_first_categorical_and_datetime_data_nat(): - # GH 20520 - df = DataFrame( - { - "group": ["first", "first", "second", "third", "third"], - "time": 5 * [np.datetime64("NaT")], - "categories": Series(["a", "b", "c", "a", "b"], dtype="category"), - } - ) - result = df.groupby("group").first() - expected = DataFrame( - { - "time": 3 * [np.datetime64("NaT")], - "categories": Series(["a", "c", "a"]).astype( - pd.CategoricalDtype(["a", "b", "c"]) - ), - } - ) - expected.index = Index(["first", "second", "third"], name="group") - tm.assert_frame_equal(result, expected) - - -def test_first_multi_key_groupby_categorical(): - # GH 22512 - df = DataFrame( - { - "A": [1, 1, 1, 2, 2], - "B": [100, 100, 200, 100, 100], - "C": ["apple", "orange", "mango", "mango", "orange"], - "D": ["jupiter", "mercury", "mars", "venus", "venus"], - } - ) - df = df.astype({"D": "category"}) - result = df.groupby(by=["A", "B"]).first() - expected = DataFrame( - { - "C": ["apple", "mango", "mango"], - "D": Series(["jupiter", "mars", "venus"]).astype( - pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"]) - ), - } - ) - expected.index = MultiIndex.from_tuples( - [(1, 100), (1, 200), (2, 100)], names=["A", "B"] - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("method", ["first", "last", "nth"]) -def test_groupby_last_first_nth_with_none(method, nulls_fixture): - # GH29645 - expected = Series(["y"]) - data = Series( - [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture], - index=[0, 0, 0, 0, 0], - ).groupby(level=0) - - if method == "nth": - result = getattr(data, method)(3) - else: - result = getattr(data, method)() - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "arg, expected_rows", - [ - [slice(None, 3, 2), [0, 1, 4, 5]], - [slice(None, -2), [0, 2, 5]], - [[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], - [[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]], - ], -) -def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): - # Test slices GH #42947 - - result = slice_test_grouped.nth[arg] - equivalent = slice_test_grouped.nth(arg) - expected = slice_test_df.iloc[expected_rows] - - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(equivalent, expected) - - -def test_nth_indexed(slice_test_df, slice_test_grouped): - # Test index notation GH #44688 - - result = slice_test_grouped.nth[0, 1, -2:] - equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)]) - expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] - - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(equivalent, expected) - - -def test_invalid_argument(slice_test_grouped): - # Test for error on invalid argument - - with pytest.raises(TypeError, match="Invalid index"): - slice_test_grouped.nth(3.14) - - -def test_negative_step(slice_test_grouped): - # Test for error on negative slice step - - with pytest.raises(ValueError, match="Invalid step"): - slice_test_grouped.nth(slice(None, None, -1)) - - -def test_np_ints(slice_test_df, slice_test_grouped): - # Test np ints work - - result = slice_test_grouped.nth(np.array([0, 1])) - expected = slice_test_df.iloc[[0, 1, 2, 3, 4]] - tm.assert_frame_equal(result, expected) - - -def test_groupby_nth_with_column_axis(): - # GH43926 - df = DataFrame( - [ - [4, 5, 6], - [8, 8, 7], - ], - index=["z", "y"], - columns=["C", "B", "A"], - ) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(df.iloc[1], axis=1) - result = gb.nth(0) - expected = df.iloc[:, [0, 2]] - tm.assert_frame_equal(result, expected) - - -def test_groupby_nth_interval(): - # GH#24205 - idx_result = MultiIndex( - [ - pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]), - pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]), - ], - [[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]], - ) - df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result) - result = df_result.groupby(level=[0, 1], observed=False).nth(0) - val_expected = [0, 1, 3] - idx_expected = MultiIndex( - [ - pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]), - pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]), - ], - [[0, 0, 1], [0, 1, 0]], - ) - expected = DataFrame(val_expected, index=idx_expected, columns=["col"]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "start, stop, expected_values, expected_columns", - [ - (None, None, [0, 1, 2, 3, 4], list("ABCDE")), - (None, 1, [0, 3], list("AD")), - (None, 9, [0, 1, 2, 3, 4], list("ABCDE")), - (None, -1, [0, 1, 3], list("ABD")), - (1, None, [1, 2, 4], list("BCE")), - (1, -1, [1], list("B")), - (-1, None, [2, 4], list("CE")), - (-1, 2, [4], list("E")), - ], -) -@pytest.mark.parametrize("method", ["call", "index"]) -def test_nth_slices_with_column_axis( - start, stop, expected_values, expected_columns, method -): - df = DataFrame([range(5)], columns=[list("ABCDE")]) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby([5, 5, 5, 6, 6], axis=1) - result = { - "call": lambda start, stop: gb.nth(slice(start, stop)), - "index": lambda start, stop: gb.nth[start:stop], - }[method](start, stop) - expected = DataFrame([expected_values], columns=[expected_columns]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.filterwarnings( - "ignore:invalid value encountered in remainder:RuntimeWarning" -) -def test_head_tail_dropna_true(): - # GH#45089 - df = DataFrame( - [["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"] - ) - expected = DataFrame([["a", "z"]], columns=["X", "Y"]) - - result = df.groupby(["X", "Y"]).head(n=1) - tm.assert_frame_equal(result, expected) - - result = df.groupby(["X", "Y"]).tail(n=1) - tm.assert_frame_equal(result, expected) - - result = df.groupby(["X", "Y"]).nth(n=0) - tm.assert_frame_equal(result, expected) - - -def test_head_tail_dropna_false(): - # GH#45089 - df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"]) - expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"]) - - result = df.groupby(["X", "Y"], dropna=False).head(n=1) - tm.assert_frame_equal(result, expected) - - result = df.groupby(["X", "Y"], dropna=False).tail(n=1) - tm.assert_frame_equal(result, expected) - - result = df.groupby(["X", "Y"], dropna=False).nth(n=0) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"])) -@pytest.mark.parametrize("dropna", ["any", "all", None]) -def test_nth_after_selection(selection, dropna): - # GH#11038, GH#53518 - df = DataFrame( - { - "a": [1, 1, 2], - "b": [np.nan, 3, 4], - "c": [5, 6, 7], - } - ) - gb = df.groupby("a")[selection] - result = gb.nth(0, dropna=dropna) - if dropna == "any" or (dropna == "all" and selection != ["b", "c"]): - locs = [1, 2] - else: - locs = [0, 2] - expected = df.loc[locs, selection] - tm.assert_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_numeric_only.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_numeric_only.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_numeric_only.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_numeric_only.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,521 @@ +import re + +import numpy as np +import pytest + +from pandas._libs import lib + +import pandas as pd +from pandas import ( + DataFrame, + Index, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.tests.groupby import get_groupby_method_args + + +class TestNumericOnly: + # make sure that we are passing thru kwargs to our agg functions + + @pytest.fixture + def df(self): + # GH3668 + # GH5724 + df = DataFrame( + { + "group": [1, 1, 2], + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "category_string": Series(list("abc")).astype("category"), + "category_int": [7, 8, 9], + "datetime": date_range("20130101", periods=3), + "datetimetz": date_range("20130101", periods=3, tz="US/Eastern"), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + }, + columns=[ + "group", + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ], + ) + return df + + @pytest.mark.parametrize("method", ["mean", "median"]) + def test_averages(self, df, method): + # mean / median + expected_columns_numeric = Index(["int", "float", "category_int"]) + + gb = df.groupby("group") + expected = DataFrame( + { + "category_int": [7.5, 9], + "float": [4.5, 6.0], + "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")], + "int": [1.5, 3], + "datetime": [ + Timestamp("2013-01-01 12:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + "datetimetz": [ + Timestamp("2013-01-01 12:00:00", tz="US/Eastern"), + Timestamp("2013-01-03 00:00:00", tz="US/Eastern"), + ], + }, + index=Index([1, 2], name="group"), + columns=[ + "int", + "float", + "category_int", + ], + ) + + result = getattr(gb, method)(numeric_only=True) + tm.assert_frame_equal(result.reindex_like(expected), expected) + + expected_columns = expected.columns + + self._check(df, method, expected_columns, expected_columns_numeric) + + @pytest.mark.parametrize("method", ["min", "max"]) + def test_extrema(self, df, method): + # TODO: min, max *should* handle + # categorical (ordered) dtype + + expected_columns = Index( + [ + "int", + "float", + "string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ] + ) + expected_columns_numeric = expected_columns + + self._check(df, method, expected_columns, expected_columns_numeric) + + @pytest.mark.parametrize("method", ["first", "last"]) + def test_first_last(self, df, method): + expected_columns = Index( + [ + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ] + ) + expected_columns_numeric = expected_columns + + self._check(df, method, expected_columns, expected_columns_numeric) + + @pytest.mark.parametrize("method", ["sum", "cumsum"]) + def test_sum_cumsum(self, df, method): + expected_columns_numeric = Index(["int", "float", "category_int"]) + expected_columns = Index( + ["int", "float", "string", "category_int", "timedelta"] + ) + if method == "cumsum": + # cumsum loses string + expected_columns = Index(["int", "float", "category_int", "timedelta"]) + + self._check(df, method, expected_columns, expected_columns_numeric) + + @pytest.mark.parametrize("method", ["prod", "cumprod"]) + def test_prod_cumprod(self, df, method): + expected_columns = Index(["int", "float", "category_int"]) + expected_columns_numeric = expected_columns + + self._check(df, method, expected_columns, expected_columns_numeric) + + @pytest.mark.parametrize("method", ["cummin", "cummax"]) + def test_cummin_cummax(self, df, method): + # like min, max, but don't include strings + expected_columns = Index( + ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"] + ) + + # GH#15561: numeric_only=False set by default like min/max + expected_columns_numeric = expected_columns + + self._check(df, method, expected_columns, expected_columns_numeric) + + def _check(self, df, method, expected_columns, expected_columns_numeric): + gb = df.groupby("group") + + # object dtypes for transformations are not implemented in Cython and + # have no Python fallback + exception = NotImplementedError if method.startswith("cum") else TypeError + + if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"): + # The methods default to numeric_only=False and raise TypeError + msg = "|".join( + [ + "Categorical is not ordered", + f"Cannot perform {method} with non-ordered Categorical", + re.escape(f"agg function failed [how->{method},dtype->object]"), + # cumsum/cummin/cummax/cumprod + "function is not implemented for this dtype", + ] + ) + with pytest.raises(exception, match=msg): + getattr(gb, method)() + elif method in ("sum", "mean", "median", "prod"): + msg = "|".join( + [ + "category type does not support sum operations", + re.escape(f"agg function failed [how->{method},dtype->object]"), + re.escape(f"agg function failed [how->{method},dtype->string]"), + ] + ) + with pytest.raises(exception, match=msg): + getattr(gb, method)() + else: + result = getattr(gb, method)() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + if method not in ("first", "last"): + msg = "|".join( + [ + "Categorical is not ordered", + "category type does not support", + "function is not implemented for this dtype", + f"Cannot perform {method} with non-ordered Categorical", + re.escape(f"agg function failed [how->{method},dtype->object]"), + re.escape(f"agg function failed [how->{method},dtype->string]"), + ] + ) + with pytest.raises(exception, match=msg): + getattr(gb, method)(numeric_only=False) + else: + result = getattr(gb, method)(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + +@pytest.mark.parametrize("numeric_only", [True, False, None]) +def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string): + if groupby_func in ("idxmax", "idxmin"): + pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") + if groupby_func in ("corrwith", "skew"): + msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1" + request.applymarker(pytest.mark.xfail(reason=msg)) + + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] + ) + df["E"] = "x" + groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] + gb = df.groupby(groups) + method = getattr(gb, groupby_func) + args = get_groupby_method_args(groupby_func, df) + kwargs = {"axis": 1} + if numeric_only is not None: + # when numeric_only is None we don't pass any argument + kwargs["numeric_only"] = numeric_only + + # Functions without numeric_only and axis args + no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift") + # Functions with axis args + has_axis = ( + "cumprod", + "cumsum", + "diff", + "pct_change", + "rank", + "shift", + "cummax", + "cummin", + "idxmin", + "idxmax", + "fillna", + ) + warn_msg = f"DataFrameGroupBy.{groupby_func} with axis=1 is deprecated" + if numeric_only is not None and groupby_func in no_args: + msg = "got an unexpected keyword argument 'numeric_only'" + if groupby_func in ["cumprod", "cumsum"]: + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + method(*args, **kwargs) + else: + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) + elif groupby_func not in has_axis: + msg = "got an unexpected keyword argument 'axis'" + with pytest.raises(TypeError, match=msg): + method(*args, **kwargs) + # fillna and shift are successful even on object dtypes + elif (numeric_only is None or not numeric_only) and groupby_func not in ( + "fillna", + "shift", + ): + msgs = ( + # cummax, cummin, rank + "not supported between instances of", + # cumprod + "can't multiply sequence by non-int of type 'float'", + # cumsum, diff, pct_change + "unsupported operand type", + "has no kernel", + ) + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError) + else: + errs = TypeError + with pytest.raises(errs, match=f"({'|'.join(msgs)})"): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + method(*args, **kwargs) + else: + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + result = method(*args, **kwargs) + + df_expected = df.drop(columns="E").T if numeric_only else df.T + expected = getattr(df_expected, groupby_func)(*args).T + if groupby_func == "shift" and not numeric_only: + # shift with axis=1 leaves the leftmost column as numeric + # but transposing for expected gives us object dtype + expected = expected.astype(float) + + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "kernel, has_arg", + [ + ("all", False), + ("any", False), + ("bfill", False), + ("corr", True), + ("corrwith", True), + ("cov", True), + ("cummax", True), + ("cummin", True), + ("cumprod", True), + ("cumsum", True), + ("diff", False), + ("ffill", False), + ("fillna", False), + ("first", True), + ("idxmax", True), + ("idxmin", True), + ("last", True), + ("max", True), + ("mean", True), + ("median", True), + ("min", True), + ("nth", False), + ("nunique", False), + ("pct_change", False), + ("prod", True), + ("quantile", True), + ("sem", True), + ("skew", True), + ("std", True), + ("sum", True), + ("var", True), + ], +) +@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default]) +@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]]) +def test_numeric_only(kernel, has_arg, numeric_only, keys): + # GH#46072 + # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False + # has_arg: Whether the op has a numeric_only arg + df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]}) + + args = get_groupby_method_args(kernel, df) + kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only} + + gb = df.groupby(keys) + method = getattr(gb, kernel) + if has_arg and numeric_only is True: + # Cases where b does not appear in the result + result = method(*args, **kwargs) + assert "b" not in result.columns + elif ( + # kernels that work on any dtype and have numeric_only arg + kernel in ("first", "last") + or ( + # kernels that work on any dtype and don't have numeric_only arg + kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") + and numeric_only is lib.no_default + ) + ): + warn = FutureWarning if kernel == "fillna" else None + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = method(*args, **kwargs) + assert "b" in result.columns + elif has_arg: + assert numeric_only is not True + # kernels that are successful on any dtype were above; this will fail + + # object dtypes for transformations are not implemented in Cython and + # have no Python fallback + exception = NotImplementedError if kernel.startswith("cum") else TypeError + + msg = "|".join( + [ + "not allowed for this dtype", + "cannot be performed against 'object' dtypes", + # On PY39 message is "a number"; on PY310 and after is "a real number" + "must be a string or a.* number", + "unsupported operand type", + "function is not implemented for this dtype", + re.escape(f"agg function failed [how->{kernel},dtype->object]"), + ] + ) + if kernel == "idxmin": + msg = "'<' not supported between instances of 'type' and 'type'" + elif kernel == "idxmax": + msg = "'>' not supported between instances of 'type' and 'type'" + with pytest.raises(exception, match=msg): + method(*args, **kwargs) + elif not has_arg and numeric_only is not lib.no_default: + with pytest.raises( + TypeError, match="got an unexpected keyword argument 'numeric_only'" + ): + method(*args, **kwargs) + else: + assert kernel in ("diff", "pct_change") + assert numeric_only is lib.no_default + # Doesn't have numeric_only argument and fails on nuisance columns + with pytest.raises(TypeError, match=r"unsupported operand type"): + method(*args, **kwargs) + + +@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") +@pytest.mark.parametrize("dtype", [bool, int, float, object]) +def test_deprecate_numeric_only_series(dtype, groupby_func, request): + # GH#46560 + grouper = [0, 0, 1] + + ser = Series([1, 0, 0], dtype=dtype) + gb = ser.groupby(grouper) + + if groupby_func == "corrwith": + # corrwith is not implemented on SeriesGroupBy + assert not hasattr(gb, groupby_func) + return + + method = getattr(gb, groupby_func) + + expected_ser = Series([1, 0, 0]) + expected_gb = expected_ser.groupby(grouper) + expected_method = getattr(expected_gb, groupby_func) + + args = get_groupby_method_args(groupby_func, ser) + + fails_on_numeric_object = ( + "corr", + "cov", + "cummax", + "cummin", + "cumprod", + "cumsum", + "quantile", + ) + # ops that give an object result on object input + obj_result = ( + "first", + "last", + "nth", + "bfill", + "ffill", + "shift", + "sum", + "diff", + "pct_change", + "var", + "mean", + "median", + "min", + "max", + "prod", + "skew", + ) + + # Test default behavior; kernels that fail may be enabled in the future but kernels + # that succeed should not be allowed to fail (without deprecation, at least) + if groupby_func in fails_on_numeric_object and dtype is object: + if groupby_func == "quantile": + msg = "cannot be performed against 'object' dtypes" + else: + msg = "is not supported for object dtype" + warn = FutureWarning if groupby_func == "fillna" else None + warn_msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + with pytest.raises(TypeError, match=msg): + method(*args) + elif dtype is object: + warn = FutureWarning if groupby_func == "fillna" else None + warn_msg = "SeriesGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=warn_msg): + result = method(*args) + with tm.assert_produces_warning(warn, match=warn_msg): + expected = expected_method(*args) + if groupby_func in obj_result: + expected = expected.astype(object) + tm.assert_series_equal(result, expected) + + has_numeric_only = ( + "first", + "last", + "max", + "mean", + "median", + "min", + "prod", + "quantile", + "sem", + "skew", + "std", + "sum", + "var", + "cummax", + "cummin", + "cumprod", + "cumsum", + ) + if groupby_func not in has_numeric_only: + msg = "got an unexpected keyword argument 'numeric_only'" + with pytest.raises(TypeError, match=msg): + method(*args, numeric_only=True) + elif dtype is object: + msg = "|".join( + [ + "SeriesGroupBy.sem called with numeric_only=True and dtype object", + "Series.skew does not allow numeric_only=True with non-numeric", + "cum(sum|prod|min|max) is not supported for object dtype", + r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric", + ] + ) + with pytest.raises(TypeError, match=msg): + method(*args, numeric_only=True) + elif dtype == bool and groupby_func == "quantile": + msg = "Allowing bool dtype in SeriesGroupBy.quantile" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#51424 + result = method(*args, numeric_only=True) + expected = method(*args, numeric_only=False) + tm.assert_series_equal(result, expected) + else: + result = method(*args, numeric_only=True) + expected = method(*args, numeric_only=False) + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_nunique.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_nunique.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_nunique.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_nunique.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,190 +0,0 @@ -import datetime as dt -from string import ascii_lowercase - -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - MultiIndex, - NaT, - Series, - Timestamp, - date_range, -) -import pandas._testing as tm - - -@pytest.mark.slow -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("dropna", [False, True]) -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize("with_nan", [True, False]) -@pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]]) -def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys): - n = 100 - m = 10 - days = date_range("2015-08-23", periods=10) - df = DataFrame( - { - "jim": np.random.default_rng(2).choice(list(ascii_lowercase), n), - "joe": np.random.default_rng(2).choice(days, n), - "julie": np.random.default_rng(2).integers(0, m, n), - } - ) - if with_nan: - df = df.astype({"julie": float}) # Explicit cast to avoid implicit cast below - df.loc[1::17, "jim"] = None - df.loc[3::37, "joe"] = None - df.loc[7::19, "julie"] = None - df.loc[8::19, "julie"] = None - df.loc[9::19, "julie"] = None - original_df = df.copy() - gr = df.groupby(keys, as_index=as_index, sort=sort) - left = gr["julie"].nunique(dropna=dropna) - - gr = df.groupby(keys, as_index=as_index, sort=sort) - right = gr["julie"].apply(Series.nunique, dropna=dropna) - if not as_index: - right = right.reset_index(drop=True) - - if as_index: - tm.assert_series_equal(left, right, check_names=False) - else: - tm.assert_frame_equal(left, right, check_names=False) - tm.assert_frame_equal(df, original_df) - - -def test_nunique(): - df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) - - expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]}) - result = df.groupby("A", as_index=False).nunique() - tm.assert_frame_equal(result, expected) - - # as_index - expected.index = list("abc") - expected.index.name = "A" - expected = expected.drop(columns="A") - result = df.groupby("A").nunique() - tm.assert_frame_equal(result, expected) - - # with na - result = df.replace({"x": None}).groupby("A").nunique(dropna=False) - tm.assert_frame_equal(result, expected) - - # dropna - expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc")) - expected.index.name = "A" - result = df.replace({"x": None}).groupby("A").nunique() - tm.assert_frame_equal(result, expected) - - -def test_nunique_with_object(): - # GH 11077 - data = DataFrame( - [ - [100, 1, "Alice"], - [200, 2, "Bob"], - [300, 3, "Charlie"], - [-400, 4, "Dan"], - [500, 5, "Edith"], - ], - columns=["amount", "id", "name"], - ) - - result = data.groupby(["id", "amount"])["name"].nunique() - index = MultiIndex.from_arrays([data.id, data.amount]) - expected = Series([1] * 5, name="name", index=index) - tm.assert_series_equal(result, expected) - - -def test_nunique_with_empty_series(): - # GH 12553 - data = Series(name="name", dtype=object) - result = data.groupby(level=0).nunique() - expected = Series(name="name", dtype="int64") - tm.assert_series_equal(result, expected) - - -def test_nunique_with_timegrouper(): - # GH 13453 - test = DataFrame( - { - "time": [ - Timestamp("2016-06-28 09:35:35"), - Timestamp("2016-06-28 16:09:30"), - Timestamp("2016-06-28 16:46:28"), - ], - "data": ["1", "2", "3"], - } - ).set_index("time") - result = test.groupby(pd.Grouper(freq="h"))["data"].nunique() - expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(Series.nunique) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "key, data, dropna, expected", - [ - ( - ["x", "x", "x"], - [Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")], - True, - Series([1], index=pd.Index(["x"], name="key"), name="data"), - ), - ( - ["x", "x", "x"], - [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], - True, - Series([1], index=pd.Index(["x"], name="key"), name="data"), - ), - ( - ["x", "x", "x", "y", "y"], - [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], - False, - Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"), - ), - ( - ["x", "x", "x", "x", "y"], - [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], - False, - Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"), - ), - ], -) -def test_nunique_with_NaT(key, data, dropna, expected): - # GH 27951 - df = DataFrame({"key": key, "data": data}) - result = df.groupby(["key"])["data"].nunique(dropna=dropna) - tm.assert_series_equal(result, expected) - - -def test_nunique_preserves_column_level_names(): - # GH 23222 - test = DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) - result = test.groupby([0, 0, 0]).nunique() - expected = DataFrame([2], index=np.array([0]), columns=test.columns) - tm.assert_frame_equal(result, expected) - - -def test_nunique_transform_with_datetime(): - # GH 35109 - transform with nunique on datetimes results in integers - df = DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"]) - result = df.groupby([0, 0, 1])["date"].transform("nunique") - expected = Series([2, 2, 1], name="date") - tm.assert_series_equal(result, expected) - - -def test_empty_categorical(observed): - # GH#21334 - cat = Series([1]).astype("category") - ser = cat[:0] - gb = ser.groupby(ser, observed=observed) - result = gb.nunique() - if observed: - expected = Series([], index=cat[:0], dtype="int64") - else: - expected = Series([0], index=cat, dtype="int64") - tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_quantile.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_quantile.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_quantile.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_quantile.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,503 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - Index, -) -import pandas._testing as tm - - -@pytest.mark.parametrize( - "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] -) -@pytest.mark.parametrize( - "a_vals,b_vals", - [ - # Ints - ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), - ([1, 2, 3, 4], [4, 3, 2, 1]), - ([1, 2, 3, 4, 5], [4, 3, 2, 1]), - # Floats - ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]), - # Missing data - ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]), - ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), - # Timestamps - ( - pd.date_range("1/1/18", freq="D", periods=5), - pd.date_range("1/1/18", freq="D", periods=5)[::-1], - ), - ( - pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"), - pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"), - ), - # All NA - ([np.nan] * 5, [np.nan] * 5), - ], -) -@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1]) -def test_quantile(interpolation, a_vals, b_vals, q, request): - if ( - interpolation == "nearest" - and q == 0.5 - and isinstance(b_vals, list) - and b_vals == [4, 3, 2, 1] - ): - request.node.add_marker( - pytest.mark.xfail( - reason="Unclear numpy expectation for nearest " - "result with equidistant data" - ) - ) - all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)]) - - a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) - b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation) - - df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals}) - - expected = DataFrame( - [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key") - ) - if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M": - # TODO(non-nano): this should be unnecessary once array_to_datetime - # correctly infers non-nano from Timestamp.unit - expected = expected.astype(all_vals.dtype) - result = df.groupby("key").quantile(q, interpolation=interpolation) - - tm.assert_frame_equal(result, expected) - - -def test_quantile_array(): - # https://github.com/pandas-dev/pandas/issues/27526 - df = DataFrame({"A": [0, 1, 2, 3, 4]}) - key = np.array([0, 0, 1, 1, 1], dtype=np.int64) - result = df.groupby(key).quantile([0.25]) - - index = pd.MultiIndex.from_product([[0, 1], [0.25]]) - expected = DataFrame({"A": [0.25, 2.50]}, index=index) - tm.assert_frame_equal(result, expected) - - df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) - index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]]) - - key = np.array([0, 0, 1, 1], dtype=np.int64) - result = df.groupby(key).quantile([0.25, 0.75]) - expected = DataFrame( - {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index - ) - tm.assert_frame_equal(result, expected) - - -def test_quantile_array2(): - # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 - arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64) - df = DataFrame(arr, columns=list("ABC")) - result = df.groupby("A").quantile([0.3, 0.7]) - expected = DataFrame( - { - "B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7], - "C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8], - }, - index=pd.MultiIndex.from_product( - [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None] - ), - ) - tm.assert_frame_equal(result, expected) - - -def test_quantile_array_no_sort(): - df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) - key = np.array([1, 0, 1], dtype=np.int64) - result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75]) - expected = DataFrame( - {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]}, - index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]), - ) - tm.assert_frame_equal(result, expected) - - result = df.groupby(key, sort=False).quantile([0.75, 0.25]) - expected = DataFrame( - {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]}, - index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]), - ) - tm.assert_frame_equal(result, expected) - - -def test_quantile_array_multiple_levels(): - df = DataFrame( - {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]} - ) - result = df.groupby(["c", "d"]).quantile([0.25, 0.75]) - index = pd.MultiIndex.from_tuples( - [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)], - names=["c", "d", None], - ) - expected = DataFrame( - {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)]) -@pytest.mark.parametrize("groupby", [[0], [0, 1]]) -@pytest.mark.parametrize("q", [[0.5, 0.6]]) -def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): - # GH30289 - nrow, ncol = frame_size - df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol)) - - idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q] - idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [ - list(range(len(q))) * min(nrow, 4) - ] - expected_index = pd.MultiIndex( - levels=idx_levels, codes=idx_codes, names=groupby + [None] - ) - expected_values = [ - [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q - ] - expected_columns = [x for x in range(ncol) if x not in groupby] - expected = DataFrame( - expected_values, index=expected_index, columns=expected_columns - ) - result = df.groupby(groupby).quantile(q) - - tm.assert_frame_equal(result, expected) - - -def test_quantile_raises(): - df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - - with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): - df.groupby("key").quantile() - - -def test_quantile_out_of_bounds_q_raises(): - # https://github.com/pandas-dev/pandas/issues/27470 - df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)}) - g = df.groupby([0, 0, 0, 1, 1, 1]) - with pytest.raises(ValueError, match="Got '50.0' instead"): - g.quantile(50) - - with pytest.raises(ValueError, match="Got '-1.0' instead"): - g.quantile(-1) - - -def test_quantile_missing_group_values_no_segfaults(): - # GH 28662 - data = np.array([1.0, np.nan, 1.0]) - df = DataFrame({"key": data, "val": range(3)}) - - # Random segfaults; would have been guaranteed in loop - grp = df.groupby("key") - for _ in range(100): - grp.quantile() - - -@pytest.mark.parametrize( - "key, val, expected_key, expected_val", - [ - ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]), - ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]), - (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]), - ([0], [42], [0], [42.0]), - ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")), - ], -) -def test_quantile_missing_group_values_correct_results( - key, val, expected_key, expected_val -): - # GH 28662, GH 33200, GH 33569 - df = DataFrame({"key": key, "val": val}) - - expected = DataFrame( - expected_val, index=Index(expected_key, name="key"), columns=["val"] - ) - - grp = df.groupby("key") - - result = grp.quantile(0.5) - tm.assert_frame_equal(result, expected) - - result = grp.quantile() - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "values", - [ - pd.array([1, 0, None] * 2, dtype="Int64"), - pd.array([True, False, None] * 2, dtype="boolean"), - ], -) -@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) -def test_groupby_quantile_nullable_array(values, q): - # https://github.com/pandas-dev/pandas/issues/33136 - df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values}) - result = df.groupby("a")["b"].quantile(q) - - if isinstance(q, list): - idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None]) - true_quantiles = [0.0, 0.5, 1.0] - else: - idx = Index(["x", "y"], name="a") - true_quantiles = [0.5] - - expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64") - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): - df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) - if numeric_only: - result = df.groupby("a").quantile(q, numeric_only=numeric_only) - expected = df.groupby("a")[["b"]].quantile(q) - tm.assert_frame_equal(result, expected) - else: - with pytest.raises( - TypeError, match="'quantile' cannot be performed against 'object' dtypes!" - ): - df.groupby("a").quantile(q, numeric_only=numeric_only) - - -def test_groupby_quantile_NA_float(any_float_dtype): - # GH#42849 - df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) - result = df.groupby("x")["y"].quantile(0.5) - exp_index = Index([1.0], dtype=any_float_dtype, name="x") - - if any_float_dtype in ["Float32", "Float64"]: - expected_dtype = any_float_dtype - else: - expected_dtype = None - - expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y") - tm.assert_series_equal(result, expected) - - result = df.groupby("x")["y"].quantile([0.5, 0.75]) - expected = pd.Series( - [0.2] * 2, - index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]), - name="y", - dtype=expected_dtype, - ) - tm.assert_series_equal(result, expected) - - -def test_groupby_quantile_NA_int(any_int_ea_dtype): - # GH#42849 - df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype) - result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series( - [3.5], - dtype="Float64", - index=Index([1], name="x", dtype=any_int_ea_dtype), - name="y", - ) - tm.assert_series_equal(expected, result) - - result = df.groupby("x").quantile(0.5) - expected = DataFrame( - {"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype) - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)] -) -def test_groupby_quantile_all_na_group_masked( - interpolation, val1, val2, any_numeric_ea_dtype -): - # GH#37493 - df = DataFrame( - {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype - ) - result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation) - expected = DataFrame( - {"b": [val1, val2, pd.NA, pd.NA]}, - dtype=any_numeric_ea_dtype, - index=pd.MultiIndex.from_arrays( - [pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]], - names=["a", None], - ), - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("interpolation", ["midpoint", "linear"]) -def test_groupby_quantile_all_na_group_masked_interp( - interpolation, any_numeric_ea_dtype -): - # GH#37493 - df = DataFrame( - {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype - ) - result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation) - - if any_numeric_ea_dtype == "Float32": - expected_dtype = any_numeric_ea_dtype - else: - expected_dtype = "Float64" - - expected = DataFrame( - {"b": [2.0, 2.5, pd.NA, pd.NA]}, - dtype=expected_dtype, - index=pd.MultiIndex.from_arrays( - [ - pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), - [0.5, 0.75, 0.5, 0.75], - ], - names=["a", None], - ), - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dtype", ["Float64", "Float32"]) -def test_groupby_quantile_allNA_column(dtype): - # GH#42849 - df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) - result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series( - [np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y" - ) - expected.index.name = "x" - tm.assert_series_equal(expected, result) - - -def test_groupby_timedelta_quantile(): - # GH: 29485 - df = DataFrame( - {"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]} - ) - result = df.groupby("group").quantile(0.99) - expected = DataFrame( - { - "value": [ - pd.Timedelta("0 days 00:00:00.990000"), - pd.Timedelta("0 days 00:00:02.990000"), - ] - }, - index=Index([1, 2], name="group"), - ) - tm.assert_frame_equal(result, expected) - - -def test_columns_groupby_quantile(): - # GH 33795 - df = DataFrame( - np.arange(12).reshape(3, -1), - index=list("XYZ"), - columns=pd.Series(list("ABAB"), name="col"), - ) - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby("col", axis=1) - result = gb.quantile(q=[0.8, 0.2]) - expected = DataFrame( - [ - [1.6, 0.4, 2.6, 1.4], - [5.6, 4.4, 6.6, 5.4], - [9.6, 8.4, 10.6, 9.4], - ], - index=list("XYZ"), - columns=pd.MultiIndex.from_tuples( - [("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None] - ), - ) - - tm.assert_frame_equal(result, expected) - - -def test_timestamp_groupby_quantile(): - # GH 33168 - df = DataFrame( - { - "timestamp": pd.date_range( - start="2020-04-19 00:00:00", freq="1T", periods=100, tz="UTC" - ).floor("1H"), - "category": list(range(1, 101)), - "value": list(range(101, 201)), - } - ) - - result = df.groupby("timestamp").quantile([0.2, 0.8]) - - expected = DataFrame( - [ - {"category": 12.8, "value": 112.8}, - {"category": 48.2, "value": 148.2}, - {"category": 68.8, "value": 168.8}, - {"category": 92.2, "value": 192.2}, - ], - index=pd.MultiIndex.from_tuples( - [ - (pd.Timestamp("2020-04-19 00:00:00+00:00"), 0.2), - (pd.Timestamp("2020-04-19 00:00:00+00:00"), 0.8), - (pd.Timestamp("2020-04-19 01:00:00+00:00"), 0.2), - (pd.Timestamp("2020-04-19 01:00:00+00:00"), 0.8), - ], - names=("timestamp", None), - ), - ) - - tm.assert_frame_equal(result, expected) - - -def test_groupby_quantile_dt64tz_period(): - # GH#51373 - dti = pd.date_range("2016-01-01", periods=1000) - ser = pd.Series(dti) - df = ser.to_frame() - df[1] = dti.tz_localize("US/Pacific") - df[2] = dti.to_period("D") - df[3] = dti - dti[0] - df.iloc[-1] = pd.NaT - - by = np.tile(np.arange(5), 200) - gb = df.groupby(by) - - result = gb.quantile(0.5) - - # Check that we match the group-by-group result - exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} - expected = DataFrame(exp).T.infer_objects() - expected.index = expected.index.astype(int) - - tm.assert_frame_equal(result, expected) - - -def test_groupby_quantile_nonmulti_levels_order(): - # Non-regression test for GH #53009 - ind = pd.MultiIndex.from_tuples( - [ - (0, "a", "B"), - (0, "a", "A"), - (0, "b", "B"), - (0, "b", "A"), - (1, "a", "B"), - (1, "a", "A"), - (1, "b", "B"), - (1, "b", "A"), - ], - names=["sample", "cat0", "cat1"], - ) - ser = pd.Series(range(8), index=ind) - result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8]) - - qind = pd.MultiIndex.from_tuples( - [("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None] - ) - expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind) - - tm.assert_series_equal(result, expected) - - # We need to check that index levels are not sorted - expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]]) - tm.assert_equal(result.index.levels, expected_levels) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_raises.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_raises.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_raises.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_raises.py 2024-04-10 17:42:52.000000000 +0000 @@ -97,22 +97,24 @@ return df -def _call_and_check(klass, msg, how, gb, groupby_func, args): - if klass is None: - if how == "method": - getattr(gb, groupby_func)(*args) - elif how == "agg": - gb.agg(groupby_func, *args) - else: - gb.transform(groupby_func, *args) - else: - with pytest.raises(klass, match=msg): +def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): + warn_klass = None if warn_msg == "" else FutureWarning + with tm.assert_produces_warning(warn_klass, match=warn_msg): + if klass is None: if how == "method": getattr(gb, groupby_func)(*args) elif how == "agg": gb.agg(groupby_func, *args) else: gb.transform(groupby_func, *args) + else: + with pytest.raises(klass, match=msg): + if how == "method": + getattr(gb, groupby_func)(*args) + elif how == "agg": + gb.agg(groupby_func, *args) + else: + gb.transform(groupby_func, *args) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -187,11 +189,16 @@ "sum": (None, ""), "var": ( TypeError, - re.escape("agg function failed [how->var,dtype->object]"), + re.escape("agg function failed [how->var,dtype->"), ), }[groupby_func] - _call_and_check(klass, msg, how, gb, groupby_func, args) + if groupby_func == "fillna": + kind = "Series" if groupby_series else "DataFrame" + warn_msg = f"{kind}GroupBy.fillna is deprecated" + else: + warn_msg = "" + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -233,8 +240,7 @@ warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -297,13 +303,14 @@ "var": (TypeError, "datetime64 type does not support var operations"), }[groupby_func] - warn = None - warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" if groupby_func in ["any", "all"]: - warn = FutureWarning - - with tm.assert_produces_warning(warn, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func, args) + warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated" + elif groupby_func == "fillna": + kind = "Series" if groupby_series else "DataFrame" + warn_msg = f"{kind}GroupBy.fillna is deprecated" + else: + warn_msg = "" + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -342,8 +349,7 @@ warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"]) @@ -497,7 +503,12 @@ ), }[groupby_func] - _call_and_check(klass, msg, how, gb, groupby_func, args) + if groupby_func == "fillna": + kind = "Series" if groupby_series else "DataFrame" + warn_msg = f"{kind}GroupBy.fillna is deprecated" + else: + warn_msg = "" + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) @pytest.mark.parametrize("how", ["agg", "transform"]) @@ -540,8 +551,7 @@ warn_msg = "using SeriesGroupBy.[sum|mean]" else: warn_msg = "using DataFrameGroupBy.[sum|mean]" - with tm.assert_produces_warning(FutureWarning, match=warn_msg): - _call_and_check(klass, msg, how, gb, groupby_func_np, ()) + _call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) @@ -571,7 +581,20 @@ assert not hasattr(gb, "corrwith") return - empty_groups = any(group.empty for group in gb.groups.values()) + empty_groups = not observed and any(group.empty for group in gb.groups.values()) + if ( + not observed + and how != "transform" + and isinstance(by, list) + and isinstance(by[0], str) + and by == ["a", "b"] + ): + assert not empty_groups + # TODO: empty_groups should be true due to unobserved categorical combinations + empty_groups = True + if how == "transform": + # empty groups will be ignored + empty_groups = False klass, msg = { "all": (None, ""), @@ -617,10 +640,10 @@ if not using_copy_on_write else (None, ""), # no-op with CoW "first": (None, ""), - "idxmax": (ValueError, "attempt to get argmax of an empty sequence") + "idxmax": (ValueError, "empty group due to unobserved categories") if empty_groups else (None, ""), - "idxmin": (ValueError, "attempt to get argmin of an empty sequence") + "idxmin": (ValueError, "empty group due to unobserved categories") if empty_groups else (None, ""), "last": (None, ""), @@ -675,7 +698,12 @@ ), }[groupby_func] - _call_and_check(klass, msg, how, gb, groupby_func, args) + if groupby_func == "fillna": + kind = "Series" if groupby_series else "DataFrame" + warn_msg = f"{kind}GroupBy.fillna is deprecated" + else: + warn_msg = "" + _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg) def test_subsetting_columns_axis_1_raises(): diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_rank.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_rank.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_rank.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_rank.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,712 +0,0 @@ -from datetime import datetime - -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - DataFrame, - NaT, - Series, - concat, -) -import pandas._testing as tm - - -def test_rank_unordered_categorical_typeerror(): - # GH#51034 should be TypeError, not NotImplementedError - cat = pd.Categorical([], ordered=False) - ser = Series(cat) - df = ser.to_frame() - - msg = "Cannot perform rank with non-ordered Categorical" - - gb = ser.groupby(cat, observed=False) - with pytest.raises(TypeError, match=msg): - gb.rank() - - gb2 = df.groupby(cat, observed=False) - with pytest.raises(TypeError, match=msg): - gb2.rank() - - -def test_rank_apply(): - lev1 = np.array(["a" * 10] * 100, dtype=object) - lev2 = np.array(["b" * 10] * 130, dtype=object) - lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int) - lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int) - - df = DataFrame( - { - "value": np.random.default_rng(2).standard_normal(500), - "key1": lev1.take(lab1), - "key2": lev2.take(lab2), - } - ) - - result = df.groupby(["key1", "key2"]).value.rank() - - expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])] - expected = concat(expected, axis=0) - expected = expected.reindex(result.index) - tm.assert_series_equal(result, expected) - - result = df.groupby(["key1", "key2"]).value.rank(pct=True) - - expected = [ - piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"]) - ] - expected = concat(expected, axis=0) - expected = expected.reindex(result.index) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) -@pytest.mark.parametrize( - "vals", - [ - np.array([2, 2, 8, 2, 6], dtype=dtype) - for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"] - ] - + [ - [ - pd.Timestamp("2018-01-02"), - pd.Timestamp("2018-01-02"), - pd.Timestamp("2018-01-08"), - pd.Timestamp("2018-01-02"), - pd.Timestamp("2018-01-06"), - ], - [ - pd.Timestamp("2018-01-02", tz="US/Pacific"), - pd.Timestamp("2018-01-02", tz="US/Pacific"), - pd.Timestamp("2018-01-08", tz="US/Pacific"), - pd.Timestamp("2018-01-02", tz="US/Pacific"), - pd.Timestamp("2018-01-06", tz="US/Pacific"), - ], - [ - pd.Timestamp("2018-01-02") - pd.Timestamp(0), - pd.Timestamp("2018-01-02") - pd.Timestamp(0), - pd.Timestamp("2018-01-08") - pd.Timestamp(0), - pd.Timestamp("2018-01-02") - pd.Timestamp(0), - pd.Timestamp("2018-01-06") - pd.Timestamp(0), - ], - [ - pd.Timestamp("2018-01-02").to_period("D"), - pd.Timestamp("2018-01-02").to_period("D"), - pd.Timestamp("2018-01-08").to_period("D"), - pd.Timestamp("2018-01-02").to_period("D"), - pd.Timestamp("2018-01-06").to_period("D"), - ], - ], - ids=lambda x: type(x[0]), -) -@pytest.mark.parametrize( - "ties_method,ascending,pct,exp", - [ - ("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]), - ("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), - ("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]), - ("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]), - ("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]), - ("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), - ("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), - ("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]), - ("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]), - ("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), - ("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]), - ("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]), - ("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]), - ("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), - ("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]), - ("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]), - ("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]), - ("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]), - ("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), - ("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]), - ], -) -def test_rank_args(grps, vals, ties_method, ascending, pct, exp): - key = np.repeat(grps, len(vals)) - - orig_vals = vals - vals = list(vals) * len(grps) - if isinstance(orig_vals, np.ndarray): - vals = np.array(vals, dtype=orig_vals.dtype) - - df = DataFrame({"key": key, "val": vals}) - result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct) - - exp_df = DataFrame(exp * len(grps), columns=["val"]) - tm.assert_frame_equal(result, exp_df) - - -@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) -@pytest.mark.parametrize( - "vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]] -) -@pytest.mark.parametrize( - "ties_method,ascending,na_option,exp", - [ - ("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), - ("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]), - ("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]), - ("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), - ("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]), - ("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]), - ("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]), - ("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]), - ("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]), - ("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]), - ("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]), - ("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]), - ("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]), - ("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]), - ("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]), - ("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]), - ("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]), - ("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]), - ("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]), - ("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]), - ("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]), - ("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]), - ("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]), - ("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]), - ("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]), - ("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]), - ("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]), - ("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]), - ("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]), - ("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]), - ], -) -def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): - # GH 20561 - key = np.repeat(grps, len(vals)) - vals = vals * len(grps) - df = DataFrame({"key": key, "val": vals}) - result = df.groupby("key").rank( - method=ties_method, ascending=ascending, na_option=na_option - ) - exp_df = DataFrame(exp * len(grps), columns=["val"]) - tm.assert_frame_equal(result, exp_df) - - -@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) -@pytest.mark.parametrize( - "vals", - [ - np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype) - for dtype in ["f8", "f4", "f2"] - ] - + [ - [ - pd.Timestamp("2018-01-02"), - pd.Timestamp("2018-01-02"), - np.nan, - pd.Timestamp("2018-01-08"), - pd.Timestamp("2018-01-02"), - pd.Timestamp("2018-01-06"), - np.nan, - np.nan, - ], - [ - pd.Timestamp("2018-01-02", tz="US/Pacific"), - pd.Timestamp("2018-01-02", tz="US/Pacific"), - np.nan, - pd.Timestamp("2018-01-08", tz="US/Pacific"), - pd.Timestamp("2018-01-02", tz="US/Pacific"), - pd.Timestamp("2018-01-06", tz="US/Pacific"), - np.nan, - np.nan, - ], - [ - pd.Timestamp("2018-01-02") - pd.Timestamp(0), - pd.Timestamp("2018-01-02") - pd.Timestamp(0), - np.nan, - pd.Timestamp("2018-01-08") - pd.Timestamp(0), - pd.Timestamp("2018-01-02") - pd.Timestamp(0), - pd.Timestamp("2018-01-06") - pd.Timestamp(0), - np.nan, - np.nan, - ], - [ - pd.Timestamp("2018-01-02").to_period("D"), - pd.Timestamp("2018-01-02").to_period("D"), - np.nan, - pd.Timestamp("2018-01-08").to_period("D"), - pd.Timestamp("2018-01-02").to_period("D"), - pd.Timestamp("2018-01-06").to_period("D"), - np.nan, - np.nan, - ], - ], - ids=lambda x: type(x[0]), -) -@pytest.mark.parametrize( - "ties_method,ascending,na_option,pct,exp", - [ - ( - "average", - True, - "keep", - False, - [2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan], - ), - ( - "average", - True, - "keep", - True, - [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan], - ), - ( - "average", - False, - "keep", - False, - [4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan], - ), - ( - "average", - False, - "keep", - True, - [0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan], - ), - ("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]), - ("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), - ( - "min", - False, - "keep", - False, - [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], - ), - ("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), - ("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]), - ("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), - ( - "max", - False, - "keep", - False, - [5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], - ), - ("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]), - ( - "first", - True, - "keep", - False, - [1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan], - ), - ( - "first", - True, - "keep", - True, - [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan], - ), - ( - "first", - False, - "keep", - False, - [3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], - ), - ( - "first", - False, - "keep", - True, - [0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan], - ), - ( - "dense", - True, - "keep", - False, - [1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan], - ), - ( - "dense", - True, - "keep", - True, - [ - 1.0 / 3.0, - 1.0 / 3.0, - np.nan, - 3.0 / 3.0, - 1.0 / 3.0, - 2.0 / 3.0, - np.nan, - np.nan, - ], - ), - ( - "dense", - False, - "keep", - False, - [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], - ), - ( - "dense", - False, - "keep", - True, - [ - 3.0 / 3.0, - 3.0 / 3.0, - np.nan, - 1.0 / 3.0, - 3.0 / 3.0, - 2.0 / 3.0, - np.nan, - np.nan, - ], - ), - ("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]), - ( - "average", - True, - "bottom", - True, - [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875], - ), - ("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]), - ( - "average", - False, - "bottom", - True, - [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875], - ), - ("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]), - ( - "min", - True, - "bottom", - True, - [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75], - ), - ("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]), - ( - "min", - False, - "bottom", - True, - [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75], - ), - ("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]), - ("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]), - ("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]), - ( - "max", - False, - "bottom", - True, - [0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0], - ), - ("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]), - ( - "first", - True, - "bottom", - True, - [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0], - ), - ("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]), - ( - "first", - False, - "bottom", - True, - [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0], - ), - ("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]), - ("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]), - ("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]), - ("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]), - ], -) -def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp): - key = np.repeat(grps, len(vals)) - - orig_vals = vals - vals = list(vals) * len(grps) - if isinstance(orig_vals, np.ndarray): - vals = np.array(vals, dtype=orig_vals.dtype) - - df = DataFrame({"key": key, "val": vals}) - result = df.groupby("key").rank( - method=ties_method, ascending=ascending, na_option=na_option, pct=pct - ) - - exp_df = DataFrame(exp * len(grps), columns=["val"]) - tm.assert_frame_equal(result, exp_df) - - -@pytest.mark.parametrize( - "pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])] -) -def test_rank_resets_each_group(pct, exp): - df = DataFrame( - {"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10} - ) - result = df.groupby("key").rank(pct=pct) - exp_df = DataFrame(exp * 2, columns=["val"]) - tm.assert_frame_equal(result, exp_df) - - -@pytest.mark.parametrize( - "dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"] -) -@pytest.mark.parametrize("upper", [True, False]) -def test_rank_avg_even_vals(dtype, upper): - if upper: - # use IntegerDtype/FloatingDtype - dtype = dtype[0].upper() + dtype[1:] - dtype = dtype.replace("Ui", "UI") - df = DataFrame({"key": ["a"] * 4, "val": [1] * 4}) - df["val"] = df["val"].astype(dtype) - assert df["val"].dtype == dtype - - result = df.groupby("key").rank() - exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"]) - if upper: - exp_df = exp_df.astype("Float64") - tm.assert_frame_equal(result, exp_df) - - -@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) -@pytest.mark.parametrize("pct", [True, False]) -@pytest.mark.parametrize( - "vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]] -) -def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals): - df = DataFrame({"key": ["foo"] * 5, "val": vals}) - mask = df["val"].isna() - - gb = df.groupby("key") - res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct) - - # construct our expected by using numeric values with the same ordering - if mask.any(): - df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]}) - else: - df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]}) - - gb2 = df2.groupby("key") - alt = gb2.rank( - method=ties_method, ascending=ascending, na_option=na_option, pct=pct - ) - - tm.assert_frame_equal(res, alt) - - -@pytest.mark.parametrize("na_option", [True, "bad", 1]) -@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("pct", [True, False]) -@pytest.mark.parametrize( - "vals", - [ - ["bar", "bar", "foo", "bar", "baz"], - ["bar", np.nan, "foo", np.nan, "baz"], - [1, np.nan, 2, np.nan, 3], - ], -) -def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals): - df = DataFrame({"key": ["foo"] * 5, "val": vals}) - msg = "na_option must be one of 'keep', 'top', or 'bottom'" - - with pytest.raises(ValueError, match=msg): - df.groupby("key").rank( - method=ties_method, ascending=ascending, na_option=na_option, pct=pct - ) - - -def test_rank_empty_group(): - # see gh-22519 - column = "A" - df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]}) - - result = df.groupby(column).B.rank(pct=True) - expected = Series([0.5, np.nan, 1.0], name="B") - tm.assert_series_equal(result, expected) - - result = df.groupby(column).rank(pct=True) - expected = DataFrame({"B": [0.5, np.nan, 1.0]}) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "input_key,input_value,output_value", - [ - ([1, 2], [1, 1], [1.0, 1.0]), - ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]), - ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]), - ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]), - ], -) -def test_rank_zero_div(input_key, input_value, output_value): - # GH 23666 - df = DataFrame({"A": input_key, "B": input_value}) - - result = df.groupby("A").rank(method="dense", pct=True) - expected = DataFrame({"B": output_value}) - tm.assert_frame_equal(result, expected) - - -def test_rank_min_int(): - # GH-32859 - df = DataFrame( - { - "grp": [1, 1, 2], - "int_col": [ - np.iinfo(np.int64).min, - np.iinfo(np.int64).max, - np.iinfo(np.int64).min, - ], - "datetimelike": [NaT, datetime(2001, 1, 1), NaT], - } - ) - - result = df.groupby("grp").rank() - expected = DataFrame( - {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]} - ) - - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("use_nan", [True, False]) -def test_rank_pct_equal_values_on_group_transition(use_nan): - # GH#40518 - fill_value = np.nan if use_nan else 3 - df = DataFrame( - [ - [-1, 1], - [-1, 2], - [1, fill_value], - [-1, fill_value], - ], - columns=["group", "val"], - ) - result = df.groupby(["group"])["val"].rank( - method="dense", - pct=True, - ) - if use_nan: - expected = Series([0.5, 1, np.nan, np.nan], name="val") - else: - expected = Series([1 / 3, 2 / 3, 1, 1], name="val") - - tm.assert_series_equal(result, expected) - - -def test_rank_multiindex(): - # GH27721 - df = concat( - { - "a": DataFrame({"col1": [3, 4], "col2": [1, 2]}), - "b": DataFrame({"col3": [5, 6], "col4": [7, 8]}), - }, - axis=1, - ) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(level=0, axis=1) - msg = "DataFrameGroupBy.rank with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.rank(axis=1) - - expected = concat( - [ - df["a"].rank(axis=1), - df["b"].rank(axis=1), - ], - axis=1, - keys=["a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -def test_groupby_axis0_rank_axis1(): - # GH#41320 - df = DataFrame( - {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]}, - index=["a", "a", "b", "b"], - ) - msg = "The 'axis' keyword in DataFrame.groupby is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(level=0, axis=0) - - msg = "DataFrameGroupBy.rank with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = gb.rank(axis=1) - - # This should match what we get when "manually" operating group-by-group - expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0) - tm.assert_frame_equal(res, expected) - - # check that we haven't accidentally written a case that coincidentally - # matches rank(axis=0) - msg = "The 'axis' keyword in DataFrameGroupBy.rank" - with tm.assert_produces_warning(FutureWarning, match=msg): - alt = gb.rank(axis=0) - assert not alt.equals(expected) - - -def test_groupby_axis0_cummax_axis1(): - # case where groupby axis is 0 and axis keyword in transform is 1 - - # df has mixed dtype -> multiple blocks - df = DataFrame( - {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]}, - index=["a", "a", "b", "b"], - ) - msg = "The 'axis' keyword in DataFrame.groupby is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gb = df.groupby(level=0, axis=0) - - msg = "DataFrameGroupBy.cummax with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - cmax = gb.cummax(axis=1) - expected = df[[0, 1]].astype(np.float64) - expected[2] = expected[1] - tm.assert_frame_equal(cmax, expected) - - -def test_non_unique_index(): - # GH 16577 - df = DataFrame( - {"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0}, - index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, - ) - result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True) - expected = Series( - [1.0, 1.0, 1.0, np.nan], - index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4, - name="value", - ) - tm.assert_series_equal(result, expected) - - -def test_rank_categorical(): - cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True) - cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True) - - df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2}) - - gb = df.groupby("col1") - - res = gb.rank() - - expected = df.astype(object).groupby("col1").rank() - tm.assert_frame_equal(res, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_reductions.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_reductions.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_reductions.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_reductions.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,1176 @@ +import builtins +import datetime as dt +from string import ascii_lowercase + +import numpy as np +import pytest + +from pandas._libs.tslibs import iNaT + +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.missing import na_value_for_dtype + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, + isna, +) +import pandas._testing as tm +from pandas.util import _test_decorators as td + + +@pytest.mark.parametrize("agg_func", ["any", "all"]) +@pytest.mark.parametrize( + "vals", + [ + ["foo", "bar", "baz"], + ["foo", "", ""], + ["", "", ""], + [1, 2, 3], + [1, 0, 0], + [0, 0, 0], + [1.0, 2.0, 3.0], + [1.0, 0.0, 0.0], + [0.0, 0.0, 0.0], + [True, True, True], + [True, False, False], + [False, False, False], + [np.nan, np.nan, np.nan], + ], +) +def test_groupby_bool_aggs(skipna, agg_func, vals): + df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2}) + + # Figure out expectation using Python builtin + exp = getattr(builtins, agg_func)(vals) + + # edge case for missing data with skipna and 'any' + if skipna and all(isna(vals)) and agg_func == "any": + exp = False + + expected = DataFrame( + [exp] * 2, columns=["val"], index=pd.Index(["a", "b"], name="key") + ) + result = getattr(df.groupby("key"), agg_func)(skipna=skipna) + tm.assert_frame_equal(result, expected) + + +def test_any(): + df = DataFrame( + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], + columns=["A", "B", "C"], + ) + expected = DataFrame( + [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] + ) + expected.index.name = "A" + result = df.groupby("A").any() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_bool_aggs_dup_column_labels(bool_agg_func): + # GH#21668 + df = DataFrame([[True, True]], columns=["a", "a"]) + grp_by = df.groupby([0]) + result = getattr(grp_by, bool_agg_func)() + + expected = df.set_axis(np.array([0])) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +@pytest.mark.parametrize( + "data", + [ + [False, False, False], + [True, True, True], + [pd.NA, pd.NA, pd.NA], + [False, pd.NA, False], + [True, pd.NA, True], + [True, pd.NA, False], + ], +) +def test_masked_kleene_logic(bool_agg_func, skipna, data): + # GH#37506 + ser = Series(data, dtype="boolean") + + # The result should match aggregating on the whole series. Correctness + # there is verified in test_reductions.py::test_any_all_boolean_kleene_logic + expected_data = getattr(ser, bool_agg_func)(skipna=skipna) + expected = Series(expected_data, index=np.array([0]), dtype="boolean") + + result = ser.groupby([0, 0, 0]).agg(bool_agg_func, skipna=skipna) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dtype1,dtype2,exp_col1,exp_col2", + [ + ( + "float", + "Float64", + np.array([True], dtype=bool), + pd.array([pd.NA], dtype="boolean"), + ), + ( + "Int64", + "float", + pd.array([pd.NA], dtype="boolean"), + np.array([True], dtype=bool), + ), + ( + "Int64", + "Int64", + pd.array([pd.NA], dtype="boolean"), + pd.array([pd.NA], dtype="boolean"), + ), + ( + "Float64", + "boolean", + pd.array([pd.NA], dtype="boolean"), + pd.array([pd.NA], dtype="boolean"), + ), + ], +) +def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2): + # GH#37506 + data = [1.0, np.nan] + df = DataFrame( + {"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)} + ) + result = df.groupby([1, 1]).agg("all", skipna=False) + + expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=np.array([1])) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series): + # GH#40585 + obj = frame_or_series([pd.NA, 1], dtype=dtype) + expected_res = True + if not skipna and bool_agg_func == "all": + expected_res = pd.NA + expected = frame_or_series([expected_res], index=np.array([1]), dtype="boolean") + + result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "bool_agg_func,data,expected_res", + [ + ("any", [pd.NA, np.nan], False), + ("any", [pd.NA, 1, np.nan], True), + ("all", [pd.NA, pd.NaT], True), + ("all", [pd.NA, False, pd.NaT], False), + ], +) +def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_series): + # GH#37501 + obj = frame_or_series(data, dtype=object) + result = obj.groupby([1] * len(data)).agg(bool_agg_func) + expected = frame_or_series([expected_res], index=np.array([1]), dtype="bool") + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_object_NA_raises_with_skipna_false(bool_agg_func): + # GH#37501 + ser = Series([pd.NA], dtype=object) + with pytest.raises(TypeError, match="boolean value of NA is ambiguous"): + ser.groupby([1]).agg(bool_agg_func, skipna=False) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_empty(frame_or_series, bool_agg_func): + # GH 45231 + kwargs = {"columns": ["a"]} if frame_or_series is DataFrame else {"name": "a"} + obj = frame_or_series(**kwargs, dtype=object) + result = getattr(obj.groupby(obj.index), bool_agg_func)() + expected = frame_or_series(**kwargs, dtype=bool) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype): + # GH#57040 + if any_real_numpy_dtype is int or any_real_numpy_dtype is float: + # No need to test + return + info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo + min_value = info(any_real_numpy_dtype).min + max_value = info(any_real_numpy_dtype).max + df = DataFrame( + {"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]}, + dtype=any_real_numpy_dtype, + ) + gb = df.groupby("a") + result = getattr(gb, how)() + expected = DataFrame( + {"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype) + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): + # GH#57040 + min_value = np.finfo(float_numpy_dtype).min + max_value = np.finfo(float_numpy_dtype).max + df = DataFrame( + { + "a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"), + "b": Series( + [ + np.nan, + min_value, + np.nan, + max_value, + min_value, + np.nan, + max_value, + np.nan, + np.nan, + np.nan, + ], + dtype=float_numpy_dtype, + ), + }, + ) + gb = df.groupby("a") + + warn = None if skipna else FutureWarning + msg = f"The behavior of DataFrameGroupBy.{how} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, how)(skipna=skipna) + if skipna: + values = [1, 3, 4, 6, np.nan] + else: + values = np.nan + expected = DataFrame( + {"b": values}, index=pd.Index(range(1, 6), name="a", dtype="intp") + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "func, values", + [ + ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}), + ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), + ], +) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_idxmin_idxmax_returns_int_types(func, values, numeric_only): + # GH 25444 + df = DataFrame( + { + "name": ["A", "A", "B", "B"], + "c_int": [1, 2, 3, 4], + "c_float": [4.02, 3.03, 2.04, 1.05], + "c_date": ["2019", "2018", "2016", "2017"], + } + ) + df["c_date"] = pd.to_datetime(df["c_date"]) + df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific") + df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0] + df["c_period"] = df["c_date"].dt.to_period("W") + df["c_Integer"] = df["c_int"].astype("Int64") + df["c_Floating"] = df["c_float"].astype("Float64") + + result = getattr(df.groupby("name"), func)(numeric_only=numeric_only) + + expected = DataFrame(values, index=pd.Index(["A", "B"], name="name")) + if numeric_only: + expected = expected.drop(columns=["c_date"]) + else: + expected["c_date_tz"] = expected["c_date"] + expected["c_timedelta"] = expected["c_date"] + expected["c_period"] = expected["c_date"] + expected["c_Integer"] = expected["c_int"] + expected["c_Floating"] = expected["c_float"] + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + ( + Timestamp("2011-01-15 12:50:28.502376"), + Timestamp("2011-01-20 12:50:28.593448"), + ), + (24650000000000001, 24650000000000002), + ], +) +@pytest.mark.parametrize("method", ["count", "min", "max", "first", "last"]) +def test_groupby_non_arithmetic_agg_int_like_precision(method, data): + # GH#6620, GH#9311 + df = DataFrame({"a": [1, 1], "b": data}) + + grouped = df.groupby("a") + result = getattr(grouped, method)() + if method == "count": + expected_value = 2 + elif method == "first": + expected_value = data[0] + elif method == "last": + expected_value = data[1] + else: + expected_value = getattr(df["b"], method)() + expected = DataFrame({"b": [expected_value]}, index=pd.Index([1], name="a")) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): + # GH#57019 + na_value = na_value_for_dtype(pandas_dtype(any_real_nullable_dtype)) + df = DataFrame( + { + "a": [2, 1, 1, 2, 3, 3], + "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + }, + dtype=any_real_nullable_dtype, + ) + gb = df.groupby("a", sort=sort) + method = getattr(gb, how) + result = method(skipna=skipna) + + ilocs = { + ("first", True): [3, 1, 4], + ("first", False): [0, 1, 4], + ("last", True): [3, 1, 5], + ("last", False): [3, 2, 5], + }[how, skipna] + expected = df.iloc[ilocs].set_index("a") + if sort: + expected = expected.sort_index() + tm.assert_frame_equal(result, expected) + + +def test_idxmin_idxmax_axis1(): + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] + ) + df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4] + + gb = df.groupby("A") + + warn_msg = "DataFrameGroupBy.idxmax with axis=1 is deprecated" + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + res = gb.idxmax(axis=1) + + alt = df.iloc[:, 1:].idxmax(axis=1) + indexer = res.index.get_level_values(1) + + tm.assert_series_equal(alt[indexer], res.droplevel("A")) + + df["E"] = date_range("2016-01-01", periods=10) + gb2 = df.groupby("A") + + msg = "'>' not supported between instances of 'Timestamp' and 'float'" + with pytest.raises(TypeError, match=msg): + with tm.assert_produces_warning(FutureWarning, match=warn_msg): + gb2.idxmax(axis=1) + + +def test_groupby_mean_no_overflow(): + # Regression test for (#22487) + df = DataFrame( + { + "user": ["A", "A", "A", "A", "A"], + "connections": [4970, 4749, 4719, 4704, 18446744073699999744], + } + ) + assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 + + +def test_mean_on_timedelta(): + # GH 17382 + df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5}) + result = df.groupby("cat")["time"].mean() + expected = Series( + pd.to_timedelta([4, 5]), name="time", index=pd.Index(["A", "B"], name="cat") + ) + tm.assert_series_equal(result, expected) + + +def test_cython_median(): + arr = np.random.default_rng(2).standard_normal(1000) + arr[::2] = np.nan + df = DataFrame(arr) + + labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) + labels[::17] = np.nan + + result = df.groupby(labels).median() + msg = "using DataFrameGroupBy.median" + with tm.assert_produces_warning(FutureWarning, match=msg): + exp = df.groupby(labels).agg(np.nanmedian) + tm.assert_frame_equal(result, exp) + + df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5))) + msg = "using DataFrameGroupBy.median" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = df.groupby(labels).agg(np.median) + xp = df.groupby(labels).median() + tm.assert_frame_equal(rs, xp) + + +def test_median_empty_bins(observed): + df = DataFrame(np.random.default_rng(2).integers(0, 44, 500)) + + grps = range(0, 55, 5) + bins = pd.cut(df[0], grps) + + result = df.groupby(bins, observed=observed).median() + expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) + tm.assert_frame_equal(result, expected) + + +def test_max_min_non_numeric(): + # #2700 + aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) + + result = aa.groupby("nn").max() + assert "ss" in result + + result = aa.groupby("nn").max(numeric_only=False) + assert "ss" in result + + result = aa.groupby("nn").min() + assert "ss" in result + + result = aa.groupby("nn").min(numeric_only=False) + assert "ss" in result + + +def test_max_min_object_multiple_columns(using_array_manager): + # GH#41111 case where the aggregation is valid for some columns but not + # others; we split object blocks column-wise, consistent with + # DataFrame._reduce + + df = DataFrame( + { + "A": [1, 1, 2, 2, 3], + "B": [1, "foo", 2, "bar", False], + "C": ["a", "b", "c", "d", "e"], + } + ) + df._consolidate_inplace() # should already be consolidate, but double-check + if not using_array_manager: + assert len(df._mgr.blocks) == 2 + + gb = df.groupby("A") + + result = gb[["C"]].max() + # "max" is valid for column "C" but not for "B" + ei = pd.Index([1, 2, 3], name="A") + expected = DataFrame({"C": ["b", "d", "e"]}, index=ei) + tm.assert_frame_equal(result, expected) + + result = gb[["C"]].min() + # "min" is valid for column "C" but not for "B" + ei = pd.Index([1, 2, 3], name="A") + expected = DataFrame({"C": ["a", "c", "e"]}, index=ei) + tm.assert_frame_equal(result, expected) + + +def test_min_date_with_nans(): + # GH26321 + dates = pd.to_datetime( + Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d" + ).dt.date + df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) + + result = df.groupby("b", as_index=False)["c"].min()["c"] + expected = pd.to_datetime( + Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" + ).dt.date + tm.assert_series_equal(result, expected) + + result = df.groupby("b")["c"].min() + expected.index.name = "b" + tm.assert_series_equal(result, expected) + + +def test_max_inat(): + # GH#40767 dont interpret iNaT as NaN + ser = Series([1, iNaT]) + key = np.array([1, 1], dtype=np.int64) + gb = ser.groupby(key) + + result = gb.max(min_count=2) + expected = Series({1: 1}, dtype=np.int64) + tm.assert_series_equal(result, expected, check_exact=True) + + result = gb.min(min_count=2) + expected = Series({1: iNaT}, dtype=np.int64) + tm.assert_series_equal(result, expected, check_exact=True) + + # not enough entries -> gets masked to NaN + result = gb.min(min_count=3) + expected = Series({1: np.nan}) + tm.assert_series_equal(result, expected, check_exact=True) + + +def test_max_inat_not_all_na(): + # GH#40767 dont interpret iNaT as NaN + + # make sure we dont round iNaT+1 to iNaT + ser = Series([1, iNaT, 2, iNaT + 1]) + gb = ser.groupby([1, 2, 3, 3]) + result = gb.min(min_count=2) + + # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy + expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1}) + expected.index = expected.index.astype(int) + tm.assert_series_equal(result, expected, check_exact=True) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_groupby_aggregate_period_column(func): + # GH 31471 + groups = [1, 2] + periods = pd.period_range("2020", periods=2, freq="Y") + df = DataFrame({"a": groups, "b": periods}) + + result = getattr(df.groupby("a")["b"], func)() + idx = pd.Index([1, 2], name="a") + expected = Series(periods, index=idx, name="b") + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_groupby_aggregate_period_frame(func): + # GH 31471 + groups = [1, 2] + periods = pd.period_range("2020", periods=2, freq="Y") + df = DataFrame({"a": groups, "b": periods}) + + result = getattr(df.groupby("a"), func)() + idx = pd.Index([1, 2], name="a") + expected = DataFrame({"b": periods}, index=idx) + + tm.assert_frame_equal(result, expected) + + +def test_aggregate_numeric_object_dtype(): + # https://github.com/pandas-dev/pandas/issues/39329 + # simplified case: multiple object columns where one is all-NaN + # -> gets split as the all-NaN is inferred as float + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, + ).astype(object) + result = df.groupby("key").min() + expected = ( + DataFrame( + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}, + ) + .set_index("key") + .astype(object) + ) + tm.assert_frame_equal(result, expected) + + # same but with numbers + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, + ).astype(object) + result = df.groupby("key").min() + expected = ( + DataFrame({"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}) + .set_index("key") + .astype(object) + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_aggregate_categorical_lost_index(func: str): + # GH: 28641 groupby drops index, when grouping over categorical column with min/max + ds = Series(["b"], dtype="category").cat.as_ordered() + df = DataFrame({"A": [1997], "B": ds}) + result = df.groupby("A").agg({"B": func}) + expected = DataFrame({"B": ["b"]}, index=pd.Index([1997], name="A")) + + # ordered categorical dtype should be preserved + expected["B"] = expected["B"].astype(ds.dtype) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"]) +def test_groupby_min_max_nullable(dtype): + if dtype == "Int64": + # GH#41743 avoid precision loss + ts = 1618556707013635762 + elif dtype == "boolean": + ts = 0 + else: + ts = 4.0 + + df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]}) + df["ts"] = df["ts"].astype(dtype) + + gb = df.groupby("id") + + result = gb.min() + expected = df.iloc[:1].set_index("id") + tm.assert_frame_equal(result, expected) + + res_max = gb.max() + expected_max = df.iloc[1:].set_index("id") + tm.assert_frame_equal(res_max, expected_max) + + result2 = gb.min(min_count=3) + expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype) + tm.assert_frame_equal(result2, expected2) + + res_max2 = gb.max(min_count=3) + tm.assert_frame_equal(res_max2, expected2) + + # Case with NA values + df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]}) + df2["ts"] = df2["ts"].astype(dtype) + gb2 = df2.groupby("id") + + result3 = gb2.min() + tm.assert_frame_equal(result3, expected) + + res_max3 = gb2.max() + tm.assert_frame_equal(res_max3, expected_max) + + result4 = gb2.min(min_count=100) + tm.assert_frame_equal(result4, expected2) + + res_max4 = gb2.max(min_count=100) + tm.assert_frame_equal(res_max4, expected2) + + +def test_min_max_nullable_uint64_empty_group(): + # don't raise NotImplementedError from libgroupby + cat = pd.Categorical([0] * 10, categories=[0, 1]) + df = DataFrame({"A": cat, "B": pd.array(np.arange(10, dtype=np.uint64))}) + gb = df.groupby("A", observed=False) + + res = gb.min() + + idx = pd.CategoricalIndex([0, 1], dtype=cat.dtype, name="A") + expected = DataFrame({"B": pd.array([0, pd.NA], dtype="UInt64")}, index=idx) + tm.assert_frame_equal(res, expected) + + res = gb.max() + expected.iloc[0, 0] = 9 + tm.assert_frame_equal(res, expected) + + +@pytest.mark.parametrize("func", ["first", "last", "min", "max"]) +def test_groupby_min_max_categorical(func): + # GH: 52151 + df = DataFrame( + { + "col1": pd.Categorical(["A"], categories=list("AB"), ordered=True), + "col2": pd.Categorical([1], categories=[1, 2], ordered=True), + "value": 0.1, + } + ) + result = getattr(df.groupby("col1", observed=False), func)() + + idx = pd.CategoricalIndex(data=["A", "B"], name="col1", ordered=True) + expected = DataFrame( + { + "col2": pd.Categorical([1, None], categories=[1, 2], ordered=True), + "value": [0.1, None], + }, + index=idx, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_min_empty_string_dtype(func): + # GH#55619 + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] + result = getattr(df.groupby("a"), func)() + expected = DataFrame( + columns=["b", "c"], dtype=dtype, index=pd.Index([], dtype=dtype, name="a") + ) + tm.assert_frame_equal(result, expected) + + +def test_max_nan_bug(): + df = DataFrame( + { + "Unnamed: 0": ["-04-23", "-05-06", "-05-07"], + "Date": [ + "2013-04-23 00:00:00", + "2013-05-06 00:00:00", + "2013-05-07 00:00:00", + ], + "app": Series([np.nan, np.nan, "OE"]), + "File": ["log080001.log", "log.log", "xlsx"], + } + ) + gb = df.groupby("Date") + r = gb[["File"]].max() + e = gb["File"].max().to_frame() + tm.assert_frame_equal(r, e) + assert not r["File"].isna().any() + + +@pytest.mark.slow +@pytest.mark.parametrize("sort", [False, True]) +@pytest.mark.parametrize("dropna", [False, True]) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("with_nan", [True, False]) +@pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]]) +def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys): + n = 100 + m = 10 + days = date_range("2015-08-23", periods=10) + df = DataFrame( + { + "jim": np.random.default_rng(2).choice(list(ascii_lowercase), n), + "joe": np.random.default_rng(2).choice(days, n), + "julie": np.random.default_rng(2).integers(0, m, n), + } + ) + if with_nan: + df = df.astype({"julie": float}) # Explicit cast to avoid implicit cast below + df.loc[1::17, "jim"] = None + df.loc[3::37, "joe"] = None + df.loc[7::19, "julie"] = None + df.loc[8::19, "julie"] = None + df.loc[9::19, "julie"] = None + original_df = df.copy() + gr = df.groupby(keys, as_index=as_index, sort=sort) + left = gr["julie"].nunique(dropna=dropna) + + gr = df.groupby(keys, as_index=as_index, sort=sort) + right = gr["julie"].apply(Series.nunique, dropna=dropna) + if not as_index: + right = right.reset_index(drop=True) + + if as_index: + tm.assert_series_equal(left, right, check_names=False) + else: + tm.assert_frame_equal(left, right, check_names=False) + tm.assert_frame_equal(df, original_df) + + +def test_nunique(): + df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) + + expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]}) + result = df.groupby("A", as_index=False).nunique() + tm.assert_frame_equal(result, expected) + + # as_index + expected.index = list("abc") + expected.index.name = "A" + expected = expected.drop(columns="A") + result = df.groupby("A").nunique() + tm.assert_frame_equal(result, expected) + + # with na + result = df.replace({"x": None}).groupby("A").nunique(dropna=False) + tm.assert_frame_equal(result, expected) + + # dropna + expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc")) + expected.index.name = "A" + result = df.replace({"x": None}).groupby("A").nunique() + tm.assert_frame_equal(result, expected) + + +def test_nunique_with_object(): + # GH 11077 + data = DataFrame( + [ + [100, 1, "Alice"], + [200, 2, "Bob"], + [300, 3, "Charlie"], + [-400, 4, "Dan"], + [500, 5, "Edith"], + ], + columns=["amount", "id", "name"], + ) + + result = data.groupby(["id", "amount"])["name"].nunique() + index = MultiIndex.from_arrays([data.id, data.amount]) + expected = Series([1] * 5, name="name", index=index) + tm.assert_series_equal(result, expected) + + +def test_nunique_with_empty_series(): + # GH 12553 + data = Series(name="name", dtype=object) + result = data.groupby(level=0).nunique() + expected = Series(name="name", dtype="int64") + tm.assert_series_equal(result, expected) + + +def test_nunique_with_timegrouper(): + # GH 13453 + test = DataFrame( + { + "time": [ + Timestamp("2016-06-28 09:35:35"), + Timestamp("2016-06-28 16:09:30"), + Timestamp("2016-06-28 16:46:28"), + ], + "data": ["1", "2", "3"], + } + ).set_index("time") + result = test.groupby(pd.Grouper(freq="h"))["data"].nunique() + expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(Series.nunique) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "key, data, dropna, expected", + [ + ( + ["x", "x", "x"], + [Timestamp("2019-01-01"), pd.NaT, Timestamp("2019-01-01")], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x"], + [dt.date(2019, 1, 1), pd.NaT, dt.date(2019, 1, 1)], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "y", "y"], + [ + dt.date(2019, 1, 1), + pd.NaT, + dt.date(2019, 1, 1), + pd.NaT, + dt.date(2019, 1, 1), + ], + False, + Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "x", "y"], + [ + dt.date(2019, 1, 1), + pd.NaT, + dt.date(2019, 1, 1), + pd.NaT, + dt.date(2019, 1, 1), + ], + False, + Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ], +) +def test_nunique_with_NaT(key, data, dropna, expected): + # GH 27951 + df = DataFrame({"key": key, "data": data}) + result = df.groupby(["key"])["data"].nunique(dropna=dropna) + tm.assert_series_equal(result, expected) + + +def test_nunique_preserves_column_level_names(): + # GH 23222 + test = DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) + result = test.groupby([0, 0, 0]).nunique() + expected = DataFrame([2], index=np.array([0]), columns=test.columns) + tm.assert_frame_equal(result, expected) + + +def test_nunique_transform_with_datetime(): + # GH 35109 - transform with nunique on datetimes results in integers + df = DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"]) + result = df.groupby([0, 0, 1])["date"].transform("nunique") + expected = Series([2, 2, 1], name="date") + tm.assert_series_equal(result, expected) + + +def test_empty_categorical(observed): + # GH#21334 + cat = Series([1]).astype("category") + ser = cat[:0] + gb = ser.groupby(ser, observed=observed) + result = gb.nunique() + if observed: + expected = Series([], index=cat[:0], dtype="int64") + else: + expected = Series([0], index=cat, dtype="int64") + tm.assert_series_equal(result, expected) + + +def test_intercept_builtin_sum(): + s = Series([1.0, 2.0, np.nan, 3.0]) + grouped = s.groupby([0, 1, 2, 2]) + + msg = "using SeriesGroupBy.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result = grouped.agg(builtins.sum) + msg = "using np.sum" + with tm.assert_produces_warning(FutureWarning, match=msg): + # GH#53425 + result2 = grouped.apply(builtins.sum) + expected = grouped.sum() + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + +@pytest.mark.parametrize("min_count", [0, 10]) +def test_groupby_sum_mincount_boolean(min_count): + b = True + a = False + na = np.nan + dfg = pd.array([b, b, na, na, a, a, b], dtype="boolean") + + df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg}) + result = df.groupby("A").sum(min_count=min_count) + if min_count == 0: + expected = DataFrame( + {"B": pd.array([3, 0, 0], dtype="Int64")}, + index=pd.Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + else: + expected = DataFrame( + {"B": pd.array([pd.NA] * 3, dtype="Int64")}, + index=pd.Index([1, 2, 3], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_below_mincount_nullable_integer(): + # https://github.com/pandas-dev/pandas/issues/32861 + df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64") + grouped = df.groupby("a") + idx = pd.Index([0, 1, 2], name="a", dtype="Int64") + + result = grouped["b"].sum(min_count=2) + expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b") + tm.assert_series_equal(result, expected) + + result = grouped.sum(min_count=2) + expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx) + tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_timedelta_with_nat(): + # GH#42659 + df = DataFrame( + { + "a": [1, 1, 2, 2], + "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT], + } + ) + td3 = pd.Timedelta(days=3) + + gb = df.groupby("a") + + res = gb.sum() + expected = DataFrame({"b": [td3, td3]}, index=pd.Index([1, 2], name="a")) + tm.assert_frame_equal(res, expected) + + res = gb["b"].sum() + tm.assert_series_equal(res, expected["b"]) + + res = gb["b"].sum(min_count=2) + expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index) + tm.assert_series_equal(res, expected) + + +@pytest.mark.parametrize( + "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"] +) +@pytest.mark.parametrize( + "method,data", + [ + ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), + ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), + ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), + ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), + ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}), + ], +) +def test_groupby_non_arithmetic_agg_types(dtype, method, data): + # GH9311, GH6620 + df = DataFrame( + [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}] + ) + + df["b"] = df.b.astype(dtype) + + if "args" not in data: + data["args"] = [] + + if "out_type" in data: + out_type = data["out_type"] + else: + out_type = dtype + + exp = data["df"] + df_out = DataFrame(exp) + + df_out["b"] = df_out.b.astype(out_type) + df_out.set_index("a", inplace=True) + + grpd = df.groupby("a") + t = getattr(grpd, method)(*data["args"]) + tm.assert_frame_equal(t, df_out) + + +def scipy_sem(*args, **kwargs): + from scipy.stats import sem + + return sem(*args, ddof=1, **kwargs) + + +@pytest.mark.parametrize( + "op,targop", + [ + ("mean", np.mean), + ("median", np.median), + ("std", np.std), + ("var", np.var), + ("sum", np.sum), + ("prod", np.prod), + ("min", np.min), + ("max", np.max), + ("first", lambda x: x.iloc[0]), + ("last", lambda x: x.iloc[-1]), + ("count", np.size), + pytest.param("sem", scipy_sem, marks=td.skip_if_no("scipy")), + ], +) +def test_ops_general(op, targop): + df = DataFrame(np.random.default_rng(2).standard_normal(1000)) + labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) + + result = getattr(df.groupby(labels), op)() + warn = None if op in ("first", "last", "count", "sem") else FutureWarning + msg = f"using DataFrameGroupBy.{op}" + with tm.assert_produces_warning(warn, match=msg): + expected = df.groupby(labels).agg(targop) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + { + "a": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2], + }, + {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]}, + ], +) +@pytest.mark.parametrize("function", ["mean", "median", "var"]) +def test_apply_to_nullable_integer_returns_float(values, function): + # https://github.com/pandas-dev/pandas/issues/32219 + output = 0.5 if function == "var" else 1.5 + arr = np.array([output] * 3, dtype=float) + idx = pd.Index([1, 2, 3], name="a", dtype="Int64") + expected = DataFrame({"b": arr}, index=idx).astype("Float64") + + groups = DataFrame(values, dtype="Int64").groupby("a") + + result = getattr(groups, function)() + tm.assert_frame_equal(result, expected) + + result = groups.agg(function) + tm.assert_frame_equal(result, expected) + + result = groups.agg([function]) + expected.columns = MultiIndex.from_tuples([("b", function)]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "op", + [ + "sum", + "prod", + "min", + "max", + "median", + "mean", + "skew", + "std", + "var", + "sem", + ], +) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_regression_allowlist_methods(op, axis, skipna, sort): + # GH6944 + # GH 17537 + # explicitly test the allowlist methods + raw_frame = DataFrame([0]) + if axis == 0: + frame = raw_frame + msg = "The 'axis' keyword in DataFrame.groupby is deprecated and will be" + else: + frame = raw_frame.T + msg = "DataFrame.groupby with axis=1 is deprecated" + + with tm.assert_produces_warning(FutureWarning, match=msg): + grouped = frame.groupby(level=0, axis=axis, sort=sort) + + if op == "skew": + # skew has skipna + result = getattr(grouped, op)(skipna=skipna) + expected = frame.groupby(level=0).apply( + lambda h: getattr(h, op)(axis=axis, skipna=skipna) + ) + if sort: + expected = expected.sort_index(axis=axis) + tm.assert_frame_equal(result, expected) + else: + result = getattr(grouped, op)() + expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(axis=axis)) + if sort: + expected = expected.sort_index(axis=axis) + tm.assert_frame_equal(result, expected) + + +def test_groupby_prod_with_int64_dtype(): + # GH#46573 + data = [ + [1, 11], + [1, 41], + [1, 17], + [1, 37], + [1, 7], + [1, 29], + [1, 31], + [1, 2], + [1, 3], + [1, 43], + [1, 5], + [1, 47], + [1, 19], + [1, 88], + ] + df = DataFrame(data, columns=["A", "B"], dtype="int64") + result = df.groupby(["A"]).prod().reset_index() + expected = DataFrame({"A": [1], "B": [180970905912331920]}, dtype="int64") + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_sample.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_sample.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_sample.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_sample.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,154 +0,0 @@ -import pytest - -from pandas import ( - DataFrame, - Index, - Series, -) -import pandas._testing as tm - - -@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)]) -def test_groupby_sample_balanced_groups_shape(n, frac): - values = [1] * 10 + [2] * 10 - df = DataFrame({"a": values, "b": values}) - - result = df.groupby("a").sample(n=n, frac=frac) - values = [1] * 2 + [2] * 2 - expected = DataFrame({"a": values, "b": values}, index=result.index) - tm.assert_frame_equal(result, expected) - - result = df.groupby("a")["b"].sample(n=n, frac=frac) - expected = Series(values, name="b", index=result.index) - tm.assert_series_equal(result, expected) - - -def test_groupby_sample_unbalanced_groups_shape(): - values = [1] * 10 + [2] * 20 - df = DataFrame({"a": values, "b": values}) - - result = df.groupby("a").sample(n=5) - values = [1] * 5 + [2] * 5 - expected = DataFrame({"a": values, "b": values}, index=result.index) - tm.assert_frame_equal(result, expected) - - result = df.groupby("a")["b"].sample(n=5) - expected = Series(values, name="b", index=result.index) - tm.assert_series_equal(result, expected) - - -def test_groupby_sample_index_value_spans_groups(): - values = [1] * 3 + [2] * 3 - df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2]) - - result = df.groupby("a").sample(n=2) - values = [1] * 2 + [2] * 2 - expected = DataFrame({"a": values, "b": values}, index=result.index) - tm.assert_frame_equal(result, expected) - - result = df.groupby("a")["b"].sample(n=2) - expected = Series(values, name="b", index=result.index) - tm.assert_series_equal(result, expected) - - -def test_groupby_sample_n_and_frac_raises(): - df = DataFrame({"a": [1, 2], "b": [1, 2]}) - msg = "Please enter a value for `frac` OR `n`, not both" - - with pytest.raises(ValueError, match=msg): - df.groupby("a").sample(n=1, frac=1.0) - - with pytest.raises(ValueError, match=msg): - df.groupby("a")["b"].sample(n=1, frac=1.0) - - -def test_groupby_sample_frac_gt_one_without_replacement_raises(): - df = DataFrame({"a": [1, 2], "b": [1, 2]}) - msg = "Replace has to be set to `True` when upsampling the population `frac` > 1." - - with pytest.raises(ValueError, match=msg): - df.groupby("a").sample(frac=1.5, replace=False) - - with pytest.raises(ValueError, match=msg): - df.groupby("a")["b"].sample(frac=1.5, replace=False) - - -@pytest.mark.parametrize("n", [-1, 1.5]) -def test_groupby_sample_invalid_n_raises(n): - df = DataFrame({"a": [1, 2], "b": [1, 2]}) - - if n < 0: - msg = "A negative number of rows requested. Please provide `n` >= 0." - else: - msg = "Only integers accepted as `n` values" - - with pytest.raises(ValueError, match=msg): - df.groupby("a").sample(n=n) - - with pytest.raises(ValueError, match=msg): - df.groupby("a")["b"].sample(n=n) - - -def test_groupby_sample_oversample(): - values = [1] * 10 + [2] * 10 - df = DataFrame({"a": values, "b": values}) - - result = df.groupby("a").sample(frac=2.0, replace=True) - values = [1] * 20 + [2] * 20 - expected = DataFrame({"a": values, "b": values}, index=result.index) - tm.assert_frame_equal(result, expected) - - result = df.groupby("a")["b"].sample(frac=2.0, replace=True) - expected = Series(values, name="b", index=result.index) - tm.assert_series_equal(result, expected) - - -def test_groupby_sample_without_n_or_frac(): - values = [1] * 10 + [2] * 10 - df = DataFrame({"a": values, "b": values}) - - result = df.groupby("a").sample(n=None, frac=None) - expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index) - tm.assert_frame_equal(result, expected) - - result = df.groupby("a")["b"].sample(n=None, frac=None) - expected = Series([1, 2], name="b", index=result.index) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "index, expected_index", - [(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])], -) -def test_groupby_sample_with_weights(index, expected_index): - # GH 39927 - tests for integer index needed - values = [1] * 2 + [2] * 2 - df = DataFrame({"a": values, "b": values}, index=Index(index)) - - result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0]) - expected = DataFrame({"a": values, "b": values}, index=Index(expected_index)) - tm.assert_frame_equal(result, expected) - - result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0]) - expected = Series(values, name="b", index=Index(expected_index)) - tm.assert_series_equal(result, expected) - - -def test_groupby_sample_with_selections(): - # GH 39928 - values = [1] * 10 + [2] * 10 - df = DataFrame({"a": values, "b": values, "c": values}) - - result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None) - expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index) - tm.assert_frame_equal(result, expected) - - -def test_groupby_sample_with_empty_inputs(): - # GH48459 - df = DataFrame({"a": [], "b": []}) - groupby_df = df.groupby("a") - - result = groupby_df.sample() - expected = df - tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_size.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_size.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_size.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_size.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,130 +0,0 @@ -import numpy as np -import pytest - -import pandas.util._test_decorators as td - -from pandas.core.dtypes.common import is_integer_dtype - -from pandas import ( - DataFrame, - Index, - PeriodIndex, - Series, -) -import pandas._testing as tm - - -@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) -def test_size(df, by): - grouped = df.groupby(by=by) - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - -@pytest.mark.parametrize( - "by", - [ - [0, 0, 0, 0], - [0, 1, 1, 1], - [1, 0, 1, 1], - [0, None, None, None], - pytest.param([None, None, None, None], marks=pytest.mark.xfail), - ], -) -def test_size_axis_1(df, axis_1, by, sort, dropna): - # GH#45715 - counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)} - if dropna: - counts = {key: value for key, value in counts.items() if key is not None} - expected = Series(counts, dtype="int64") - if sort: - expected = expected.sort_index() - if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by): - expected.index = expected.index.astype(int) - - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna) - result = grouped.size() - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) -@pytest.mark.parametrize("sort", [True, False]) -def test_size_sort(sort, by): - df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC")) - left = df.groupby(by=by, sort=sort).size() - right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0]) - tm.assert_series_equal(left, right, check_names=False) - - -def test_size_series_dataframe(): - # https://github.com/pandas-dev/pandas/issues/11699 - df = DataFrame(columns=["A", "B"]) - out = Series(dtype="int64", index=Index([], name="A")) - tm.assert_series_equal(df.groupby("A").size(), out) - - -def test_size_groupby_all_null(): - # https://github.com/pandas-dev/pandas/issues/23050 - # Assert no 'Value Error : Length of passed values is 2, index implies 0' - df = DataFrame({"A": [None, None]}) # all-null groups - result = df.groupby("A").size() - expected = Series(dtype="int64", index=Index([], name="A")) - tm.assert_series_equal(result, expected) - - -def test_size_period_index(): - # https://github.com/pandas-dev/pandas/issues/34010 - ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D")) - grp = ser.groupby(level="A") - result = grp.size() - tm.assert_series_equal(result, ser) - - -@pytest.mark.parametrize("as_index", [True, False]) -def test_size_on_categorical(as_index): - df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"]) - df["A"] = df["A"].astype("category") - result = df.groupby(["A", "B"], as_index=as_index, observed=False).size() - - expected = DataFrame( - [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"] - ) - expected["A"] = expected["A"].astype("category") - if as_index: - expected = expected.set_index(["A", "B"])["size"].rename(None) - - tm.assert_equal(result, expected) - - -@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) -def test_size_series_masked_type_returns_Int64(dtype): - # GH 54132 - ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype) - result = ser.groupby(level=0).size() - expected = Series([2, 1], dtype="Int64", index=["a", "b"]) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_size_strings(dtype): - # GH#55627 - df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) - result = df.groupby("a")["b"].size() - exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" - expected = Series( - [2, 1], - index=Index(["a", "b"], name="a", dtype=dtype), - name="b", - dtype=exp_dtype, - ) - tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_skew.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_skew.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_skew.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_skew.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,27 +0,0 @@ -import numpy as np - -import pandas as pd -import pandas._testing as tm - - -def test_groupby_skew_equivalence(): - # Test that that groupby skew method (which uses libgroupby.group_skew) - # matches the results of operating group-by-group (which uses nanops.nanskew) - nrows = 1000 - ngroups = 3 - ncols = 2 - nan_frac = 0.05 - - arr = np.random.default_rng(2).standard_normal((nrows, ncols)) - arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan - - df = pd.DataFrame(arr) - grps = np.random.default_rng(2).integers(0, ngroups, size=nrows) - gb = df.groupby(grps) - - result = gb.skew() - - grpwise = [grp.skew().to_frame(i).T for i, grp in gb] - expected = pd.concat(grpwise, axis=0) - expected.index = expected.index.astype(result.index.dtype) # 32bit builds - tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_timegrouper.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_timegrouper.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_timegrouper.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_timegrouper.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,7 +5,6 @@ datetime, timedelta, ) -from io import StringIO import numpy as np import pytest @@ -31,7 +30,7 @@ def frame_for_truncated_bingrouper(): """ DataFrame used by groupby_with_truncated_bingrouper, made into - a separate fixture for easier re-use in + a separate fixture for easier reuse in test_groupby_apply_timegrouper_with_nat_apply_squeeze """ df = DataFrame( @@ -53,8 +52,8 @@ @pytest.fixture def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): """ - GroupBy object such that gb.grouper is a BinGrouper and - len(gb.grouper.result_index) < len(gb.grouper.group_keys_seq) + GroupBy object such that gb._grouper is a BinGrouper and + len(gb._grouper.result_index) < len(gb._grouper.group_keys_seq) Aggregations on this groupby should have @@ -68,7 +67,7 @@ gb = df.groupby(tdg) # check we're testing the case we're interested in - assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq) + assert len(gb._grouper.result_index) != len(gb._grouper.group_keys_seq) return gb @@ -99,11 +98,17 @@ for df in [df_original, df_reordered]: df = df.set_index(["Date"]) + exp_dti = date_range( + "20130901", + "20131205", + freq="5D", + name="Date", + inclusive="left", + unit=df.index.unit, + ) expected = DataFrame( {"Buyer": 0, "Quantity": 0}, - index=date_range( - "20130901", "20131205", freq="5D", name="Date", inclusive="left" - ), + index=exp_dti, ) # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl" expected = expected.astype({"Buyer": object}) @@ -147,10 +152,10 @@ df = df.sort_values(by="Quantity", ascending=False) df = df.set_index("Date", drop=False) - g = df.groupby(Grouper(freq="6M")) + g = df.groupby(Grouper(freq="6ME")) assert g.group_keys - assert isinstance(g.grouper, BinGrouper) + assert isinstance(g._grouper, BinGrouper) groups = g.groups assert isinstance(groups, dict) assert len(groups) == 3 @@ -193,7 +198,7 @@ ).set_index(["Date", "Buyer"]) msg = "The default value of numeric_only" - result = df.groupby([Grouper(freq="A"), "Buyer"]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="YE"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) expected = DataFrame( @@ -248,7 +253,7 @@ result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1ME"), "Buyer"]).sum(numeric_only=True) expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), @@ -264,32 +269,32 @@ # passing the name df = df.reset_index() - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum( + result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum( numeric_only=True ) tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="'The grouper name foo is not found'"): - df.groupby([Grouper(freq="1M", key="foo"), "Buyer"]).sum() + df.groupby([Grouper(freq="1ME", key="foo"), "Buyer"]).sum() # passing the level df = df.set_index("Date") - result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum( + result = df.groupby([Grouper(freq="1ME", level="Date"), "Buyer"]).sum( numeric_only=True ) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum( + result = df.groupby([Grouper(freq="1ME", level=0), "Buyer"]).sum( numeric_only=True ) tm.assert_frame_equal(result, expected) with pytest.raises(ValueError, match="The level foo is not valid"): - df.groupby([Grouper(freq="1M", level="foo"), "Buyer"]).sum() + df.groupby([Grouper(freq="1ME", level="foo"), "Buyer"]).sum() # multi names df = df.copy() df["Date"] = df.index + offsets.MonthEnd(2) - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum( + result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum( numeric_only=True ) expected = DataFrame( @@ -309,7 +314,7 @@ msg = "The Grouper cannot specify both a key and a level!" with pytest.raises(ValueError, match=msg): df.groupby( - [Grouper(freq="1M", key="Date", level="Date"), "Buyer"] + [Grouper(freq="1ME", key="Date", level="Date"), "Buyer"] ).sum() # single groupers @@ -320,21 +325,23 @@ [datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date" ), ) - result = df.groupby(Grouper(freq="1M")).sum(numeric_only=True) + result = df.groupby(Grouper(freq="1ME")).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M")]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1ME")]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) expected.index = expected.index.shift(1) assert expected.index.freq == offsets.MonthEnd() - result = df.groupby(Grouper(freq="1M", key="Date")).sum(numeric_only=True) + result = df.groupby(Grouper(freq="1ME", key="Date")).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", key="Date")]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1ME", key="Date")]).sum( + numeric_only=True + ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"]) + @pytest.mark.parametrize("freq", ["D", "ME", "YE", "QE-APR"]) def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort df = DataFrame( @@ -421,7 +428,7 @@ dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"] for df in [df_original, df_reordered]: - grouped = df.groupby(Grouper(freq="M", key="Date")) + grouped = df.groupby(Grouper(freq="ME", key="Date")) for t, expected in zip(dt_list, expected_list): dt = Timestamp(t) result = grouped.get_group(dt) @@ -436,7 +443,7 @@ g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")] for df in [df_original, df_reordered]: - grouped = df.groupby(["Buyer", Grouper(freq="M", key="Date")]) + grouped = df.groupby(["Buyer", Grouper(freq="ME", key="Date")]) for (b, t), expected in zip(g_list, expected_list): dt = Timestamp(t) result = grouped.get_group((b, dt)) @@ -453,7 +460,7 @@ ] for df in [df_original, df_reordered]: - grouped = df.groupby(Grouper(freq="M")) + grouped = df.groupby(Grouper(freq="ME")) for t, expected in zip(dt_list, expected_list): dt = Timestamp(t) result = grouped.get_group(dt) @@ -470,8 +477,12 @@ def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) - expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) - result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -487,8 +498,11 @@ def sumfunc_value(x): return x.value.sum() - expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -506,6 +520,7 @@ groups = grouped.groups assert isinstance(next(iter(groups.keys())), datetime) + def test_groupby_groups_datetimeindex2(self): # GH#11442 index = date_range("2015/01/01", periods=5, name="date") df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index) @@ -520,7 +535,9 @@ for date in dates: result = grouped.get_group(date) data = [[df.loc[date, "A"], df.loc[date, "B"]]] - expected_index = DatetimeIndex([date], name="date", freq="D") + expected_index = DatetimeIndex( + [date], name="date", freq="D", dtype=index.dtype + ) expected = DataFrame(data, columns=list("AB"), index=expected_index) tm.assert_frame_equal(result, expected) @@ -598,14 +615,26 @@ def test_groupby_multi_timezone(self): # combining multiple / different timezones yields UTC + df = DataFrame( + { + "value": range(5), + "date": [ + "2000-01-28 16:47:00", + "2000-01-29 16:48:00", + "2000-01-30 16:49:00", + "2000-01-31 16:50:00", + "2000-01-01 16:50:00", + ], + "tz": [ + "America/Chicago", + "America/Chicago", + "America/Los_Angeles", + "America/Chicago", + "America/New_York", + ], + } + ) - data = """0,2000-01-28 16:47:00,America/Chicago -1,2000-01-29 16:48:00,America/Chicago -2,2000-01-30 16:49:00,America/Los_Angeles -3,2000-01-31 16:50:00,America/Chicago -4,2000-01-01 16:50:00,America/New_York""" - - df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"]) result = df.groupby("tz", group_keys=False).date.apply( lambda x: pd.to_datetime(x).dt.tz_localize(x.name) ) @@ -646,7 +675,7 @@ df = DataFrame( { "label": ["a", "a", "a", "b", "b", "b"], - "period": [pd.Period(d, freq="H") for d in dates], + "period": [pd.Period(d, freq="h") for d in dates], "value1": np.arange(6, dtype="int64"), "value2": [1, 2] * 3, } @@ -661,7 +690,7 @@ "2011-07-19 09:00:00", "2011-07-19 09:00:00", ], - freq="H", + freq="h", name="period", ) exp_idx2 = Index(["a", "b"] * 3, name="label") @@ -676,7 +705,7 @@ tm.assert_frame_equal(result, expected) # by level - didx = pd.PeriodIndex(dates, freq="H") + didx = pd.PeriodIndex(dates, freq="h") df = DataFrame( {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, index=didx, @@ -684,7 +713,7 @@ exp_idx = pd.PeriodIndex( ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], - freq="H", + freq="h", ) expected = DataFrame( {"value1": [3, 5, 7], "value2": [2, 4, 6]}, @@ -697,7 +726,7 @@ def test_groupby_first_datetime64(self): df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) - df[1] = df[1].view("M8[ns]") + df[1] = df[1].astype("M8[ns]") assert issubclass(df[1].dtype.type, np.datetime64) @@ -753,7 +782,7 @@ def test_datetime_count(self): df = DataFrame( - {"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="T")} + {"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="min")} ) result = df.groupby("a").dates.count() expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates") @@ -842,21 +871,23 @@ result = period_series.groupby(period_series.index.month).sum() expected = Series( - range(0, periods), index=Index(range(1, periods + 1), name=index.name) + range(periods), index=Index(range(1, periods + 1), name=index.name) ) tm.assert_series_equal(result, expected) def test_groupby_apply_timegrouper_with_nat_dict_returns( self, groupby_with_truncated_bingrouper ): - # GH#43500 case where gb.grouper.result_index and gb.grouper.group_keys_seq + # GH#43500 case where gb._grouper.result_index and gb._grouper.group_keys_seq # have different lengths that goes through the `isinstance(values[0], dict)` # path gb = groupby_with_truncated_bingrouper res = gb["Quantity"].apply(lambda x: {"foo": len(x)}) - dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date") + df = gb.obj + unit = df["Date"]._values.unit + dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit) mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)]) expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity") tm.assert_series_equal(res, expected) @@ -870,7 +901,9 @@ res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan) - dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date") + df = gb.obj + unit = df["Date"]._values.unit + dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit) expected = Series( [18, np.nan, np.nan, np.nan, np.nan, np.nan, 5], index=dti._with_freq(None), @@ -886,7 +919,7 @@ # We need to create a GroupBy object with only one non-NaT group, # so use a huge freq so that all non-NaT dates will be grouped together - tdg = Grouper(key="Date", freq="100Y") + tdg = Grouper(key="Date", freq="100YE") gb = df.groupby(tdg) # check that we will go through the singular_series path @@ -895,11 +928,14 @@ assert gb._selected_obj._get_axis(gb.axis).nlevels == 1 # function that returns a Series - res = gb.apply(lambda x: x["Quantity"] * 2) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + res = gb.apply(lambda x: x["Quantity"] * 2) + dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date") expected = DataFrame( [[36, 6, 6, 10, 2]], - index=Index([Timestamp("2013-12-31")], name="Date"), + index=dti, columns=Index([0, 1, 5, 2, 3], name="Quantity"), ) tm.assert_frame_equal(res, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/test_value_counts.py pandas-2.2.2+dfsg/pandas/tests/groupby/test_value_counts.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/test_value_counts.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/test_value_counts.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,1175 +0,0 @@ -""" -these are systematically testing all of the args to value_counts -with different size combinations. This is to ensure stability of the sorting -and proper parameter handling -""" - -from itertools import product - -import numpy as np -import pytest - -import pandas.util._test_decorators as td - -from pandas import ( - Categorical, - CategoricalIndex, - DataFrame, - Grouper, - Index, - MultiIndex, - Series, - date_range, - to_datetime, -) -import pandas._testing as tm -from pandas.util.version import Version - - -def tests_value_counts_index_names_category_column(): - # GH44324 Missing name of index category column - df = DataFrame( - { - "gender": ["female"], - "country": ["US"], - } - ) - df["gender"] = df["gender"].astype("category") - result = df.groupby("country")["gender"].value_counts() - - # Construct expected, very specific multiindex - df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"]) - df_mi_expected["gender"] = df_mi_expected["gender"].astype("category") - mi_expected = MultiIndex.from_frame(df_mi_expected) - expected = Series([1], index=mi_expected, name="count") - - tm.assert_series_equal(result, expected) - - -# our starting frame -def seed_df(seed_nans, n, m): - days = date_range("2015-08-24", periods=10) - - frame = DataFrame( - { - "1st": np.random.default_rng(2).choice(list("abcd"), n), - "2nd": np.random.default_rng(2).choice(days, n), - "3rd": np.random.default_rng(2).integers(1, m + 1, n), - } - ) - - if seed_nans: - # Explicitly cast to float to avoid implicit cast when setting nan - frame["3rd"] = frame["3rd"].astype("float") - frame.loc[1::11, "1st"] = np.nan - frame.loc[3::17, "2nd"] = np.nan - frame.loc[7::19, "3rd"] = np.nan - frame.loc[8::19, "3rd"] = np.nan - frame.loc[9::19, "3rd"] = np.nan - - return frame - - -# create input df, keys, and the bins -binned = [] -ids = [] -for seed_nans in [True, False]: - for n, m in product((100, 1000), (5, 20)): - df = seed_df(seed_nans, n, m) - bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2) - keys = "1st", "2nd", ["1st", "2nd"] - for k, b in product(keys, bins): - binned.append((df, k, b, n, m)) - ids.append(f"{k}-{n}-{m}") - - -@pytest.mark.slow -@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) -@pytest.mark.parametrize("isort", [True, False]) -@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) -def test_series_groupby_value_counts( - df, keys, bins, n, m, isort, normalize, name, sort, ascending, dropna -): - def rebuild_index(df): - arr = list(map(df.index.get_level_values, range(df.index.nlevels))) - df.index = MultiIndex.from_arrays(arr, names=df.index.names) - return df - - kwargs = { - "normalize": normalize, - "sort": sort, - "ascending": ascending, - "dropna": dropna, - "bins": bins, - } - - gr = df.groupby(keys, sort=isort) - left = gr["3rd"].value_counts(**kwargs) - - gr = df.groupby(keys, sort=isort) - right = gr["3rd"].apply(Series.value_counts, **kwargs) - right.index.names = right.index.names[:-1] + ["3rd"] - # https://github.com/pandas-dev/pandas/issues/49909 - right = right.rename(name) - - # have to sort on index because of unstable sort on values - left, right = map(rebuild_index, (left, right)) # xref GH9212 - tm.assert_series_equal(left.sort_index(), right.sort_index()) - - -@pytest.mark.parametrize("utc", [True, False]) -def test_series_groupby_value_counts_with_grouper(utc): - # GH28479 - df = DataFrame( - { - "Timestamp": [ - 1565083561, - 1565083561 + 86400, - 1565083561 + 86500, - 1565083561 + 86400 * 2, - 1565083561 + 86400 * 3, - 1565083561 + 86500 * 3, - 1565083561 + 86400 * 4, - ], - "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], - } - ).drop([3]) - - df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s") - dfg = df.groupby(Grouper(freq="1D", key="Datetime")) - - # have to sort on index because of unstable sort on values xref GH9212 - result = dfg["Food"].value_counts().sort_index() - expected = dfg["Food"].apply(Series.value_counts).sort_index() - expected.index.names = result.index.names - # https://github.com/pandas-dev/pandas/issues/49909 - expected = expected.rename("count") - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]]) -def test_series_groupby_value_counts_empty(columns): - # GH39172 - df = DataFrame(columns=columns) - dfg = df.groupby(columns[:-1]) - - result = dfg[columns[-1]].value_counts() - expected = Series([], dtype=result.dtype, name="count") - expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns) - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]]) -def test_series_groupby_value_counts_one_row(columns): - # GH42618 - df = DataFrame(data=[range(len(columns))], columns=columns) - dfg = df.groupby(columns[:-1]) - - result = dfg[columns[-1]].value_counts() - expected = df.value_counts() - - tm.assert_series_equal(result, expected) - - -def test_series_groupby_value_counts_on_categorical(): - # GH38672 - - s = Series(Categorical(["a"], categories=["a", "b"])) - result = s.groupby([0]).value_counts() - - expected = Series( - data=[1, 0], - index=MultiIndex.from_arrays( - [ - np.array([0, 0]), - CategoricalIndex( - ["a", "b"], categories=["a", "b"], ordered=False, dtype="category" - ), - ] - ), - name="count", - ) - - # Expected: - # 0 a 1 - # b 0 - # dtype: int64 - - tm.assert_series_equal(result, expected) - - -def test_series_groupby_value_counts_no_sort(): - # GH#50482 - df = DataFrame( - { - "gender": ["male", "male", "female", "male", "female", "male"], - "education": ["low", "medium", "high", "low", "high", "low"], - "country": ["US", "FR", "US", "FR", "FR", "FR"], - } - ) - gb = df.groupby(["country", "gender"], sort=False)["education"] - result = gb.value_counts(sort=False) - index = MultiIndex( - levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]], - codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]], - names=["country", "gender", "education"], - ) - expected = Series([1, 1, 1, 2, 1], index=index, name="count") - tm.assert_series_equal(result, expected) - - -@pytest.fixture -def education_df(): - return DataFrame( - { - "gender": ["male", "male", "female", "male", "female", "male"], - "education": ["low", "medium", "high", "low", "high", "low"], - "country": ["US", "FR", "US", "FR", "FR", "FR"], - } - ) - - -def test_axis(education_df): - msg = "DataFrame.groupby with axis=1 is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - gp = education_df.groupby("country", axis=1) - with pytest.raises(NotImplementedError, match="axis"): - gp.value_counts() - - -def test_bad_subset(education_df): - gp = education_df.groupby("country") - with pytest.raises(ValueError, match="subset"): - gp.value_counts(subset=["country"]) - - -def test_basic(education_df, request): - # gh43564 - if Version(np.__version__) >= Version("1.25"): - request.node.add_marker( - pytest.mark.xfail( - reason=( - "pandas default unstable sorting of duplicates" - "issue with numpy>=1.25 with AVX instructions" - ), - strict=False, - ) - ) - result = education_df.groupby("country")[["gender", "education"]].value_counts( - normalize=True - ) - expected = Series( - data=[0.5, 0.25, 0.25, 0.5, 0.5], - index=MultiIndex.from_tuples( - [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("US", "female", "high"), - ("US", "male", "low"), - ], - names=["country", "gender", "education"], - ), - name="proportion", - ) - tm.assert_series_equal(result, expected) - - -def _frame_value_counts(df, keys, normalize, sort, ascending): - return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending) - - -@pytest.mark.parametrize("groupby", ["column", "array", "function"]) -@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) -@pytest.mark.parametrize( - "sort, ascending", - [ - (False, None), - (True, True), - (True, False), - ], -) -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize("frame", [True, False]) -def test_against_frame_and_seriesgroupby( - education_df, groupby, normalize, name, sort, ascending, as_index, frame, request -): - # test all parameters: - # - Use column, array or function as by= parameter - # - Whether or not to normalize - # - Whether or not to sort and how - # - Whether or not to use the groupby as an index - # - 3-way compare against: - # - apply with :meth:`~DataFrame.value_counts` - # - `~SeriesGroupBy.value_counts` - if Version(np.__version__) >= Version("1.25") and frame and sort and normalize: - request.node.add_marker( - pytest.mark.xfail( - reason=( - "pandas default unstable sorting of duplicates" - "issue with numpy>=1.25 with AVX instructions" - ), - strict=False, - ) - ) - by = { - "column": "country", - "array": education_df["country"].values, - "function": lambda x: education_df["country"][x] == "US", - }[groupby] - - gp = education_df.groupby(by=by, as_index=as_index) - result = gp[["gender", "education"]].value_counts( - normalize=normalize, sort=sort, ascending=ascending - ) - if frame: - # compare against apply with DataFrame value_counts - expected = gp.apply( - _frame_value_counts, ["gender", "education"], normalize, sort, ascending - ) - - if as_index: - tm.assert_series_equal(result, expected) - else: - name = "proportion" if normalize else "count" - expected = expected.reset_index().rename({0: name}, axis=1) - if groupby == "column": - expected = expected.rename({"level_0": "country"}, axis=1) - expected["country"] = np.where(expected["country"], "US", "FR") - elif groupby == "function": - expected["level_0"] = expected["level_0"] == 1 - else: - expected["level_0"] = np.where(expected["level_0"], "US", "FR") - tm.assert_frame_equal(result, expected) - else: - # compare against SeriesGroupBy value_counts - education_df["both"] = education_df["gender"] + "-" + education_df["education"] - expected = gp["both"].value_counts( - normalize=normalize, sort=sort, ascending=ascending - ) - expected.name = name - if as_index: - index_frame = expected.index.to_frame(index=False) - index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) - index_frame["education"] = index_frame["both"].str.split("-").str.get(1) - del index_frame["both"] - index_frame = index_frame.rename({0: None}, axis=1) - expected.index = MultiIndex.from_frame(index_frame) - tm.assert_series_equal(result, expected) - else: - expected.insert(1, "gender", expected["both"].str.split("-").str.get(0)) - expected.insert(2, "education", expected["both"].str.split("-").str.get(1)) - del expected["both"] - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) -@pytest.mark.parametrize("normalize", [True, False]) -@pytest.mark.parametrize( - "sort, ascending, expected_rows, expected_count, expected_group_size", - [ - (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]), - (True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]), - (True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]), - ], -) -def test_compound( - education_df, - normalize, - sort, - ascending, - expected_rows, - expected_count, - expected_group_size, - dtype, -): - education_df = education_df.astype(dtype) - education_df.columns = education_df.columns.astype(dtype) - # Multiple groupby keys and as_index=False - gp = education_df.groupby(["country", "gender"], as_index=False, sort=False) - result = gp["education"].value_counts( - normalize=normalize, sort=sort, ascending=ascending - ) - expected = DataFrame() - for column in ["country", "gender", "education"]: - expected[column] = [education_df[column][row] for row in expected_rows] - expected = expected.astype(dtype) - expected.columns = expected.columns.astype(dtype) - if normalize: - expected["proportion"] = expected_count - expected["proportion"] /= expected_group_size - if dtype == "string[pyarrow]": - expected["proportion"] = expected["proportion"].convert_dtypes() - else: - expected["count"] = expected_count - if dtype == "string[pyarrow]": - expected["count"] = expected["count"].convert_dtypes() - tm.assert_frame_equal(result, expected) - - -@pytest.fixture -def animals_df(): - return DataFrame( - {"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, - index=["falcon", "dog", "cat", "ant"], - ) - - -@pytest.mark.parametrize( - "sort, ascending, normalize, name, expected_data, expected_index", - [ - (False, None, False, "count", [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]), - (True, True, False, "count", [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]), - (True, False, False, "count", [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]), - ( - True, - False, - True, - "proportion", - [0.5, 0.25, 0.25], - [(1, 1, 1), (4, 2, 6), (0, 2, 0)], - ), - ], -) -def test_data_frame_value_counts( - animals_df, sort, ascending, normalize, name, expected_data, expected_index -): - # 3-way compare with :meth:`~DataFrame.value_counts` - # Tests from frame/methods/test_value_counts.py - result_frame = animals_df.value_counts( - sort=sort, ascending=ascending, normalize=normalize - ) - expected = Series( - data=expected_data, - index=MultiIndex.from_arrays( - expected_index, names=["key", "num_legs", "num_wings"] - ), - name=name, - ) - tm.assert_series_equal(result_frame, expected) - - result_frame_groupby = animals_df.groupby("key").value_counts( - sort=sort, ascending=ascending, normalize=normalize - ) - - tm.assert_series_equal(result_frame_groupby, expected) - - -@pytest.fixture -def nulls_df(): - n = np.nan - return DataFrame( - { - "A": [1, 1, n, 4, n, 6, 6, 6, 6], - "B": [1, 1, 3, n, n, 6, 6, 6, 6], - "C": [1, 2, 3, 4, 5, 6, n, 8, n], - "D": [1, 2, 3, 4, 5, 6, 7, n, n], - } - ) - - -@pytest.mark.parametrize( - "group_dropna, count_dropna, expected_rows, expected_values", - [ - ( - False, - False, - [0, 1, 3, 5, 7, 6, 8, 2, 4], - [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0], - ), - (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), - (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), - (True, True, [0, 1, 5], [0.5, 0.5, 1.0]), - ], -) -def test_dropna_combinations( - nulls_df, group_dropna, count_dropna, expected_rows, expected_values, request -): - if Version(np.__version__) >= Version("1.25") and not group_dropna: - request.node.add_marker( - pytest.mark.xfail( - reason=( - "pandas default unstable sorting of duplicates" - "issue with numpy>=1.25 with AVX instructions" - ), - strict=False, - ) - ) - gp = nulls_df.groupby(["A", "B"], dropna=group_dropna) - result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna) - columns = DataFrame() - for column in nulls_df.columns: - columns[column] = [nulls_df[column][row] for row in expected_rows] - index = MultiIndex.from_frame(columns) - expected = Series(data=expected_values, index=index, name="proportion") - tm.assert_series_equal(result, expected) - - -@pytest.fixture -def names_with_nulls_df(nulls_fixture): - return DataFrame( - { - "key": [1, 1, 1, 1], - "first_name": ["John", "Anne", "John", "Beth"], - "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"], - }, - ) - - -@pytest.mark.parametrize( - "dropna, expected_data, expected_index", - [ - ( - True, - [1, 1], - MultiIndex.from_arrays( - [(1, 1), ("Beth", "John"), ("Louise", "Smith")], - names=["key", "first_name", "middle_name"], - ), - ), - ( - False, - [1, 1, 1, 1], - MultiIndex( - levels=[ - Index([1]), - Index(["Anne", "Beth", "John"]), - Index(["Louise", "Smith", np.nan]), - ], - codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]], - names=["key", "first_name", "middle_name"], - ), - ), - ], -) -@pytest.mark.parametrize("normalize, name", [(False, "count"), (True, "proportion")]) -def test_data_frame_value_counts_dropna( - names_with_nulls_df, dropna, normalize, name, expected_data, expected_index -): - # GH 41334 - # 3-way compare with :meth:`~DataFrame.value_counts` - # Tests with nulls from frame/methods/test_value_counts.py - result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize) - expected = Series( - data=expected_data, - index=expected_index, - name=name, - ) - if normalize: - expected /= float(len(expected_data)) - - tm.assert_series_equal(result_frame, expected) - - result_frame_groupby = names_with_nulls_df.groupby("key").value_counts( - dropna=dropna, normalize=normalize - ) - - tm.assert_series_equal(result_frame_groupby, expected) - - -@pytest.mark.parametrize("as_index", [False, True]) -@pytest.mark.parametrize("observed", [False, True]) -@pytest.mark.parametrize( - "normalize, name, expected_data", - [ - ( - False, - "count", - np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), - ), - ( - True, - "proportion", - np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), - ), - ], -) -def test_categorical_single_grouper_with_only_observed_categories( - education_df, as_index, observed, normalize, name, expected_data, request -): - # Test single categorical grouper with only observed grouping categories - # when non-groupers are also categorical - if Version(np.__version__) >= Version("1.25"): - request.node.add_marker( - pytest.mark.xfail( - reason=( - "pandas default unstable sorting of duplicates" - "issue with numpy>=1.25 with AVX instructions" - ), - strict=False, - ) - ) - - gp = education_df.astype("category").groupby( - "country", as_index=as_index, observed=observed - ) - result = gp.value_counts(normalize=normalize) - - expected_index = MultiIndex.from_tuples( - [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ], - names=["country", "gender", "education"], - ) - - expected_series = Series( - data=expected_data, - index=expected_index, - name=name, - ) - for i in range(3): - expected_series.index = expected_series.index.set_levels( - CategoricalIndex(expected_series.index.levels[i]), level=i - ) - - if as_index: - tm.assert_series_equal(result, expected_series) - else: - expected = expected_series.reset_index( - name="proportion" if normalize else "count" - ) - tm.assert_frame_equal(result, expected) - - -def assert_categorical_single_grouper( - education_df, as_index, observed, expected_index, normalize, name, expected_data -): - # Test single categorical grouper when non-groupers are also categorical - education_df = education_df.copy().astype("category") - - # Add non-observed grouping categories - education_df["country"] = education_df["country"].cat.add_categories(["ASIA"]) - - gp = education_df.groupby("country", as_index=as_index, observed=observed) - result = gp.value_counts(normalize=normalize) - - expected_series = Series( - data=expected_data, - index=MultiIndex.from_tuples( - expected_index, - names=["country", "gender", "education"], - ), - name=name, - ) - for i in range(3): - index_level = CategoricalIndex(expected_series.index.levels[i]) - if i == 0: - index_level = index_level.set_categories( - education_df["country"].cat.categories - ) - expected_series.index = expected_series.index.set_levels(index_level, level=i) - - if as_index: - tm.assert_series_equal(result, expected_series) - else: - expected = expected_series.reset_index(name=name) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize( - "normalize, name, expected_data", - [ - ( - False, - "count", - np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), - ), - ( - True, - "proportion", - np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), - ), - ], -) -def test_categorical_single_grouper_observed_true( - education_df, as_index, normalize, name, expected_data, request -): - # GH#46357 - - if Version(np.__version__) >= Version("1.25"): - request.node.add_marker( - pytest.mark.xfail( - reason=( - "pandas default unstable sorting of duplicates" - "issue with numpy>=1.25 with AVX instructions" - ), - strict=False, - ) - ) - - expected_index = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ] - - assert_categorical_single_grouper( - education_df=education_df, - as_index=as_index, - observed=True, - expected_index=expected_index, - normalize=normalize, - name=name, - expected_data=expected_data, - ) - - -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize( - "normalize, name, expected_data", - [ - ( - False, - "count", - np.array( - [2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64 - ), - ), - ( - True, - "proportion", - np.array( - [ - 0.5, - 0.25, - 0.25, - 0.0, - 0.0, - 0.0, - 0.5, - 0.5, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - ] - ), - ), - ], -) -def test_categorical_single_grouper_observed_false( - education_df, as_index, normalize, name, expected_data, request -): - # GH#46357 - - if Version(np.__version__) >= Version("1.25"): - request.node.add_marker( - pytest.mark.xfail( - reason=( - "pandas default unstable sorting of duplicates" - "issue with numpy>=1.25 with AVX instructions" - ), - strict=False, - ) - ) - - expected_index = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "male", "high"), - ("FR", "female", "medium"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "male", "medium"), - ("US", "male", "high"), - ("US", "female", "medium"), - ("US", "female", "low"), - ("ASIA", "male", "low"), - ("ASIA", "male", "high"), - ("ASIA", "female", "medium"), - ("ASIA", "female", "low"), - ("ASIA", "female", "high"), - ("ASIA", "male", "medium"), - ] - - assert_categorical_single_grouper( - education_df=education_df, - as_index=as_index, - observed=False, - expected_index=expected_index, - normalize=normalize, - name=name, - expected_data=expected_data, - ) - - -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize( - "observed, expected_index", - [ - ( - False, - [ - ("FR", "high", "female"), - ("FR", "high", "male"), - ("FR", "low", "male"), - ("FR", "low", "female"), - ("FR", "medium", "male"), - ("FR", "medium", "female"), - ("US", "high", "female"), - ("US", "high", "male"), - ("US", "low", "male"), - ("US", "low", "female"), - ("US", "medium", "female"), - ("US", "medium", "male"), - ], - ), - ( - True, - [ - ("FR", "high", "female"), - ("FR", "low", "male"), - ("FR", "medium", "male"), - ("US", "high", "female"), - ("US", "low", "male"), - ], - ), - ], -) -@pytest.mark.parametrize( - "normalize, name, expected_data", - [ - ( - False, - "count", - np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64), - ), - ( - True, - "proportion", - # NaN values corresponds to non-observed groups - np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]), - ), - ], -) -def test_categorical_multiple_groupers( - education_df, as_index, observed, expected_index, normalize, name, expected_data -): - # GH#46357 - - # Test multiple categorical groupers when non-groupers are non-categorical - education_df = education_df.copy() - education_df["country"] = education_df["country"].astype("category") - education_df["education"] = education_df["education"].astype("category") - - gp = education_df.groupby( - ["country", "education"], as_index=as_index, observed=observed - ) - result = gp.value_counts(normalize=normalize) - - expected_series = Series( - data=expected_data[expected_data > 0.0] if observed else expected_data, - index=MultiIndex.from_tuples( - expected_index, - names=["country", "education", "gender"], - ), - name=name, - ) - for i in range(2): - expected_series.index = expected_series.index.set_levels( - CategoricalIndex(expected_series.index.levels[i]), level=i - ) - - if as_index: - tm.assert_series_equal(result, expected_series) - else: - expected = expected_series.reset_index( - name="proportion" if normalize else "count" - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("as_index", [False, True]) -@pytest.mark.parametrize("observed", [False, True]) -@pytest.mark.parametrize( - "normalize, name, expected_data", - [ - ( - False, - "count", - np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64), - ), - ( - True, - "proportion", - # NaN values corresponds to non-observed groups - np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]), - ), - ], -) -def test_categorical_non_groupers( - education_df, as_index, observed, normalize, name, expected_data, request -): - # GH#46357 Test non-observed categories are included in the result, - # regardless of `observed` - - if Version(np.__version__) >= Version("1.25"): - request.node.add_marker( - pytest.mark.xfail( - reason=( - "pandas default unstable sorting of duplicates" - "issue with numpy>=1.25 with AVX instructions" - ), - strict=False, - ) - ) - - education_df = education_df.copy() - education_df["gender"] = education_df["gender"].astype("category") - education_df["education"] = education_df["education"].astype("category") - - gp = education_df.groupby("country", as_index=as_index, observed=observed) - result = gp.value_counts(normalize=normalize) - - expected_index = [ - ("FR", "male", "low"), - ("FR", "female", "high"), - ("FR", "male", "medium"), - ("FR", "female", "low"), - ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), - ("US", "male", "low"), - ("US", "female", "low"), - ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ] - expected_series = Series( - data=expected_data, - index=MultiIndex.from_tuples( - expected_index, - names=["country", "gender", "education"], - ), - name=name, - ) - for i in range(1, 3): - expected_series.index = expected_series.index.set_levels( - CategoricalIndex(expected_series.index.levels[i]), level=i - ) - - if as_index: - tm.assert_series_equal(result, expected_series) - else: - expected = expected_series.reset_index( - name="proportion" if normalize else "count" - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "normalize, expected_label, expected_values", - [ - (False, "count", [1, 1, 1]), - (True, "proportion", [0.5, 0.5, 1.0]), - ], -) -def test_mixed_groupings(normalize, expected_label, expected_values): - # Test multiple groupings - df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) - gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False) - result = gp.value_counts(sort=True, normalize=normalize) - expected = DataFrame( - { - "level_0": np.array([4, 4, 5], dtype=int), - "A": [1, 1, 2], - "level_2": [8, 8, 7], - "B": [1, 3, 2], - expected_label: expected_values, - } - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "test, columns, expected_names", - [ - ("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]), - ("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]), - ], -) -@pytest.mark.parametrize("as_index", [False, True]) -def test_column_label_duplicates(test, columns, expected_names, as_index): - # GH 44992 - # Test for duplicate input column labels and generated duplicate labels - df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns) - expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)] - keys = ["a", np.array([0, 1], dtype=np.int64), "d"] - result = df.groupby(keys, as_index=as_index).value_counts() - if as_index: - expected = Series( - data=(1, 1), - index=MultiIndex.from_tuples( - expected_data, - names=expected_names, - ), - name="count", - ) - tm.assert_series_equal(result, expected) - else: - expected_data = [list(row) + [1] for row in expected_data] - expected_columns = list(expected_names) - expected_columns[1] = "level_1" - expected_columns.append("count") - expected = DataFrame(expected_data, columns=expected_columns) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "normalize, expected_label", - [ - (False, "count"), - (True, "proportion"), - ], -) -def test_result_label_duplicates(normalize, expected_label): - # Test for result column label duplicating an input column label - gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby( - "a", as_index=False - ) - msg = f"Column label '{expected_label}' is duplicate of result column" - with pytest.raises(ValueError, match=msg): - gb.value_counts(normalize=normalize) - - -def test_ambiguous_grouping(): - # Test that groupby is not confused by groupings length equal to row count - df = DataFrame({"a": [1, 1]}) - gb = df.groupby(np.array([1, 1], dtype=np.int64)) - result = gb.value_counts() - expected = Series( - [2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]), name="count" - ) - tm.assert_series_equal(result, expected) - - -def test_subset_overlaps_gb_key_raises(): - # GH 46383 - df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) - msg = "Keys {'c1'} in subset cannot be in the groupby column keys." - with pytest.raises(ValueError, match=msg): - df.groupby("c1").value_counts(subset=["c1"]) - - -def test_subset_doesnt_exist_in_frame(): - # GH 46383 - df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) - msg = "Keys {'c3'} in subset do not exist in the DataFrame." - with pytest.raises(ValueError, match=msg): - df.groupby("c1").value_counts(subset=["c3"]) - - -def test_subset(): - # GH 46383 - df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) - result = df.groupby(level=0).value_counts(subset=["c2"]) - expected = Series( - [1, 2], - index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]), - name="count", - ) - tm.assert_series_equal(result, expected) - - -def test_subset_duplicate_columns(): - # GH 46383 - df = DataFrame( - [["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]], - index=[0, 1, 1], - columns=["c1", "c2", "c2"], - ) - result = df.groupby(level=0).value_counts(subset=["c2"]) - expected = Series( - [1, 2], - index=MultiIndex.from_arrays( - [[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"] - ), - name="count", - ) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("utc", [True, False]) -def test_value_counts_time_grouper(utc): - # GH#50486 - df = DataFrame( - { - "Timestamp": [ - 1565083561, - 1565083561 + 86400, - 1565083561 + 86500, - 1565083561 + 86400 * 2, - 1565083561 + 86400 * 3, - 1565083561 + 86500 * 3, - 1565083561 + 86400 * 4, - ], - "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], - } - ).drop([3]) - - df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s") - gb = df.groupby(Grouper(freq="1D", key="Datetime")) - result = gb.value_counts() - dates = to_datetime( - ["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc - ) - timestamps = df["Timestamp"].unique() - index = MultiIndex( - levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]], - codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]], - names=["Datetime", "Timestamp", "Food"], - ) - expected = Series(1, index=index, name="count") - tm.assert_series_equal(result, expected) - - -def test_value_counts_integer_columns(): - # GH#55627 - df = DataFrame({1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"]}) - gp = df.groupby([1, 2], as_index=False, sort=False) - result = gp[3].value_counts() - expected = DataFrame( - {1: ["a", "a", "a"], 2: ["a", "a", "d"], 3: ["a", "b", "c"], "count": 1} - ) - tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/groupby/transform/test_transform.py pandas-2.2.2+dfsg/pandas/tests/groupby/transform/test_transform.py --- pandas-2.1.4+dfsg/pandas/tests/groupby/transform/test_transform.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/groupby/transform/test_transform.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,4 @@ """ test with the .transform """ -from io import StringIO - import numpy as np import pytest @@ -12,6 +10,7 @@ from pandas import ( Categorical, DataFrame, + Index, MultiIndex, Series, Timestamp, @@ -69,8 +68,12 @@ tm.assert_frame_equal(result, expected) # GH 8430 - df = tm.makeTimeDataFrame() - g = df.groupby(pd.Grouper(freq="M")) + df = DataFrame( + np.random.default_rng(2).standard_normal((50, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=50, freq="B"), + ) + g = df.groupby(pd.Grouper(freq="ME")) g.transform(lambda x: x - 1) # GH 9700 @@ -103,6 +106,8 @@ result = grp.transform("mean") tm.assert_series_equal(result, expected) + +def test_transform_fast2(): # GH 12737 df = DataFrame( { @@ -115,12 +120,15 @@ ) result = df.groupby("grouping").transform("first") - dates = [ - Timestamp("2014-1-1"), - Timestamp("2014-1-2"), - Timestamp("2014-1-2"), - Timestamp("2014-1-4"), - ] + dates = Index( + [ + Timestamp("2014-1-1"), + Timestamp("2014-1-2"), + Timestamp("2014-1-2"), + Timestamp("2014-1-4"), + ], + dtype="M8[ns]", + ) expected = DataFrame( {"f": [1.1, 2.1, 2.1, 4.5], "d": dates, "i": [1, 2, 2, 4]}, columns=["f", "i", "d"], @@ -132,6 +140,8 @@ expected = expected[["f", "i"]] tm.assert_frame_equal(result, expected) + +def test_transform_fast3(): # dup columns df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["g", "a", "a"]) result = df.groupby("g").transform("first") @@ -184,8 +194,13 @@ msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): gb = df.groupby([0, 0, 1], axis=1) - result = gb.transform(transformation_func, *args) - expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T + warn = FutureWarning if transformation_func == "fillna" else None + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = gb.transform(transformation_func, *args) + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T if transformation_func in ["diff", "shift"]: # Result contains nans, so transpose coerces to float @@ -203,7 +218,7 @@ "nth", ): marker = pytest.mark.xfail(reason="transform incorrectly fails - GH#45986") - request.node.add_marker(marker) + request.applymarker(marker) df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) msg = "DataFrame.groupby with axis=1 is deprecated" @@ -337,22 +352,28 @@ def test_transform_casting(): # 13046 - data = """ - idx A ID3 DATETIME - 0 B-028 b76cd912ff "2014-10-08 13:43:27" - 1 B-054 4a57ed0b02 "2014-10-08 14:26:19" - 2 B-076 1a682034f8 "2014-10-08 14:29:01" - 3 B-023 b76cd912ff "2014-10-08 18:39:34" - 4 B-023 f88g8d7sds "2014-10-08 18:40:18" - 5 B-033 b76cd912ff "2014-10-08 18:44:30" - 6 B-032 b76cd912ff "2014-10-08 18:46:00" - 7 B-037 b76cd912ff "2014-10-08 18:52:15" - 8 B-046 db959faf02 "2014-10-08 18:59:59" - 9 B-053 b76cd912ff "2014-10-08 19:17:48" - 10 B-065 b76cd912ff "2014-10-08 19:21:38" - """ - df = pd.read_csv( - StringIO(data), sep=r"\s+", index_col=[0], parse_dates=["DATETIME"] + times = [ + "13:43:27", + "14:26:19", + "14:29:01", + "18:39:34", + "18:40:18", + "18:44:30", + "18:46:00", + "18:52:15", + "18:59:59", + "19:17:48", + "19:21:38", + ] + df = DataFrame( + { + "A": [f"B-{i}" for i in range(11)], + "ID3": np.take( + ["a", "b", "c", "d", "e"], [0, 1, 2, 1, 3, 1, 1, 1, 4, 1, 1] + ), + "DATETIME": pd.to_datetime([f"2014-10-08 {time}" for time in times]), + }, + index=pd.RangeIndex(11, name="idx"), ) result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff()) @@ -377,7 +398,7 @@ grouped = df.groupby(lambda x: x.month) - msg = "DataFrameGroupBy.fillna with 'method' is deprecated" + msg = "DataFrameGroupBy.fillna is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): filled = grouped.fillna(method="pad") msg = "Series.fillna with 'method' is deprecated" @@ -395,10 +416,13 @@ "cost": (100, 200, 300, 400, 500, 600), } ) - with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"): - df.groupby(["price"]).transform("fillna") - with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"): - df.groupby(["price"]).fillna() + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"): + df.groupby(["price"]).transform("fillna") + with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(ValueError, match="Must specify a fill 'value' or 'method'"): + df.groupby(["price"]).fillna() def test_transform_transformation_func(transformation_func): @@ -429,23 +453,30 @@ test_op = lambda x: x.transform(transformation_func) mock_op = lambda x: getattr(x, transformation_func)() - msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated" - groupby_msg = ( - "The default fill_method='ffill' in DataFrameGroupBy.pct_change is deprecated" - ) if transformation_func == "pct_change": - with tm.assert_produces_warning(FutureWarning, match=groupby_msg): - result = test_op(df.groupby("A")) + msg = "The default fill_method='pad' in DataFrame.pct_change is deprecated" + groupby_msg = ( + "The default fill_method='ffill' in DataFrameGroupBy.pct_change " + "is deprecated" + ) + warn = FutureWarning + groupby_warn = FutureWarning + elif transformation_func == "fillna": + msg = "" + groupby_msg = "DataFrameGroupBy.fillna is deprecated" + warn = None + groupby_warn = FutureWarning else: + msg = groupby_msg = "" + warn = groupby_warn = None + + with tm.assert_produces_warning(groupby_warn, match=groupby_msg): result = test_op(df.groupby("A")) # pass the group in same order as iterating `for ... in df.groupby(...)` # but reorder to match df's index since this is a transform groups = [df[["B"]].iloc[4:6], df[["B"]].iloc[6:], df[["B"]].iloc[:4]] - if transformation_func == "pct_change": - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = concat([mock_op(g) for g in groups]).sort_index() - else: + with tm.assert_produces_warning(warn, match=msg): expected = concat([mock_op(g) for g in groups]).sort_index() # sort_index does not preserve the freq expected = expected.set_axis(df.index) @@ -509,7 +540,7 @@ Timestamp("2014-1-2"), Timestamp("2014-1-4"), ] - expected = Series(dates, name="d") + expected = Series(dates, name="d", dtype="M8[ns]") tm.assert_series_equal(result, expected) @@ -636,7 +667,9 @@ return group[:1] grouped = df.groupby("c") - result = grouped.apply(f) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = grouped.apply(f) assert result["d"].dtype == np.float64 @@ -790,7 +823,13 @@ f = gb[["float", "float_missing"]].apply(targop) expected = concat([f, i], axis=1) else: - expected = gb.apply(targop) + if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): + warn = None + else: + warn = DeprecationWarning + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(warn, match=msg): + expected = gb.apply(targop) expected = expected.sort_index(axis=1) if op == "shift": @@ -1173,7 +1212,9 @@ result = stocks.groupby(stocks["week_id"])["price"].transform(func) - expected = Series(data=pd.to_datetime(values), index=dates, name="price") + expected = Series( + data=pd.to_datetime(values).as_unit("ns"), index=dates, name="price" + ) tm.assert_series_equal(result, expected) @@ -1312,7 +1353,7 @@ # Check that the fastpath raises, see _transform_general obj = gb._obj_with_exclusions - gen = gb.grouper.get_iterator(obj, axis=gb.axis) + gen = gb._grouper.get_iterator(obj, axis=gb.axis) fast_path, slow_path = gb._define_paths(func) _, group = next(gen) @@ -1447,7 +1488,7 @@ # GH 17093 if reduction_func == "corrwith": msg = "incorrectly raises" - request.node.add_marker(pytest.mark.xfail(reason=msg)) + request.applymarker(pytest.mark.xfail(reason=msg)) index = [1, 2, 3, 4] # test transform preserves non-standard index df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index) @@ -1511,11 +1552,19 @@ # ngroup/cumcount always returns a Series as it counts the groups, not values expected = expected["B"].rename(None) - msg = "The default fill_method='ffill' in DataFrameGroupBy.pct_change is deprecated" if transformation_func == "pct_change" and not dropna: - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.transform("pct_change", *args) + warn = FutureWarning + msg = ( + "The default fill_method='ffill' in DataFrameGroupBy.pct_change " + "is deprecated" + ) + elif transformation_func == "fillna": + warn = FutureWarning + msg = "DataFrameGroupBy.fillna is deprecated" else: + warn = None + msg = "" + with tm.assert_produces_warning(warn, match=msg): result = gb.transform(transformation_func, *args) tm.assert_equal(result, expected) @@ -1583,7 +1632,9 @@ buffer.append(Series([np.nan], index=[3], dtype=dtype)) expected = concat(buffer) - with tm.assert_produces_warning(None): + warn = FutureWarning if transformation_func == "fillna" else None + msg = "SeriesGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): result = gb.transform(transformation_func, *args) tm.assert_equal(result, expected) @@ -1626,6 +1677,26 @@ args = get_groupby_method_args(groupby_func, df) gb_as_index_true = df.groupby(keys, as_index=True) gb_as_index_false = df.groupby(keys, as_index=False) - result = gb_as_index_true.transform(groupby_func, *args) - expected = gb_as_index_false.transform(groupby_func, *args) + warn = FutureWarning if groupby_func == "fillna" else None + msg = "DataFrameGroupBy.fillna is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = gb_as_index_true.transform(groupby_func, *args) + with tm.assert_produces_warning(warn, match=msg): + expected = gb_as_index_false.transform(groupby_func, *args) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmax", "idxmin"]) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_idxmin_idxmax_transform_args(how, skipna, numeric_only): + # GH#55268 - ensure *args are passed through when calling transform + df = DataFrame({"a": [1, 1, 1, 2], "b": [3.0, 4.0, np.nan, 6.0], "c": list("abcd")}) + gb = df.groupby("a") + msg = f"'axis' keyword in DataFrameGroupBy.{how} is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = gb.transform(how, 0, skipna, numeric_only) + warn = None if skipna else FutureWarning + msg = f"The behavior of DataFrameGroupBy.{how} with .* any-NA and skipna=False" + with tm.assert_produces_warning(warn, match=msg): + expected = gb.transform(how, skipna=skipna, numeric_only=numeric_only) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/base_class/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/indexes/base_class/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/base_class/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/base_class/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,7 @@ from pandas import ( Index, MultiIndex, + Series, ) import pandas._testing as tm @@ -57,3 +58,23 @@ with pd.option_context("future.infer_string", True): ser = Index(["a", 1]) tm.assert_index_equal(ser, expected) + + def test_inference_on_pandas_objects(self): + # GH#56012 + idx = Index([pd.Timestamp("2019-12-31")], dtype=object) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = Index(idx) + assert result.dtype != np.object_ + + ser = Series([pd.Timestamp("2019-12-31")], dtype=object) + + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = Index(ser) + assert result.dtype != np.object_ + + def test_constructor_not_read_only(self): + # GH#57130 + ser = Series([1, 2], dtype=object) + with pd.option_context("mode.copy_on_write", True): + idx = Index(ser) + assert idx._values.flags.writeable diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/base_class/test_formats.py pandas-2.2.2+dfsg/pandas/tests/indexes/base_class/test_formats.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/base_class/test_formats.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/base_class/test_formats.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,12 +1,22 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype import pandas._config.config as cf from pandas import Index +import pandas._testing as tm class TestIndexRendering: + def test_repr_is_valid_construction_code(self): + # for the case of Index, where the repr is traditional rather than + # stylized + idx = Index(["a", "b"]) + res = eval(repr(idx)) + tm.assert_index_equal(res, idx) + + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -71,6 +81,7 @@ result = repr(index) assert result == expected + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -133,7 +144,9 @@ def test_index_repr_bool_nan(self): # GH32146 arr = Index([True, False, np.nan], dtype=object) - exp1 = arr.format() + msg = "Index.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + exp1 = arr.format() out1 = ["True", "False", "NaN"] assert out1 == exp1 @@ -145,4 +158,6 @@ # GH#35439 idx = Index(["aaaaaaaaa", "b"]) expected = ["aaaaaaaaa", "b"] - assert idx.format() == expected + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/base_class/test_reshape.py pandas-2.2.2+dfsg/pandas/tests/indexes/base_class/test_reshape.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/base_class/test_reshape.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/base_class/test_reshape.py 2024-04-10 17:42:52.000000000 +0000 @@ -33,13 +33,15 @@ # test empty null_index = Index([]) - tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a")) + tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a")) - def test_insert_missing(self, nulls_fixture): + def test_insert_missing(self, nulls_fixture, using_infer_string): # GH#22295 # test there is no mangling of NA values - expected = Index(["a", nulls_fixture, "b", "c"]) - result = Index(list("abc")).insert(1, nulls_fixture) + expected = Index(["a", nulls_fixture, "b", "c"], dtype=object) + result = Index(list("abc"), dtype=object).insert( + 1, Index([nulls_fixture], dtype=object) + ) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/base_class/test_setops.py pandas-2.2.2+dfsg/pandas/tests/indexes/base_class/test_setops.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/base_class/test_setops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/base_class/test_setops.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,6 +12,13 @@ from pandas.core.algorithms import safe_sort +def equal_contents(arr1, arr2) -> bool: + """ + Checks if the set of unique elements of arr1 and arr2 are equivalent. + """ + return frozenset(arr1) == frozenset(arr2) + + class TestIndexSetOps: @pytest.mark.parametrize( "method", ["union", "intersection", "difference", "symmetric_difference"] @@ -71,7 +78,7 @@ result = first.union(klass(second.values)) - assert tm.equalContents(result, index) + assert equal_contents(result, index) def test_union_sort_other_incomparable(self): # https://github.com/pandas-dev/pandas/issues/24959 @@ -119,7 +126,7 @@ second = index[:3] result = first.intersection(klass(second.values), sort=sort) - assert tm.equalContents(result, second) + assert equal_contents(result, second) def test_intersection_nosort(self): result = Index(["c", "b", "a"]).intersection(["b", "a"]) @@ -147,7 +154,7 @@ def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique index1 = Index(["A", "B", "A", "C"]) - expected = Index(expected_arr, dtype="object") + expected = Index(expected_arr) result = index1.intersection(index2, sort=sort) if sort is None: expected = expected.sort_values() @@ -244,7 +251,7 @@ tm.assert_index_equal(union, expected) else: expected = Index(vals, name=expected_name) - tm.equalContents(union, expected) + tm.assert_index_equal(union.sort_values(), expected.sort_values()) @pytest.mark.parametrize( "diff_type, expected", diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/categorical/test_astype.py pandas-2.2.2+dfsg/pandas/tests/indexes/categorical/test_astype.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/categorical/test_astype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/categorical/test_astype.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,7 +18,7 @@ ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) result = ci.astype(object) - tm.assert_index_equal(result, Index(np.array(ci))) + tm.assert_index_equal(result, Index(np.array(ci), dtype=object)) # this IS equal, but not the same class assert result.equals(ci) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/categorical/test_category.py pandas-2.2.2+dfsg/pandas/tests/indexes/categorical/test_category.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/categorical/test_category.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/categorical/test_category.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import index as libindex from pandas._libs.arrays import NDArrayBacked @@ -47,7 +49,7 @@ # invalid -> cast to object expected = ci.astype(object).insert(0, "d") - result = ci.insert(0, "d") + result = ci.insert(0, "d").astype(object) tm.assert_index_equal(result, expected, exact=True) # GH 18295 (test missing) @@ -194,6 +196,7 @@ expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) @@ -228,6 +231,13 @@ expected = np.array([False] * 5 + [True]) tm.assert_numpy_array_equal(result, expected) + def test_isin_overlapping_intervals(self): + # GH 34974 + idx = pd.IntervalIndex([pd.Interval(0, 2), pd.Interval(0, 1)]) + result = CategoricalIndex(idx).isin(idx) + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) + def test_identical(self): ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) @@ -241,7 +251,7 @@ # # Must be tested separately from other indexes because # self.values is not an ndarray. - index = tm.makeCategoricalIndex(10) + index = CategoricalIndex(list("ab") * 5) result = CategoricalIndex(index.values, copy=True) tm.assert_index_equal(index, result) @@ -250,17 +260,11 @@ result = CategoricalIndex(index.values, copy=False) assert result._data._codes is index._data._codes - def test_frame_repr(self): - df = pd.DataFrame({"A": [1, 2, 3]}, index=CategoricalIndex(["a", "b", "c"])) - result = repr(df) - expected = " A\na 1\nb 2\nc 3" - assert result == expected - class TestCategoricalIndex2: def test_view_i8(self): # GH#25464 - ci = tm.makeCategoricalIndex(100) + ci = CategoricalIndex(list("ab") * 50) msg = "When changing to a larger dtype, its size must be a divisor" with pytest.raises(ValueError, match=msg): ci.view("i8") diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/categorical/test_formats.py pandas-2.2.2+dfsg/pandas/tests/indexes/categorical/test_formats.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/categorical/test_formats.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/categorical/test_formats.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,9 +1,13 @@ """ Tests for CategoricalIndex.__repr__ and related methods. """ +import pytest + +from pandas._config import using_pyarrow_string_dtype import pandas._config.config as cf from pandas import CategoricalIndex +import pandas._testing as tm class TestCategoricalIndexRepr: @@ -11,8 +15,11 @@ # GH#35439 idx = CategoricalIndex(["aaaaaaaaa", "b"]) expected = ["aaaaaaaaa", "b"] - assert idx.format() == expected + msg = r"CategoricalIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/categorical/test_reindex.py pandas-2.2.2+dfsg/pandas/tests/indexes/categorical/test_reindex.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/categorical/test_reindex.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/categorical/test_reindex.py 2024-04-10 17:42:52.000000000 +0000 @@ -40,7 +40,7 @@ # See GH25459 cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) res, indexer = cat.reindex(["a", "c", "c"]) - exp = Index(["a", "c", "c"], dtype="object") + exp = Index(["a", "c", "c"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/categorical/test_setops.py pandas-2.2.2+dfsg/pandas/tests/indexes/categorical/test_setops.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/categorical/test_setops.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/categorical/test_setops.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,18 @@ +import numpy as np +import pytest + +from pandas import ( + CategoricalIndex, + Index, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("na_value", [None, np.nan]) +def test_difference_with_na(na_value): + # GH 57318 + ci = CategoricalIndex(["a", "b", "c", None]) + other = Index(["c", na_value]) + result = ci.difference(other) + expected = CategoricalIndex(["a", "b"], categories=["a", "b", "c"]) + tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/conftest.py pandas-2.2.2+dfsg/pandas/tests/indexes/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,7 +5,6 @@ Series, array, ) -import pandas._testing as tm @pytest.fixture(params=[None, False]) @@ -25,7 +24,7 @@ return request.param -@pytest.fixture(params=["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"]) +@pytest.fixture(params=["D", "3D", "-3D", "h", "2h", "-2h", "min", "2min", "s", "-3s"]) def freq_sample(request): """ Valid values for 'freq' parameter used to create date_range and @@ -40,22 +39,3 @@ Types that may be passed as the indexer to searchsorted. """ return request.param - - -@pytest.fixture( - params=tm.ALL_REAL_NUMPY_DTYPES - + [ - "object", - "category", - "datetime64[ns]", - "timedelta64[ns]", - ] -) -def any_dtype_for_small_pos_integer_indexes(request): - """ - Dtypes that can be given to an Index with small positive integers. - - This means that for any dtype `x` in the params list, `Index([1, 2, 3], dtype=x)` is - valid and gives the correct Index (sub-)class. - """ - return request.param diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimelike_/test_drop_duplicates.py 2024-04-10 17:42:52.000000000 +0000 @@ -68,7 +68,7 @@ class TestDropDuplicatesPeriodIndex(DropDuplicates): - @pytest.fixture(params=["D", "3D", "H", "2H", "T", "2T", "S", "3S"]) + @pytest.fixture(params=["D", "3D", "h", "2h", "min", "2min", "s", "3s"]) def freq(self, request): return request.param diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimelike_/test_equals.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimelike_/test_equals.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimelike_/test_equals.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimelike_/test_equals.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,6 +18,7 @@ TimedeltaIndex, date_range, period_range, + timedelta_range, ) import pandas._testing as tm @@ -65,7 +66,7 @@ assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) - idx2 = PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq="H") + idx2 = PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq="h") assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) assert not idx.equals(idx2.astype(object)) @@ -75,7 +76,7 @@ # same internal, different tz idx3 = PeriodIndex._simple_new( - idx._values._simple_new(idx._values.asi8, dtype=pd.PeriodDtype("H")) + idx._values._simple_new(idx._values.asi8, dtype=pd.PeriodDtype("h")) ) tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) assert not idx.equals(idx3) @@ -141,7 +142,7 @@ class TestTimedeltaIndexEquals(EqualsTests): @pytest.fixture def index(self): - return tm.makeTimedeltaIndex(10) + return timedelta_range("1 day", periods=10) def test_equals2(self): # GH#13107 diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimelike_/test_is_monotonic.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimelike_/test_is_monotonic.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimelike_/test_is_monotonic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimelike_/test_is_monotonic.py 2024-04-10 17:42:52.000000000 +0000 @@ -34,7 +34,7 @@ assert obj.is_unique dti2 = dti.insert(3, NaT) - pi2 = dti2.to_period("H") + pi2 = dti2.to_period("h") tdi2 = Index(dti2.view("timedelta64[ns]")) for obj in [pi2, pi2._engine, dti2, dti2._engine, tdi2, tdi2._engine]: diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimelike_/test_sort_values.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimelike_/test_sort_values.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimelike_/test_sort_values.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimelike_/test_sort_values.py 2024-04-10 17:42:52.000000000 +0000 @@ -92,7 +92,7 @@ tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0], dtype=np.intp)) check_freq_ascending(ordered, idx, False) - @pytest.mark.parametrize("freq", ["D", "H"]) + @pytest.mark.parametrize("freq", ["D", "h"]) def test_sort_values_with_freq_timedeltaindex(self, freq): # GH#10295 idx = timedelta_range(start=f"1{freq}", periods=3, freq=freq).rename("idx") @@ -107,7 +107,7 @@ ), DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", + freq="h", name="tzidx", tz="Asia/Tokyo", ), @@ -127,7 +127,7 @@ @pytest.mark.parametrize( "idx", [ - PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="A"), + PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="Y"), Index([2011, 2012, 2013], name="idx"), # for compatibility check ], ) @@ -275,10 +275,10 @@ ), ( PeriodIndex( - ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A" + ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="Y" ), PeriodIndex( - ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="A" + ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="Y" ), ), ( @@ -308,7 +308,7 @@ def test_order_stability_compat(): # GH#35922. sort_values is stable both for normal and datetime-like Index - pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A") + pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="Y") iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False) ordered2, indexer2 = iidx.sort_values(return_indexer=True, ascending=False) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimelike_/test_value_counts.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimelike_/test_value_counts.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimelike_/test_value_counts.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimelike_/test_value_counts.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,15 +18,15 @@ def test_value_counts_unique_datetimeindex(self, tz_naive_fixture): tz = tz_naive_fixture - orig = date_range("2011-01-01 09:00", freq="H", periods=10, tz=tz) + orig = date_range("2011-01-01 09:00", freq="h", periods=10, tz=tz) self._check_value_counts_with_repeats(orig) def test_value_counts_unique_timedeltaindex(self): - orig = timedelta_range("1 days 09:00:00", freq="H", periods=10) + orig = timedelta_range("1 days 09:00:00", freq="h", periods=10) self._check_value_counts_with_repeats(orig) def test_value_counts_unique_periodindex(self): - orig = period_range("2011-01-01 09:00", freq="H", periods=10) + orig = period_range("2011-01-01 09:00", freq="h", periods=10) self._check_value_counts_with_repeats(orig) def _check_value_counts_with_repeats(self, orig): @@ -83,7 +83,7 @@ "2013-01-01 08:00", NaT, ], - freq="H", + freq="h", ) self._check_value_counts_dropna(idx) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_asof.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_asof.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_asof.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_asof.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,30 @@ +from datetime import timedelta + +from pandas import ( + Index, + Timestamp, + date_range, + isna, +) + + +class TestAsOf: + def test_asof_partial(self): + index = date_range("2010-01-01", periods=2, freq="ME") + expected = Timestamp("2010-02-28") + result = index.asof("2010-02") + assert result == expected + assert not isinstance(result, Index) + + def test_asof(self): + index = date_range("2020-01-01", periods=10) + + dt = index[0] + assert index.asof(dt) == dt + assert isna(index.asof(dt - timedelta(1))) + + dt = index[-1] + assert index.asof(dt + timedelta(1)) == dt + + dt = index[0].to_pydatetime() + assert isinstance(index.asof(dt), Timestamp) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_astype.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_astype.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_astype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_astype.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,9 +18,30 @@ class TestDatetimeIndex: + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_astype_asobject_around_dst_transition(self, tzstr): + # GH#1345 + + # dates around a dst transition + rng = date_range("2/13/2010", "5/6/2010", tz=tzstr) + + objs = rng.astype(object) + for i, x in enumerate(objs): + exval = rng[i] + assert x == exval + assert x.tzinfo == exval.tzinfo + + objs = rng.astype(object) + for i, x in enumerate(objs): + exval = rng[i] + assert x == exval + assert x.tzinfo == exval.tzinfo + def test_astype(self): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan], name="idx") + idx = DatetimeIndex( + ["2016-05-16", "NaT", NaT, np.nan], dtype="M8[ns]", name="idx" + ) result = idx.astype(object) expected = Index( @@ -36,6 +57,7 @@ ) tm.assert_index_equal(result, expected) + def test_astype2(self): rng = date_range("1/1/2000", periods=10, name="idx") result = rng.astype("i8") tm.assert_index_equal(result, Index(rng.asi8, name="idx")) @@ -117,7 +139,7 @@ def test_astype_str_freq_and_name(self): # test astype string with freqH and name - dti = date_range("1/1/2011", periods=3, freq="H", name="test_name") + dti = date_range("1/1/2011", periods=3, freq="h", name="test_name") result = dti.astype(str) expected = Index( ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], @@ -129,7 +151,7 @@ def test_astype_str_freq_and_tz(self): # test astype string with freqH and timezone dti = date_range( - "3/6/2012 00:00", periods=2, freq="H", tz="Europe/London", name="test_name" + "3/6/2012 00:00", periods=2, freq="h", tz="Europe/London", name="test_name" ) result = dti.astype(str) expected = Index( @@ -141,7 +163,9 @@ def test_astype_datetime64(self): # GH 13149, GH 13209 - idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan], name="idx") + idx = DatetimeIndex( + ["2016-05-16", "NaT", NaT, np.nan], dtype="M8[ns]", name="idx" + ) result = idx.astype("datetime64[ns]") tm.assert_index_equal(result, idx) @@ -168,7 +192,7 @@ @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) def test_astype_object_tz(self, tz): - idx = date_range(start="2013-01-01", periods=4, freq="M", name="idx", tz=tz) + idx = date_range(start="2013-01-01", periods=4, freq="ME", name="idx", tz=tz) expected_list = [ Timestamp("2013-01-31", tz=tz), Timestamp("2013-02-28", tz=tz), @@ -268,7 +292,7 @@ # GH 20997, 20964, 24559 val = [Timestamp("2018-01-01", tz=tz).as_unit("ns")._value] result = Index(val, name="idx").astype(dtype) - expected = DatetimeIndex(["2018-01-01"], tz=tz, name="idx") + expected = DatetimeIndex(["2018-01-01"], tz=tz, name="idx").as_unit("ns") tm.assert_index_equal(result, expected) def test_dti_astype_period(self): @@ -288,8 +312,9 @@ def test_astype_category(self, tz): obj = date_range("2000", periods=2, tz=tz, name="idx") result = obj.astype("category") + dti = DatetimeIndex(["2000-01-01", "2000-01-02"], tz=tz).as_unit("ns") expected = pd.CategoricalIndex( - [Timestamp("2000-01-01", tz=tz), Timestamp("2000-01-02", tz=tz)], + dti, name="idx", ) tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_delete.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_delete.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_delete.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_delete.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,141 @@ +import pytest + +from pandas import ( + DatetimeIndex, + Series, + date_range, +) +import pandas._testing as tm + + +class TestDelete: + def test_delete(self, unit): + idx = date_range( + start="2000-01-01", periods=5, freq="ME", name="idx", unit=unit + ) + + # preserve freq + expected_0 = date_range( + start="2000-02-01", periods=4, freq="ME", name="idx", unit=unit + ) + expected_4 = date_range( + start="2000-01-01", periods=4, freq="ME", name="idx", unit=unit + ) + + # reset freq to None + expected_1 = DatetimeIndex( + ["2000-01-31", "2000-03-31", "2000-04-30", "2000-05-31"], + freq=None, + name="idx", + ).as_unit(unit) + + cases = { + 0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + with pytest.raises((IndexError, ValueError), match="out of bounds"): + # either depending on numpy version + idx.delete(5) + + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo", "US/Pacific"]) + def test_delete2(self, tz): + idx = date_range( + start="2000-01-01 09:00", periods=10, freq="h", name="idx", tz=tz + ) + + expected = date_range( + start="2000-01-01 10:00", periods=9, freq="h", name="idx", tz=tz + ) + result = idx.delete(0) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freqstr == "h" + assert result.tz == expected.tz + + expected = date_range( + start="2000-01-01 09:00", periods=9, freq="h", name="idx", tz=tz + ) + result = idx.delete(-1) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freqstr == "h" + assert result.tz == expected.tz + + def test_delete_slice(self, unit): + idx = date_range( + start="2000-01-01", periods=10, freq="D", name="idx", unit=unit + ) + + # preserve freq + expected_0_2 = date_range( + start="2000-01-04", periods=7, freq="D", name="idx", unit=unit + ) + expected_7_9 = date_range( + start="2000-01-01", periods=7, freq="D", name="idx", unit=unit + ) + + # reset freq to None + expected_3_5 = DatetimeIndex( + [ + "2000-01-01", + "2000-01-02", + "2000-01-03", + "2000-01-07", + "2000-01-08", + "2000-01-09", + "2000-01-10", + ], + freq=None, + name="idx", + ).as_unit(unit) + + cases = { + (0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5, + } + for n, expected in cases.items(): + result = idx.delete(n) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + result = idx.delete(slice(n[0], n[-1] + 1)) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + + # TODO: belongs in Series.drop tests? + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo", "US/Pacific"]) + def test_delete_slice2(self, tz, unit): + dti = date_range( + "2000-01-01 09:00", periods=10, freq="h", name="idx", tz=tz, unit=unit + ) + ts = Series( + 1, + index=dti, + ) + # preserve freq + result = ts.drop(ts.index[:5]).index + expected = dti[5:] + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz + + # reset freq to None + result = ts.drop(ts.index[[1, 3, 5, 7, 9]]).index + expected = dti[::2]._with_freq(None) + tm.assert_index_equal(result, expected) + assert result.name == expected.name + assert result.freq == expected.freq + assert result.tz == expected.tz diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_factorize.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_factorize.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_factorize.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_factorize.py 2024-04-10 17:42:52.000000000 +0000 @@ -58,7 +58,7 @@ def test_factorize_preserves_freq(self): # GH#38120 freq should be preserved - idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo") + idx3 = date_range("2000-01", periods=4, freq="ME", tz="Asia/Tokyo") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() @@ -74,7 +74,7 @@ def test_factorize_tz(self, tz_naive_fixture, index_or_series): tz = tz_naive_fixture # GH#13750 - base = date_range("2016-11-05", freq="H", periods=100, tz=tz) + base = date_range("2016-11-05", freq="h", periods=100, tz=tz) idx = base.repeat(5) exp_arr = np.arange(100, dtype=np.intp).repeat(5) @@ -89,7 +89,7 @@ def test_factorize_dst(self, index_or_series): # GH#13750 - idx = date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern") + idx = date_range("2016-11-06", freq="h", periods=12, tz="US/Eastern") obj = index_or_series(idx) arr, res = obj.factorize() @@ -98,7 +98,7 @@ if index_or_series is Index: assert res.freq == idx.freq - idx = date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern") + idx = date_range("2016-06-13", freq="h", periods=12, tz="US/Eastern") obj = index_or_series(idx) arr, res = obj.factorize() @@ -112,7 +112,7 @@ # GH#51978 case that does not go through the fastpath based on # non-None freq tz = tz_naive_fixture - idx = date_range("2016-11-06", freq="H", periods=5, tz=tz)[[0, 4, 1, 3, 2]] + idx = date_range("2016-11-06", freq="h", periods=5, tz=tz)[[0, 4, 1, 3, 2]] exp_codes, exp_uniques = idx.factorize(sort=sort) res_codes, res_uniques = idx.as_unit("s").factorize(sort=sort) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_insert.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_insert.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_insert.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_insert.py 2024-04-10 17:42:52.000000000 +0000 @@ -52,13 +52,15 @@ result = dti.insert(0, item) assert result.freq is None - def test_insert(self): - idx = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"], name="idx") + def test_insert(self, unit): + idx = DatetimeIndex( + ["2000-01-04", "2000-01-01", "2000-01-02"], name="idx" + ).as_unit(unit) result = idx.insert(2, datetime(2000, 1, 5)) exp = DatetimeIndex( ["2000-01-04", "2000-01-01", "2000-01-05", "2000-01-02"], name="idx" - ) + ).as_unit(unit) tm.assert_index_equal(result, exp) # insertion of non-datetime should coerce to object index @@ -76,31 +78,32 @@ tm.assert_index_equal(result, expected) assert result.name == expected.name - idx = date_range("1/1/2000", periods=3, freq="M", name="idx") + def test_insert2(self, unit): + idx = date_range("1/1/2000", periods=3, freq="ME", name="idx", unit=unit) # preserve freq expected_0 = DatetimeIndex( ["1999-12-31", "2000-01-31", "2000-02-29", "2000-03-31"], name="idx", - freq="M", - ) + freq="ME", + ).as_unit(unit) expected_3 = DatetimeIndex( ["2000-01-31", "2000-02-29", "2000-03-31", "2000-04-30"], name="idx", - freq="M", - ) + freq="ME", + ).as_unit(unit) # reset freq to None expected_1_nofreq = DatetimeIndex( ["2000-01-31", "2000-01-31", "2000-02-29", "2000-03-31"], name="idx", freq=None, - ) + ).as_unit(unit) expected_3_nofreq = DatetimeIndex( ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], name="idx", freq=None, - ) + ).as_unit(unit) cases = [ (0, datetime(1999, 12, 31), expected_0), @@ -116,22 +119,28 @@ assert result.name == expected.name assert result.freq == expected.freq + def test_insert3(self, unit): + idx = date_range("1/1/2000", periods=3, freq="ME", name="idx", unit=unit) + # reset freq to None result = idx.insert(3, datetime(2000, 1, 2)) expected = DatetimeIndex( ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], name="idx", freq=None, - ) + ).as_unit(unit) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq is None + def test_insert4(self, unit): for tz in ["US/Pacific", "Asia/Singapore"]: - idx = date_range("1/1/2000 09:00", periods=6, freq="H", tz=tz, name="idx") + idx = date_range( + "1/1/2000 09:00", periods=6, freq="h", tz=tz, name="idx", unit=unit + ) # preserve freq expected = date_range( - "1/1/2000 09:00", periods=7, freq="H", tz=tz, name="idx" + "1/1/2000 09:00", periods=7, freq="h", tz=tz, name="idx", unit=unit ) for d in [ Timestamp("2000-01-01 15:00", tz=tz), @@ -156,7 +165,7 @@ name="idx", tz=tz, freq=None, - ) + ).as_unit(unit) # reset freq to None for d in [ Timestamp("2000-01-01 10:00", tz=tz), diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_isocalendar.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_isocalendar.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_isocalendar.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_isocalendar.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,7 @@ from pandas import ( DataFrame, DatetimeIndex, + date_range, ) import pandas._testing as tm @@ -18,3 +19,10 @@ dtype="UInt32", ) tm.assert_frame_equal(result, expected_data_frame) + + +def test_dti_timestamp_isocalendar_fields(): + idx = date_range("2020-01-01", periods=10) + expected = tuple(idx.isocalendar().iloc[-1].to_list()) + result = idx[-1].isocalendar() + assert result == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_map.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_map.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_map.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_map.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,47 @@ +import pytest + +from pandas import ( + DatetimeIndex, + Index, + MultiIndex, + Period, + date_range, +) +import pandas._testing as tm + + +class TestMap: + def test_map(self): + rng = date_range("1/1/2000", periods=10) + + f = lambda x: x.strftime("%Y%m%d") + result = rng.map(f) + exp = Index([f(x) for x in rng]) + tm.assert_index_equal(result, exp) + + def test_map_fallthrough(self, capsys): + # GH#22067, check we don't get warnings about silently ignored errors + dti = date_range("2017-01-01", "2018-01-01", freq="B") + + dti.map(lambda x: Period(year=x.year, month=x.month, freq="M")) + + captured = capsys.readouterr() + assert captured.err == "" + + def test_map_bug_1677(self): + index = DatetimeIndex(["2012-04-25 09:30:00.393000"]) + f = index.asof + + result = index.map(f) + expected = Index([f(index[0])]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("name", [None, "name"]) + def test_index_map(self, name): + # see GH#20990 + count = 6 + index = date_range("2018-01-01", periods=count, freq="ME", name=name).map( + lambda x: (x.year, x.month) + ) + exp_index = MultiIndex.from_product(((2018,), range(1, 7)), names=[name, name]) + tm.assert_index_equal(index, exp_index) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_normalize.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_normalize.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_normalize.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_normalize.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,95 @@ +from dateutil.tz import tzlocal +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import ( + DatetimeIndex, + NaT, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestNormalize: + def test_normalize(self): + rng = date_range("1/1/2000 9:30", periods=10, freq="D") + + result = rng.normalize() + expected = date_range("1/1/2000", periods=10, freq="D") + tm.assert_index_equal(result, expected) + + arr_ns = np.array([1380585623454345752, 1380585612343234312]).astype( + "datetime64[ns]" + ) + rng_ns = DatetimeIndex(arr_ns) + rng_ns_normalized = rng_ns.normalize() + + arr_ns = np.array([1380585600000000000, 1380585600000000000]).astype( + "datetime64[ns]" + ) + expected = DatetimeIndex(arr_ns) + tm.assert_index_equal(rng_ns_normalized, expected) + + assert result.is_normalized + assert not rng.is_normalized + + def test_normalize_nat(self): + dti = DatetimeIndex([NaT, Timestamp("2018-01-01 01:00:00")]) + result = dti.normalize() + expected = DatetimeIndex([NaT, Timestamp("2018-01-01")]) + tm.assert_index_equal(result, expected) + + def test_normalize_tz(self): + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz="US/Eastern") + + result = rng.normalize() # does not preserve freq + expected = date_range("1/1/2000", periods=10, freq="D", tz="US/Eastern") + tm.assert_index_equal(result, expected._with_freq(None)) + + assert result.is_normalized + assert not rng.is_normalized + + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz="UTC") + + result = rng.normalize() + expected = date_range("1/1/2000", periods=10, freq="D", tz="UTC") + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized + + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz=tzlocal()) + result = rng.normalize() # does not preserve freq + expected = date_range("1/1/2000", periods=10, freq="D", tz=tzlocal()) + tm.assert_index_equal(result, expected._with_freq(None)) + + assert result.is_normalized + assert not rng.is_normalized + + @td.skip_if_windows + @pytest.mark.parametrize( + "timezone", + [ + "US/Pacific", + "US/Eastern", + "UTC", + "Asia/Kolkata", + "Asia/Shanghai", + "Australia/Canberra", + ], + ) + def test_normalize_tz_local(self, timezone): + # GH#13459 + with tm.set_timezone(timezone): + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz=tzlocal()) + + result = rng.normalize() + expected = date_range("1/1/2000", periods=10, freq="D", tz=tzlocal()) + expected = expected._with_freq(None) + tm.assert_index_equal(result, expected) + + assert result.is_normalized + assert not rng.is_normalized diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_repeat.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_repeat.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_repeat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_repeat.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,30 +11,35 @@ class TestRepeat: def test_repeat_range(self, tz_naive_fixture): - tz = tz_naive_fixture rng = date_range("1/1/2000", "1/1/2001") result = rng.repeat(5) assert result.freq is None assert len(result) == 5 * len(rng) - index = date_range("2001-01-01", periods=2, freq="D", tz=tz) + def test_repeat_range2(self, tz_naive_fixture, unit): + tz = tz_naive_fixture + index = date_range("2001-01-01", periods=2, freq="D", tz=tz, unit=unit) exp = DatetimeIndex( ["2001-01-01", "2001-01-01", "2001-01-02", "2001-01-02"], tz=tz - ) + ).as_unit(unit) for res in [index.repeat(2), np.repeat(index, 2)]: tm.assert_index_equal(res, exp) assert res.freq is None - index = date_range("2001-01-01", periods=2, freq="2D", tz=tz) + def test_repeat_range3(self, tz_naive_fixture, unit): + tz = tz_naive_fixture + index = date_range("2001-01-01", periods=2, freq="2D", tz=tz, unit=unit) exp = DatetimeIndex( ["2001-01-01", "2001-01-01", "2001-01-03", "2001-01-03"], tz=tz - ) + ).as_unit(unit) for res in [index.repeat(2), np.repeat(index, 2)]: tm.assert_index_equal(res, exp) assert res.freq is None - index = DatetimeIndex(["2001-01-01", "NaT", "2003-01-01"], tz=tz) + def test_repeat_range4(self, tz_naive_fixture, unit): + tz = tz_naive_fixture + index = DatetimeIndex(["2001-01-01", "NaT", "2003-01-01"], tz=tz).as_unit(unit) exp = DatetimeIndex( [ "2001-01-01", @@ -48,17 +53,17 @@ "2003-01-01", ], tz=tz, - ) + ).as_unit(unit) for res in [index.repeat(3), np.repeat(index, 3)]: tm.assert_index_equal(res, exp) assert res.freq is None - def test_repeat(self, tz_naive_fixture): + def test_repeat(self, tz_naive_fixture, unit): tz = tz_naive_fixture reps = 2 msg = "the 'axis' parameter is not supported" - rng = date_range(start="2016-01-01", periods=2, freq="30Min", tz=tz) + rng = date_range(start="2016-01-01", periods=2, freq="30Min", tz=tz, unit=unit) expected_rng = DatetimeIndex( [ @@ -67,7 +72,7 @@ Timestamp("2016-01-01 00:30:00", tz=tz), Timestamp("2016-01-01 00:30:00", tz=tz), ] - ) + ).as_unit(unit) res = rng.repeat(reps) tm.assert_index_equal(res, expected_rng) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_resolution.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_resolution.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_resolution.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_resolution.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,31 @@ +from dateutil.tz import tzlocal +import pytest + +from pandas.compat import IS64 + +from pandas import date_range + + +@pytest.mark.parametrize( + "freq,expected", + [ + ("YE", "day"), + ("QE", "day"), + ("ME", "day"), + ("D", "day"), + ("h", "hour"), + ("min", "minute"), + ("s", "second"), + ("ms", "millisecond"), + ("us", "microsecond"), + ], +) +def test_dti_resolution(request, tz_naive_fixture, freq, expected): + tz = tz_naive_fixture + if freq == "YE" and not IS64 and isinstance(tz, tzlocal): + request.applymarker( + pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") + ) + + idx = date_range(start="2013-04-01", periods=30, freq=freq, tz=tz) + assert idx.resolution == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_round.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_round.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_round.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_round.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,221 @@ +import pytest + +from pandas._libs.tslibs import to_offset +from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG + +from pandas import ( + DatetimeIndex, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDatetimeIndexRound: + def test_round_daily(self): + dti = date_range("20130101 09:10:11", periods=5) + result = dti.round("D") + expected = date_range("20130101", periods=5) + tm.assert_index_equal(result, expected) + + dti = dti.tz_localize("UTC").tz_convert("US/Eastern") + result = dti.round("D") + expected = date_range("20130101", periods=5).tz_localize("US/Eastern") + tm.assert_index_equal(result, expected) + + result = dti.round("s") + tm.assert_index_equal(result, dti) + + @pytest.mark.parametrize( + "freq, error_msg", + [ + ("YE", " is a non-fixed frequency"), + ("ME", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ], + ) + def test_round_invalid(self, freq, error_msg): + dti = date_range("20130101 09:10:11", periods=5) + dti = dti.tz_localize("UTC").tz_convert("US/Eastern") + with pytest.raises(ValueError, match=error_msg): + dti.round(freq) + + def test_round(self, tz_naive_fixture, unit): + tz = tz_naive_fixture + rng = date_range(start="2016-01-01", periods=5, freq="30Min", tz=tz, unit=unit) + elt = rng[1] + + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz), + Timestamp("2016-01-01 00:00:00", tz=tz), + Timestamp("2016-01-01 01:00:00", tz=tz), + Timestamp("2016-01-01 02:00:00", tz=tz), + Timestamp("2016-01-01 02:00:00", tz=tz), + ] + ).as_unit(unit) + expected_elt = expected_rng[1] + + result = rng.round(freq="h") + tm.assert_index_equal(result, expected_rng) + assert elt.round(freq="h") == expected_elt + + msg = INVALID_FREQ_ERR_MSG + with pytest.raises(ValueError, match=msg): + rng.round(freq="foo") + with pytest.raises(ValueError, match=msg): + elt.round(freq="foo") + + msg = " is a non-fixed frequency" + with pytest.raises(ValueError, match=msg): + rng.round(freq="ME") + with pytest.raises(ValueError, match=msg): + elt.round(freq="ME") + + def test_round2(self, tz_naive_fixture): + tz = tz_naive_fixture + # GH#14440 & GH#15578 + index = DatetimeIndex(["2016-10-17 12:00:00.0015"], tz=tz).as_unit("ns") + result = index.round("ms") + expected = DatetimeIndex(["2016-10-17 12:00:00.002000"], tz=tz).as_unit("ns") + tm.assert_index_equal(result, expected) + + for freq in ["us", "ns"]: + tm.assert_index_equal(index, index.round(freq)) + + def test_round3(self, tz_naive_fixture): + tz = tz_naive_fixture + index = DatetimeIndex(["2016-10-17 12:00:00.00149"], tz=tz).as_unit("ns") + result = index.round("ms") + expected = DatetimeIndex(["2016-10-17 12:00:00.001000"], tz=tz).as_unit("ns") + tm.assert_index_equal(result, expected) + + def test_round4(self, tz_naive_fixture): + index = DatetimeIndex(["2016-10-17 12:00:00.001501031"], dtype="M8[ns]") + result = index.round("10ns") + expected = DatetimeIndex(["2016-10-17 12:00:00.001501030"], dtype="M8[ns]") + tm.assert_index_equal(result, expected) + + ts = "2016-10-17 12:00:00.001501031" + dti = DatetimeIndex([ts], dtype="M8[ns]") + with tm.assert_produces_warning(False): + dti.round("1010ns") + + def test_no_rounding_occurs(self, tz_naive_fixture): + # GH 21262 + tz = tz_naive_fixture + rng = date_range(start="2016-01-01", periods=5, freq="2Min", tz=tz) + + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz), + Timestamp("2016-01-01 00:02:00", tz=tz), + Timestamp("2016-01-01 00:04:00", tz=tz), + Timestamp("2016-01-01 00:06:00", tz=tz), + Timestamp("2016-01-01 00:08:00", tz=tz), + ] + ).as_unit("ns") + + result = rng.round(freq="2min") + tm.assert_index_equal(result, expected_rng) + + @pytest.mark.parametrize( + "test_input, rounder, freq, expected", + [ + (["2117-01-01 00:00:45"], "floor", "15s", ["2117-01-01 00:00:45"]), + (["2117-01-01 00:00:45"], "ceil", "15s", ["2117-01-01 00:00:45"]), + ( + ["2117-01-01 00:00:45.000000012"], + "floor", + "10ns", + ["2117-01-01 00:00:45.000000010"], + ), + ( + ["1823-01-01 00:00:01.000000012"], + "ceil", + "10ns", + ["1823-01-01 00:00:01.000000020"], + ), + (["1823-01-01 00:00:01"], "floor", "1s", ["1823-01-01 00:00:01"]), + (["1823-01-01 00:00:01"], "ceil", "1s", ["1823-01-01 00:00:01"]), + (["2018-01-01 00:15:00"], "ceil", "15min", ["2018-01-01 00:15:00"]), + (["2018-01-01 00:15:00"], "floor", "15min", ["2018-01-01 00:15:00"]), + (["1823-01-01 03:00:00"], "ceil", "3h", ["1823-01-01 03:00:00"]), + (["1823-01-01 03:00:00"], "floor", "3h", ["1823-01-01 03:00:00"]), + ( + ("NaT", "1823-01-01 00:00:01"), + "floor", + "1s", + ("NaT", "1823-01-01 00:00:01"), + ), + ( + ("NaT", "1823-01-01 00:00:01"), + "ceil", + "1s", + ("NaT", "1823-01-01 00:00:01"), + ), + ], + ) + def test_ceil_floor_edge(self, test_input, rounder, freq, expected): + dt = DatetimeIndex(list(test_input)) + func = getattr(dt, rounder) + result = func(freq) + expected = DatetimeIndex(list(expected)) + assert expected.equals(result) + + @pytest.mark.parametrize( + "start, index_freq, periods", + [("2018-01-01", "12h", 25), ("2018-01-01 0:0:0.124999", "1ns", 1000)], + ) + @pytest.mark.parametrize( + "round_freq", + [ + "2ns", + "3ns", + "4ns", + "5ns", + "6ns", + "7ns", + "250ns", + "500ns", + "750ns", + "1us", + "19us", + "250us", + "500us", + "750us", + "1s", + "2s", + "3s", + "12h", + "1D", + ], + ) + def test_round_int64(self, start, index_freq, periods, round_freq): + dt = date_range(start=start, freq=index_freq, periods=periods) + unit = to_offset(round_freq).nanos + + # test floor + result = dt.floor(round_freq) + diff = dt.asi8 - result.asi8 + mod = result.asi8 % unit + assert (mod == 0).all(), f"floor not a {round_freq} multiple" + assert (0 <= diff).all() and (diff < unit).all(), "floor error" + + # test ceil + result = dt.ceil(round_freq) + diff = result.asi8 - dt.asi8 + mod = result.asi8 % unit + assert (mod == 0).all(), f"ceil not a {round_freq} multiple" + assert (0 <= diff).all() and (diff < unit).all(), "ceil error" + + # test round + result = dt.round(round_freq) + diff = abs(result.asi8 - dt.asi8) + mod = result.asi8 % unit + assert (mod == 0).all(), f"round not a {round_freq} multiple" + assert (diff <= unit // 2).all(), "round error" + if unit % 2 == 0: + assert ( + result.asi8[diff == unit // 2] % 2 == 0 + ).all(), "round half to even error" diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_shift.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_shift.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_shift.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_shift.py 2024-04-10 17:42:52.000000000 +0000 @@ -20,42 +20,43 @@ # ------------------------------------------------------------- # DatetimeIndex.shift is used in integer addition - def test_dti_shift_tzaware(self, tz_naive_fixture): + def test_dti_shift_tzaware(self, tz_naive_fixture, unit): # GH#9903 tz = tz_naive_fixture - idx = DatetimeIndex([], name="xxx", tz=tz) - tm.assert_index_equal(idx.shift(0, freq="H"), idx) - tm.assert_index_equal(idx.shift(3, freq="H"), idx) + idx = DatetimeIndex([], name="xxx", tz=tz).as_unit(unit) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) + tm.assert_index_equal(idx.shift(3, freq="h"), idx) idx = DatetimeIndex( ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], name="xxx", tz=tz, - freq="H", - ) - tm.assert_index_equal(idx.shift(0, freq="H"), idx) + freq="h", + ).as_unit(unit) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) exp = DatetimeIndex( ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], name="xxx", tz=tz, - freq="H", - ) - tm.assert_index_equal(idx.shift(3, freq="H"), exp) + freq="h", + ).as_unit(unit) + tm.assert_index_equal(idx.shift(3, freq="h"), exp) exp = DatetimeIndex( ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], name="xxx", tz=tz, - freq="H", - ) - tm.assert_index_equal(idx.shift(-3, freq="H"), exp) + freq="h", + ).as_unit(unit) + tm.assert_index_equal(idx.shift(-3, freq="h"), exp) - def test_dti_shift_freqs(self): + def test_dti_shift_freqs(self, unit): # test shift for DatetimeIndex and non DatetimeIndex # GH#8083 - drange = date_range("20130101", periods=5) + drange = date_range("20130101", periods=5, unit=unit) result = drange.shift(1) expected = DatetimeIndex( ["2013-01-02", "2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"], + dtype=f"M8[{unit}]", freq="D", ) tm.assert_index_equal(result, expected) @@ -63,6 +64,7 @@ result = drange.shift(-1) expected = DatetimeIndex( ["2012-12-31", "2013-01-01", "2013-01-02", "2013-01-03", "2013-01-04"], + dtype=f"M8[{unit}]", freq="D", ) tm.assert_index_equal(result, expected) @@ -70,12 +72,13 @@ result = drange.shift(3, freq="2D") expected = DatetimeIndex( ["2013-01-07", "2013-01-08", "2013-01-09", "2013-01-10", "2013-01-11"], + dtype=f"M8[{unit}]", freq="D", ) tm.assert_index_equal(result, expected) - def test_dti_shift_int(self): - rng = date_range("1/1/2000", periods=20) + def test_dti_shift_int(self, unit): + rng = date_range("1/1/2000", periods=20, unit=unit) result = rng + 5 * rng.freq expected = rng.shift(5) @@ -85,25 +88,27 @@ expected = rng.shift(-5) tm.assert_index_equal(result, expected) - def test_dti_shift_no_freq(self): + def test_dti_shift_no_freq(self, unit): # GH#19147 - dti = DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) + dti = DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None).as_unit(unit) with pytest.raises(NullFrequencyError, match="Cannot shift with no freq"): dti.shift(2) @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_dti_shift_localized(self, tzstr): - dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") + def test_dti_shift_localized(self, tzstr, unit): + dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI", unit=unit) dr_tz = dr.tz_localize(tzstr) - result = dr_tz.shift(1, "10T") + result = dr_tz.shift(1, "10min") assert result.tz == dr_tz.tz - def test_dti_shift_across_dst(self): + def test_dti_shift_across_dst(self, unit): # GH 8616 - idx = date_range("2013-11-03", tz="America/Chicago", periods=7, freq="H") - s = Series(index=idx[:-1], dtype=object) - result = s.shift(freq="H") + idx = date_range( + "2013-11-03", tz="America/Chicago", periods=7, freq="h", unit=unit + ) + ser = Series(index=idx[:-1], dtype=object) + result = ser.shift(freq="h") expected = Series(index=idx[1:], dtype=object) tm.assert_series_equal(result, expected) @@ -115,24 +120,26 @@ [1, "2014-11-14 01:00:00"], ], ) - def test_dti_shift_near_midnight(self, shift, result_time): + def test_dti_shift_near_midnight(self, shift, result_time, unit): # GH 8616 dt = datetime(2014, 11, 14, 0) dt_est = pytz.timezone("EST").localize(dt) - s = Series(data=[1], index=[dt_est]) - result = s.shift(shift, freq="H") - expected = Series(1, index=DatetimeIndex([result_time], tz="EST")) + idx = DatetimeIndex([dt_est]).as_unit(unit) + ser = Series(data=[1], index=idx) + result = ser.shift(shift, freq="h") + exp_index = DatetimeIndex([result_time], tz="EST").as_unit(unit) + expected = Series(1, index=exp_index) tm.assert_series_equal(result, expected) - def test_shift_periods(self): + def test_shift_periods(self, unit): # GH#22458 : argument 'n' was deprecated in favor of 'periods' - idx = date_range(start=START, end=END, periods=3) + idx = date_range(start=START, end=END, periods=3, unit=unit) tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) @pytest.mark.parametrize("freq", ["B", "C"]) - def test_shift_bday(self, freq): - rng = date_range(START, END, freq=freq) + def test_shift_bday(self, freq, unit): + rng = date_range(START, END, freq=freq, unit=unit) shifted = rng.shift(5) assert shifted[0] == rng[5] assert shifted.freq == rng.freq @@ -145,18 +152,18 @@ assert shifted[0] == rng[0] assert shifted.freq == rng.freq - def test_shift_bmonth(self): - rng = date_range(START, END, freq=pd.offsets.BMonthEnd()) + def test_shift_bmonth(self, unit): + rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) shifted = rng.shift(1, freq=pd.offsets.BDay()) assert shifted[0] == rng[0] + pd.offsets.BDay() - rng = date_range(START, END, freq=pd.offsets.BMonthEnd()) + rng = date_range(START, END, freq=pd.offsets.BMonthEnd(), unit=unit) with tm.assert_produces_warning(pd.errors.PerformanceWarning): shifted = rng.shift(1, freq=pd.offsets.CDay()) assert shifted[0] == rng[0] + pd.offsets.CDay() - def test_shift_empty(self): + def test_shift_empty(self, unit): # GH#14811 - dti = date_range(start="2016-10-21", end="2016-10-21", freq="BM") + dti = date_range(start="2016-10-21", end="2016-10-21", freq="BME", unit=unit) result = dti.shift(1) tm.assert_index_equal(result, dti) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_to_julian_date.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_to_julian_date.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_to_julian_date.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_to_julian_date.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,45 @@ +import numpy as np + +from pandas import ( + Index, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDateTimeIndexToJulianDate: + def test_1700(self): + dr = date_range(start=Timestamp("1710-10-01"), periods=5, freq="D") + r1 = Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, Index) and r2.dtype == np.float64 + tm.assert_index_equal(r1, r2) + + def test_2000(self): + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="D") + r1 = Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, Index) and r2.dtype == np.float64 + tm.assert_index_equal(r1, r2) + + def test_hour(self): + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="h") + r1 = Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, Index) and r2.dtype == np.float64 + tm.assert_index_equal(r1, r2) + + def test_minute(self): + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="min") + r1 = Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, Index) and r2.dtype == np.float64 + tm.assert_index_equal(r1, r2) + + def test_second(self): + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="s") + r1 = Index([x.to_julian_date() for x in dr]) + r2 = dr.to_julian_date() + assert isinstance(r2, Index) and r2.dtype == np.float64 + tm.assert_index_equal(r1, r2) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_to_period.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_to_period.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_to_period.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_to_period.py 2024-04-10 17:42:52.000000000 +0000 @@ -20,7 +20,7 @@ class TestToPeriod: def test_dti_to_period(self): - dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") + dti = date_range(start="1/1/2005", end="12/1/2005", freq="ME") pi1 = dti.to_period() pi2 = dti.to_period(freq="D") pi3 = dti.to_period(freq="3D") @@ -50,38 +50,66 @@ result = stamps.to_period(freq) tm.assert_index_equal(rng, result) - @pytest.mark.parametrize("off", ["BQ", "QS", "BQS"]) + @pytest.mark.parametrize("off", ["BQE", "QS", "BQS"]) def test_to_period_quarterlyish(self, off): rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == "Q-DEC" + assert prng.freq == "QE-DEC" - @pytest.mark.parametrize("off", ["BA", "AS", "BAS"]) + @pytest.mark.parametrize("off", ["BYE", "YS", "BYS"]) def test_to_period_annualish(self, off): rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == "A-DEC" + assert prng.freq == "YE-DEC" def test_to_period_monthish(self): - offsets = ["MS", "BM"] + offsets = ["MS", "BME"] for off in offsets: rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == "M" + assert prng.freqstr == "M" - rng = date_range("01-Jan-2012", periods=8, freq="M") + rng = date_range("01-Jan-2012", periods=8, freq="ME") prng = rng.to_period() - assert prng.freq == "M" + assert prng.freqstr == "M" with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): date_range("01-Jan-2012", periods=8, freq="EOM") - @pytest.mark.parametrize("freq", ["2M", MonthEnd(2)]) - def test_dti_to_period_2monthish(self, freq): - dti = date_range("2020-01-01", periods=3, freq=freq) + @pytest.mark.parametrize( + "freq_offset, freq_period", + [ + ("2ME", "2M"), + (MonthEnd(2), MonthEnd(2)), + ], + ) + def test_dti_to_period_2monthish(self, freq_offset, freq_period): + dti = date_range("2020-01-01", periods=3, freq=freq_offset) pi = dti.to_period() - tm.assert_index_equal(pi, period_range("2020-01", "2020-05", freq=freq)) + tm.assert_index_equal(pi, period_range("2020-01", "2020-05", freq=freq_period)) + + @pytest.mark.parametrize( + "freq, freq_depr", + [ + ("2ME", "2M"), + ("2QE", "2Q"), + ("2QE-SEP", "2Q-SEP"), + ("1YE", "1Y"), + ("2YE-MAR", "2Y-MAR"), + ("1YE", "1A"), + ("2YE-MAR", "2A-MAR"), + ], + ) + def test_to_period_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr): + # GH#9586 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + f"in a future version, please use '{freq[1:]}' instead." + + rng = date_range("01-Jan-2012", periods=8, freq=freq) + prng = rng.to_period() + with tm.assert_produces_warning(FutureWarning, match=msg): + assert prng.freq == freq_depr def test_to_period_infer(self): # https://github.com/pandas-dev/pandas/issues/33358 @@ -106,7 +134,7 @@ tm.assert_index_equal(pi.to_timestamp(), dti) dti = date_range("1/1/2000", "1/7/2002", freq="B") - pi = dti.to_period(freq="H") + pi = dti.to_period(freq="h") tm.assert_index_equal(pi.to_timestamp(), dti) def test_to_period_millisecond(self): @@ -119,10 +147,10 @@ with tm.assert_produces_warning(UserWarning): # warning that timezone info will be lost - period = index.to_period(freq="L") + period = index.to_period(freq="ms") assert 2 == len(period) - assert period[0] == Period("2007-01-01 10:11:12.123Z", "L") - assert period[1] == Period("2007-01-01 10:11:13.789Z", "L") + assert period[0] == Period("2007-01-01 10:11:12.123Z", "ms") + assert period[1] == Period("2007-01-01 10:11:13.789Z", "ms") def test_to_period_microsecond(self): index = DatetimeIndex( @@ -134,10 +162,10 @@ with tm.assert_produces_warning(UserWarning): # warning that timezone info will be lost - period = index.to_period(freq="U") + period = index.to_period(freq="us") assert 2 == len(period) - assert period[0] == Period("2007-01-01 10:11:12.123456Z", "U") - assert period[1] == Period("2007-01-01 10:11:13.789123Z", "U") + assert period[0] == Period("2007-01-01 10:11:12.123456Z", "us") + assert period[1] == Period("2007-01-01 10:11:13.789123Z", "us") @pytest.mark.parametrize( "tz", @@ -187,3 +215,11 @@ idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"]) assert idx.freqstr is None tm.assert_index_equal(idx.to_period(), expected) + + @pytest.mark.parametrize("freq", ["2BMS", "1SME-15"]) + def test_to_period_offsets_not_supported(self, freq): + # GH#56243 + msg = f"{freq[1:]} is not supported as period frequency" + ts = date_range("1/1/2012", periods=4, freq=freq) + with pytest.raises(ValueError, match=msg): + ts.to_period() diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_to_pydatetime.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_to_pydatetime.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_to_pydatetime.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_to_pydatetime.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,51 @@ +from datetime import ( + datetime, + timezone, +) + +import dateutil.parser +import dateutil.tz +from dateutil.tz import tzlocal +import numpy as np + +from pandas import ( + DatetimeIndex, + date_range, + to_datetime, +) +import pandas._testing as tm +from pandas.tests.indexes.datetimes.test_timezones import FixedOffset + +fixed_off = FixedOffset(-420, "-07:00") + + +class TestToPyDatetime: + def test_dti_to_pydatetime(self): + dt = dateutil.parser.parse("2012-06-13T01:39:00Z") + dt = dt.replace(tzinfo=tzlocal()) + + arr = np.array([dt], dtype=object) + + result = to_datetime(arr, utc=True) + assert result.tz is timezone.utc + + rng = date_range("2012-11-03 03:00", "2012-11-05 03:00", tz=tzlocal()) + arr = rng.to_pydatetime() + result = to_datetime(arr, utc=True) + assert result.tz is timezone.utc + + def test_dti_to_pydatetime_fizedtz(self): + dates = np.array( + [ + datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off), + ] + ) + dti = DatetimeIndex(dates) + + result = dti.to_pydatetime() + tm.assert_numpy_array_equal(dates, result) + + result = dti._mpl_repr() + tm.assert_numpy_array_equal(dates, result) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_tz_convert.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_tz_convert.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_tz_convert.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_tz_convert.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,283 @@ +from datetime import datetime + +import dateutil.tz +from dateutil.tz import gettz +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs import timezones + +from pandas import ( + DatetimeIndex, + Index, + NaT, + Timestamp, + date_range, + offsets, +) +import pandas._testing as tm + + +class TestTZConvert: + def test_tz_convert_nat(self): + # GH#5546 + dates = [NaT] + idx = DatetimeIndex(dates) + idx = idx.tz_localize("US/Pacific") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Pacific")) + idx = idx.tz_convert("US/Eastern") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Eastern")) + idx = idx.tz_convert("UTC") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="UTC")) + + dates = ["2010-12-01 00:00", "2010-12-02 00:00", NaT] + idx = DatetimeIndex(dates) + idx = idx.tz_localize("US/Pacific") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Pacific")) + idx = idx.tz_convert("US/Eastern") + expected = ["2010-12-01 03:00", "2010-12-02 03:00", NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) + + idx = idx + offsets.Hour(5) + expected = ["2010-12-01 08:00", "2010-12-02 08:00", NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) + idx = idx.tz_convert("US/Pacific") + expected = ["2010-12-01 05:00", "2010-12-02 05:00", NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Pacific")) + + idx = idx + np.timedelta64(3, "h") + expected = ["2010-12-01 08:00", "2010-12-02 08:00", NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Pacific")) + + idx = idx.tz_convert("US/Eastern") + expected = ["2010-12-01 11:00", "2010-12-02 11:00", NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) + + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) + def test_dti_tz_convert_compat_timestamp(self, prefix): + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + idx = DatetimeIndex(strdates, tz=prefix + "US/Eastern") + + conv = idx[0].tz_convert(prefix + "US/Pacific") + expected = idx.tz_convert(prefix + "US/Pacific")[0] + + assert conv == expected + + def test_dti_tz_convert_hour_overflow_dst(self): + # Regression test for GH#13306 + + # sorted case US/Eastern -> UTC + ts = ["2008-05-12 09:50:00", "2008-12-12 09:50:35", "2009-05-12 09:50:32"] + tt = DatetimeIndex(ts).tz_localize("US/Eastern") + ut = tt.tz_convert("UTC") + expected = Index([13, 14, 13], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = ["2008-05-12 13:50:00", "2008-12-12 14:50:35", "2009-05-12 13:50:32"] + tt = DatetimeIndex(ts).tz_localize("UTC") + ut = tt.tz_convert("US/Eastern") + expected = Index([9, 9, 9], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = ["2008-05-12 09:50:00", "2008-12-12 09:50:35", "2008-05-12 09:50:32"] + tt = DatetimeIndex(ts).tz_localize("US/Eastern") + ut = tt.tz_convert("UTC") + expected = Index([13, 14, 13], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = ["2008-05-12 13:50:00", "2008-12-12 14:50:35", "2008-05-12 13:50:32"] + tt = DatetimeIndex(ts).tz_localize("UTC") + ut = tt.tz_convert("US/Eastern") + expected = Index([9, 9, 9], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz): + # Regression test for GH#13306 + + # sorted case US/Eastern -> UTC + ts = [ + Timestamp("2008-05-12 09:50:00", tz=tz), + Timestamp("2008-12-12 09:50:35", tz=tz), + Timestamp("2009-05-12 09:50:32", tz=tz), + ] + tt = DatetimeIndex(ts) + ut = tt.tz_convert("UTC") + expected = Index([13, 14, 13], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = [ + Timestamp("2008-05-12 13:50:00", tz="UTC"), + Timestamp("2008-12-12 14:50:35", tz="UTC"), + Timestamp("2009-05-12 13:50:32", tz="UTC"), + ] + tt = DatetimeIndex(ts) + ut = tt.tz_convert("US/Eastern") + expected = Index([9, 9, 9], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = [ + Timestamp("2008-05-12 09:50:00", tz=tz), + Timestamp("2008-12-12 09:50:35", tz=tz), + Timestamp("2008-05-12 09:50:32", tz=tz), + ] + tt = DatetimeIndex(ts) + ut = tt.tz_convert("UTC") + expected = Index([13, 14, 13], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = [ + Timestamp("2008-05-12 13:50:00", tz="UTC"), + Timestamp("2008-12-12 14:50:35", tz="UTC"), + Timestamp("2008-05-12 13:50:32", tz="UTC"), + ] + tt = DatetimeIndex(ts) + ut = tt.tz_convert("US/Eastern") + expected = Index([9, 9, 9], dtype=np.int32) + tm.assert_index_equal(ut.hour, expected) + + @pytest.mark.parametrize("freq, n", [("h", 1), ("min", 60), ("s", 3600)]) + def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): + # Regression test for tslib.tz_convert(vals, tz1, tz2). + # See GH#4496 for details. + idx = date_range(datetime(2011, 3, 26, 23), datetime(2011, 3, 27, 1), freq=freq) + idx = idx.tz_localize("UTC") + idx = idx.tz_convert("Europe/Moscow") + + expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) + tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) + + def test_dti_tz_convert_dst(self): + for freq, n in [("h", 1), ("min", 60), ("s", 3600)]: + # Start DST + idx = date_range( + "2014-03-08 23:00", "2014-03-09 09:00", freq=freq, tz="UTC" + ) + idx = idx.tz_convert("US/Eastern") + expected = np.repeat( + np.array([18, 19, 20, 21, 22, 23, 0, 1, 3, 4, 5]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) + tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) + + idx = date_range( + "2014-03-08 18:00", "2014-03-09 05:00", freq=freq, tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + expected = np.repeat( + np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) + tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) + + # End DST + idx = date_range( + "2014-11-01 23:00", "2014-11-02 09:00", freq=freq, tz="UTC" + ) + idx = idx.tz_convert("US/Eastern") + expected = np.repeat( + np.array([19, 20, 21, 22, 23, 0, 1, 1, 2, 3, 4]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) + tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) + + idx = date_range( + "2014-11-01 18:00", "2014-11-02 05:00", freq=freq, tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + expected = np.repeat( + np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + np.array([n, n, n, n, n, n, n, n, n, n, n, n, 1]), + ) + tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) + + # daily + # Start DST + idx = date_range("2014-03-08 00:00", "2014-03-09 00:00", freq="D", tz="UTC") + idx = idx.tz_convert("US/Eastern") + tm.assert_index_equal(idx.hour, Index([19, 19], dtype=np.int32)) + + idx = date_range( + "2014-03-08 00:00", "2014-03-09 00:00", freq="D", tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + tm.assert_index_equal(idx.hour, Index([5, 5], dtype=np.int32)) + + # End DST + idx = date_range("2014-11-01 00:00", "2014-11-02 00:00", freq="D", tz="UTC") + idx = idx.tz_convert("US/Eastern") + tm.assert_index_equal(idx.hour, Index([20, 20], dtype=np.int32)) + + idx = date_range( + "2014-11-01 00:00", "2014-11-02 000:00", freq="D", tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + tm.assert_index_equal(idx.hour, Index([4, 4], dtype=np.int32)) + + def test_tz_convert_roundtrip(self, tz_aware_fixture): + tz = tz_aware_fixture + idx1 = date_range(start="2014-01-01", end="2014-12-31", freq="ME", tz="UTC") + exp1 = date_range(start="2014-01-01", end="2014-12-31", freq="ME") + + idx2 = date_range(start="2014-01-01", end="2014-12-31", freq="D", tz="UTC") + exp2 = date_range(start="2014-01-01", end="2014-12-31", freq="D") + + idx3 = date_range(start="2014-01-01", end="2014-03-01", freq="h", tz="UTC") + exp3 = date_range(start="2014-01-01", end="2014-03-01", freq="h") + + idx4 = date_range(start="2014-08-01", end="2014-10-31", freq="min", tz="UTC") + exp4 = date_range(start="2014-08-01", end="2014-10-31", freq="min") + + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), (idx4, exp4)]: + converted = idx.tz_convert(tz) + reset = converted.tz_convert(None) + tm.assert_index_equal(reset, expected) + assert reset.tzinfo is None + expected = converted.tz_convert("UTC").tz_localize(None) + expected = expected._with_freq("infer") + tm.assert_index_equal(reset, expected) + + def test_dti_tz_convert_tzlocal(self): + # GH#13583 + # tz_convert doesn't affect to internal + dti = date_range(start="2001-01-01", end="2001-03-01", tz="UTC") + dti2 = dti.tz_convert(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + + dti = date_range(start="2001-01-01", end="2001-03-01", tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_convert(None) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + + @pytest.mark.parametrize( + "tz", + [ + "US/Eastern", + "dateutil/US/Eastern", + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + ], + ) + def test_dti_tz_convert_utc_to_local_no_modify(self, tz): + rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") + rng_eastern = rng.tz_convert(tz) + + # Values are unmodified + tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) + + assert timezones.tz_compare(rng_eastern.tz, timezones.maybe_get_tz(tz)) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_tz_convert_unsorted(self, tzstr): + dr = date_range("2012-03-09", freq="h", periods=100, tz="utc") + dr = dr.tz_convert(tzstr) + + result = dr[::-1].hour + exp = dr.hour[::-1] + tm.assert_almost_equal(result, exp) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_tz_localize.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_tz_localize.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_tz_localize.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_tz_localize.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,402 @@ +from datetime import ( + datetime, + timedelta, +) + +import dateutil.tz +from dateutil.tz import gettz +import numpy as np +import pytest +import pytz + +from pandas import ( + DatetimeIndex, + Timestamp, + bdate_range, + date_range, + offsets, + to_datetime, +) +import pandas._testing as tm + +try: + from zoneinfo import ZoneInfo +except ImportError: + # Cannot assign to a type [misc] + ZoneInfo = None # type: ignore[misc, assignment] + + +easts = [pytz.timezone("US/Eastern"), gettz("US/Eastern")] +if ZoneInfo is not None: + try: + tz = ZoneInfo("US/Eastern") + except KeyError: + # no tzdata + pass + else: + easts.append(tz) + + +class TestTZLocalize: + def test_tz_localize_invalidates_freq(self): + # we only preserve freq in unambiguous cases + + # if localized to US/Eastern, this crosses a DST transition + dti = date_range("2014-03-08 23:00", "2014-03-09 09:00", freq="h") + assert dti.freq == "h" + + result = dti.tz_localize(None) # no-op + assert result.freq == "h" + + result = dti.tz_localize("UTC") # unambiguous freq preservation + assert result.freq == "h" + + result = dti.tz_localize("US/Eastern", nonexistent="shift_forward") + assert result.freq is None + assert result.inferred_freq is None # i.e. we are not _too_ strict here + + # Case where we _can_ keep freq because we're length==1 + dti2 = dti[:1] + result = dti2.tz_localize("US/Eastern") + assert result.freq == "h" + + def test_tz_localize_utc_copies(self, utc_fixture): + # GH#46460 + times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] + index = DatetimeIndex(times) + + res = index.tz_localize(utc_fixture) + assert not tm.shares_memory(res, index) + + res2 = index._data.tz_localize(utc_fixture) + assert not tm.shares_memory(index._data, res2) + + def test_dti_tz_localize_nonexistent_raise_coerce(self): + # GH#13057 + times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] + index = DatetimeIndex(times) + tz = "US/Eastern" + with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): + index.tz_localize(tz=tz) + + with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): + index.tz_localize(tz=tz, nonexistent="raise") + + result = index.tz_localize(tz=tz, nonexistent="NaT") + test_times = ["2015-03-08 01:00-05:00", "NaT", "2015-03-08 03:00-04:00"] + dti = to_datetime(test_times, utc=True) + expected = dti.tz_convert("US/Eastern") + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_infer(self, tz): + # November 6, 2011, fall back, repeat 2 AM hour + # With no repeated hours, we cannot infer the transition + dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour()) + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + dr.tz_localize(tz) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_infer2(self, tz, unit): + # With repeated hours, we can infer the transition + dr = date_range( + datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour(), tz=tz, unit=unit + ) + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + di = DatetimeIndex(times).as_unit(unit) + result = di.tz_localize(tz, ambiguous="infer") + expected = dr._with_freq(None) + tm.assert_index_equal(result, expected) + result2 = DatetimeIndex(times, tz=tz, ambiguous="infer").as_unit(unit) + tm.assert_index_equal(result2, expected) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_infer3(self, tz): + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) + localized = dr.tz_localize(tz) + localized_infer = dr.tz_localize(tz, ambiguous="infer") + tm.assert_index_equal(localized, localized_infer) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_times(self, tz): + # March 13, 2011, spring forward, skip from 2 AM to 3 AM + dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=offsets.Hour()) + with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:30:00"): + dr.tz_localize(tz) + + # after dst transition, it works + dr = date_range( + datetime(2011, 3, 13, 3, 30), periods=3, freq=offsets.Hour(), tz=tz + ) + + # November 6, 2011, fall back, repeat 2 AM hour + dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=offsets.Hour()) + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + dr.tz_localize(tz) + + # UTC is OK + dr = date_range( + datetime(2011, 3, 13), periods=48, freq=offsets.Minute(30), tz=pytz.utc + ) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_tz_localize_pass_dates_to_utc(self, tzstr): + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + + idx = DatetimeIndex(strdates) + conv = idx.tz_localize(tzstr) + + fromdates = DatetimeIndex(strdates, tz=tzstr) + + assert conv.tz == fromdates.tz + tm.assert_numpy_array_equal(conv.values, fromdates.values) + + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) + def test_dti_tz_localize(self, prefix): + tzstr = prefix + "US/Eastern" + dti = date_range(start="1/1/2005", end="1/1/2005 0:00:30.256", freq="ms") + dti2 = dti.tz_localize(tzstr) + + dti_utc = date_range( + start="1/1/2005 05:00", end="1/1/2005 5:00:30.256", freq="ms", tz="utc" + ) + + tm.assert_numpy_array_equal(dti2.values, dti_utc.values) + + dti3 = dti2.tz_convert(prefix + "US/Pacific") + tm.assert_numpy_array_equal(dti3.values, dti_utc.values) + + dti = date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="ms") + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + dti.tz_localize(tzstr) + + dti = date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="ms") + with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"): + dti.tz_localize(tzstr) + + @pytest.mark.parametrize( + "tz", + [ + "US/Eastern", + "dateutil/US/Eastern", + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + ], + ) + def test_dti_tz_localize_utc_conversion(self, tz): + # Localizing to time zone should: + # 1) check for DST ambiguities + # 2) convert to UTC + + rng = date_range("3/10/2012", "3/11/2012", freq="30min") + + converted = rng.tz_localize(tz) + expected_naive = rng + offsets.Hour(5) + tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) + + # DST ambiguity, this should fail + rng = date_range("3/11/2012", "3/12/2012", freq="30min") + # Is this really how it should fail?? + with pytest.raises(pytz.NonExistentTimeError, match="2012-03-11 02:00:00"): + rng.tz_localize(tz) + + def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): + # note: this tz tests that a tz-naive index can be localized + # and de-localized successfully, when there are no DST transitions + # in the range. + idx = date_range(start="2014-06-01", end="2014-08-30", freq="15min") + tz = tz_aware_fixture + localized = idx.tz_localize(tz) + # can't localize a tz-aware object + with pytest.raises( + TypeError, match="Already tz-aware, use tz_convert to convert" + ): + localized.tz_localize(tz) + reset = localized.tz_localize(None) + assert reset.tzinfo is None + expected = idx._with_freq(None) + tm.assert_index_equal(reset, expected) + + def test_dti_tz_localize_naive(self): + rng = date_range("1/1/2011", periods=100, freq="h") + + conv = rng.tz_localize("US/Pacific") + exp = date_range("1/1/2011", periods=100, freq="h", tz="US/Pacific") + + tm.assert_index_equal(conv, exp._with_freq(None)) + + def test_dti_tz_localize_tzlocal(self): + # GH#13583 + offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) + offset = int(offset.total_seconds() * 1000000000) + + dti = date_range(start="2001-01-01", end="2001-03-01") + dti2 = dti.tz_localize(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8) + + dti = date_range(start="2001-01-01", end="2001-03-01", tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_localize(None) + tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_nat(self, tz): + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + di = DatetimeIndex(times) + localized = di.tz_localize(tz, ambiguous="NaT") + + times = [ + "11/06/2011 00:00", + np.nan, + np.nan, + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + di_test = DatetimeIndex(times, tz="US/Eastern") + + # left dtype is datetime64[ns, US/Eastern] + # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] + tm.assert_numpy_array_equal(di_test.values, localized.values) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_flags(self, tz, unit): + # November 6, 2011, fall back, repeat 2 AM hour + + # Pass in flags to determine right dst transition + dr = date_range( + datetime(2011, 11, 6, 0), periods=5, freq=offsets.Hour(), tz=tz, unit=unit + ) + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + + # Test tz_localize + di = DatetimeIndex(times).as_unit(unit) + is_dst = [1, 1, 0, 0, 0] + localized = di.tz_localize(tz, ambiguous=is_dst) + expected = dr._with_freq(None) + tm.assert_index_equal(expected, localized) + + result = DatetimeIndex(times, tz=tz, ambiguous=is_dst).as_unit(unit) + tm.assert_index_equal(result, expected) + + localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) + tm.assert_index_equal(dr, localized) + + localized = di.tz_localize(tz, ambiguous=np.array(is_dst).astype("bool")) + tm.assert_index_equal(dr, localized) + + # Test constructor + localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst).as_unit(unit) + tm.assert_index_equal(dr, localized) + + # Test duplicate times where inferring the dst fails + times += times + di = DatetimeIndex(times).as_unit(unit) + + # When the sizes are incompatible, make sure error is raised + msg = "Length of ambiguous bool-array must be the same size as vals" + with pytest.raises(Exception, match=msg): + di.tz_localize(tz, ambiguous=is_dst) + + # When sizes are compatible and there are repeats ('infer' won't work) + is_dst = np.hstack((is_dst, is_dst)) + localized = di.tz_localize(tz, ambiguous=is_dst) + dr = dr.append(dr) + tm.assert_index_equal(dr, localized) + + @pytest.mark.parametrize("tz", easts) + def test_dti_tz_localize_ambiguous_flags2(self, tz, unit): + # When there is no dst transition, nothing special happens + dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=offsets.Hour()) + is_dst = np.array([1] * 10) + localized = dr.tz_localize(tz) + localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) + tm.assert_index_equal(localized, localized_is_dst) + + def test_dti_tz_localize_bdate_range(self): + dr = bdate_range("1/1/2009", "1/1/2010") + dr_utc = bdate_range("1/1/2009", "1/1/2010", tz=pytz.utc) + localized = dr.tz_localize(pytz.utc) + tm.assert_index_equal(dr_utc, localized) + + @pytest.mark.parametrize( + "start_ts, tz, end_ts, shift", + [ + ["2015-03-29 02:20:00", "Europe/Warsaw", "2015-03-29 03:00:00", "forward"], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:59:59.999999999", + "backward", + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 03:20:00", + timedelta(hours=1), + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:20:00", + timedelta(hours=-1), + ], + ["2018-03-11 02:33:00", "US/Pacific", "2018-03-11 03:00:00", "forward"], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:59:59.999999999", + "backward", + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 03:33:00", + timedelta(hours=1), + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:33:00", + timedelta(hours=-1), + ], + ], + ) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + def test_dti_tz_localize_nonexistent_shift( + self, start_ts, tz, end_ts, shift, tz_type, unit + ): + # GH#8917 + tz = tz_type + tz + if isinstance(shift, str): + shift = "shift_" + shift + dti = DatetimeIndex([Timestamp(start_ts)]).as_unit(unit) + result = dti.tz_localize(tz, nonexistent=shift) + expected = DatetimeIndex([Timestamp(end_ts)]).tz_localize(tz).as_unit(unit) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("offset", [-1, 1]) + def test_dti_tz_localize_nonexistent_shift_invalid(self, offset, warsaw): + # GH#8917 + tz = warsaw + dti = DatetimeIndex([Timestamp("2015-03-29 02:20:00")]) + msg = "The provided timedelta will relocalize on a nonexistent time" + with pytest.raises(ValueError, match=msg): + dti.tz_localize(tz, nonexistent=timedelta(seconds=offset)) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_unique.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_unique.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/methods/test_unique.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/methods/test_unique.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,77 @@ +from datetime import ( + datetime, + timedelta, +) + +from pandas import ( + DatetimeIndex, + NaT, + Timestamp, +) +import pandas._testing as tm + + +def test_unique(tz_naive_fixture): + idx = DatetimeIndex(["2017"] * 2, tz=tz_naive_fixture) + expected = idx[:1] + + result = idx.unique() + tm.assert_index_equal(result, expected) + # GH#21737 + # Ensure the underlying data is consistent + assert result[0] == expected[0] + + +def test_index_unique(rand_series_with_duplicate_datetimeindex): + dups = rand_series_with_duplicate_datetimeindex + index = dups.index + + uniques = index.unique() + expected = DatetimeIndex( + [ + datetime(2000, 1, 2), + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + ], + dtype=index.dtype, + ) + assert uniques.dtype == index.dtype # sanity + tm.assert_index_equal(uniques, expected) + assert index.nunique() == 4 + + # GH#2563 + assert isinstance(uniques, DatetimeIndex) + + dups_local = index.tz_localize("US/Eastern") + dups_local.name = "foo" + result = dups_local.unique() + expected = DatetimeIndex(expected, name="foo") + expected = expected.tz_localize("US/Eastern") + assert result.tz is not None + assert result.name == "foo" + tm.assert_index_equal(result, expected) + + +def test_index_unique2(): + # NaT, note this is excluded + arr = [1370745748 + t for t in range(20)] + [NaT._value] + idx = DatetimeIndex(arr * 3) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) + assert idx.nunique() == 20 + assert idx.nunique(dropna=False) == 21 + + +def test_index_unique3(): + arr = [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20) + ] + [NaT] + idx = DatetimeIndex(arr * 3) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) + assert idx.nunique() == 20 + assert idx.nunique(dropna=False) == 21 + + +def test_is_unique_monotonic(rand_series_with_duplicate_datetimeindex): + index = rand_series_with_duplicate_datetimeindex.index + assert not index.is_unique diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_arithmetic.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_arithmetic.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_arithmetic.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_arithmetic.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,56 @@ +# Arithmetic tests specific to DatetimeIndex are generally about `freq` +# rentention or inference. Other arithmetic tests belong in +# tests/arithmetic/test_datetime64.py +import pytest + +from pandas import ( + Timedelta, + TimedeltaIndex, + Timestamp, + date_range, + timedelta_range, +) +import pandas._testing as tm + + +class TestDatetimeIndexArithmetic: + def test_add_timedelta_preserves_freq(self): + # GH#37295 should hold for any DTI with freq=None or Tick freq + tz = "Canada/Eastern" + dti = date_range( + start=Timestamp("2019-03-26 00:00:00-0400", tz=tz), + end=Timestamp("2020-10-17 00:00:00-0400", tz=tz), + freq="D", + ) + result = dti + Timedelta(days=1) + assert result.freq == dti.freq + + def test_sub_datetime_preserves_freq(self, tz_naive_fixture): + # GH#48818 + dti = date_range("2016-01-01", periods=12, tz=tz_naive_fixture) + + res = dti - dti[0] + expected = timedelta_range("0 Days", "11 Days") + tm.assert_index_equal(res, expected) + assert res.freq == expected.freq + + @pytest.mark.xfail( + reason="The inherited freq is incorrect bc dti.freq is incorrect " + "https://github.com/pandas-dev/pandas/pull/48818/files#r982793461" + ) + def test_sub_datetime_preserves_freq_across_dst(self): + # GH#48818 + ts = Timestamp("2016-03-11", tz="US/Pacific") + dti = date_range(ts, periods=4) + + res = dti - dti[0] + expected = TimedeltaIndex( + [ + Timedelta(days=0), + Timedelta(days=1), + Timedelta(days=2), + Timedelta(days=2, hours=23), + ] + ) + tm.assert_index_equal(res, expected) + assert res.freq == expected.freq diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_asof.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_asof.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_asof.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_asof.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,31 +0,0 @@ -from datetime import timedelta - -from pandas import ( - Index, - Timestamp, - date_range, - isna, -) -import pandas._testing as tm - - -class TestAsOf: - def test_asof_partial(self): - index = date_range("2010-01-01", periods=2, freq="m") - expected = Timestamp("2010-02-28") - result = index.asof("2010-02") - assert result == expected - assert not isinstance(result, Index) - - def test_asof(self): - index = tm.makeDateIndex(100) - - dt = index[0] - assert index.asof(dt) == dt - assert isna(index.asof(dt - timedelta(1))) - - dt = index[-1] - assert index.asof(dt + timedelta(1)) == dt - - dt = index[0].to_pydatetime() - assert isinstance(index.asof(dt), Timestamp) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,8 @@ from operator import attrgetter import dateutil +import dateutil.tz +from dateutil.tz import gettz import numpy as np import pytest import pytz @@ -16,6 +18,7 @@ from pandas._libs.tslibs import ( OutOfBoundsDatetime, astype_overflowsafe, + timezones, ) import pandas as pd @@ -28,10 +31,7 @@ to_datetime, ) import pandas._testing as tm -from pandas.core.arrays import ( - DatetimeArray, - period_array, -) +from pandas.core.arrays import period_array class TestDatetimeIndex: @@ -70,10 +70,7 @@ with pytest.raises(ValueError, match=msg): DatetimeIndex([], dtype="M8[ns, UTC]", tz=None) - @pytest.mark.parametrize( - "dt_cls", [DatetimeIndex, DatetimeArray._from_sequence_not_strict] - ) - def test_freq_validation_with_nat(self, dt_cls): + def test_freq_validation_with_nat(self): # GH#11587 make sure we get a useful error message when generate_range # raises msg = ( @@ -81,9 +78,9 @@ "to passed frequency D" ) with pytest.raises(ValueError, match=msg): - dt_cls([pd.NaT, Timestamp("2011-01-01")], freq="D") + DatetimeIndex([pd.NaT, Timestamp("2011-01-01")], freq="D") with pytest.raises(ValueError, match=msg): - dt_cls([pd.NaT, Timestamp("2011-01-01")._value], freq="D") + DatetimeIndex([pd.NaT, Timestamp("2011-01-01")._value], freq="D") # TODO: better place for tests shared by DTI/TDI? @pytest.mark.parametrize( @@ -185,7 +182,7 @@ ) def test_construction_with_alt(self, kwargs, tz_aware_fixture): tz = tz_aware_fixture - i = date_range("20130101", periods=5, freq="H", tz=tz) + i = date_range("20130101", periods=5, freq="h", tz=tz) kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} result = DatetimeIndex(i, **kwargs) tm.assert_index_equal(i, result) @@ -196,7 +193,7 @@ ) def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): tz = tz_aware_fixture - i = date_range("20130101", periods=5, freq="H", tz=tz) + i = date_range("20130101", periods=5, freq="h", tz=tz) i = i._with_freq(None) kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} @@ -567,6 +564,16 @@ # can't create DatetimeIndex DatetimeIndex(dates) + @pytest.mark.parametrize("data", [["1400-01-01"], [datetime(1400, 1, 1)]]) + def test_dti_date_out_of_range(self, data): + # GH#1475 + msg = ( + "^Out of bounds nanosecond timestamp: " + "1400-01-01( 00:00:00)?, at position 0$" + ) + with pytest.raises(OutOfBoundsDatetime, match=msg): + DatetimeIndex(data) + def test_construction_with_ndarray(self): # GH 5152 dates = [datetime(2013, 10, 7), datetime(2013, 10, 8), datetime(2013, 10, 9)] @@ -582,23 +589,16 @@ result = DatetimeIndex(values).tz_localize("US/Central") - expected = DatetimeIndex(["2000-01-01T00:00:00"], tz="US/Central") + expected = DatetimeIndex(["2000-01-01T00:00:00"], dtype="M8[ns, US/Central]") tm.assert_index_equal(result, expected) # but UTC is *not* deprecated. with tm.assert_produces_warning(None): result = DatetimeIndex(values, tz="UTC") - expected = DatetimeIndex(["2000-01-01T00:00:00"], tz="US/Central") + expected = DatetimeIndex(["2000-01-01T00:00:00"], dtype="M8[ns, UTC]") + tm.assert_index_equal(result, expected) def test_constructor_coverage(self): - rng = date_range("1/1/2000", periods=10.5) - exp = date_range("1/1/2000", periods=10) - tm.assert_index_equal(rng, exp) - - msg = "periods must be a number, got foo" - with pytest.raises(TypeError, match=msg): - date_range(start="1/1/2000", periods="foo", freq="D") - msg = r"DatetimeIndex\(\.\.\.\) must be called with a collection" with pytest.raises(TypeError, match=msg): DatetimeIndex("1/1/2000") @@ -637,18 +637,7 @@ with pytest.raises(ValueError, match=msg): DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"], freq="D") - msg = ( - "Of the four parameters: start, end, periods, and freq, exactly " - "three must be specified" - ) - with pytest.raises(ValueError, match=msg): - date_range(start="2011-01-01", freq="b") - with pytest.raises(ValueError, match=msg): - date_range(end="2011-01-01", freq="B") - with pytest.raises(ValueError, match=msg): - date_range(periods=10, freq="D") - - @pytest.mark.parametrize("freq", ["AS", "W-SUN"]) + @pytest.mark.parametrize("freq", ["YS", "W-SUN"]) def test_constructor_datetime64_tzformat(self, freq): # see GH#6572: ISO 8601 format results in stdlib timezone object idx = date_range( @@ -715,10 +704,14 @@ idx = DatetimeIndex( ["2013-01-01", "2013-01-02"], dtype="datetime64[ns, US/Eastern]" ) - expected = DatetimeIndex(["2013-01-01", "2013-01-02"]).tz_localize("US/Eastern") + expected = ( + DatetimeIndex(["2013-01-01", "2013-01-02"]) + .as_unit("ns") + .tz_localize("US/Eastern") + ) tm.assert_index_equal(idx, expected) - idx = DatetimeIndex(["2013-01-01", "2013-01-02"], tz="US/Eastern") + idx = DatetimeIndex(["2013-01-01", "2013-01-02"], tz="US/Eastern").as_unit("ns") tm.assert_index_equal(idx, expected) def test_constructor_dtype_tz_mismatch_raises(self): @@ -752,10 +745,6 @@ with pytest.raises(ValueError, match=msg): DatetimeIndex([1, 2], dtype=dtype) - def test_constructor_name(self): - idx = date_range(start="2000-01-01", periods=1, freq="A", name="TEST") - assert idx.name == "TEST" - def test_000constructor_resolution(self): # 2252 t1 = Timestamp((1352934390 * 1000000000) + 1000000 + 1000 + 1) @@ -786,7 +775,7 @@ result = date_range(freq="D", start=start, end=end, tz=tz) expected = DatetimeIndex( ["2013-01-01 06:00:00", "2013-01-02 06:00:00"], - tz="America/Los_Angeles", + dtype="M8[ns, America/Los_Angeles]", freq="D", ) tm.assert_index_equal(result, expected) @@ -834,14 +823,12 @@ def test_construction_from_replaced_timestamps_with_dst(self): # GH 18785 index = date_range( - Timestamp(2000, 1, 1), - Timestamp(2005, 1, 1), - freq="MS", + Timestamp(2000, 12, 31), + Timestamp(2005, 12, 31), + freq="YE-DEC", tz="Australia/Melbourne", ) - test = pd.DataFrame({"data": range(len(index))}, index=index) - test = test.resample("Y").mean() - result = DatetimeIndex([x.replace(month=6, day=1) for x in test.index]) + result = DatetimeIndex([x.replace(month=6, day=1) for x in index]) expected = DatetimeIndex( [ "2000-06-01 00:00:00", @@ -902,7 +889,7 @@ start = Timestamp("2015-03-29 02:30:00").tz_localize( timezone, nonexistent="shift_forward" ) - result = date_range(start=start, periods=2, freq="H") + result = date_range(start=start, periods=2, freq="h") expected = DatetimeIndex( [ Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), @@ -913,10 +900,8 @@ tm.assert_index_equal(result, expected) # nonexistent keyword in end - end = Timestamp("2015-03-29 02:30:00").tz_localize( - timezone, nonexistent="shift_forward" - ) - result = date_range(end=end, periods=2, freq="H") + end = start + result = date_range(end=end, periods=2, freq="h") expected = DatetimeIndex( [ Timestamp("2015-03-29 01:00:00+01:00", tz=timezone), @@ -948,6 +933,164 @@ expected = DatetimeIndex([Timestamp("2019", tz="UTC"), pd.NaT]) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) + def test_dti_from_tzaware_datetime(self, tz): + d = [datetime(2012, 8, 19, tzinfo=tz)] + + index = DatetimeIndex(d) + assert timezones.tz_compare(index.tz, tz) + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_tz_constructors(self, tzstr): + """Test different DatetimeIndex constructions with timezone + Follow-up of GH#4229 + """ + arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] + + idx1 = to_datetime(arr).tz_localize(tzstr) + idx2 = date_range(start="2005-11-10 08:00:00", freq="h", periods=2, tz=tzstr) + idx2 = idx2._with_freq(None) # the others all have freq=None + idx3 = DatetimeIndex(arr, tz=tzstr) + idx4 = DatetimeIndex(np.array(arr), tz=tzstr) + + for other in [idx2, idx3, idx4]: + tm.assert_index_equal(idx1, other) + + def test_dti_construction_idempotent(self, unit): + rng = date_range( + "03/12/2012 00:00", periods=10, freq="W-FRI", tz="US/Eastern", unit=unit + ) + rng2 = DatetimeIndex(data=rng, tz="US/Eastern") + tm.assert_index_equal(rng, rng2) + + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) + def test_dti_constructor_static_tzinfo(self, prefix): + # it works! + index = DatetimeIndex([datetime(2012, 1, 1)], tz=prefix + "EST") + index.hour + index[0] + + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_convert_datetime_list(self, tzstr): + dr = date_range("2012-06-02", periods=10, tz=tzstr, name="foo") + dr2 = DatetimeIndex(list(dr), name="foo", freq="D") + tm.assert_index_equal(dr, dr2) + + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + ], + ) + @pytest.mark.parametrize("use_str", [True, False]) + @pytest.mark.parametrize("box_cls", [Timestamp, DatetimeIndex]) + def test_dti_ambiguous_matches_timestamp(self, tz, use_str, box_cls, request): + # GH#47471 check that we get the same raising behavior in the DTI + # constructor and Timestamp constructor + dtstr = "2013-11-03 01:59:59.999999" + item = dtstr + if not use_str: + item = Timestamp(dtstr).to_pydatetime() + if box_cls is not Timestamp: + item = [item] + + if not use_str and isinstance(tz, dateutil.tz.tzfile): + # FIXME: The Timestamp constructor here behaves differently than all + # the other cases bc with dateutil/zoneinfo tzinfos we implicitly + # get fold=0. Having this raise is not important, but having the + # behavior be consistent across cases is. + mark = pytest.mark.xfail(reason="We implicitly get fold=0.") + request.applymarker(mark) + + with pytest.raises(pytz.AmbiguousTimeError, match=dtstr): + box_cls(item, tz=tz) + + @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"]) + def test_dti_constructor_with_non_nano_dtype(self, tz): + # GH#55756, GH#54620 + ts = Timestamp("2999-01-01") + dtype = "M8[us]" + if tz is not None: + dtype = f"M8[us, {tz}]" + vals = [ts, "2999-01-02 03:04:05.678910", 2500] + result = DatetimeIndex(vals, dtype=dtype) + # The 2500 is interpreted as microseconds, consistent with what + # we would get if we created DatetimeIndexes from vals[:2] and vals[2:] + # and concated the results. + pointwise = [ + vals[0].tz_localize(tz), + Timestamp(vals[1], tz=tz), + to_datetime(vals[2], unit="us", utc=True).tz_convert(tz), + ] + exp_vals = [x.as_unit("us").asm8 for x in pointwise] + exp_arr = np.array(exp_vals, dtype="M8[us]") + expected = DatetimeIndex(exp_arr, dtype="M8[us]") + if tz is not None: + expected = expected.tz_localize("UTC").tz_convert(tz) + tm.assert_index_equal(result, expected) + + result2 = DatetimeIndex(np.array(vals, dtype=object), dtype=dtype) + tm.assert_index_equal(result2, expected) + + def test_dti_constructor_with_non_nano_now_today(self): + # GH#55756 + now = Timestamp.now() + today = Timestamp.today() + result = DatetimeIndex(["now", "today"], dtype="M8[s]") + assert result.dtype == "M8[s]" + + # result may not exactly match [now, today] so we'll test it up to a tolerance. + # (it *may* match exactly due to rounding) + tolerance = pd.Timedelta(microseconds=1) + + diff0 = result[0] - now.as_unit("s") + assert diff0 >= pd.Timedelta(0) + assert diff0 < tolerance + + diff1 = result[1] - today.as_unit("s") + assert diff1 >= pd.Timedelta(0) + assert diff1 < tolerance + + def test_dti_constructor_object_float_matches_float_dtype(self): + # GH#55780 + arr = np.array([0, np.nan], dtype=np.float64) + arr2 = arr.astype(object) + + dti1 = DatetimeIndex(arr, tz="CET") + dti2 = DatetimeIndex(arr2, tz="CET") + tm.assert_index_equal(dti1, dti2) + + @pytest.mark.parametrize("dtype", ["M8[us]", "M8[us, US/Pacific]"]) + def test_dti_constructor_with_dtype_object_int_matches_int_dtype(self, dtype): + # Going through the object path should match the non-object path + + vals1 = np.arange(5, dtype="i8") * 1000 + vals1[0] = pd.NaT.value + + vals2 = vals1.astype(np.float64) + vals2[0] = np.nan + + vals3 = vals1.astype(object) + # change lib.infer_dtype(vals3) from "integer" so we go through + # array_to_datetime in _sequence_to_dt64 + vals3[0] = pd.NaT + + vals4 = vals2.astype(object) + + res1 = DatetimeIndex(vals1, dtype=dtype) + res2 = DatetimeIndex(vals2, dtype=dtype) + res3 = DatetimeIndex(vals3, dtype=dtype) + res4 = DatetimeIndex(vals4, dtype=dtype) + + expected = DatetimeIndex(vals1.view("M8[us]")) + if res1.tz is not None: + expected = expected.tz_localize("UTC").tz_convert(res1.tz) + tm.assert_index_equal(res1, expected) + tm.assert_index_equal(res2, expected) + tm.assert_index_equal(res3, expected) + tm.assert_index_equal(res4, expected) + class TestTimeSeries: def test_dti_constructor_preserve_dti_freq(self): @@ -966,32 +1109,6 @@ result = DatetimeIndex(rng._data, freq=None) assert result.freq is None - dta = DatetimeArray(rng, freq=None) - assert dta.freq is None - - def test_dti_constructor_years_only(self, tz_naive_fixture): - tz = tz_naive_fixture - # GH 6961 - rng1 = date_range("2014", "2015", freq="M", tz=tz) - expected1 = date_range("2014-01-31", "2014-12-31", freq="M", tz=tz) - - rng2 = date_range("2014", "2015", freq="MS", tz=tz) - expected2 = date_range("2014-01-01", "2015-01-01", freq="MS", tz=tz) - - rng3 = date_range("2014", "2020", freq="A", tz=tz) - expected3 = date_range("2014-12-31", "2019-12-31", freq="A", tz=tz) - - rng4 = date_range("2014", "2020", freq="AS", tz=tz) - expected4 = date_range("2014-01-01", "2020-01-01", freq="AS", tz=tz) - - for rng, expected in [ - (rng1, expected1), - (rng2, expected2), - (rng3, expected3), - (rng4, expected4), - ]: - tm.assert_index_equal(rng, expected) - def test_dti_constructor_small_int(self, any_int_numpy_dtype): # see gh-13721 exp = DatetimeIndex( @@ -1009,12 +1126,6 @@ rng = DatetimeIndex(["1-1-2000 00:00:01"]) assert rng[0].second == 1 - def test_is_(self): - dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") - assert dti.is_(dti) - assert dti.is_(dti.view()) - assert not dti.is_(dti.copy()) - def test_index_cast_datetime64_other_units(self): arr = np.arange(0, 100, 10, dtype=np.int64).view("M8[D]") idx = Index(arr) @@ -1036,7 +1147,8 @@ assert (index.asi8[50:100] != -1).all() @pytest.mark.parametrize( - "freq", ["M", "Q", "A", "D", "B", "BH", "T", "S", "L", "U", "H", "N", "C"] + "freq", + ["ME", "QE", "YE", "D", "B", "bh", "min", "s", "ms", "us", "h", "ns", "C"], ) def test_from_freq_recreate_from_data(self, freq): org = date_range(start="2001/02/01 09:00", freq=freq, periods=1) @@ -1076,43 +1188,17 @@ for other in [idx2, idx3, idx4]: assert (idx1.values == other.values).all() - sdate = datetime(1999, 12, 25) - edate = datetime(2000, 1, 1) - idx = date_range(start=sdate, freq="1B", periods=20) - assert len(idx) == 20 - assert idx[0] == sdate + 0 * offsets.BDay() - assert idx.freq == "B" - - idx1 = date_range(start=sdate, end=edate, freq="W-SUN") - idx2 = date_range(start=sdate, end=edate, freq=offsets.Week(weekday=6)) - assert len(idx1) == len(idx2) - assert idx1.freq == idx2.freq - - idx1 = date_range(start=sdate, end=edate, freq="QS") - idx2 = date_range( - start=sdate, end=edate, freq=offsets.QuarterBegin(startingMonth=1) - ) - assert len(idx1) == len(idx2) - assert idx1.freq == idx2.freq - - idx1 = date_range(start=sdate, end=edate, freq="BQ") - idx2 = date_range( - start=sdate, end=edate, freq=offsets.BQuarterEnd(startingMonth=12) - ) - assert len(idx1) == len(idx2) - assert idx1.freq == idx2.freq - - def test_pass_datetimeindex_to_index(self): - # Bugs in #1396 - rng = date_range("1/1/2000", "3/1/2000") - idx = Index(rng, dtype=object) - - expected = Index(rng.to_pydatetime(), dtype=object) - - tm.assert_numpy_array_equal(idx.values, expected.values) - - def test_date_range_tuple_freq_raises(self): - # GH#34703 - edate = datetime(2000, 1, 1) - with pytest.raises(TypeError, match="pass as a string instead"): - date_range(end=edate, freq=("D", 5), periods=20) + def test_dti_constructor_object_dtype_dayfirst_yearfirst_with_tz(self): + # GH#55813 + val = "5/10/16" + + dfirst = Timestamp(2016, 10, 5, tz="US/Pacific") + yfirst = Timestamp(2005, 10, 16, tz="US/Pacific") + + result1 = DatetimeIndex([val], tz="US/Pacific", dayfirst=True) + expected1 = DatetimeIndex([dfirst]) + tm.assert_index_equal(result1, expected1) + + result2 = DatetimeIndex([val], tz="US/Pacific", yearfirst=True) + expected2 = DatetimeIndex([yfirst]) + tm.assert_index_equal(result2, expected2) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_date_range.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_date_range.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_date_range.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_date_range.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,6 +7,7 @@ time, timedelta, ) +import re import numpy as np import pytest @@ -37,6 +38,12 @@ ) import pandas._testing as tm from pandas.core.arrays.datetimes import _generate_range as generate_range +from pandas.tests.indexes.datetimes.test_timezones import ( + FixedOffset, + fixed_off_no_name, +) + +from pandas.tseries.holiday import USFederalHolidayCalendar START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) @@ -123,7 +130,48 @@ class TestDateRanges: - @pytest.mark.parametrize("freq", ["N", "U", "L", "T", "S", "H", "D"]) + def test_date_range_name(self): + idx = date_range(start="2000-01-01", periods=1, freq="YE", name="TEST") + assert idx.name == "TEST" + + def test_date_range_invalid_periods(self): + msg = "periods must be a number, got foo" + with pytest.raises(TypeError, match=msg): + date_range(start="1/1/2000", periods="foo", freq="D") + + def test_date_range_fractional_period(self): + msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + rng = date_range("1/1/2000", periods=10.5) + exp = date_range("1/1/2000", periods=10) + tm.assert_index_equal(rng, exp) + + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2ME", "2M"), + ("2SME", "2SM"), + ("2BQE", "2BQ"), + ("2BYE", "2BY"), + ], + ) + def test_date_range_frequency_M_SM_BQ_BY_deprecated(self, freq, freq_depr): + # GH#52064 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + f"in a future version, please use '{freq[1:]}' instead." + + expected = date_range("1/1/2000", periods=4, freq=freq) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = date_range("1/1/2000", periods=4, freq=freq_depr) + tm.assert_index_equal(result, expected) + + def test_date_range_tuple_freq_raises(self): + # GH#34703 + edate = datetime(2000, 1, 1) + with pytest.raises(TypeError, match="pass as a string instead"): + date_range(end=edate, freq=("D", 5), periods=20) + + @pytest.mark.parametrize("freq", ["ns", "us", "ms", "min", "s", "h", "D"]) def test_date_range_edges(self, freq): # GH#13672 td = Timedelta(f"1{freq}") @@ -136,6 +184,7 @@ ) exp = DatetimeIndex( [ts + n * td for n in range(1, 5)], + dtype="M8[ns]", freq=freq, ) tm.assert_index_equal(idx, exp) @@ -146,7 +195,7 @@ end=ts + td, freq=freq, ) - exp = DatetimeIndex([], freq=freq) + exp = DatetimeIndex([], dtype="M8[ns]", freq=freq) tm.assert_index_equal(idx, exp) # start matches end @@ -155,7 +204,7 @@ end=ts + td, freq=freq, ) - exp = DatetimeIndex([ts + td], freq=freq) + exp = DatetimeIndex([ts + td], dtype="M8[ns]", freq=freq) tm.assert_index_equal(idx, exp) def test_date_range_near_implementation_bound(self): @@ -205,11 +254,11 @@ # case with start later than 1970-01-01, overflow int64 but not uint64 msg = "Cannot generate range with" with pytest.raises(OutOfBoundsDatetime, match=msg): - date_range(start="1970-02-01", periods=106752 * 24, freq="H") + date_range(start="1970-02-01", periods=106752 * 24, freq="h") # case with end before 1970-01-01, overflow int64 but not uint64 with pytest.raises(OutOfBoundsDatetime, match=msg): - date_range(end="1969-11-14", periods=106752 * 24, freq="H") + date_range(end="1969-11-14", periods=106752 * 24, freq="h") @pytest.mark.slow @pytest.mark.parametrize( @@ -223,11 +272,11 @@ start = Timestamp(s_ts) end = Timestamp(e_ts) - expected = date_range(start=start, end=end, freq="-1H") + expected = date_range(start=start, end=end, freq="-1h") assert expected[0] == start assert expected[-1] == end - dti = date_range(end=end, periods=len(expected), freq="-1H") + dti = date_range(end=end, periods=len(expected), freq="-1h") tm.assert_index_equal(dti, expected) def test_date_range_out_of_bounds(self): @@ -242,53 +291,6 @@ rng = date_range("1/1/2000 00:00", "1/1/2000 00:18", freq="5min") assert len(rng) == 4 - @pytest.mark.parametrize("freq", ["AS", "YS"]) - def test_begin_year_alias(self, freq): - # see gh-9313 - rng = date_range("1/1/2013", "7/1/2017", freq=freq) - exp = DatetimeIndex( - ["2013-01-01", "2014-01-01", "2015-01-01", "2016-01-01", "2017-01-01"], - freq=freq, - ) - tm.assert_index_equal(rng, exp) - - @pytest.mark.parametrize("freq", ["A", "Y"]) - def test_end_year_alias(self, freq): - # see gh-9313 - rng = date_range("1/1/2013", "7/1/2017", freq=freq) - exp = DatetimeIndex( - ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-31"], freq=freq - ) - tm.assert_index_equal(rng, exp) - - @pytest.mark.parametrize("freq", ["BA", "BY"]) - def test_business_end_year_alias(self, freq): - # see gh-9313 - rng = date_range("1/1/2013", "7/1/2017", freq=freq) - exp = DatetimeIndex( - ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-30"], freq=freq - ) - tm.assert_index_equal(rng, exp) - - def test_date_range_negative_freq(self): - # GH 11018 - rng = date_range("2011-12-31", freq="-2A", periods=3) - exp = DatetimeIndex(["2011-12-31", "2009-12-31", "2007-12-31"], freq="-2A") - tm.assert_index_equal(rng, exp) - assert rng.freq == "-2A" - - rng = date_range("2011-01-31", freq="-2M", periods=3) - exp = DatetimeIndex(["2011-01-31", "2010-11-30", "2010-09-30"], freq="-2M") - tm.assert_index_equal(rng, exp) - assert rng.freq == "-2M" - - def test_date_range_bms_bug(self): - # #1645 - rng = date_range("1/1/2000", periods=10, freq="BMS") - - ex_first = Timestamp("2000-01-03") - assert rng[0] == ex_first - def test_date_range_normalize(self): snap = datetime.today() n = 50 @@ -296,24 +298,17 @@ rng = date_range(snap, periods=n, normalize=False, freq="2D") offset = timedelta(2) - values = DatetimeIndex([snap + i * offset for i in range(n)], freq=offset) + expected = DatetimeIndex( + [snap + i * offset for i in range(n)], dtype="M8[ns]", freq=offset + ) - tm.assert_index_equal(rng, values) + tm.assert_index_equal(rng, expected) rng = date_range("1/1/2000 08:15", periods=n, normalize=False, freq="B") the_time = time(8, 15) for val in rng: assert val.time() == the_time - def test_date_range_fy5252(self): - dr = date_range( - start="2013-01-01", - periods=2, - freq=offsets.FY5253(startingMonth=1, weekday=3, variation="nearest"), - ) - assert dr[0] == Timestamp("2013-01-31") - assert dr[1] == Timestamp("2014-01-30") - def test_date_range_ambiguous_arguments(self): # #2538 start = datetime(2011, 1, 1, 5, 3, 40) @@ -326,11 +321,12 @@ with pytest.raises(ValueError, match=msg): date_range(start, end, periods=10, freq="s") - def test_date_range_convenience_periods(self): + def test_date_range_convenience_periods(self, unit): # GH 20808 - result = date_range("2018-04-24", "2018-04-27", periods=3) + result = date_range("2018-04-24", "2018-04-27", periods=3, unit=unit) expected = DatetimeIndex( ["2018-04-24 00:00:00", "2018-04-25 12:00:00", "2018-04-27 00:00:00"], + dtype=f"M8[{unit}]", freq=None, ) @@ -342,6 +338,7 @@ "2018-04-01 04:00:00", tz="Australia/Sydney", periods=3, + unit=unit, ) expected = DatetimeIndex( [ @@ -349,7 +346,7 @@ Timestamp("2018-04-01 02:00:00+1000", tz="Australia/Sydney"), Timestamp("2018-04-01 04:00:00+1000", tz="Australia/Sydney"), ] - ) + ).as_unit(unit) tm.assert_index_equal(result, expected) def test_date_range_index_comparison(self): @@ -404,59 +401,6 @@ expected = date_range("20180101", periods=3, freq="D", tz="US/Eastern") tm.assert_index_equal(result, expected) - def test_date_range_businesshour(self): - idx = DatetimeIndex( - [ - "2014-07-04 09:00", - "2014-07-04 10:00", - "2014-07-04 11:00", - "2014-07-04 12:00", - "2014-07-04 13:00", - "2014-07-04 14:00", - "2014-07-04 15:00", - "2014-07-04 16:00", - ], - freq="BH", - ) - rng = date_range("2014-07-04 09:00", "2014-07-04 16:00", freq="BH") - tm.assert_index_equal(idx, rng) - - idx = DatetimeIndex(["2014-07-04 16:00", "2014-07-07 09:00"], freq="BH") - rng = date_range("2014-07-04 16:00", "2014-07-07 09:00", freq="BH") - tm.assert_index_equal(idx, rng) - - idx = DatetimeIndex( - [ - "2014-07-04 09:00", - "2014-07-04 10:00", - "2014-07-04 11:00", - "2014-07-04 12:00", - "2014-07-04 13:00", - "2014-07-04 14:00", - "2014-07-04 15:00", - "2014-07-04 16:00", - "2014-07-07 09:00", - "2014-07-07 10:00", - "2014-07-07 11:00", - "2014-07-07 12:00", - "2014-07-07 13:00", - "2014-07-07 14:00", - "2014-07-07 15:00", - "2014-07-07 16:00", - "2014-07-08 09:00", - "2014-07-08 10:00", - "2014-07-08 11:00", - "2014-07-08 12:00", - "2014-07-08 13:00", - "2014-07-08 14:00", - "2014-07-08 15:00", - "2014-07-08 16:00", - ], - freq="BH", - ) - rng = date_range("2014-07-04 09:00", "2014-07-08 16:00", freq="BH") - tm.assert_index_equal(idx, rng) - def test_date_range_timedelta(self): start = "2020-01-01" end = "2020-01-11" @@ -481,13 +425,13 @@ date_range(periods=10) with pytest.raises(ValueError, match=msg): - date_range(start="1/1/2000", freq="H") + date_range(start="1/1/2000", freq="h") with pytest.raises(ValueError, match=msg): - date_range(end="1/1/2000", freq="H") + date_range(end="1/1/2000", freq="h") with pytest.raises(ValueError, match=msg): - date_range(periods=10, freq="H") + date_range(periods=10, freq="h") with pytest.raises(ValueError, match=msg): date_range() @@ -505,13 +449,7 @@ with pytest.raises(ValueError, match=msg): date_range(datetime(2011, 11, 11), datetime(2011, 11, 12), freq=offset) - @pytest.mark.parametrize("periods", (1, 2)) - def test_wom_len(self, periods): - # https://github.com/pandas-dev/pandas/issues/20517 - res = date_range(start="20110101", periods=periods, freq="WOM-1MON") - assert len(res) == periods - - def test_construct_over_dst(self): + def test_construct_over_dst(self, unit): # GH 20854 pre_dst = Timestamp("2010-11-07 01:00:00").tz_localize( "US/Pacific", ambiguous=True @@ -524,14 +462,19 @@ pre_dst, pst_dst, ] - expected = DatetimeIndex(expect_data, freq="H") - result = date_range(start="2010-11-7", periods=3, freq="H", tz="US/Pacific") + expected = DatetimeIndex(expect_data, freq="h").as_unit(unit) + result = date_range( + start="2010-11-7", periods=3, freq="h", tz="US/Pacific", unit=unit + ) tm.assert_index_equal(result, expected) - def test_construct_with_different_start_end_string_format(self): + def test_construct_with_different_start_end_string_format(self, unit): # GH 12064 result = date_range( - "2013-01-01 00:00:00+09:00", "2013/01/01 02:00:00+09:00", freq="H" + "2013-01-01 00:00:00+09:00", + "2013/01/01 02:00:00+09:00", + freq="h", + unit=unit, ) expected = DatetimeIndex( [ @@ -539,8 +482,8 @@ Timestamp("2013-01-01 01:00:00+09:00"), Timestamp("2013-01-01 02:00:00+09:00"), ], - freq="H", - ) + freq="h", + ).as_unit(unit) tm.assert_index_equal(result, expected) def test_error_with_zero_monthends(self): @@ -548,13 +491,15 @@ with pytest.raises(ValueError, match=msg): date_range("1/1/2000", "1/1/2001", freq=MonthEnd(0)) - def test_range_bug(self): + def test_range_bug(self, unit): # GH #770 offset = DateOffset(months=3) - result = date_range("2011-1-1", "2012-1-31", freq=offset) + result = date_range("2011-1-1", "2012-1-31", freq=offset, unit=unit) start = datetime(2011, 1, 1) - expected = DatetimeIndex([start + i * offset for i in range(5)], freq=offset) + expected = DatetimeIndex( + [start + i * offset for i in range(5)], dtype=f"M8[{unit}]", freq=offset + ) tm.assert_index_equal(result, expected) def test_range_tz_pytz(self): @@ -638,43 +583,25 @@ assert dr[0] == start assert dr[2] == end - @pytest.mark.parametrize("freq", ["1D", "3D", "2M", "7W", "3H", "A"]) - def test_range_closed(self, freq, inclusive_endpoints_fixture): - begin = datetime(2011, 1, 1) - end = datetime(2014, 1, 1) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3h", "YE"]) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + def test_range_closed(self, freq, tz, inclusive_endpoints_fixture): + # GH#12409, GH#12684 - result_range = date_range( - begin, end, inclusive=inclusive_endpoints_fixture, freq=freq - ) - both_range = date_range(begin, end, inclusive="both", freq=freq) - expected_range = _get_expected_range( - begin, end, both_range, inclusive_endpoints_fixture - ) - - tm.assert_index_equal(expected_range, result_range) - - @pytest.mark.parametrize("freq", ["1D", "3D", "2M", "7W", "3H", "A"]) - def test_range_closed_with_tz_aware_start_end( - self, freq, inclusive_endpoints_fixture - ): - # GH12409, GH12684 - begin = Timestamp("2011/1/1", tz="US/Eastern") - end = Timestamp("2014/1/1", tz="US/Eastern") + begin = Timestamp("2011/1/1", tz=tz) + end = Timestamp("2014/1/1", tz=tz) result_range = date_range( begin, end, inclusive=inclusive_endpoints_fixture, freq=freq ) both_range = date_range(begin, end, inclusive="both", freq=freq) expected_range = _get_expected_range( - begin, - end, - both_range, - inclusive_endpoints_fixture, + begin, end, both_range, inclusive_endpoints_fixture ) tm.assert_index_equal(expected_range, result_range) - @pytest.mark.parametrize("freq", ["1D", "3D", "2M", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2ME", "7W", "3h", "YE"]) def test_range_with_tz_closed_with_tz_aware_start_end( self, freq, inclusive_endpoints_fixture ): @@ -748,11 +675,24 @@ tm.assert_index_equal(both_boundary, expected_both) tm.assert_index_equal(neither_boundary, expected_neither) - def test_years_only(self): - # GH 6961 - dr = date_range("2014", "2015", freq="M") - assert dr[0] == datetime(2014, 1, 31) - assert dr[-1] == datetime(2014, 12, 31) + def test_date_range_years_only(self, tz_naive_fixture): + tz = tz_naive_fixture + # GH#6961 + rng1 = date_range("2014", "2015", freq="ME", tz=tz) + expected1 = date_range("2014-01-31", "2014-12-31", freq="ME", tz=tz) + tm.assert_index_equal(rng1, expected1) + + rng2 = date_range("2014", "2015", freq="MS", tz=tz) + expected2 = date_range("2014-01-01", "2015-01-01", freq="MS", tz=tz) + tm.assert_index_equal(rng2, expected2) + + rng3 = date_range("2014", "2020", freq="YE", tz=tz) + expected3 = date_range("2014-12-31", "2019-12-31", freq="YE", tz=tz) + tm.assert_index_equal(rng3, expected3) + + rng4 = date_range("2014", "2020", freq="YS", tz=tz) + expected4 = date_range("2014-01-01", "2020-01-01", freq="YS", tz=tz) + tm.assert_index_equal(rng4, expected4) def test_freq_divides_end_in_nanos(self): # GH 10885 @@ -761,13 +701,13 @@ expected_1 = DatetimeIndex( ["2005-01-12 10:00:00", "2005-01-12 15:45:00"], dtype="datetime64[ns]", - freq="345T", + freq="345min", tz=None, ) expected_2 = DatetimeIndex( ["2005-01-13 10:00:00", "2005-01-13 15:45:00"], dtype="datetime64[ns]", - freq="345T", + freq="345min", tz=None, ) tm.assert_index_equal(result_1, expected_1) @@ -836,6 +776,70 @@ ) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("h", "H"), + ("2min", "2T"), + ("1s", "1S"), + ("2ms", "2L"), + ("1us", "1U"), + ("2ns", "2N"), + ], + ) + def test_frequencies_H_T_S_L_U_N_deprecated(self, freq, freq_depr): + # GH#52536 + freq_msg = re.split("[0-9]*", freq, maxsplit=1)[1] + freq_depr_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] + msg = ( + f"'{freq_depr_msg}' is deprecated and will be removed in a future version, " + ) + f"please use '{freq_msg}' instead" + + expected = date_range("1/1/2000", periods=2, freq=freq) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = date_range("1/1/2000", periods=2, freq=freq_depr) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("200YE", "200A"), + ("YE", "Y"), + ("2YE-MAY", "2A-MAY"), + ("YE-MAY", "Y-MAY"), + ], + ) + def test_frequencies_A_deprecated_Y_renamed(self, freq, freq_depr): + # GH#9586, GH#54275 + freq_msg = re.split("[0-9]*", freq, maxsplit=1)[1] + freq_depr_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] + msg = f"'{freq_depr_msg}' is deprecated and will be removed " + f"in a future version, please use '{freq_msg}' instead." + + expected = date_range("1/1/2000", periods=2, freq=freq) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = date_range("1/1/2000", periods=2, freq=freq_depr) + tm.assert_index_equal(result, expected) + + def test_to_offset_with_lowercase_deprecated_freq(self) -> None: + # https://github.com/pandas-dev/pandas/issues/56847 + msg = ( + "'m' is deprecated and will be removed in a future version, please use " + "'ME' instead." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = date_range("2010-01-01", periods=2, freq="m") + expected = DatetimeIndex(["2010-01-31", "2010-02-28"], freq="ME") + tm.assert_index_equal(result, expected) + + def test_date_range_bday(self): + sdate = datetime(1999, 12, 25) + idx = date_range(start=sdate, freq="1B", periods=20) + assert len(idx) == 20 + assert idx[0] == sdate + 0 * offsets.BDay() + assert idx.freq == "B" + class TestDateRangeTZ: """Tests for date_range with timezones""" @@ -869,9 +873,20 @@ tm.assert_index_equal(result, expected) - def test_date_range_with_fixedoffset_noname(self): - from pandas.tests.indexes.datetimes.test_timezones import fixed_off_no_name + def test_date_range_with_fixed_tz(self): + off = FixedOffset(420, "+07:00") + start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) + end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) + rng = date_range(start=start, end=end) + assert off == rng.tz + rng2 = date_range(start, periods=len(rng), tz=off) + tm.assert_index_equal(rng, rng2) + + rng3 = date_range("3/11/2012 05:00:00+07:00", "6/11/2012 05:00:00+07:00") + assert (rng.values == rng3.values).all() + + def test_date_range_with_fixedoffset_noname(self): off = fixed_off_no_name start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) @@ -886,20 +901,61 @@ stamp = Timestamp("3/11/2012 05:00", tz=tzstr) assert stamp.hour == 5 - rng = date_range("3/11/2012 04:00", periods=10, freq="H", tz=tzstr) + rng = date_range("3/11/2012 04:00", periods=10, freq="h", tz=tzstr) assert stamp == rng[1] + @pytest.mark.parametrize("tz", ["Europe/London", "dateutil/Europe/London"]) + def test_date_range_ambiguous_endpoint(self, tz): + # construction with an ambiguous end-point + # GH#11626 + + with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): + date_range( + "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="h" + ) -class TestGenRangeGeneration: - def test_generate(self): - rng1 = list(generate_range(START, END, periods=None, offset=BDay(), unit="ns")) - rng2 = list(generate_range(START, END, periods=None, offset="B", unit="ns")) - assert rng1 == rng2 + times = date_range( + "2013-10-26 23:00", "2013-10-27 01:00", freq="h", tz=tz, ambiguous="infer" + ) + assert times[0] == Timestamp("2013-10-26 23:00", tz=tz) + assert times[-1] == Timestamp("2013-10-27 01:00:00+0000", tz=tz) - def test_generate_cday(self): - rng1 = list(generate_range(START, END, periods=None, offset=CDay(), unit="ns")) - rng2 = list(generate_range(START, END, periods=None, offset="C", unit="ns")) + @pytest.mark.parametrize( + "tz, option, expected", + [ + ["US/Pacific", "shift_forward", "2019-03-10 03:00"], + ["dateutil/US/Pacific", "shift_forward", "2019-03-10 03:00"], + ["US/Pacific", "shift_backward", "2019-03-10 01:00"], + ["dateutil/US/Pacific", "shift_backward", "2019-03-10 01:00"], + ["US/Pacific", timedelta(hours=1), "2019-03-10 03:00"], + ], + ) + def test_date_range_nonexistent_endpoint(self, tz, option, expected): + # construction with an nonexistent end-point + + with pytest.raises(pytz.NonExistentTimeError, match="2019-03-10 02:00:00"): + date_range( + "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="h" + ) + + times = date_range( + "2019-03-10 00:00", "2019-03-10 02:00", freq="h", tz=tz, nonexistent=option + ) + assert times[-1] == Timestamp(expected, tz=tz) + + +class TestGenRangeGeneration: + @pytest.mark.parametrize( + "freqstr,offset", + [ + ("B", BDay()), + ("C", CDay()), + ], + ) + def test_generate(self, freqstr, offset): + rng1 = list(generate_range(START, END, periods=None, offset=offset, unit="ns")) + rng2 = list(generate_range(START, END, periods=None, offset=freqstr, unit="ns")) assert rng1 == rng2 def test_1(self): @@ -944,7 +1000,7 @@ def test_precision_finer_than_offset(self): # GH#9907 result1 = date_range( - start="2015-04-15 00:00:03", end="2016-04-22 00:00:00", freq="Q" + start="2015-04-15 00:00:03", end="2016-04-22 00:00:00", freq="QE" ) result2 = date_range( start="2015-04-15 00:00:03", end="2015-06-22 00:00:04", freq="W" @@ -968,7 +1024,7 @@ "2015-06-21 00:00:03", ] expected1 = DatetimeIndex( - expected1_list, dtype="datetime64[ns]", freq="Q-DEC", tz=None + expected1_list, dtype="datetime64[ns]", freq="QE-DEC", tz=None ) expected2 = DatetimeIndex( expected2_list, dtype="datetime64[ns]", freq="W-SUN", tz=None @@ -1065,7 +1121,7 @@ # GH#24252 avoid doing unnecessary addition that _would_ overflow start = Timestamp.max.floor("D").to_pydatetime() rng = date_range(start, end=None, periods=1, freq="B") - expected = DatetimeIndex([start], freq="B") + expected = DatetimeIndex([start], freq="B").as_unit("ns") tm.assert_index_equal(rng, expected) def test_bday_overflow_error(self): @@ -1107,18 +1163,22 @@ result = rng1.union(rng2) assert isinstance(result, DatetimeIndex) - def test_cdaterange(self): - result = bdate_range("2013-05-01", periods=3, freq="C") - expected = DatetimeIndex(["2013-05-01", "2013-05-02", "2013-05-03"], freq="C") + def test_cdaterange(self, unit): + result = bdate_range("2013-05-01", periods=3, freq="C", unit=unit) + expected = DatetimeIndex( + ["2013-05-01", "2013-05-02", "2013-05-03"], dtype=f"M8[{unit}]", freq="C" + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - def test_cdaterange_weekmask(self): + def test_cdaterange_weekmask(self, unit): result = bdate_range( - "2013-05-01", periods=3, freq="C", weekmask="Sun Mon Tue Wed Thu" + "2013-05-01", periods=3, freq="C", weekmask="Sun Mon Tue Wed Thu", unit=unit ) expected = DatetimeIndex( - ["2013-05-01", "2013-05-02", "2013-05-05"], freq=result.freq + ["2013-05-01", "2013-05-02", "2013-05-05"], + dtype=f"M8[{unit}]", + freq=result.freq, ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq @@ -1131,10 +1191,14 @@ with pytest.raises(ValueError, match=msg): bdate_range("2013-05-01", periods=3, weekmask="Sun Mon Tue Wed Thu") - def test_cdaterange_holidays(self): - result = bdate_range("2013-05-01", periods=3, freq="C", holidays=["2013-05-01"]) + def test_cdaterange_holidays(self, unit): + result = bdate_range( + "2013-05-01", periods=3, freq="C", holidays=["2013-05-01"], unit=unit + ) expected = DatetimeIndex( - ["2013-05-02", "2013-05-03", "2013-05-06"], freq=result.freq + ["2013-05-02", "2013-05-03", "2013-05-06"], + dtype=f"M8[{unit}]", + freq=result.freq, ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq @@ -1147,20 +1211,24 @@ with pytest.raises(ValueError, match=msg): bdate_range("2013-05-01", periods=3, holidays=["2013-05-01"]) - def test_cdaterange_weekmask_and_holidays(self): + def test_cdaterange_weekmask_and_holidays(self, unit): result = bdate_range( "2013-05-01", periods=3, freq="C", weekmask="Sun Mon Tue Wed Thu", holidays=["2013-05-01"], + unit=unit, ) expected = DatetimeIndex( - ["2013-05-02", "2013-05-05", "2013-05-06"], freq=result.freq + ["2013-05-02", "2013-05-05", "2013-05-06"], + dtype=f"M8[{unit}]", + freq=result.freq, ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq + def test_cdaterange_holidays_weekmask_requires_freqstr(self): # raise with non-custom freq msg = ( "a custom frequency string is required when holidays or " @@ -1200,7 +1268,7 @@ # https://github.com/pandas-dev/pandas/issues/24110 start, end = start_end result = date_range(start=start, end=end, periods=2, inclusive="left") - expected = DatetimeIndex([start]) + expected = DatetimeIndex([start], dtype="M8[ns, UTC]") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1218,26 +1286,10 @@ def test_range_with_timezone_and_custombusinessday(self, start, period, expected): # GH49441 result = date_range(start=start, periods=period, freq="C") - expected = DatetimeIndex(expected) + expected = DatetimeIndex(expected).as_unit("ns") tm.assert_index_equal(result, expected) -def test_date_range_with_custom_holidays(): - # GH 30593 - freq = offsets.CustomBusinessHour(start="15:00", holidays=["2020-11-26"]) - result = date_range(start="2020-11-25 15:00", periods=4, freq=freq) - expected = DatetimeIndex( - [ - "2020-11-25 15:00:00", - "2020-11-25 16:00:00", - "2020-11-27 15:00:00", - "2020-11-27 16:00:00", - ], - freq=freq, - ) - tm.assert_index_equal(result, expected) - - class TestDateRangeNonNano: def test_date_range_reso_validation(self): msg = "'unit' must be one of 's', 'ms', 'us', 'ns'" @@ -1302,3 +1354,368 @@ ).view("M8[s]") tm.assert_numpy_array_equal(dti.to_numpy(), exp) + + +class TestDateRangeNonTickFreq: + # Tests revolving around less-common (non-Tick) `freq` keywords. + + def test_date_range_custom_business_month_begin(self, unit): + hcal = USFederalHolidayCalendar() + freq = offsets.CBMonthBegin(calendar=hcal) + dti = date_range(start="20120101", end="20130101", freq=freq, unit=unit) + assert all(freq.is_on_offset(x) for x in dti) + + expected = DatetimeIndex( + [ + "2012-01-03", + "2012-02-01", + "2012-03-01", + "2012-04-02", + "2012-05-01", + "2012-06-01", + "2012-07-02", + "2012-08-01", + "2012-09-04", + "2012-10-01", + "2012-11-01", + "2012-12-03", + ], + dtype=f"M8[{unit}]", + freq=freq, + ) + tm.assert_index_equal(dti, expected) + + def test_date_range_custom_business_month_end(self, unit): + hcal = USFederalHolidayCalendar() + freq = offsets.CBMonthEnd(calendar=hcal) + dti = date_range(start="20120101", end="20130101", freq=freq, unit=unit) + assert all(freq.is_on_offset(x) for x in dti) + + expected = DatetimeIndex( + [ + "2012-01-31", + "2012-02-29", + "2012-03-30", + "2012-04-30", + "2012-05-31", + "2012-06-29", + "2012-07-31", + "2012-08-31", + "2012-09-28", + "2012-10-31", + "2012-11-30", + "2012-12-31", + ], + dtype=f"M8[{unit}]", + freq=freq, + ) + tm.assert_index_equal(dti, expected) + + def test_date_range_with_custom_holidays(self, unit): + # GH#30593 + freq = offsets.CustomBusinessHour(start="15:00", holidays=["2020-11-26"]) + result = date_range(start="2020-11-25 15:00", periods=4, freq=freq, unit=unit) + expected = DatetimeIndex( + [ + "2020-11-25 15:00:00", + "2020-11-25 16:00:00", + "2020-11-27 15:00:00", + "2020-11-27 16:00:00", + ], + dtype=f"M8[{unit}]", + freq=freq, + ) + tm.assert_index_equal(result, expected) + + def test_date_range_businesshour(self, unit): + idx = DatetimeIndex( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + ], + dtype=f"M8[{unit}]", + freq="bh", + ) + rng = date_range("2014-07-04 09:00", "2014-07-04 16:00", freq="bh", unit=unit) + tm.assert_index_equal(idx, rng) + + idx = DatetimeIndex( + ["2014-07-04 16:00", "2014-07-07 09:00"], dtype=f"M8[{unit}]", freq="bh" + ) + rng = date_range("2014-07-04 16:00", "2014-07-07 09:00", freq="bh", unit=unit) + tm.assert_index_equal(idx, rng) + + idx = DatetimeIndex( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + "2014-07-08 11:00", + "2014-07-08 12:00", + "2014-07-08 13:00", + "2014-07-08 14:00", + "2014-07-08 15:00", + "2014-07-08 16:00", + ], + dtype=f"M8[{unit}]", + freq="bh", + ) + rng = date_range("2014-07-04 09:00", "2014-07-08 16:00", freq="bh", unit=unit) + tm.assert_index_equal(idx, rng) + + def test_date_range_business_hour2(self, unit): + idx1 = date_range( + start="2014-07-04 15:00", end="2014-07-08 10:00", freq="bh", unit=unit + ) + idx2 = date_range(start="2014-07-04 15:00", periods=12, freq="bh", unit=unit) + idx3 = date_range(end="2014-07-08 10:00", periods=12, freq="bh", unit=unit) + expected = DatetimeIndex( + [ + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + ], + dtype=f"M8[{unit}]", + freq="bh", + ) + tm.assert_index_equal(idx1, expected) + tm.assert_index_equal(idx2, expected) + tm.assert_index_equal(idx3, expected) + + idx4 = date_range( + start="2014-07-04 15:45", end="2014-07-08 10:45", freq="bh", unit=unit + ) + idx5 = date_range(start="2014-07-04 15:45", periods=12, freq="bh", unit=unit) + idx6 = date_range(end="2014-07-08 10:45", periods=12, freq="bh", unit=unit) + + expected2 = expected + Timedelta(minutes=45).as_unit(unit) + expected2.freq = "bh" + tm.assert_index_equal(idx4, expected2) + tm.assert_index_equal(idx5, expected2) + tm.assert_index_equal(idx6, expected2) + + def test_date_range_business_hour_short(self, unit): + # GH#49835 + idx4 = date_range(start="2014-07-01 10:00", freq="bh", periods=1, unit=unit) + expected4 = DatetimeIndex(["2014-07-01 10:00"], dtype=f"M8[{unit}]", freq="bh") + tm.assert_index_equal(idx4, expected4) + + def test_date_range_year_start(self, unit): + # see GH#9313 + rng = date_range("1/1/2013", "7/1/2017", freq="YS", unit=unit) + exp = DatetimeIndex( + ["2013-01-01", "2014-01-01", "2015-01-01", "2016-01-01", "2017-01-01"], + dtype=f"M8[{unit}]", + freq="YS", + ) + tm.assert_index_equal(rng, exp) + + def test_date_range_year_end(self, unit): + # see GH#9313 + rng = date_range("1/1/2013", "7/1/2017", freq="YE", unit=unit) + exp = DatetimeIndex( + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-31"], + dtype=f"M8[{unit}]", + freq="YE", + ) + tm.assert_index_equal(rng, exp) + + def test_date_range_negative_freq_year_end(self, unit): + # GH#11018 + rng = date_range("2011-12-31", freq="-2YE", periods=3, unit=unit) + exp = DatetimeIndex( + ["2011-12-31", "2009-12-31", "2007-12-31"], dtype=f"M8[{unit}]", freq="-2YE" + ) + tm.assert_index_equal(rng, exp) + assert rng.freq == "-2YE" + + def test_date_range_business_year_end_year(self, unit): + # see GH#9313 + rng = date_range("1/1/2013", "7/1/2017", freq="BYE", unit=unit) + exp = DatetimeIndex( + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-30"], + dtype=f"M8[{unit}]", + freq="BYE", + ) + tm.assert_index_equal(rng, exp) + + def test_date_range_bms(self, unit): + # GH#1645 + result = date_range("1/1/2000", periods=10, freq="BMS", unit=unit) + + expected = DatetimeIndex( + [ + "2000-01-03", + "2000-02-01", + "2000-03-01", + "2000-04-03", + "2000-05-01", + "2000-06-01", + "2000-07-03", + "2000-08-01", + "2000-09-01", + "2000-10-02", + ], + dtype=f"M8[{unit}]", + freq="BMS", + ) + tm.assert_index_equal(result, expected) + + def test_date_range_semi_month_begin(self, unit): + dates = [ + datetime(2007, 12, 15), + datetime(2008, 1, 1), + datetime(2008, 1, 15), + datetime(2008, 2, 1), + datetime(2008, 2, 15), + datetime(2008, 3, 1), + datetime(2008, 3, 15), + datetime(2008, 4, 1), + datetime(2008, 4, 15), + datetime(2008, 5, 1), + datetime(2008, 5, 15), + datetime(2008, 6, 1), + datetime(2008, 6, 15), + datetime(2008, 7, 1), + datetime(2008, 7, 15), + datetime(2008, 8, 1), + datetime(2008, 8, 15), + datetime(2008, 9, 1), + datetime(2008, 9, 15), + datetime(2008, 10, 1), + datetime(2008, 10, 15), + datetime(2008, 11, 1), + datetime(2008, 11, 15), + datetime(2008, 12, 1), + datetime(2008, 12, 15), + ] + # ensure generating a range with DatetimeIndex gives same result + result = date_range(start=dates[0], end=dates[-1], freq="SMS", unit=unit) + exp = DatetimeIndex(dates, dtype=f"M8[{unit}]", freq="SMS") + tm.assert_index_equal(result, exp) + + def test_date_range_semi_month_end(self, unit): + dates = [ + datetime(2007, 12, 31), + datetime(2008, 1, 15), + datetime(2008, 1, 31), + datetime(2008, 2, 15), + datetime(2008, 2, 29), + datetime(2008, 3, 15), + datetime(2008, 3, 31), + datetime(2008, 4, 15), + datetime(2008, 4, 30), + datetime(2008, 5, 15), + datetime(2008, 5, 31), + datetime(2008, 6, 15), + datetime(2008, 6, 30), + datetime(2008, 7, 15), + datetime(2008, 7, 31), + datetime(2008, 8, 15), + datetime(2008, 8, 31), + datetime(2008, 9, 15), + datetime(2008, 9, 30), + datetime(2008, 10, 15), + datetime(2008, 10, 31), + datetime(2008, 11, 15), + datetime(2008, 11, 30), + datetime(2008, 12, 15), + datetime(2008, 12, 31), + ] + # ensure generating a range with DatetimeIndex gives same result + result = date_range(start=dates[0], end=dates[-1], freq="SME", unit=unit) + exp = DatetimeIndex(dates, dtype=f"M8[{unit}]", freq="SME") + tm.assert_index_equal(result, exp) + + def test_date_range_week_of_month(self, unit): + # GH#20517 + # Note the start here is not on_offset for this freq + result = date_range(start="20110101", periods=1, freq="WOM-1MON", unit=unit) + expected = DatetimeIndex(["2011-01-03"], dtype=f"M8[{unit}]", freq="WOM-1MON") + tm.assert_index_equal(result, expected) + + result2 = date_range(start="20110101", periods=2, freq="WOM-1MON", unit=unit) + expected2 = DatetimeIndex( + ["2011-01-03", "2011-02-07"], dtype=f"M8[{unit}]", freq="WOM-1MON" + ) + tm.assert_index_equal(result2, expected2) + + def test_date_range_week_of_month2(self, unit): + # GH#5115, GH#5348 + result = date_range("2013-1-1", periods=4, freq="WOM-1SAT", unit=unit) + expected = DatetimeIndex( + ["2013-01-05", "2013-02-02", "2013-03-02", "2013-04-06"], + dtype=f"M8[{unit}]", + freq="WOM-1SAT", + ) + tm.assert_index_equal(result, expected) + + def test_date_range_negative_freq_month_end(self, unit): + # GH#11018 + rng = date_range("2011-01-31", freq="-2ME", periods=3, unit=unit) + exp = DatetimeIndex( + ["2011-01-31", "2010-11-30", "2010-09-30"], dtype=f"M8[{unit}]", freq="-2ME" + ) + tm.assert_index_equal(rng, exp) + assert rng.freq == "-2ME" + + def test_date_range_fy5253(self, unit): + freq = offsets.FY5253(startingMonth=1, weekday=3, variation="nearest") + dti = date_range( + start="2013-01-01", + periods=2, + freq=freq, + unit=unit, + ) + expected = DatetimeIndex( + ["2013-01-31", "2014-01-30"], dtype=f"M8[{unit}]", freq=freq + ) + + tm.assert_index_equal(dti, expected) + + @pytest.mark.parametrize( + "freqstr,offset", + [ + ("QS", offsets.QuarterBegin(startingMonth=1)), + ("BQE", offsets.BQuarterEnd(startingMonth=12)), + ("W-SUN", offsets.Week(weekday=6)), + ], + ) + def test_date_range_freqstr_matches_offset(self, freqstr, offset): + sdate = datetime(1999, 12, 25) + edate = datetime(2000, 1, 1) + + idx1 = date_range(start=sdate, end=edate, freq=freqstr) + idx2 = date_range(start=sdate, end=edate, freq=offset) + assert len(idx1) == len(idx2) + assert idx1.freq == idx2.freq diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_datetime.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_datetime.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_datetime.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_datetime.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,7 @@ +import datetime as dt from datetime import date +import re -import dateutil import numpy as np import pytest @@ -19,35 +20,11 @@ class TestDatetimeIndex: - def test_sub_datetime_preserves_freq(self, tz_naive_fixture): - # GH#48818 - dti = date_range("2016-01-01", periods=12, tz=tz_naive_fixture) - - res = dti - dti[0] - expected = pd.timedelta_range("0 Days", "11 Days") - tm.assert_index_equal(res, expected) - assert res.freq == expected.freq - - @pytest.mark.xfail( - reason="The inherited freq is incorrect bc dti.freq is incorrect " - "https://github.com/pandas-dev/pandas/pull/48818/files#r982793461" - ) - def test_sub_datetime_preserves_freq_across_dst(self): - # GH#48818 - ts = Timestamp("2016-03-11", tz="US/Pacific") - dti = date_range(ts, periods=4) - - res = dti - dti[0] - expected = pd.TimedeltaIndex( - [ - pd.Timedelta(days=0), - pd.Timedelta(days=1), - pd.Timedelta(days=2), - pd.Timedelta(days=2, hours=23), - ] - ) - tm.assert_index_equal(res, expected) - assert res.freq == expected.freq + def test_is_(self): + dti = date_range(start="1/1/2005", end="12/1/2005", freq="ME") + assert dti.is_(dti) + assert dti.is_(dti.view()) + assert not dti.is_(dti.copy()) def test_time_overflow_for_32bit_machines(self): # GH8943. On some machines NumPy defaults to np.int32 (for example, @@ -58,10 +35,10 @@ # overflow. periods = np_long(1000) - idx1 = date_range(start="2000", periods=periods, freq="S") + idx1 = date_range(start="2000", periods=periods, freq="s") assert len(idx1) == periods - idx2 = date_range(end="2000", periods=periods, freq="S") + idx2 = date_range(end="2000", periods=periods, freq="s") assert len(idx2) == periods def test_nat(self): @@ -81,12 +58,6 @@ expected = DatetimeIndex([d1, d3, d2]) tm.assert_index_equal(result_union, expected) - # GH 5115 - result = date_range("2013-1-1", periods=4, freq="WOM-1SAT") - dates = ["2013-01-05", "2013-02-02", "2013-03-02", "2013-04-06"] - expected = DatetimeIndex(dates, freq="WOM-1SAT") - tm.assert_index_equal(result, expected) - def test_append_nondatetimeindex(self): rng = date_range("1/1/2000", periods=10) idx = Index(["a", "b", "c", "d"]) @@ -94,51 +65,12 @@ result = rng.append(idx) assert isinstance(result[0], Timestamp) - def test_iteration_preserves_tz(self): - # see gh-8890 - index = date_range("2012-01-01", periods=3, freq="H", tz="US/Eastern") - - for i, ts in enumerate(index): - result = ts - expected = index[i] # pylint: disable=unnecessary-list-index-lookup - assert result == expected - - index = date_range( - "2012-01-01", periods=3, freq="H", tz=dateutil.tz.tzoffset(None, -28800) - ) - - for i, ts in enumerate(index): - result = ts - expected = index[i] # pylint: disable=unnecessary-list-index-lookup - assert result._repr_base == expected._repr_base - assert result == expected - - # 9100 - index = DatetimeIndex( - ["2014-12-01 03:32:39.987000-08:00", "2014-12-01 04:12:34.987000-08:00"] - ) - for i, ts in enumerate(index): - result = ts - expected = index[i] # pylint: disable=unnecessary-list-index-lookup - assert result._repr_base == expected._repr_base - assert result == expected - - @pytest.mark.parametrize("periods", [0, 9999, 10000, 10001]) - def test_iteration_over_chunksize(self, periods): - # GH21012 - - index = date_range("2000-01-01 00:00:00", periods=periods, freq="min") - num = 0 - for stamp in index: - assert index[num] == stamp - num += 1 - assert num == len(index) - def test_misc_coverage(self): rng = date_range("1/1/2000", periods=5) result = rng.groupby(rng.day) assert isinstance(next(iter(result.values()))[0], Timestamp) + # TODO: belongs in frame groupby tests? def test_groupby_function_tuple_1677(self): df = DataFrame( np.random.default_rng(2).random(100), @@ -150,8 +82,8 @@ assert isinstance(result.index[0], tuple) def assert_index_parameters(self, index): - assert index.freq == "40960N" - assert index.inferred_freq == "40960N" + assert index.freq == "40960ns" + assert index.inferred_freq == "40960ns" def test_ns_index(self): nsamples = 400 @@ -201,3 +133,84 @@ result = np.asarray(idx, dtype=object) tm.assert_numpy_array_equal(result, expected) + + def test_CBH_deprecated(self): + msg = "'CBH' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range( + dt.datetime(2022, 12, 11), dt.datetime(2022, 12, 13), freq="CBH" + ) + result = DatetimeIndex( + [ + "2022-12-12 09:00:00", + "2022-12-12 10:00:00", + "2022-12-12 11:00:00", + "2022-12-12 12:00:00", + "2022-12-12 13:00:00", + "2022-12-12 14:00:00", + "2022-12-12 15:00:00", + "2022-12-12 16:00:00", + ], + dtype="datetime64[ns]", + freq="cbh", + ) + + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq_depr, expected_values, expected_freq", + [ + ( + "AS-AUG", + ["2021-08-01", "2022-08-01", "2023-08-01"], + "YS-AUG", + ), + ( + "1BAS-MAY", + ["2021-05-03", "2022-05-02", "2023-05-01"], + "1BYS-MAY", + ), + ], + ) + def test_AS_BAS_deprecated(self, freq_depr, expected_values, expected_freq): + # GH#55479 + freq_msg = re.split("[0-9]*", freq_depr, maxsplit=1)[1] + msg = f"'{freq_msg}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range( + dt.datetime(2020, 12, 1), dt.datetime(2023, 12, 1), freq=freq_depr + ) + result = DatetimeIndex( + expected_values, + dtype="datetime64[ns]", + freq=expected_freq, + ) + + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "freq, expected_values, freq_depr", + [ + ("2BYE-MAR", ["2016-03-31"], "2BA-MAR"), + ("2BYE-JUN", ["2016-06-30"], "2BY-JUN"), + ("2BME", ["2016-02-29", "2016-04-29", "2016-06-30"], "2BM"), + ("2BQE", ["2016-03-31"], "2BQ"), + ("1BQE-MAR", ["2016-03-31", "2016-06-30"], "1BQ-MAR"), + ], + ) + def test_BM_BQ_BY_deprecated(self, freq, expected_values, freq_depr): + # GH#52064 + msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + f"in a future version, please use '{freq[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = date_range(start="2016-02-21", end="2016-08-21", freq=freq_depr) + result = DatetimeIndex( + data=expected_values, + dtype="datetime64[ns]", + freq=freq, + ) + + tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_datetimelike.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_datetimelike.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_datetimelike.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_datetimelike.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,10 +0,0 @@ -""" generic tests from the Datetimelike class """ -from pandas import date_range - - -class TestDatetimeIndex: - def test_format(self): - # GH35439 - idx = date_range("20130101", periods=5) - expected = [f"{x:%Y-%m-%d}" for x in idx] - assert idx.format() == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_delete.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_delete.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_delete.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_delete.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,138 +0,0 @@ -import pytest - -from pandas import ( - DatetimeIndex, - Series, - date_range, -) -import pandas._testing as tm - - -class TestDelete: - def test_delete(self): - idx = date_range(start="2000-01-01", periods=5, freq="M", name="idx") - - # preserve freq - expected_0 = date_range(start="2000-02-01", periods=4, freq="M", name="idx") - expected_4 = date_range(start="2000-01-01", periods=4, freq="M", name="idx") - - # reset freq to None - expected_1 = DatetimeIndex( - ["2000-01-31", "2000-03-31", "2000-04-30", "2000-05-31"], - freq=None, - name="idx", - ) - - cases = { - 0: expected_0, - -5: expected_0, - -1: expected_4, - 4: expected_4, - 1: expected_1, - } - for n, expected in cases.items(): - result = idx.delete(n) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - with pytest.raises((IndexError, ValueError), match="out of bounds"): - # either depending on numpy version - idx.delete(5) - - for tz in [None, "Asia/Tokyo", "US/Pacific"]: - idx = date_range( - start="2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz - ) - - expected = date_range( - start="2000-01-01 10:00", periods=9, freq="H", name="idx", tz=tz - ) - result = idx.delete(0) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freqstr == "H" - assert result.tz == expected.tz - - expected = date_range( - start="2000-01-01 09:00", periods=9, freq="H", name="idx", tz=tz - ) - result = idx.delete(-1) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freqstr == "H" - assert result.tz == expected.tz - - def test_delete_slice(self): - idx = date_range(start="2000-01-01", periods=10, freq="D", name="idx") - - # preserve freq - expected_0_2 = date_range(start="2000-01-04", periods=7, freq="D", name="idx") - expected_7_9 = date_range(start="2000-01-01", periods=7, freq="D", name="idx") - - # reset freq to None - expected_3_5 = DatetimeIndex( - [ - "2000-01-01", - "2000-01-02", - "2000-01-03", - "2000-01-07", - "2000-01-08", - "2000-01-09", - "2000-01-10", - ], - freq=None, - name="idx", - ) - - cases = { - (0, 1, 2): expected_0_2, - (7, 8, 9): expected_7_9, - (3, 4, 5): expected_3_5, - } - for n, expected in cases.items(): - result = idx.delete(n) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - result = idx.delete(slice(n[0], n[-1] + 1)) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - - for tz in [None, "Asia/Tokyo", "US/Pacific"]: - ts = Series( - 1, - index=date_range( - "2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz - ), - ) - # preserve freq - result = ts.drop(ts.index[:5]).index - expected = date_range( - "2000-01-01 14:00", periods=5, freq="H", name="idx", tz=tz - ) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - assert result.tz == expected.tz - - # reset freq to None - result = ts.drop(ts.index[[1, 3, 5, 7, 9]]).index - expected = DatetimeIndex( - [ - "2000-01-01 09:00", - "2000-01-01 11:00", - "2000-01-01 13:00", - "2000-01-01 15:00", - "2000-01-01 17:00", - ], - freq=None, - name="idx", - tz=tz, - ) - tm.assert_index_equal(result, expected) - assert result.name == expected.name - assert result.freq == expected.freq - assert result.tz == expected.tz diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_formats.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_formats.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_formats.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_formats.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,43 +8,49 @@ import pandas as pd from pandas import ( DatetimeIndex, + NaT, Series, ) import pandas._testing as tm -def test_format_native_types(): +@pytest.fixture(params=["s", "ms", "us", "ns"]) +def unit(request): + return request.param + + +def test_get_values_for_csv(): index = pd.date_range(freq="1D", periods=3, start="2017-01-01") # First, with no arguments. expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype=object) - result = index._format_native_types() + result = index._get_values_for_csv() tm.assert_numpy_array_equal(result, expected) # No NaN values, so na_rep has no effect - result = index._format_native_types(na_rep="pandas") + result = index._get_values_for_csv(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) # Make sure date formatting works expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype=object) - result = index._format_native_types(date_format="%m-%Y-%d") + result = index._get_values_for_csv(date_format="%m-%Y-%d") tm.assert_numpy_array_equal(result, expected) # NULL object handling should work - index = DatetimeIndex(["2017-01-01", pd.NaT, "2017-01-03"]) + index = DatetimeIndex(["2017-01-01", NaT, "2017-01-03"]) expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object) - result = index._format_native_types() + result = index._get_values_for_csv(na_rep="NaT") tm.assert_numpy_array_equal(result, expected) expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object) - result = index._format_native_types(na_rep="pandas") + result = index._get_values_for_csv(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) - result = index._format_native_types(date_format="%Y-%m-%d %H:%M:%S.%f") + result = index._get_values_for_csv(na_rep="NaT", date_format="%Y-%m-%d %H:%M:%S.%f") expected = np.array( ["2017-01-01 00:00:00.000000", "NaT", "2017-01-03 00:00:00.000000"], dtype=object, @@ -52,12 +58,35 @@ tm.assert_numpy_array_equal(result, expected) # invalid format - result = index._format_native_types(date_format="foo") + result = index._get_values_for_csv(na_rep="NaT", date_format="foo") expected = np.array(["foo", "NaT", "foo"], dtype=object) tm.assert_numpy_array_equal(result, expected) class TestDatetimeIndexRendering: + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_with_timezone_repr(self, tzstr): + rng = pd.date_range("4/13/2010", "5/6/2010") + + rng_eastern = rng.tz_localize(tzstr) + + rng_repr = repr(rng_eastern) + assert "2010-04-13 00:00:00" in rng_repr + + def test_dti_repr_dates(self): + text = str(pd.to_datetime([datetime(2013, 1, 1), datetime(2014, 1, 1)])) + assert "['2013-01-01'," in text + assert ", '2014-01-01']" in text + + def test_dti_repr_mixed(self): + text = str( + pd.to_datetime( + [datetime(2013, 1, 1), datetime(2014, 1, 1, 12), datetime(2014, 1, 1)] + ) + ) + assert "'2013-01-01 00:00:00'," in text + assert "'2014-01-01 00:00:00']" in text + def test_dti_repr_short(self): dr = pd.date_range(start="1/1/2012", periods=1) repr(dr) @@ -73,33 +102,32 @@ [ ( ["2012-01-01 00:00:00"], - "60T", + "60min", ( "DatetimeIndex(['2012-01-01 00:00:00'], " - "dtype='datetime64[ns]', freq='60T')" + "dtype='datetime64[ns]', freq='60min')" ), ), ( ["2012-01-01 00:00:00", "2012-01-01 01:00:00"], - "60T", + "60min", "DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 01:00:00'], " - "dtype='datetime64[ns]', freq='60T')", + "dtype='datetime64[ns]', freq='60min')", ), ( ["2012-01-01"], - "24H", - "DatetimeIndex(['2012-01-01'], dtype='datetime64[ns]', freq='24H')", + "24h", + "DatetimeIndex(['2012-01-01'], dtype='datetime64[ns]', freq='24h')", ), ], ) - def test_dti_repr_time_midnight(self, dates, freq, expected_repr): + def test_dti_repr_time_midnight(self, dates, freq, expected_repr, unit): # GH53634 - dti = DatetimeIndex(dates, freq) + dti = DatetimeIndex(dates, freq).as_unit(unit) actual_repr = repr(dti) - assert actual_repr == expected_repr + assert actual_repr == expected_repr.replace("[ns]", f"[{unit}]") - @pytest.mark.parametrize("method", ["__repr__", "__str__"]) - def test_dti_representation(self, method): + def test_dti_representation(self, unit): idxs = [] idxs.append(DatetimeIndex([], freq="D")) idxs.append(DatetimeIndex(["2011-01-01"], freq="D")) @@ -108,17 +136,17 @@ idxs.append( DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", + freq="h", tz="Asia/Tokyo", ) ) idxs.append( DatetimeIndex( - ["2011-01-01 09:00", "2011-01-01 10:00", pd.NaT], tz="US/Eastern" + ["2011-01-01 09:00", "2011-01-01 10:00", NaT], tz="US/Eastern" ) ) idxs.append( - DatetimeIndex(["2011-01-01 09:00", "2011-01-01 10:00", pd.NaT], tz="UTC") + DatetimeIndex(["2011-01-01 09:00", "2011-01-01 10:00", NaT], tz="UTC") ) exp = [] @@ -135,7 +163,7 @@ exp.append( "DatetimeIndex(['2011-01-01 09:00:00+09:00', " "'2011-01-01 10:00:00+09:00', '2011-01-01 11:00:00+09:00']" - ", dtype='datetime64[ns, Asia/Tokyo]', freq='H')" + ", dtype='datetime64[ns, Asia/Tokyo]', freq='h')" ) exp.append( "DatetimeIndex(['2011-01-01 09:00:00-05:00', " @@ -150,22 +178,27 @@ ) with pd.option_context("display.width", 300): - for indx, expected in zip(idxs, exp): - result = getattr(indx, method)() + for index, expected in zip(idxs, exp): + index = index.as_unit(unit) + expected = expected.replace("[ns", f"[{unit}") + result = repr(index) + assert result == expected + result = str(index) assert result == expected - def test_dti_representation_to_series(self): + # TODO: this is a Series.__repr__ test + def test_dti_representation_to_series(self, unit): idx1 = DatetimeIndex([], freq="D") idx2 = DatetimeIndex(["2011-01-01"], freq="D") idx3 = DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") idx4 = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") idx5 = DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", + freq="h", tz="Asia/Tokyo", ) idx6 = DatetimeIndex( - ["2011-01-01 09:00", "2011-01-01 10:00", pd.NaT], tz="US/Eastern" + ["2011-01-01 09:00", "2011-01-01 10:00", NaT], tz="US/Eastern" ) idx7 = DatetimeIndex(["2011-01-01 09:00", "2011-01-02 10:15"]) @@ -207,8 +240,9 @@ [idx1, idx2, idx3, idx4, idx5, idx6, idx7], [exp1, exp2, exp3, exp4, exp5, exp6, exp7], ): - result = repr(Series(idx)) - assert result == expected + ser = Series(idx.as_unit(unit)) + result = repr(ser) + assert result == expected.replace("[ns", f"[{unit}") def test_dti_summary(self): # GH#9116 @@ -218,11 +252,11 @@ idx4 = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") idx5 = DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], - freq="H", + freq="h", tz="Asia/Tokyo", ) idx6 = DatetimeIndex( - ["2011-01-01 09:00", "2011-01-01 10:00", pd.NaT], tz="US/Eastern" + ["2011-01-01 09:00", "2011-01-01 10:00", NaT], tz="US/Eastern" ) exp1 = "DatetimeIndex: 0 entries\nFreq: D" @@ -236,7 +270,7 @@ exp5 = ( "DatetimeIndex: 3 entries, 2011-01-01 09:00:00+09:00 " "to 2011-01-01 11:00:00+09:00\n" - "Freq: H" + "Freq: h" ) exp6 = """DatetimeIndex: 3 entries, 2011-01-01 09:00:00-05:00 to NaT""" @@ -247,51 +281,76 @@ result = idx._summary() assert result == expected - def test_dti_business_repr(self): - # only really care that it works - repr(pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1))) - - def test_dti_business_summary(self): - rng = pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1)) - rng._summary() - rng[2:2]._summary() - - def test_dti_business_summary_pytz(self): - pd.bdate_range("1/1/2005", "1/1/2009", tz=pytz.utc)._summary() - - def test_dti_business_summary_dateutil(self): - pd.bdate_range("1/1/2005", "1/1/2009", tz=dateutil.tz.tzutc())._summary() - - def test_dti_custom_business_repr(self): + @pytest.mark.parametrize("tz", [None, pytz.utc, dateutil.tz.tzutc()]) + @pytest.mark.parametrize("freq", ["B", "C"]) + def test_dti_business_repr_etc_smoke(self, tz, freq): # only really care that it works - repr(pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1), freq="C")) - - def test_dti_custom_business_summary(self): - rng = pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1), freq="C") - rng._summary() - rng[2:2]._summary() - - def test_dti_custom_business_summary_pytz(self): - pd.bdate_range("1/1/2005", "1/1/2009", freq="C", tz=pytz.utc)._summary() - - def test_dti_custom_business_summary_dateutil(self): - pd.bdate_range( - "1/1/2005", "1/1/2009", freq="C", tz=dateutil.tz.tzutc() - )._summary() + dti = pd.bdate_range( + datetime(2009, 1, 1), datetime(2010, 1, 1), tz=tz, freq=freq + ) + repr(dti) + dti._summary() + dti[2:2]._summary() class TestFormat: + def test_format(self): + # GH#35439 + idx = pd.date_range("20130101", periods=5) + expected = [f"{x:%Y-%m-%d}" for x in idx] + msg = r"DatetimeIndex\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected + def test_format_with_name_time_info(self): # bug I fixed 12/20/2011 dates = pd.date_range("2011-01-01 04:00:00", periods=10, name="something") - formatted = dates.format(name=True) + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dates.format(name=True) assert formatted[0] == "something" def test_format_datetime_with_time(self): dti = DatetimeIndex([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) - result = dti.format() + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = dti.format() expected = ["2012-02-07 00:00:00", "2012-02-07 23:00:00"] assert len(result) == 2 assert result == expected + + def test_format_datetime(self): + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = pd.to_datetime([datetime(2003, 1, 1, 12), NaT]).format() + assert formatted[0] == "2003-01-01 12:00:00" + assert formatted[1] == "NaT" + + def test_format_date(self): + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() + assert formatted[0] == "2003-01-01" + assert formatted[1] == "NaT" + + def test_format_date_tz(self): + dti = pd.to_datetime([datetime(2013, 1, 1)], utc=True) + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dti.format() + assert formatted[0] == "2013-01-01 00:00:00+00:00" + + dti = pd.to_datetime([datetime(2013, 1, 1), NaT], utc=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dti.format() + assert formatted[0] == "2013-01-01 00:00:00+00:00" + + def test_format_date_explicit_date_format(self): + dti = pd.to_datetime([datetime(2003, 2, 1), NaT]) + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + formatted = dti.format(date_format="%m-%d-%Y", na_rep="UT") + assert formatted[0] == "02-01-2003" + assert formatted[1] == "UT" diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_freq_attr.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_freq_attr.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_freq_attr.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_freq_attr.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,7 +31,7 @@ idx._data.freq = "foo" @pytest.mark.parametrize("values", [["20180101", "20180103", "20180105"], []]) - @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48H", Hour(48)]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48h", Hour(48)]) @pytest.mark.parametrize("tz", [None, "US/Eastern"]) def test_freq_setter(self, values, freq, tz): # GH#20678 diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_indexing.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_indexing.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,7 @@ import numpy as np import pytest +from pandas._libs import index as libindex from pandas.compat.numpy import np_long import pandas as pd @@ -31,49 +32,46 @@ # GH4226 st = Timestamp("2013-07-01 00:00:00", tz="America/Los_Angeles") et = Timestamp("2013-07-02 00:00:00", tz="America/Los_Angeles") - dr = date_range(st, et, freq="H", name="timebucket") + dr = date_range(st, et, freq="h", name="timebucket") assert dr[1:].name == dr.name - def test_getitem(self): - idx1 = date_range("2011-01-01", "2011-01-31", freq="D", name="idx") - idx2 = date_range( - "2011-01-01", "2011-01-31", freq="D", tz="Asia/Tokyo", name="idx" - ) + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) + def test_getitem(self, tz): + idx = date_range("2011-01-01", "2011-01-31", freq="D", tz=tz, name="idx") - for idx in [idx1, idx2]: - result = idx[0] - assert result == Timestamp("2011-01-01", tz=idx.tz) - - result = idx[0:5] - expected = date_range( - "2011-01-01", "2011-01-05", freq="D", tz=idx.tz, name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq + result = idx[0] + assert result == Timestamp("2011-01-01", tz=idx.tz) - result = idx[0:10:2] - expected = date_range( - "2011-01-01", "2011-01-09", freq="2D", tz=idx.tz, name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq + result = idx[0:5] + expected = date_range( + "2011-01-01", "2011-01-05", freq="D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq - result = idx[-20:-5:3] - expected = date_range( - "2011-01-12", "2011-01-24", freq="3D", tz=idx.tz, name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq + result = idx[0:10:2] + expected = date_range( + "2011-01-01", "2011-01-09", freq="2D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq - result = idx[4::-1] - expected = DatetimeIndex( - ["2011-01-05", "2011-01-04", "2011-01-03", "2011-01-02", "2011-01-01"], - freq="-1D", - tz=idx.tz, - name="idx", - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq + result = idx[-20:-5:3] + expected = date_range( + "2011-01-12", "2011-01-24", freq="3D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq + + result = idx[4::-1] + expected = DatetimeIndex( + ["2011-01-05", "2011-01-04", "2011-01-03", "2011-01-02", "2011-01-01"], + dtype=idx.dtype, + freq="-1D", + name="idx", + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq @pytest.mark.parametrize("freq", ["B", "C"]) def test_dti_business_getitem(self, freq): @@ -103,7 +101,7 @@ rng[:, None] def test_getitem_int_list(self): - dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") + dti = date_range(start="1/1/2005", end="12/1/2005", freq="ME") dti2 = dti[[1, 3, 5]] v1 = dti2[0] @@ -214,63 +212,68 @@ class TestTake: + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_dti_take_dont_lose_meta(self, tzstr): + rng = date_range("1/1/2000", periods=20, tz=tzstr) + + result = rng.take(range(5)) + assert result.tz == rng.tz + assert result.freq == rng.freq + def test_take_nan_first_datetime(self): index = DatetimeIndex([pd.NaT, Timestamp("20130101"), Timestamp("20130102")]) result = index.take([-1, 0, 1]) expected = DatetimeIndex([index[-1], index[0], index[1]]) tm.assert_index_equal(result, expected) - def test_take(self): + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) + def test_take(self, tz): # GH#10295 - idx1 = date_range("2011-01-01", "2011-01-31", freq="D", name="idx") - idx2 = date_range( - "2011-01-01", "2011-01-31", freq="D", tz="Asia/Tokyo", name="idx" - ) + idx = date_range("2011-01-01", "2011-01-31", freq="D", name="idx", tz=tz) - for idx in [idx1, idx2]: - result = idx.take([0]) - assert result == Timestamp("2011-01-01", tz=idx.tz) - - result = idx.take([0, 1, 2]) - expected = date_range( - "2011-01-01", "2011-01-03", freq="D", tz=idx.tz, name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq + result = idx.take([0]) + assert result == Timestamp("2011-01-01", tz=idx.tz) - result = idx.take([0, 2, 4]) - expected = date_range( - "2011-01-01", "2011-01-05", freq="2D", tz=idx.tz, name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq + result = idx.take([0, 1, 2]) + expected = date_range( + "2011-01-01", "2011-01-03", freq="D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq - result = idx.take([7, 4, 1]) - expected = date_range( - "2011-01-08", "2011-01-02", freq="-3D", tz=idx.tz, name="idx" - ) - tm.assert_index_equal(result, expected) - assert result.freq == expected.freq + result = idx.take([0, 2, 4]) + expected = date_range( + "2011-01-01", "2011-01-05", freq="2D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq - result = idx.take([3, 2, 5]) - expected = DatetimeIndex( - ["2011-01-04", "2011-01-03", "2011-01-06"], - freq=None, - tz=idx.tz, - name="idx", - ) - tm.assert_index_equal(result, expected) - assert result.freq is None + result = idx.take([7, 4, 1]) + expected = date_range( + "2011-01-08", "2011-01-02", freq="-3D", tz=idx.tz, name="idx" + ) + tm.assert_index_equal(result, expected) + assert result.freq == expected.freq - result = idx.take([-3, 2, 5]) - expected = DatetimeIndex( - ["2011-01-29", "2011-01-03", "2011-01-06"], - freq=None, - tz=idx.tz, - name="idx", - ) - tm.assert_index_equal(result, expected) - assert result.freq is None + result = idx.take([3, 2, 5]) + expected = DatetimeIndex( + ["2011-01-04", "2011-01-03", "2011-01-06"], + dtype=idx.dtype, + freq=None, + name="idx", + ) + tm.assert_index_equal(result, expected) + assert result.freq is None + + result = idx.take([-3, 2, 5]) + expected = DatetimeIndex( + ["2011-01-29", "2011-01-03", "2011-01-06"], + dtype=idx.dtype, + freq=None, + name="idx", + ) + tm.assert_index_equal(result, expected) + assert result.freq is None def test_take_invalid_kwargs(self): idx = date_range("2011-01-01", "2011-01-31", freq="D", name="idx") @@ -301,11 +304,11 @@ idx = date_range( start="2010-01-01 09:00", end="2010-02-01 09:00", - freq="H", + freq="h", tz=tz, name="idx", ) - expected = DatetimeIndex(dates, freq=None, name="idx", tz=tz) + expected = DatetimeIndex(dates, freq=None, name="idx", dtype=idx.dtype) taken1 = idx.take([5, 6, 8, 12]) taken2 = idx[[5, 6, 8, 12]] @@ -407,7 +410,7 @@ def test_get_loc_time_obj(self): # time indexing - idx = date_range("2000-01-01", periods=24, freq="H") + idx = date_range("2000-01-01", periods=24, freq="h") result = idx.get_loc(time(12)) expected = np.array([12]) @@ -417,18 +420,18 @@ expected = np.array([]) tm.assert_numpy_array_equal(result, expected, check_dtype=False) - def test_get_loc_time_obj2(self): + @pytest.mark.parametrize("offset", [-10, 10]) + def test_get_loc_time_obj2(self, monkeypatch, offset): # GH#8667 - - from pandas._libs.index import _SIZE_CUTOFF - - ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) + size_cutoff = 50 + n = size_cutoff + offset key = time(15, 11, 30) start = key.hour * 3600 + key.minute * 60 + key.second step = 24 * 3600 - for n in ns: - idx = date_range("2014-11-26", periods=n, freq="S") + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + idx = date_range("2014-11-26", periods=n, freq="s") ts = pd.Series(np.random.default_rng(2).standard_normal(n), index=idx) locs = np.arange(start, n, step, dtype=np.intp) @@ -603,7 +606,7 @@ class TestMaybeCastSliceBound: def test_maybe_cast_slice_bounds_empty(self): # GH#14354 - empty_idx = date_range(freq="1H", periods=0, end="2015") + empty_idx = date_range(freq="1h", periods=0, end="2015") right = empty_idx._maybe_cast_slice_bound("2015-01-02", "right") exp = Timestamp("2015-01-02 23:59:59.999999999") diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_iter.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_iter.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_iter.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_iter.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,76 @@ +import dateutil.tz +import numpy as np +import pytest + +from pandas import ( + DatetimeIndex, + date_range, + to_datetime, +) +from pandas.core.arrays import datetimes + + +class TestDatetimeIndexIteration: + @pytest.mark.parametrize( + "tz", [None, "UTC", "US/Central", dateutil.tz.tzoffset(None, -28800)] + ) + def test_iteration_preserves_nanoseconds(self, tz): + # GH#19603 + index = DatetimeIndex( + ["2018-02-08 15:00:00.168456358", "2018-02-08 15:00:00.168456359"], tz=tz + ) + for i, ts in enumerate(index): + assert ts == index[i] # pylint: disable=unnecessary-list-index-lookup + + def test_iter_readonly(self): + # GH#28055 ints_to_pydatetime with readonly array + arr = np.array([np.datetime64("2012-02-15T12:00:00.000000000")]) + arr.setflags(write=False) + dti = to_datetime(arr) + list(dti) + + def test_iteration_preserves_tz(self): + # see GH#8890 + index = date_range("2012-01-01", periods=3, freq="h", tz="US/Eastern") + + for i, ts in enumerate(index): + result = ts + expected = index[i] # pylint: disable=unnecessary-list-index-lookup + assert result == expected + + def test_iteration_preserves_tz2(self): + index = date_range( + "2012-01-01", periods=3, freq="h", tz=dateutil.tz.tzoffset(None, -28800) + ) + + for i, ts in enumerate(index): + result = ts + expected = index[i] # pylint: disable=unnecessary-list-index-lookup + assert result._repr_base == expected._repr_base + assert result == expected + + def test_iteration_preserves_tz3(self): + # GH#9100 + index = DatetimeIndex( + ["2014-12-01 03:32:39.987000-08:00", "2014-12-01 04:12:34.987000-08:00"] + ) + for i, ts in enumerate(index): + result = ts + expected = index[i] # pylint: disable=unnecessary-list-index-lookup + assert result._repr_base == expected._repr_base + assert result == expected + + @pytest.mark.parametrize("offset", [-5, -1, 0, 1]) + def test_iteration_over_chunksize(self, offset, monkeypatch): + # GH#21012 + chunksize = 5 + index = date_range( + "2000-01-01 00:00:00", periods=chunksize - offset, freq="min" + ) + num = 0 + with monkeypatch.context() as m: + m.setattr(datetimes, "_ITER_CHUNKSIZE", chunksize) + for stamp in index: + assert index[num] == stamp + num += 1 + assert num == len(index) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_join.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_join.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_join.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_join.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,10 +7,12 @@ import pytest from pandas import ( + DataFrame, DatetimeIndex, Index, Timestamp, date_range, + period_range, to_datetime, ) import pandas._testing as tm @@ -23,15 +25,7 @@ class TestJoin: def test_does_not_convert_mixed_integer(self): - df = tm.makeCustomDataframe( - 10, - 10, - data_gen_f=lambda *args, **kwargs: np.random.default_rng( - 2 - ).standard_normal(), - r_idx_type="i", - c_idx_type="dt", - ) + df = DataFrame(np.ones((3, 2)), columns=date_range("2020-01-01", periods=2)) cols = df.columns.join(df.index, how="outer") joined = cols.join(df.columns) assert cols.dtype == np.dtype("O") @@ -44,12 +38,10 @@ assert index is joined def test_join_with_period_index(self, join_type): - df = tm.makeCustomDataframe( - 10, - 10, - data_gen_f=lambda *args: np.random.default_rng(2).integers(2), - c_idx_type="p", - r_idx_type="dt", + df = DataFrame( + np.ones((10, 2)), + index=date_range("2020-01-01", periods=10), + columns=period_range("2020-01-01", periods=2), ) s = df.iloc[:5, 0] @@ -65,7 +57,7 @@ assert isinstance(result[0], Timestamp) def test_join_utc_convert(self, join_type): - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") left = rng.tz_convert("US/Eastern") right = rng.tz_convert("Europe/Berlin") diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_map.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_map.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_map.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_map.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,47 +0,0 @@ -import pytest - -from pandas import ( - DatetimeIndex, - Index, - MultiIndex, - Period, - date_range, -) -import pandas._testing as tm - - -class TestMap: - def test_map(self): - rng = date_range("1/1/2000", periods=10) - - f = lambda x: x.strftime("%Y%m%d") - result = rng.map(f) - exp = Index([f(x) for x in rng], dtype=" return Index - for accessor in DatetimeArray._field_ops: - res = getattr(dti, accessor) - assert len(res) == 365 - assert isinstance(res, Index) - assert res.name == "name" - - # boolean accessors -> return array - for accessor in DatetimeArray._bool_ops: - res = getattr(dti, accessor) - assert len(res) == 365 - assert isinstance(res, np.ndarray) - - # test boolean indexing - res = dti[dti.is_quarter_start] - exp = dti[[0, 90, 181, 273]] - tm.assert_index_equal(res, exp) - res = dti[dti.is_leap_year] - exp = DatetimeIndex([], freq="D", tz=dti.tz, name="name") - tm.assert_index_equal(res, exp) - - def test_datetimeindex_accessors2(self): - dti = date_range(freq="BQ-FEB", start=datetime(1998, 1, 1), periods=4) - - assert sum(dti.is_quarter_start) == 0 - assert sum(dti.is_quarter_end) == 4 - assert sum(dti.is_year_start) == 0 - assert sum(dti.is_year_end) == 1 - - def test_datetimeindex_accessors3(self): - # Ensure is_start/end accessors throw ValueError for CustomBusinessDay, - bday_egypt = offsets.CustomBusinessDay(weekmask="Sun Mon Tue Wed Thu") - dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt) - msg = "Custom business days is not supported by is_month_start" - with pytest.raises(ValueError, match=msg): - dti.is_month_start - - def test_datetimeindex_accessors4(self): - dti = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"]) - - assert dti.is_month_start[0] == 1 - - def test_datetimeindex_accessors5(self): - freq_m = to_offset("M") - bm = to_offset("BM") - qfeb = to_offset("Q-FEB") - qsfeb = to_offset("QS-FEB") - bq = to_offset("BQ") - bqs_apr = to_offset("BQS-APR") - as_nov = to_offset("AS-NOV") - - tests = [ - (freq_m.is_month_start(Timestamp("2013-06-01")), 1), - (bm.is_month_start(Timestamp("2013-06-01")), 0), - (freq_m.is_month_start(Timestamp("2013-06-03")), 0), - (bm.is_month_start(Timestamp("2013-06-03")), 1), - (qfeb.is_month_end(Timestamp("2013-02-28")), 1), - (qfeb.is_quarter_end(Timestamp("2013-02-28")), 1), - (qfeb.is_year_end(Timestamp("2013-02-28")), 1), - (qfeb.is_month_start(Timestamp("2013-03-01")), 1), - (qfeb.is_quarter_start(Timestamp("2013-03-01")), 1), - (qfeb.is_year_start(Timestamp("2013-03-01")), 1), - (qsfeb.is_month_end(Timestamp("2013-03-31")), 1), - (qsfeb.is_quarter_end(Timestamp("2013-03-31")), 0), - (qsfeb.is_year_end(Timestamp("2013-03-31")), 0), - (qsfeb.is_month_start(Timestamp("2013-02-01")), 1), - (qsfeb.is_quarter_start(Timestamp("2013-02-01")), 1), - (qsfeb.is_year_start(Timestamp("2013-02-01")), 1), - (bq.is_month_end(Timestamp("2013-06-30")), 0), - (bq.is_quarter_end(Timestamp("2013-06-30")), 0), - (bq.is_year_end(Timestamp("2013-06-30")), 0), - (bq.is_month_end(Timestamp("2013-06-28")), 1), - (bq.is_quarter_end(Timestamp("2013-06-28")), 1), - (bq.is_year_end(Timestamp("2013-06-28")), 0), - (bqs_apr.is_month_end(Timestamp("2013-06-30")), 0), - (bqs_apr.is_quarter_end(Timestamp("2013-06-30")), 0), - (bqs_apr.is_year_end(Timestamp("2013-06-30")), 0), - (bqs_apr.is_month_end(Timestamp("2013-06-28")), 1), - (bqs_apr.is_quarter_end(Timestamp("2013-06-28")), 1), - (bqs_apr.is_year_end(Timestamp("2013-03-29")), 1), - (as_nov.is_year_start(Timestamp("2013-11-01")), 1), - (as_nov.is_year_end(Timestamp("2013-10-31")), 1), - (Timestamp("2012-02-01").days_in_month, 29), - (Timestamp("2013-02-01").days_in_month, 28), - ] - - for ts, value in tests: - assert ts == value - - def test_datetimeindex_accessors6(self): - # GH 6538: Check that DatetimeIndex and its TimeStamp elements - # return the same weekofyear accessor close to new year w/ tz - dates = ["2013/12/29", "2013/12/30", "2013/12/31"] - dates = DatetimeIndex(dates, tz="Europe/Brussels") - expected = [52, 1, 1] - assert dates.isocalendar().week.tolist() == expected - assert [d.weekofyear for d in dates] == expected - - # GH 12806 - # error: Unsupported operand types for + ("List[None]" and "List[str]") - @pytest.mark.parametrize( - "time_locale", [None] + tm.get_locales() # type: ignore[operator] - ) - def test_datetime_name_accessors(self, time_locale): - # Test Monday -> Sunday and January -> December, in that sequence - if time_locale is None: - # If the time_locale is None, day-name and month_name should - # return the english attributes - expected_days = [ - "Monday", - "Tuesday", - "Wednesday", - "Thursday", - "Friday", - "Saturday", - "Sunday", - ] - expected_months = [ - "January", - "February", - "March", - "April", - "May", - "June", - "July", - "August", - "September", - "October", - "November", - "December", - ] - else: - with tm.set_locale(time_locale, locale.LC_TIME): - expected_days = calendar.day_name[:] - expected_months = calendar.month_name[1:] - - # GH#11128 - dti = date_range(freq="D", start=datetime(1998, 1, 1), periods=365) - english_days = [ - "Monday", - "Tuesday", - "Wednesday", - "Thursday", - "Friday", - "Saturday", - "Sunday", - ] - for day, name, eng_name in zip(range(4, 11), expected_days, english_days): - name = name.capitalize() - assert dti.day_name(locale=time_locale)[day] == name - assert dti.day_name(locale=None)[day] == eng_name - ts = Timestamp(datetime(2016, 4, day)) - assert ts.day_name(locale=time_locale) == name - dti = dti.append(DatetimeIndex([pd.NaT])) - assert np.isnan(dti.day_name(locale=time_locale)[-1]) - ts = Timestamp(pd.NaT) - assert np.isnan(ts.day_name(locale=time_locale)) - - # GH#12805 - dti = date_range(freq="M", start="2012", end="2013") - result = dti.month_name(locale=time_locale) - expected = Index([month.capitalize() for month in expected_months]) - - # work around different normalization schemes - # https://github.com/pandas-dev/pandas/issues/22342 - result = result.str.normalize("NFD") - expected = expected.str.normalize("NFD") - - tm.assert_index_equal(result, expected) - - for date, expected in zip(dti, expected_months): - result = date.month_name(locale=time_locale) - expected = expected.capitalize() - - result = unicodedata.normalize("NFD", result) - expected = unicodedata.normalize("NFD", result) - - assert result == expected - dti = dti.append(DatetimeIndex([pd.NaT])) - assert np.isnan(dti.month_name(locale=time_locale)[-1]) - - def test_nanosecond_field(self): - dti = DatetimeIndex(np.arange(10)) - expected = Index(np.arange(10, dtype=np.int32)) - - tm.assert_index_equal(dti.nanosecond, expected) - - -def test_iter_readonly(): - # GH#28055 ints_to_pydatetime with readonly array - arr = np.array([np.datetime64("2012-02-15T12:00:00.000000000")]) - arr.setflags(write=False) - dti = pd.to_datetime(arr) - list(dti) - - -def test_add_timedelta_preserves_freq(): - # GH#37295 should hold for any DTI with freq=None or Tick freq - tz = "Canada/Eastern" - dti = date_range( - start=Timestamp("2019-03-26 00:00:00-0400", tz=tz), - end=Timestamp("2020-10-17 00:00:00-0400", tz=tz), - freq="D", - ) - result = dti + Timedelta(days=1) - assert result.freq == dti.freq diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_npfuncs.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_npfuncs.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_npfuncs.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_npfuncs.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,7 +7,7 @@ class TestSplit: def test_split_non_utc(self): # GH#14042 - indices = date_range("2016-01-01 00:00:00+0200", freq="S", periods=10) + indices = date_range("2016-01-01 00:00:00+0200", freq="s", periods=10) result = np.split(indices, indices_or_sections=[])[0] expected = indices._with_freq(None) tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_ops.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_ops.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_ops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_ops.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,10 +1,7 @@ from datetime import datetime -from dateutil.tz import tzlocal import pytest -from pandas.compat import IS64 - from pandas import ( DatetimeIndex, Index, @@ -13,34 +10,8 @@ ) import pandas._testing as tm -START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) - class TestDatetimeIndexOps: - @pytest.mark.parametrize( - "freq,expected", - [ - ("A", "day"), - ("Q", "day"), - ("M", "day"), - ("D", "day"), - ("H", "hour"), - ("T", "minute"), - ("S", "second"), - ("L", "millisecond"), - ("U", "microsecond"), - ], - ) - def test_resolution(self, request, tz_naive_fixture, freq, expected): - tz = tz_naive_fixture - if freq == "A" and not IS64 and isinstance(tz, tzlocal): - request.node.add_marker( - pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") - ) - - idx = date_range(start="2013-04-01", periods=30, freq=freq, tz=tz) - assert idx.resolution == expected - def test_infer_freq(self, freq_sample): # GH 11018 idx = date_range("2011-01-01 09:00:00", freq=freq_sample, periods=10) @@ -53,6 +24,7 @@ class TestBusinessDatetimeIndex: @pytest.fixture def rng(self, freq): + START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) return bdate_range(START, END, freq=freq) def test_comparison(self, rng): @@ -64,7 +36,6 @@ def test_copy(self, rng): cp = rng.copy() - repr(cp) tm.assert_index_equal(cp, rng) def test_identical(self, rng): diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_partial_slicing.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_partial_slicing.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_partial_slicing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_partial_slicing.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,7 @@ DataFrame, DatetimeIndex, Index, + MultiIndex, Series, Timedelta, Timestamp, @@ -194,7 +195,7 @@ s["2004-12-31"] def test_partial_slice_daily(self): - rng = date_range(freq="H", start=datetime(2005, 1, 31), periods=500) + rng = date_range(freq="h", start=datetime(2005, 1, 31), periods=500) s = Series(np.arange(len(rng)), index=rng) result = s["2005-1-31"] @@ -204,7 +205,7 @@ s["2004-12-31 00"] def test_partial_slice_hourly(self): - rng = date_range(freq="T", start=datetime(2005, 1, 1, 20, 0, 0), periods=500) + rng = date_range(freq="min", start=datetime(2005, 1, 1, 20, 0, 0), periods=500) s = Series(np.arange(len(rng)), index=rng) result = s["2005-1-1"] @@ -218,7 +219,7 @@ s["2004-12-31 00:15"] def test_partial_slice_minutely(self): - rng = date_range(freq="S", start=datetime(2005, 1, 1, 23, 59, 0), periods=500) + rng = date_range(freq="s", start=datetime(2005, 1, 1, 23, 59, 0), periods=500) s = Series(np.arange(len(rng)), index=rng) result = s["2005-1-1 23:59"] @@ -235,7 +236,7 @@ rng = date_range( start=datetime(2005, 1, 1, 0, 0, 59, microsecond=999990), periods=20, - freq="US", + freq="us", ) s = Series(np.arange(20), rng) @@ -336,7 +337,7 @@ "TICKER": ["ABC", "MNP", "XYZ", "XYZ"], "val": [1, 2, 3, 4], }, - index=date_range("2013-06-19 09:30:00", periods=4, freq="5T"), + index=date_range("2013-06-19 09:30:00", periods=4, freq="5min"), ) df_multi = df.set_index(["ACCOUNT", "TICKER"], append=True) @@ -360,10 +361,12 @@ def test_partial_slicing_with_multiindex_series(self): # GH 4294 # partial slice on a series mi - ser = DataFrame( - np.random.default_rng(2).random((1000, 1000)), - index=date_range("2000-1-1", periods=1000), - ).stack(future_stack=True) + ser = Series( + range(250), + index=MultiIndex.from_product( + [date_range("2000-1-1", periods=50), range(5)] + ), + ) s2 = ser[:-1].copy() expected = s2["2000-1-4"] @@ -453,9 +456,11 @@ def test_slice_reduce_to_series(self): # GH 27516 - df = DataFrame({"A": range(24)}, index=date_range("2000", periods=24, freq="M")) + df = DataFrame( + {"A": range(24)}, index=date_range("2000", periods=24, freq="ME") + ) expected = Series( - range(12), index=date_range("2000", periods=12, freq="M"), name="A" + range(12), index=date_range("2000", periods=12, freq="ME"), name="A" ) result = df.loc["2000", "A"] tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_scalar_compat.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_scalar_compat.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_scalar_compat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_scalar_compat.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,48 +1,91 @@ """ Tests for DatetimeIndex methods behaving like their Timestamp counterparts """ -from datetime import datetime + +import calendar +from datetime import ( + date, + datetime, + time, +) +import locale +import unicodedata import numpy as np import pytest -from pandas._libs.tslibs import ( - OutOfBoundsDatetime, - to_offset, -) -from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG +from pandas._libs.tslibs import timezones -import pandas as pd from pandas import ( DatetimeIndex, + Index, + NaT, Timestamp, date_range, + offsets, ) import pandas._testing as tm +from pandas.core.arrays import DatetimeArray class TestDatetimeIndexOps: + def test_dti_no_millisecond_field(self): + msg = "type object 'DatetimeIndex' has no attribute 'millisecond'" + with pytest.raises(AttributeError, match=msg): + DatetimeIndex.millisecond + + msg = "'DatetimeIndex' object has no attribute 'millisecond'" + with pytest.raises(AttributeError, match=msg): + DatetimeIndex([]).millisecond + def test_dti_time(self): rng = date_range("1/1/2000", freq="12min", periods=10) - result = pd.Index(rng).time + result = Index(rng).time expected = [t.time() for t in rng] assert (result == expected).all() def test_dti_date(self): - rng = date_range("1/1/2000", freq="12H", periods=10) - result = pd.Index(rng).date + rng = date_range("1/1/2000", freq="12h", periods=10) + result = Index(rng).date expected = [t.date() for t in rng] assert (result == expected).all() - @pytest.mark.parametrize("data", [["1400-01-01"], [datetime(1400, 1, 1)]]) - def test_dti_date_out_of_range(self, data): - # GH#1475 - msg = ( - "^Out of bounds nanosecond timestamp: " - "1400-01-01( 00:00:00)?, at position 0$" - ) - with pytest.raises(OutOfBoundsDatetime, match=msg): - DatetimeIndex(data) + @pytest.mark.parametrize( + "dtype", + [None, "datetime64[ns, CET]", "datetime64[ns, EST]", "datetime64[ns, UTC]"], + ) + def test_dti_date2(self, dtype): + # Regression test for GH#21230 + expected = np.array([date(2018, 6, 4), NaT]) + + index = DatetimeIndex(["2018-06-04 10:00:00", NaT], dtype=dtype) + result = index.date + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + [None, "datetime64[ns, CET]", "datetime64[ns, EST]", "datetime64[ns, UTC]"], + ) + def test_dti_time2(self, dtype): + # Regression test for GH#21267 + expected = np.array([time(10, 20, 30), NaT]) + + index = DatetimeIndex(["2018-06-04 10:20:30", NaT], dtype=dtype) + result = index.time + + tm.assert_numpy_array_equal(result, expected) + + def test_dti_timetz(self, tz_naive_fixture): + # GH#21358 + tz = timezones.maybe_get_tz(tz_naive_fixture) + + expected = np.array([time(10, 20, 30, tzinfo=tz), NaT]) + + index = DatetimeIndex(["2018-06-04 10:20:30", NaT], tz=tz) + result = index.timetz + + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( "field", @@ -63,285 +106,224 @@ ) def test_dti_timestamp_fields(self, field): # extra fields from DatetimeIndex like quarter and week - idx = tm.makeDateIndex(100) + idx = date_range("2020-01-01", periods=10) expected = getattr(idx, field)[-1] result = getattr(Timestamp(idx[-1]), field) assert result == expected - def test_dti_timestamp_isocalendar_fields(self): - idx = tm.makeDateIndex(100) - expected = tuple(idx.isocalendar().iloc[-1].to_list()) - result = idx[-1].isocalendar() - assert result == expected - - # ---------------------------------------------------------------- - # DatetimeIndex.round - - def test_round_daily(self): - dti = date_range("20130101 09:10:11", periods=5) - result = dti.round("D") - expected = date_range("20130101", periods=5) - tm.assert_index_equal(result, expected) - - dti = dti.tz_localize("UTC").tz_convert("US/Eastern") - result = dti.round("D") - expected = date_range("20130101", periods=5).tz_localize("US/Eastern") - tm.assert_index_equal(result, expected) + def test_dti_nanosecond(self): + dti = DatetimeIndex(np.arange(10)) + expected = Index(np.arange(10, dtype=np.int32)) + + tm.assert_index_equal(dti.nanosecond, expected) + + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) + def test_dti_hour_tzaware(self, prefix): + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + rng = DatetimeIndex(strdates, tz=prefix + "US/Eastern") + assert (rng.hour == 0).all() + + # a more unusual time zone, GH#1946 + dr = date_range( + "2011-10-02 00:00", freq="h", periods=10, tz=prefix + "America/Atikokan" + ) - result = dti.round("s") - tm.assert_index_equal(result, dti) + expected = Index(np.arange(10, dtype=np.int32)) + tm.assert_index_equal(dr.hour, expected) + # GH#12806 + # error: Unsupported operand types for + ("List[None]" and "List[str]") @pytest.mark.parametrize( - "freq, error_msg", - [ - ("Y", " is a non-fixed frequency"), - ("M", " is a non-fixed frequency"), - ("foobar", "Invalid frequency: foobar"), - ], + "time_locale", [None] + tm.get_locales() # type: ignore[operator] ) - def test_round_invalid(self, freq, error_msg): - dti = date_range("20130101 09:10:11", periods=5) - dti = dti.tz_localize("UTC").tz_convert("US/Eastern") - with pytest.raises(ValueError, match=error_msg): - dti.round(freq) - - def test_round(self, tz_naive_fixture): - tz = tz_naive_fixture - rng = date_range(start="2016-01-01", periods=5, freq="30Min", tz=tz) - elt = rng[1] - - expected_rng = DatetimeIndex( - [ - Timestamp("2016-01-01 00:00:00", tz=tz), - Timestamp("2016-01-01 00:00:00", tz=tz), - Timestamp("2016-01-01 01:00:00", tz=tz), - Timestamp("2016-01-01 02:00:00", tz=tz), - Timestamp("2016-01-01 02:00:00", tz=tz), + def test_day_name_month_name(self, time_locale): + # Test Monday -> Sunday and January -> December, in that sequence + if time_locale is None: + # If the time_locale is None, day-name and month_name should + # return the english attributes + expected_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", ] - ) - expected_elt = expected_rng[1] - - tm.assert_index_equal(rng.round(freq="H"), expected_rng) - assert elt.round(freq="H") == expected_elt - - msg = INVALID_FREQ_ERR_MSG - with pytest.raises(ValueError, match=msg): - rng.round(freq="foo") - with pytest.raises(ValueError, match=msg): - elt.round(freq="foo") - - msg = " is a non-fixed frequency" - with pytest.raises(ValueError, match=msg): - rng.round(freq="M") - with pytest.raises(ValueError, match=msg): - elt.round(freq="M") - - # GH#14440 & GH#15578 - index = DatetimeIndex(["2016-10-17 12:00:00.0015"], tz=tz) - result = index.round("ms") - expected = DatetimeIndex(["2016-10-17 12:00:00.002000"], tz=tz) - tm.assert_index_equal(result, expected) - - for freq in ["us", "ns"]: - tm.assert_index_equal(index, index.round(freq)) - - index = DatetimeIndex(["2016-10-17 12:00:00.00149"], tz=tz) - result = index.round("ms") - expected = DatetimeIndex(["2016-10-17 12:00:00.001000"], tz=tz) - tm.assert_index_equal(result, expected) - - index = DatetimeIndex(["2016-10-17 12:00:00.001501031"]) - result = index.round("10ns") - expected = DatetimeIndex(["2016-10-17 12:00:00.001501030"]) - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(False): - ts = "2016-10-17 12:00:00.001501031" - DatetimeIndex([ts]).round("1010ns") - - def test_no_rounding_occurs(self, tz_naive_fixture): - # GH 21262 - tz = tz_naive_fixture - rng = date_range(start="2016-01-01", periods=5, freq="2Min", tz=tz) - - expected_rng = DatetimeIndex( - [ - Timestamp("2016-01-01 00:00:00", tz=tz), - Timestamp("2016-01-01 00:02:00", tz=tz), - Timestamp("2016-01-01 00:04:00", tz=tz), - Timestamp("2016-01-01 00:06:00", tz=tz), - Timestamp("2016-01-01 00:08:00", tz=tz), + expected_months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", ] - ) - - tm.assert_index_equal(rng.round(freq="2T"), expected_rng) - - @pytest.mark.parametrize( - "test_input, rounder, freq, expected", - [ - (["2117-01-01 00:00:45"], "floor", "15s", ["2117-01-01 00:00:45"]), - (["2117-01-01 00:00:45"], "ceil", "15s", ["2117-01-01 00:00:45"]), - ( - ["2117-01-01 00:00:45.000000012"], - "floor", - "10ns", - ["2117-01-01 00:00:45.000000010"], - ), - ( - ["1823-01-01 00:00:01.000000012"], - "ceil", - "10ns", - ["1823-01-01 00:00:01.000000020"], - ), - (["1823-01-01 00:00:01"], "floor", "1s", ["1823-01-01 00:00:01"]), - (["1823-01-01 00:00:01"], "ceil", "1s", ["1823-01-01 00:00:01"]), - (["2018-01-01 00:15:00"], "ceil", "15T", ["2018-01-01 00:15:00"]), - (["2018-01-01 00:15:00"], "floor", "15T", ["2018-01-01 00:15:00"]), - (["1823-01-01 03:00:00"], "ceil", "3H", ["1823-01-01 03:00:00"]), - (["1823-01-01 03:00:00"], "floor", "3H", ["1823-01-01 03:00:00"]), - ( - ("NaT", "1823-01-01 00:00:01"), - "floor", - "1s", - ("NaT", "1823-01-01 00:00:01"), - ), - ( - ("NaT", "1823-01-01 00:00:01"), - "ceil", - "1s", - ("NaT", "1823-01-01 00:00:01"), - ), - ], - ) - def test_ceil_floor_edge(self, test_input, rounder, freq, expected): - dt = DatetimeIndex(list(test_input)) - func = getattr(dt, rounder) - result = func(freq) - expected = DatetimeIndex(list(expected)) - assert expected.equals(result) - - @pytest.mark.parametrize( - "start, index_freq, periods", - [("2018-01-01", "12H", 25), ("2018-01-01 0:0:0.124999", "1ns", 1000)], - ) - @pytest.mark.parametrize( - "round_freq", - [ - "2ns", - "3ns", - "4ns", - "5ns", - "6ns", - "7ns", - "250ns", - "500ns", - "750ns", - "1us", - "19us", - "250us", - "500us", - "750us", - "1s", - "2s", - "3s", - "12H", - "1D", - ], - ) - def test_round_int64(self, start, index_freq, periods, round_freq): - dt = date_range(start=start, freq=index_freq, periods=periods) - unit = to_offset(round_freq).nanos - - # test floor - result = dt.floor(round_freq) - diff = dt.asi8 - result.asi8 - mod = result.asi8 % unit - assert (mod == 0).all(), f"floor not a {round_freq} multiple" - assert (0 <= diff).all() and (diff < unit).all(), "floor error" - - # test ceil - result = dt.ceil(round_freq) - diff = result.asi8 - dt.asi8 - mod = result.asi8 % unit - assert (mod == 0).all(), f"ceil not a {round_freq} multiple" - assert (0 <= diff).all() and (diff < unit).all(), "ceil error" - - # test round - result = dt.round(round_freq) - diff = abs(result.asi8 - dt.asi8) - mod = result.asi8 % unit - assert (mod == 0).all(), f"round not a {round_freq} multiple" - assert (diff <= unit // 2).all(), "round error" - if unit % 2 == 0: - assert ( - result.asi8[diff == unit // 2] % 2 == 0 - ).all(), "round half to even error" - - # ---------------------------------------------------------------- - # DatetimeIndex.normalize + else: + with tm.set_locale(time_locale, locale.LC_TIME): + expected_days = calendar.day_name[:] + expected_months = calendar.month_name[1:] + + # GH#11128 + dti = date_range(freq="D", start=datetime(1998, 1, 1), periods=365) + english_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + for day, name, eng_name in zip(range(4, 11), expected_days, english_days): + name = name.capitalize() + assert dti.day_name(locale=time_locale)[day] == name + assert dti.day_name(locale=None)[day] == eng_name + ts = Timestamp(datetime(2016, 4, day)) + assert ts.day_name(locale=time_locale) == name + dti = dti.append(DatetimeIndex([NaT])) + assert np.isnan(dti.day_name(locale=time_locale)[-1]) + ts = Timestamp(NaT) + assert np.isnan(ts.day_name(locale=time_locale)) + + # GH#12805 + dti = date_range(freq="ME", start="2012", end="2013") + result = dti.month_name(locale=time_locale) + expected = Index([month.capitalize() for month in expected_months]) + + # work around different normalization schemes GH#22342 + result = result.str.normalize("NFD") + expected = expected.str.normalize("NFD") - def test_normalize(self): - rng = date_range("1/1/2000 9:30", periods=10, freq="D") - - result = rng.normalize() - expected = date_range("1/1/2000", periods=10, freq="D") tm.assert_index_equal(result, expected) - arr_ns = np.array([1380585623454345752, 1380585612343234312]).astype( - "datetime64[ns]" - ) - rng_ns = DatetimeIndex(arr_ns) - rng_ns_normalized = rng_ns.normalize() - - arr_ns = np.array([1380585600000000000, 1380585600000000000]).astype( - "datetime64[ns]" - ) - expected = DatetimeIndex(arr_ns) - tm.assert_index_equal(rng_ns_normalized, expected) - - assert result.is_normalized - assert not rng.is_normalized - - def test_normalize_nat(self): - dti = DatetimeIndex([pd.NaT, Timestamp("2018-01-01 01:00:00")]) - result = dti.normalize() - expected = DatetimeIndex([pd.NaT, Timestamp("2018-01-01")]) - tm.assert_index_equal(result, expected) - - -class TestDateTimeIndexToJulianDate: - def test_1700(self): - dr = date_range(start=Timestamp("1710-10-01"), periods=5, freq="D") - r1 = pd.Index([x.to_julian_date() for x in dr]) - r2 = dr.to_julian_date() - assert isinstance(r2, pd.Index) and r2.dtype == np.float64 - tm.assert_index_equal(r1, r2) - - def test_2000(self): - dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="D") - r1 = pd.Index([x.to_julian_date() for x in dr]) - r2 = dr.to_julian_date() - assert isinstance(r2, pd.Index) and r2.dtype == np.float64 - tm.assert_index_equal(r1, r2) - - def test_hour(self): - dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="H") - r1 = pd.Index([x.to_julian_date() for x in dr]) - r2 = dr.to_julian_date() - assert isinstance(r2, pd.Index) and r2.dtype == np.float64 - tm.assert_index_equal(r1, r2) - - def test_minute(self): - dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="T") - r1 = pd.Index([x.to_julian_date() for x in dr]) - r2 = dr.to_julian_date() - assert isinstance(r2, pd.Index) and r2.dtype == np.float64 - tm.assert_index_equal(r1, r2) - - def test_second(self): - dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="S") - r1 = pd.Index([x.to_julian_date() for x in dr]) - r2 = dr.to_julian_date() - assert isinstance(r2, pd.Index) and r2.dtype == np.float64 - tm.assert_index_equal(r1, r2) + for item, expected in zip(dti, expected_months): + result = item.month_name(locale=time_locale) + expected = expected.capitalize() + + result = unicodedata.normalize("NFD", result) + expected = unicodedata.normalize("NFD", result) + + assert result == expected + dti = dti.append(DatetimeIndex([NaT])) + assert np.isnan(dti.month_name(locale=time_locale)[-1]) + + def test_dti_week(self): + # GH#6538: Check that DatetimeIndex and its TimeStamp elements + # return the same weekofyear accessor close to new year w/ tz + dates = ["2013/12/29", "2013/12/30", "2013/12/31"] + dates = DatetimeIndex(dates, tz="Europe/Brussels") + expected = [52, 1, 1] + assert dates.isocalendar().week.tolist() == expected + assert [d.weekofyear for d in dates] == expected + + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + def test_dti_fields(self, tz): + # GH#13303 + dti = date_range(freq="D", start=datetime(1998, 1, 1), periods=365, tz=tz) + assert dti.year[0] == 1998 + assert dti.month[0] == 1 + assert dti.day[0] == 1 + assert dti.hour[0] == 0 + assert dti.minute[0] == 0 + assert dti.second[0] == 0 + assert dti.microsecond[0] == 0 + assert dti.dayofweek[0] == 3 + + assert dti.dayofyear[0] == 1 + assert dti.dayofyear[120] == 121 + + assert dti.isocalendar().week.iloc[0] == 1 + assert dti.isocalendar().week.iloc[120] == 18 + + assert dti.quarter[0] == 1 + assert dti.quarter[120] == 2 + + assert dti.days_in_month[0] == 31 + assert dti.days_in_month[90] == 30 + + assert dti.is_month_start[0] + assert not dti.is_month_start[1] + assert dti.is_month_start[31] + assert dti.is_quarter_start[0] + assert dti.is_quarter_start[90] + assert dti.is_year_start[0] + assert not dti.is_year_start[364] + assert not dti.is_month_end[0] + assert dti.is_month_end[30] + assert not dti.is_month_end[31] + assert dti.is_month_end[364] + assert not dti.is_quarter_end[0] + assert not dti.is_quarter_end[30] + assert dti.is_quarter_end[89] + assert dti.is_quarter_end[364] + assert not dti.is_year_end[0] + assert dti.is_year_end[364] + + assert len(dti.year) == 365 + assert len(dti.month) == 365 + assert len(dti.day) == 365 + assert len(dti.hour) == 365 + assert len(dti.minute) == 365 + assert len(dti.second) == 365 + assert len(dti.microsecond) == 365 + assert len(dti.dayofweek) == 365 + assert len(dti.dayofyear) == 365 + assert len(dti.isocalendar()) == 365 + assert len(dti.quarter) == 365 + assert len(dti.is_month_start) == 365 + assert len(dti.is_month_end) == 365 + assert len(dti.is_quarter_start) == 365 + assert len(dti.is_quarter_end) == 365 + assert len(dti.is_year_start) == 365 + assert len(dti.is_year_end) == 365 + + dti.name = "name" + + # non boolean accessors -> return Index + for accessor in DatetimeArray._field_ops: + res = getattr(dti, accessor) + assert len(res) == 365 + assert isinstance(res, Index) + assert res.name == "name" + + # boolean accessors -> return array + for accessor in DatetimeArray._bool_ops: + res = getattr(dti, accessor) + assert len(res) == 365 + assert isinstance(res, np.ndarray) + + # test boolean indexing + res = dti[dti.is_quarter_start] + exp = dti[[0, 90, 181, 273]] + tm.assert_index_equal(res, exp) + res = dti[dti.is_leap_year] + exp = DatetimeIndex([], freq="D", tz=dti.tz, name="name").as_unit("ns") + tm.assert_index_equal(res, exp) + + def test_dti_is_year_quarter_start(self): + dti = date_range(freq="BQE-FEB", start=datetime(1998, 1, 1), periods=4) + + assert sum(dti.is_quarter_start) == 0 + assert sum(dti.is_quarter_end) == 4 + assert sum(dti.is_year_start) == 0 + assert sum(dti.is_year_end) == 1 + + def test_dti_is_month_start(self): + dti = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"]) + + assert dti.is_month_start[0] == 1 + + def test_dti_is_month_start_custom(self): + # Ensure is_start/end accessors throw ValueError for CustomBusinessDay, + bday_egypt = offsets.CustomBusinessDay(weekmask="Sun Mon Tue Wed Thu") + dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt) + msg = "Custom business days is not supported by is_month_start" + with pytest.raises(ValueError, match=msg): + dti.is_month_start diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_setops.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_setops.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_setops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_setops.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,7 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest @@ -12,6 +15,7 @@ DatetimeIndex, Index, Series, + Timestamp, bdate_range, date_range, ) @@ -38,7 +42,7 @@ # TODO: moved from test_datetimelike; dedup with version below def test_union2(self, sort): - everything = tm.makeDateIndex(10) + everything = date_range("2020-01-01", periods=10) first = everything[:5] second = everything[5:] union = first.union(second, sort=sort) @@ -46,7 +50,7 @@ @pytest.mark.parametrize("box", [np.array, Series, list]) def test_union3(self, sort, box): - everything = tm.makeDateIndex(10) + everything = date_range("2020-01-01", periods=10) first = everything[:5] second = everything[5:] @@ -69,7 +73,7 @@ expected2_notsorted = DatetimeIndex(list(other2) + list(rng2[:3])) rng3 = date_range("1/1/2000", freq="D", periods=5, tz=tz) - other3 = DatetimeIndex([], tz=tz) + other3 = DatetimeIndex([], tz=tz).as_unit("ns") expected3 = date_range("1/1/2000", freq="D", periods=5, tz=tz) expected3_notsorted = rng3 @@ -98,8 +102,8 @@ assert result.freq == ordered.freq def test_union_bug_1730(self, sort): - rng_a = date_range("1/1/2012", periods=4, freq="3H") - rng_b = date_range("1/1/2012", periods=4, freq="4H") + rng_a = date_range("1/1/2012", periods=4, freq="3h") + rng_b = date_range("1/1/2012", periods=4, freq="4h") result = rng_a.union(rng_b, sort=sort) exp = list(rng_a) + list(rng_b[1:]) @@ -189,18 +193,26 @@ # Fails with "AttributeError: can't set attribute" i2.union(i1, sort=sort) + def test_union_same_timezone_different_units(self): + # GH 55238 + idx1 = date_range("2000-01-01", periods=3, tz="UTC").as_unit("ms") + idx2 = date_range("2000-01-01", periods=3, tz="UTC").as_unit("us") + result = idx1.union(idx2) + expected = date_range("2000-01-01", periods=3, tz="UTC").as_unit("us") + tm.assert_index_equal(result, expected) + # TODO: moved from test_datetimelike; de-duplicate with version below def test_intersection2(self): - first = tm.makeDateIndex(10) + first = date_range("2020-01-01", periods=10) second = first[5:] intersect = first.intersection(second) - assert tm.equalContents(intersect, second) + tm.assert_index_equal(intersect, second) # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = first.intersection(case) - assert tm.equalContents(result, second) + tm.assert_index_equal(result, second) third = Index(["a", "b", "c"]) result = first.intersection(third) @@ -223,7 +235,7 @@ expected3 = date_range("6/1/2000", "6/20/2000", freq="D", name=None) rng4 = date_range("7/1/2000", "7/31/2000", freq="D", name="idx") - expected4 = DatetimeIndex([], freq="D", name="idx") + expected4 = DatetimeIndex([], freq="D", name="idx", dtype="M8[ns]") for rng, expected in [ (rng2, expected2), @@ -237,23 +249,27 @@ # non-monotonic base = DatetimeIndex( ["2011-01-05", "2011-01-04", "2011-01-02", "2011-01-03"], tz=tz, name="idx" - ) + ).as_unit("ns") rng2 = DatetimeIndex( ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], tz=tz, name="idx" - ) - expected2 = DatetimeIndex(["2011-01-04", "2011-01-02"], tz=tz, name="idx") + ).as_unit("ns") + expected2 = DatetimeIndex( + ["2011-01-04", "2011-01-02"], tz=tz, name="idx" + ).as_unit("ns") rng3 = DatetimeIndex( ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], tz=tz, name="other", - ) - expected3 = DatetimeIndex(["2011-01-04", "2011-01-02"], tz=tz, name=None) + ).as_unit("ns") + expected3 = DatetimeIndex( + ["2011-01-04", "2011-01-02"], tz=tz, name=None + ).as_unit("ns") # GH 7880 rng4 = date_range("7/1/2000", "7/31/2000", freq="D", tz=tz, name="idx") - expected4 = DatetimeIndex([], tz=tz, name="idx") + expected4 = DatetimeIndex([], tz=tz, name="idx").as_unit("ns") assert expected4.freq is None for rng, expected in [ @@ -269,7 +285,7 @@ # parametrize over both anchored and non-anchored freqs, as they # have different code paths - @pytest.mark.parametrize("freq", ["T", "B"]) + @pytest.mark.parametrize("freq", ["min", "B"]) def test_intersection_empty(self, tz_aware_fixture, freq): # empty same freq GH2129 tz = tz_aware_fixture @@ -283,7 +299,7 @@ assert result.freq == rng.freq # no overlap GH#33604 - check_freq = freq != "T" # We don't preserve freq on non-anchored offsets + check_freq = freq != "min" # We don't preserve freq on non-anchored offsets result = rng[:3].intersection(rng[-3:]) tm.assert_index_equal(result, rng[:0]) if check_freq: @@ -300,7 +316,7 @@ def test_intersection_bug_1708(self): from pandas import DateOffset - index_1 = date_range("1/1/2012", periods=4, freq="12H") + index_1 = date_range("1/1/2012", periods=4, freq="12h") index_2 = index_1 + DateOffset(hours=1) result = index_1.intersection(index_2) @@ -338,20 +354,22 @@ index = date_range("20160920", "20160925", freq="D") other = date_range("20160921", "20160924", freq="D") - expected = DatetimeIndex(["20160920", "20160925"], freq=None) + expected = DatetimeIndex(["20160920", "20160925"], dtype="M8[ns]", freq=None) idx_diff = index.difference(other, sort) tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) + # preserve frequency when the difference is a contiguous + # subset of the original range other = date_range("20160922", "20160925", freq="D") idx_diff = index.difference(other, sort) - expected = DatetimeIndex(["20160920", "20160921"], freq=None) + expected = DatetimeIndex(["20160920", "20160921"], dtype="M8[ns]", freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) def test_datetimeindex_diff(self, sort): - dti1 = date_range(freq="Q-JAN", start=datetime(1997, 12, 31), periods=100) - dti2 = date_range(freq="Q-JAN", start=datetime(1997, 12, 31), periods=98) + dti1 = date_range(freq="QE-JAN", start=datetime(1997, 12, 31), periods=100) + dti2 = date_range(freq="QE-JAN", start=datetime(1997, 12, 31), periods=98) assert len(dti1.difference(dti2, sort)) == 2 @pytest.mark.parametrize("tz", [None, "Asia/Tokyo", "US/Eastern"]) @@ -400,12 +418,58 @@ "2019-12-31", "2020-03-31", ], - freq="Q-DEC", + freq="QE-DEC", ) result = dti[::2].intersection(dti[1::2]) expected = dti[:0] tm.assert_index_equal(result, expected) + def test_dti_intersection(self): + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") + + left = rng[10:90][::-1] + right = rng[20:80][::-1] + + assert left.tz == rng.tz + result = left.intersection(right) + assert result.tz == left.tz + + # Note: not difference, as there is no symmetry requirement there + @pytest.mark.parametrize("setop", ["union", "intersection", "symmetric_difference"]) + def test_dti_setop_aware(self, setop): + # non-overlapping + # GH#39328 as of 2.0 we cast these to UTC instead of object + rng = date_range("2012-11-15 00:00:00", periods=6, freq="h", tz="US/Central") + + rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="h", tz="US/Eastern") + + result = getattr(rng, setop)(rng2) + + left = rng.tz_convert("UTC") + right = rng2.tz_convert("UTC") + expected = getattr(left, setop)(right) + tm.assert_index_equal(result, expected) + assert result.tz == left.tz + if len(result): + assert result[0].tz is timezone.utc + assert result[-1].tz is timezone.utc + + def test_dti_union_mixed(self): + # GH#21671 + rng = DatetimeIndex([Timestamp("2011-01-01"), pd.NaT]) + rng2 = DatetimeIndex(["2012-01-01", "2012-01-02"], tz="Asia/Tokyo") + result = rng.union(rng2) + expected = Index( + [ + Timestamp("2011-01-01"), + pd.NaT, + Timestamp("2012-01-01", tz="Asia/Tokyo"), + Timestamp("2012-01-02", tz="Asia/Tokyo"), + ], + dtype=object, + ) + tm.assert_index_equal(result, expected) + class TestBusinessDatetimeIndex: def test_union(self, sort): @@ -471,12 +535,12 @@ assert isinstance(the_int, DatetimeIndex) assert the_int.freq == rng.freq - the_int = rng1.intersection(rng2.view(DatetimeIndex)) + the_int = rng1.intersection(rng2) tm.assert_index_equal(the_int, expected) # non-overlapping the_int = rng[:10].intersection(rng[10:]) - expected = DatetimeIndex([]) + expected = DatetimeIndex([]).as_unit("ns") tm.assert_index_equal(the_int, expected) def test_intersection_bug(self): @@ -490,15 +554,13 @@ def test_intersection_list(self): # GH#35876 # values is not an Index -> no name -> retain "a" - values = [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")] + values = [Timestamp("2020-01-01"), Timestamp("2020-02-01")] idx = DatetimeIndex(values, name="a") res = idx.intersection(values) tm.assert_index_equal(res, idx) def test_month_range_union_tz_pytz(self, sort): - from pytz import timezone - - tz = timezone("US/Eastern") + tz = pytz.timezone("US/Eastern") early_start = datetime(2011, 1, 1) early_end = datetime(2011, 3, 1) @@ -533,13 +595,13 @@ # GH#38196 idx1 = Index( [ - pd.Timestamp("2019-12-13"), - pd.Timestamp("2019-12-12"), - pd.Timestamp("2019-12-12"), + Timestamp("2019-12-13"), + Timestamp("2019-12-12"), + Timestamp("2019-12-12"), ] ) result = idx1.intersection(idx1, sort=sort) - expected = Index([pd.Timestamp("2019-12-13"), pd.Timestamp("2019-12-12")]) + expected = Index([Timestamp("2019-12-13"), Timestamp("2019-12-12")]) tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_timezones.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_timezones.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_timezones.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_timezones.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,39 +2,25 @@ Tests for DatetimeIndex timezone-related methods """ from datetime import ( - date, datetime, - time, timedelta, timezone, tzinfo, ) -import dateutil -from dateutil.tz import ( - gettz, - tzlocal, -) +from dateutil.tz import gettz import numpy as np import pytest import pytz -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type [misc] - ZoneInfo = None # type: ignore[misc, assignment] - from pandas._libs.tslibs import ( conversion, timezones, ) -import pandas.util._test_decorators as td import pandas as pd from pandas import ( DatetimeIndex, - Index, Timestamp, bdate_range, date_range, @@ -61,813 +47,13 @@ return timedelta(0) -fixed_off = FixedOffset(-420, "-07:00") fixed_off_no_name = FixedOffset(-330, None) class TestDatetimeIndexTimezones: # ------------------------------------------------------------- - # DatetimeIndex.tz_convert - def test_tz_convert_nat(self): - # GH#5546 - dates = [pd.NaT] - idx = DatetimeIndex(dates) - idx = idx.tz_localize("US/Pacific") - tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Pacific")) - idx = idx.tz_convert("US/Eastern") - tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Eastern")) - idx = idx.tz_convert("UTC") - tm.assert_index_equal(idx, DatetimeIndex(dates, tz="UTC")) - - dates = ["2010-12-01 00:00", "2010-12-02 00:00", pd.NaT] - idx = DatetimeIndex(dates) - idx = idx.tz_localize("US/Pacific") - tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Pacific")) - idx = idx.tz_convert("US/Eastern") - expected = ["2010-12-01 03:00", "2010-12-02 03:00", pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) - - idx = idx + pd.offsets.Hour(5) - expected = ["2010-12-01 08:00", "2010-12-02 08:00", pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) - idx = idx.tz_convert("US/Pacific") - expected = ["2010-12-01 05:00", "2010-12-02 05:00", pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Pacific")) - - idx = idx + np.timedelta64(3, "h") - expected = ["2010-12-01 08:00", "2010-12-02 08:00", pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Pacific")) - - idx = idx.tz_convert("US/Eastern") - expected = ["2010-12-01 11:00", "2010-12-02 11:00", pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) - - @pytest.mark.parametrize("prefix", ["", "dateutil/"]) - def test_dti_tz_convert_compat_timestamp(self, prefix): - strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] - idx = DatetimeIndex(strdates, tz=prefix + "US/Eastern") - - conv = idx[0].tz_convert(prefix + "US/Pacific") - expected = idx.tz_convert(prefix + "US/Pacific")[0] - - assert conv == expected - - def test_dti_tz_convert_hour_overflow_dst(self): - # Regression test for: - # https://github.com/pandas-dev/pandas/issues/13306 - - # sorted case US/Eastern -> UTC - ts = ["2008-05-12 09:50:00", "2008-12-12 09:50:35", "2009-05-12 09:50:32"] - tt = DatetimeIndex(ts).tz_localize("US/Eastern") - ut = tt.tz_convert("UTC") - expected = Index([13, 14, 13], dtype=np.int32) - tm.assert_index_equal(ut.hour, expected) - - # sorted case UTC -> US/Eastern - ts = ["2008-05-12 13:50:00", "2008-12-12 14:50:35", "2009-05-12 13:50:32"] - tt = DatetimeIndex(ts).tz_localize("UTC") - ut = tt.tz_convert("US/Eastern") - expected = Index([9, 9, 9], dtype=np.int32) - tm.assert_index_equal(ut.hour, expected) - - # unsorted case US/Eastern -> UTC - ts = ["2008-05-12 09:50:00", "2008-12-12 09:50:35", "2008-05-12 09:50:32"] - tt = DatetimeIndex(ts).tz_localize("US/Eastern") - ut = tt.tz_convert("UTC") - expected = Index([13, 14, 13], dtype=np.int32) - tm.assert_index_equal(ut.hour, expected) - - # unsorted case UTC -> US/Eastern - ts = ["2008-05-12 13:50:00", "2008-12-12 14:50:35", "2008-05-12 13:50:32"] - tt = DatetimeIndex(ts).tz_localize("UTC") - ut = tt.tz_convert("US/Eastern") - expected = Index([9, 9, 9], dtype=np.int32) - tm.assert_index_equal(ut.hour, expected) - - @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) - def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz): - # Regression test for GH#13306 - - # sorted case US/Eastern -> UTC - ts = [ - Timestamp("2008-05-12 09:50:00", tz=tz), - Timestamp("2008-12-12 09:50:35", tz=tz), - Timestamp("2009-05-12 09:50:32", tz=tz), - ] - tt = DatetimeIndex(ts) - ut = tt.tz_convert("UTC") - expected = Index([13, 14, 13], dtype=np.int32) - tm.assert_index_equal(ut.hour, expected) - - # sorted case UTC -> US/Eastern - ts = [ - Timestamp("2008-05-12 13:50:00", tz="UTC"), - Timestamp("2008-12-12 14:50:35", tz="UTC"), - Timestamp("2009-05-12 13:50:32", tz="UTC"), - ] - tt = DatetimeIndex(ts) - ut = tt.tz_convert("US/Eastern") - expected = Index([9, 9, 9], dtype=np.int32) - tm.assert_index_equal(ut.hour, expected) - - # unsorted case US/Eastern -> UTC - ts = [ - Timestamp("2008-05-12 09:50:00", tz=tz), - Timestamp("2008-12-12 09:50:35", tz=tz), - Timestamp("2008-05-12 09:50:32", tz=tz), - ] - tt = DatetimeIndex(ts) - ut = tt.tz_convert("UTC") - expected = Index([13, 14, 13], dtype=np.int32) - tm.assert_index_equal(ut.hour, expected) - - # unsorted case UTC -> US/Eastern - ts = [ - Timestamp("2008-05-12 13:50:00", tz="UTC"), - Timestamp("2008-12-12 14:50:35", tz="UTC"), - Timestamp("2008-05-12 13:50:32", tz="UTC"), - ] - tt = DatetimeIndex(ts) - ut = tt.tz_convert("US/Eastern") - expected = Index([9, 9, 9], dtype=np.int32) - tm.assert_index_equal(ut.hour, expected) - - @pytest.mark.parametrize("freq, n", [("H", 1), ("T", 60), ("S", 3600)]) - def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): - # Regression test for tslib.tz_convert(vals, tz1, tz2). - # See https://github.com/pandas-dev/pandas/issues/4496 for details. - idx = date_range(datetime(2011, 3, 26, 23), datetime(2011, 3, 27, 1), freq=freq) - idx = idx.tz_localize("UTC") - idx = idx.tz_convert("Europe/Moscow") - - expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) - tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) - - def test_dti_tz_convert_dst(self): - for freq, n in [("H", 1), ("T", 60), ("S", 3600)]: - # Start DST - idx = date_range( - "2014-03-08 23:00", "2014-03-09 09:00", freq=freq, tz="UTC" - ) - idx = idx.tz_convert("US/Eastern") - expected = np.repeat( - np.array([18, 19, 20, 21, 22, 23, 0, 1, 3, 4, 5]), - np.array([n, n, n, n, n, n, n, n, n, n, 1]), - ) - tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) - - idx = date_range( - "2014-03-08 18:00", "2014-03-09 05:00", freq=freq, tz="US/Eastern" - ) - idx = idx.tz_convert("UTC") - expected = np.repeat( - np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - np.array([n, n, n, n, n, n, n, n, n, n, 1]), - ) - tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) - - # End DST - idx = date_range( - "2014-11-01 23:00", "2014-11-02 09:00", freq=freq, tz="UTC" - ) - idx = idx.tz_convert("US/Eastern") - expected = np.repeat( - np.array([19, 20, 21, 22, 23, 0, 1, 1, 2, 3, 4]), - np.array([n, n, n, n, n, n, n, n, n, n, 1]), - ) - tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) - - idx = date_range( - "2014-11-01 18:00", "2014-11-02 05:00", freq=freq, tz="US/Eastern" - ) - idx = idx.tz_convert("UTC") - expected = np.repeat( - np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), - np.array([n, n, n, n, n, n, n, n, n, n, n, n, 1]), - ) - tm.assert_index_equal(idx.hour, Index(expected, dtype=np.int32)) - - # daily - # Start DST - idx = date_range("2014-03-08 00:00", "2014-03-09 00:00", freq="D", tz="UTC") - idx = idx.tz_convert("US/Eastern") - tm.assert_index_equal(idx.hour, Index([19, 19], dtype=np.int32)) - - idx = date_range( - "2014-03-08 00:00", "2014-03-09 00:00", freq="D", tz="US/Eastern" - ) - idx = idx.tz_convert("UTC") - tm.assert_index_equal(idx.hour, Index([5, 5], dtype=np.int32)) - - # End DST - idx = date_range("2014-11-01 00:00", "2014-11-02 00:00", freq="D", tz="UTC") - idx = idx.tz_convert("US/Eastern") - tm.assert_index_equal(idx.hour, Index([20, 20], dtype=np.int32)) - - idx = date_range( - "2014-11-01 00:00", "2014-11-02 000:00", freq="D", tz="US/Eastern" - ) - idx = idx.tz_convert("UTC") - tm.assert_index_equal(idx.hour, Index([4, 4], dtype=np.int32)) - - def test_tz_convert_roundtrip(self, tz_aware_fixture): - tz = tz_aware_fixture - idx1 = date_range(start="2014-01-01", end="2014-12-31", freq="M", tz="UTC") - exp1 = date_range(start="2014-01-01", end="2014-12-31", freq="M") - - idx2 = date_range(start="2014-01-01", end="2014-12-31", freq="D", tz="UTC") - exp2 = date_range(start="2014-01-01", end="2014-12-31", freq="D") - - idx3 = date_range(start="2014-01-01", end="2014-03-01", freq="H", tz="UTC") - exp3 = date_range(start="2014-01-01", end="2014-03-01", freq="H") - - idx4 = date_range(start="2014-08-01", end="2014-10-31", freq="T", tz="UTC") - exp4 = date_range(start="2014-08-01", end="2014-10-31", freq="T") - - for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), (idx4, exp4)]: - converted = idx.tz_convert(tz) - reset = converted.tz_convert(None) - tm.assert_index_equal(reset, expected) - assert reset.tzinfo is None - expected = converted.tz_convert("UTC").tz_localize(None) - expected = expected._with_freq("infer") - tm.assert_index_equal(reset, expected) - - def test_dti_tz_convert_tzlocal(self): - # GH#13583 - # tz_convert doesn't affect to internal - dti = date_range(start="2001-01-01", end="2001-03-01", tz="UTC") - dti2 = dti.tz_convert(dateutil.tz.tzlocal()) - tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) - - dti = date_range(start="2001-01-01", end="2001-03-01", tz=dateutil.tz.tzlocal()) - dti2 = dti.tz_convert(None) - tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) - - @pytest.mark.parametrize( - "tz", - [ - "US/Eastern", - "dateutil/US/Eastern", - pytz.timezone("US/Eastern"), - gettz("US/Eastern"), - ], - ) - def test_dti_tz_convert_utc_to_local_no_modify(self, tz): - rng = date_range("3/11/2012", "3/12/2012", freq="H", tz="utc") - rng_eastern = rng.tz_convert(tz) - - # Values are unmodified - tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) - - assert timezones.tz_compare(rng_eastern.tz, timezones.maybe_get_tz(tz)) - - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_tz_convert_unsorted(self, tzstr): - dr = date_range("2012-03-09", freq="H", periods=100, tz="utc") - dr = dr.tz_convert(tzstr) - - result = dr[::-1].hour - exp = dr.hour[::-1] - tm.assert_almost_equal(result, exp) - - # ------------------------------------------------------------- - # DatetimeIndex.tz_localize - - def test_tz_localize_utc_copies(self, utc_fixture): - # GH#46460 - times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] - index = DatetimeIndex(times) - - res = index.tz_localize(utc_fixture) - assert not tm.shares_memory(res, index) - - res2 = index._data.tz_localize(utc_fixture) - assert not tm.shares_memory(index._data, res2) - - def test_dti_tz_localize_nonexistent_raise_coerce(self): - # GH#13057 - times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] - index = DatetimeIndex(times) - tz = "US/Eastern" - with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): - index.tz_localize(tz=tz) - - with pytest.raises(pytz.NonExistentTimeError, match="|".join(times)): - index.tz_localize(tz=tz, nonexistent="raise") - - result = index.tz_localize(tz=tz, nonexistent="NaT") - test_times = ["2015-03-08 01:00-05:00", "NaT", "2015-03-08 03:00-04:00"] - dti = to_datetime(test_times, utc=True) - expected = dti.tz_convert("US/Eastern") - tm.assert_index_equal(result, expected) - - easts = [pytz.timezone("US/Eastern"), gettz("US/Eastern")] - if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Eastern") - except KeyError: - # no tzdata - pass - else: - easts.append(tz) - - @pytest.mark.parametrize("tz", easts) - def test_dti_tz_localize_ambiguous_infer(self, tz): - # November 6, 2011, fall back, repeat 2 AM hour - # With no repeated hours, we cannot infer the transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour()) - with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): - dr.tz_localize(tz) - - # With repeated hours, we can infer the transition - dr = date_range( - datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour(), tz=tz - ) - times = [ - "11/06/2011 00:00", - "11/06/2011 01:00", - "11/06/2011 01:00", - "11/06/2011 02:00", - "11/06/2011 03:00", - ] - di = DatetimeIndex(times) - localized = di.tz_localize(tz, ambiguous="infer") - expected = dr._with_freq(None) - tm.assert_index_equal(expected, localized) - tm.assert_index_equal(expected, DatetimeIndex(times, tz=tz, ambiguous="infer")) - - # When there is no dst transition, nothing special happens - dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=pd.offsets.Hour()) - localized = dr.tz_localize(tz) - localized_infer = dr.tz_localize(tz, ambiguous="infer") - tm.assert_index_equal(localized, localized_infer) - - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) - def test_dti_tz_localize_ambiguous_times(self, tz): - # March 13, 2011, spring forward, skip from 2 AM to 3 AM - dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=pd.offsets.Hour()) - with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:30:00"): - dr.tz_localize(tz) - - # after dst transition, it works - dr = date_range( - datetime(2011, 3, 13, 3, 30), periods=3, freq=pd.offsets.Hour(), tz=tz - ) - - # November 6, 2011, fall back, repeat 2 AM hour - dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=pd.offsets.Hour()) - with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): - dr.tz_localize(tz) - - # UTC is OK - dr = date_range( - datetime(2011, 3, 13), periods=48, freq=pd.offsets.Minute(30), tz=pytz.utc - ) - - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_dti_tz_localize_pass_dates_to_utc(self, tzstr): - strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] - - idx = DatetimeIndex(strdates) - conv = idx.tz_localize(tzstr) - - fromdates = DatetimeIndex(strdates, tz=tzstr) - - assert conv.tz == fromdates.tz - tm.assert_numpy_array_equal(conv.values, fromdates.values) - - @pytest.mark.parametrize("prefix", ["", "dateutil/"]) - def test_dti_tz_localize(self, prefix): - tzstr = prefix + "US/Eastern" - dti = date_range(start="1/1/2005", end="1/1/2005 0:00:30.256", freq="L") - dti2 = dti.tz_localize(tzstr) - - dti_utc = date_range( - start="1/1/2005 05:00", end="1/1/2005 5:00:30.256", freq="L", tz="utc" - ) - - tm.assert_numpy_array_equal(dti2.values, dti_utc.values) - - dti3 = dti2.tz_convert(prefix + "US/Pacific") - tm.assert_numpy_array_equal(dti3.values, dti_utc.values) - - dti = date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="L") - with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): - dti.tz_localize(tzstr) - - dti = date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="L") - with pytest.raises(pytz.NonExistentTimeError, match="2011-03-13 02:00:00"): - dti.tz_localize(tzstr) - - @pytest.mark.parametrize( - "tz", - [ - "US/Eastern", - "dateutil/US/Eastern", - pytz.timezone("US/Eastern"), - gettz("US/Eastern"), - ], - ) - def test_dti_tz_localize_utc_conversion(self, tz): - # Localizing to time zone should: - # 1) check for DST ambiguities - # 2) convert to UTC - - rng = date_range("3/10/2012", "3/11/2012", freq="30T") - - converted = rng.tz_localize(tz) - expected_naive = rng + pd.offsets.Hour(5) - tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) - - # DST ambiguity, this should fail - rng = date_range("3/11/2012", "3/12/2012", freq="30T") - # Is this really how it should fail?? - with pytest.raises(pytz.NonExistentTimeError, match="2012-03-11 02:00:00"): - rng.tz_localize(tz) - - def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): - # note: this tz tests that a tz-naive index can be localized - # and de-localized successfully, when there are no DST transitions - # in the range. - idx = date_range(start="2014-06-01", end="2014-08-30", freq="15T") - tz = tz_aware_fixture - localized = idx.tz_localize(tz) - # can't localize a tz-aware object - with pytest.raises( - TypeError, match="Already tz-aware, use tz_convert to convert" - ): - localized.tz_localize(tz) - reset = localized.tz_localize(None) - assert reset.tzinfo is None - expected = idx._with_freq(None) - tm.assert_index_equal(reset, expected) - - def test_dti_tz_localize_naive(self): - rng = date_range("1/1/2011", periods=100, freq="H") - - conv = rng.tz_localize("US/Pacific") - exp = date_range("1/1/2011", periods=100, freq="H", tz="US/Pacific") - - tm.assert_index_equal(conv, exp._with_freq(None)) - - def test_dti_tz_localize_tzlocal(self): - # GH#13583 - offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) - offset = int(offset.total_seconds() * 1000000000) - - dti = date_range(start="2001-01-01", end="2001-03-01") - dti2 = dti.tz_localize(dateutil.tz.tzlocal()) - tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8) - - dti = date_range(start="2001-01-01", end="2001-03-01", tz=dateutil.tz.tzlocal()) - dti2 = dti.tz_localize(None) - tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) - - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) - def test_dti_tz_localize_ambiguous_nat(self, tz): - times = [ - "11/06/2011 00:00", - "11/06/2011 01:00", - "11/06/2011 01:00", - "11/06/2011 02:00", - "11/06/2011 03:00", - ] - di = DatetimeIndex(times) - localized = di.tz_localize(tz, ambiguous="NaT") - - times = [ - "11/06/2011 00:00", - np.nan, - np.nan, - "11/06/2011 02:00", - "11/06/2011 03:00", - ] - di_test = DatetimeIndex(times, tz="US/Eastern") - - # left dtype is datetime64[ns, US/Eastern] - # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] - tm.assert_numpy_array_equal(di_test.values, localized.values) - - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) - def test_dti_tz_localize_ambiguous_flags(self, tz): - # November 6, 2011, fall back, repeat 2 AM hour - - # Pass in flags to determine right dst transition - dr = date_range( - datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour(), tz=tz - ) - times = [ - "11/06/2011 00:00", - "11/06/2011 01:00", - "11/06/2011 01:00", - "11/06/2011 02:00", - "11/06/2011 03:00", - ] - - # Test tz_localize - di = DatetimeIndex(times) - is_dst = [1, 1, 0, 0, 0] - localized = di.tz_localize(tz, ambiguous=is_dst) - expected = dr._with_freq(None) - tm.assert_index_equal(expected, localized) - tm.assert_index_equal(expected, DatetimeIndex(times, tz=tz, ambiguous=is_dst)) - - localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) - tm.assert_index_equal(dr, localized) - - localized = di.tz_localize(tz, ambiguous=np.array(is_dst).astype("bool")) - tm.assert_index_equal(dr, localized) - - # Test constructor - localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst) - tm.assert_index_equal(dr, localized) - - # Test duplicate times where inferring the dst fails - times += times - di = DatetimeIndex(times) - - # When the sizes are incompatible, make sure error is raised - msg = "Length of ambiguous bool-array must be the same size as vals" - with pytest.raises(Exception, match=msg): - di.tz_localize(tz, ambiguous=is_dst) - - # When sizes are compatible and there are repeats ('infer' won't work) - is_dst = np.hstack((is_dst, is_dst)) - localized = di.tz_localize(tz, ambiguous=is_dst) - dr = dr.append(dr) - tm.assert_index_equal(dr, localized) - - # When there is no dst transition, nothing special happens - dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=pd.offsets.Hour()) - is_dst = np.array([1] * 10) - localized = dr.tz_localize(tz) - localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) - tm.assert_index_equal(localized, localized_is_dst) - - # TODO: belongs outside tz_localize tests? - @pytest.mark.parametrize("tz", ["Europe/London", "dateutil/Europe/London"]) - def test_dti_construction_ambiguous_endpoint(self, tz): - # construction with an ambiguous end-point - # GH#11626 - - with pytest.raises(pytz.AmbiguousTimeError, match="Cannot infer dst time"): - date_range( - "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="H" - ) - - times = date_range( - "2013-10-26 23:00", "2013-10-27 01:00", freq="H", tz=tz, ambiguous="infer" - ) - assert times[0] == Timestamp("2013-10-26 23:00", tz=tz) - assert times[-1] == Timestamp("2013-10-27 01:00:00+0000", tz=tz) - - @pytest.mark.parametrize( - "tz, option, expected", - [ - ["US/Pacific", "shift_forward", "2019-03-10 03:00"], - ["dateutil/US/Pacific", "shift_forward", "2019-03-10 03:00"], - ["US/Pacific", "shift_backward", "2019-03-10 01:00"], - ["dateutil/US/Pacific", "shift_backward", "2019-03-10 01:00"], - ["US/Pacific", timedelta(hours=1), "2019-03-10 03:00"], - ], - ) - def test_dti_construction_nonexistent_endpoint(self, tz, option, expected): - # construction with an nonexistent end-point - - with pytest.raises(pytz.NonExistentTimeError, match="2019-03-10 02:00:00"): - date_range( - "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="H" - ) - - times = date_range( - "2019-03-10 00:00", "2019-03-10 02:00", freq="H", tz=tz, nonexistent=option - ) - assert times[-1] == Timestamp(expected, tz=tz) - - def test_dti_tz_localize_bdate_range(self): - dr = bdate_range("1/1/2009", "1/1/2010") - dr_utc = bdate_range("1/1/2009", "1/1/2010", tz=pytz.utc) - localized = dr.tz_localize(pytz.utc) - tm.assert_index_equal(dr_utc, localized) - - @pytest.mark.parametrize( - "start_ts, tz, end_ts, shift", - [ - ["2015-03-29 02:20:00", "Europe/Warsaw", "2015-03-29 03:00:00", "forward"], - [ - "2015-03-29 02:20:00", - "Europe/Warsaw", - "2015-03-29 01:59:59.999999999", - "backward", - ], - [ - "2015-03-29 02:20:00", - "Europe/Warsaw", - "2015-03-29 03:20:00", - timedelta(hours=1), - ], - [ - "2015-03-29 02:20:00", - "Europe/Warsaw", - "2015-03-29 01:20:00", - timedelta(hours=-1), - ], - ["2018-03-11 02:33:00", "US/Pacific", "2018-03-11 03:00:00", "forward"], - [ - "2018-03-11 02:33:00", - "US/Pacific", - "2018-03-11 01:59:59.999999999", - "backward", - ], - [ - "2018-03-11 02:33:00", - "US/Pacific", - "2018-03-11 03:33:00", - timedelta(hours=1), - ], - [ - "2018-03-11 02:33:00", - "US/Pacific", - "2018-03-11 01:33:00", - timedelta(hours=-1), - ], - ], - ) - @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) - def test_dti_tz_localize_nonexistent_shift( - self, start_ts, tz, end_ts, shift, tz_type - ): - # GH 8917 - tz = tz_type + tz - if isinstance(shift, str): - shift = "shift_" + shift - dti = DatetimeIndex([Timestamp(start_ts)]) - result = dti.tz_localize(tz, nonexistent=shift) - expected = DatetimeIndex([Timestamp(end_ts)]).tz_localize(tz) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("offset", [-1, 1]) - def test_dti_tz_localize_nonexistent_shift_invalid(self, offset, warsaw): - # GH 8917 - tz = warsaw - dti = DatetimeIndex([Timestamp("2015-03-29 02:20:00")]) - msg = "The provided timedelta will relocalize on a nonexistent time" - with pytest.raises(ValueError, match=msg): - dti.tz_localize(tz, nonexistent=timedelta(seconds=offset)) - - # ------------------------------------------------------------- - # DatetimeIndex.normalize - - def test_normalize_tz(self): - rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz="US/Eastern") - - result = rng.normalize() # does not preserve freq - expected = date_range("1/1/2000", periods=10, freq="D", tz="US/Eastern") - tm.assert_index_equal(result, expected._with_freq(None)) - - assert result.is_normalized - assert not rng.is_normalized - - rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz="UTC") - - result = rng.normalize() - expected = date_range("1/1/2000", periods=10, freq="D", tz="UTC") - tm.assert_index_equal(result, expected) - - assert result.is_normalized - assert not rng.is_normalized - - rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz=tzlocal()) - result = rng.normalize() # does not preserve freq - expected = date_range("1/1/2000", periods=10, freq="D", tz=tzlocal()) - tm.assert_index_equal(result, expected._with_freq(None)) - - assert result.is_normalized - assert not rng.is_normalized - - @td.skip_if_windows - @pytest.mark.parametrize( - "timezone", - [ - "US/Pacific", - "US/Eastern", - "UTC", - "Asia/Kolkata", - "Asia/Shanghai", - "Australia/Canberra", - ], - ) - def test_normalize_tz_local(self, timezone): - # GH#13459 - with tm.set_timezone(timezone): - rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz=tzlocal()) - - result = rng.normalize() - expected = date_range("1/1/2000", periods=10, freq="D", tz=tzlocal()) - expected = expected._with_freq(None) - tm.assert_index_equal(result, expected) - - assert result.is_normalized - assert not rng.is_normalized - - # ------------------------------------------------------------ - # DatetimeIndex.__new__ - - @pytest.mark.parametrize("prefix", ["", "dateutil/"]) - def test_dti_constructor_static_tzinfo(self, prefix): - # it works! - index = DatetimeIndex([datetime(2012, 1, 1)], tz=prefix + "EST") - index.hour - index[0] - - def test_dti_constructor_with_fixed_tz(self): - off = FixedOffset(420, "+07:00") - start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) - end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) - rng = date_range(start=start, end=end) - assert off == rng.tz - - rng2 = date_range(start, periods=len(rng), tz=off) - tm.assert_index_equal(rng, rng2) - - rng3 = date_range("3/11/2012 05:00:00+07:00", "6/11/2012 05:00:00+07:00") - assert (rng.values == rng3.values).all() - - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_dti_convert_datetime_list(self, tzstr): - dr = date_range("2012-06-02", periods=10, tz=tzstr, name="foo") - dr2 = DatetimeIndex(list(dr), name="foo", freq="D") - tm.assert_index_equal(dr, dr2) - - def test_dti_construction_univalent(self): - rng = date_range("03/12/2012 00:00", periods=10, freq="W-FRI", tz="US/Eastern") - rng2 = DatetimeIndex(data=rng, tz="US/Eastern") - tm.assert_index_equal(rng, rng2) - - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) - def test_dti_from_tzaware_datetime(self, tz): - d = [datetime(2012, 8, 19, tzinfo=tz)] - - index = DatetimeIndex(d) - assert timezones.tz_compare(index.tz, tz) - - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_dti_tz_constructors(self, tzstr): - """Test different DatetimeIndex constructions with timezone - Follow-up of GH#4229 - """ - arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] - - idx1 = to_datetime(arr).tz_localize(tzstr) - idx2 = date_range(start="2005-11-10 08:00:00", freq="H", periods=2, tz=tzstr) - idx2 = idx2._with_freq(None) # the others all have freq=None - idx3 = DatetimeIndex(arr, tz=tzstr) - idx4 = DatetimeIndex(np.array(arr), tz=tzstr) - - for other in [idx2, idx3, idx4]: - tm.assert_index_equal(idx1, other) - - # ------------------------------------------------------------- # Unsorted - @pytest.mark.parametrize( - "dtype", - [None, "datetime64[ns, CET]", "datetime64[ns, EST]", "datetime64[ns, UTC]"], - ) - def test_date_accessor(self, dtype): - # Regression test for GH#21230 - expected = np.array([date(2018, 6, 4), pd.NaT]) - - index = DatetimeIndex(["2018-06-04 10:00:00", pd.NaT], dtype=dtype) - result = index.date - - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize( - "dtype", - [None, "datetime64[ns, CET]", "datetime64[ns, EST]", "datetime64[ns, UTC]"], - ) - def test_time_accessor(self, dtype): - # Regression test for GH#21267 - expected = np.array([time(10, 20, 30), pd.NaT]) - - index = DatetimeIndex(["2018-06-04 10:20:30", pd.NaT], dtype=dtype) - result = index.time - - tm.assert_numpy_array_equal(result, expected) - - def test_timetz_accessor(self, tz_naive_fixture): - # GH21358 - tz = timezones.maybe_get_tz(tz_naive_fixture) - - expected = np.array([time(10, 20, 30, tzinfo=tz), pd.NaT]) - - index = DatetimeIndex(["2018-06-04 10:20:30", pd.NaT], tz=tz) - result = index.timetz - - tm.assert_numpy_array_equal(result, expected) - def test_dti_drop_dont_lose_tz(self): # GH#2621 ind = date_range("2012-12-01", periods=10, tz="utc") @@ -877,9 +63,9 @@ def test_dti_tz_conversion_freq(self, tz_naive_fixture): # GH25241 - t3 = DatetimeIndex(["2019-01-01 10:00"], freq="H") + t3 = DatetimeIndex(["2019-01-01 10:00"], freq="h") assert t3.tz_localize(tz=tz_naive_fixture).freq == t3.freq - t4 = DatetimeIndex(["2019-01-02 12:00"], tz="UTC", freq="T") + t4 = DatetimeIndex(["2019-01-02 12:00"], tz="UTC", freq="min") assert t4.tz_convert(tz="UTC").freq == t4.freq def test_drop_dst_boundary(self): @@ -906,7 +92,7 @@ "201710290245", "201710290300", ], - tz=tz, + dtype="M8[ns, Europe/Brussels]", freq=freq, ambiguous=[ True, @@ -926,10 +112,14 @@ result = index.drop(index[0]) tm.assert_index_equal(result, expected) - def test_date_range_localize(self): - rng = date_range("3/11/2012 03:00", periods=15, freq="H", tz="US/Eastern") - rng2 = DatetimeIndex(["3/11/2012 03:00", "3/11/2012 04:00"], tz="US/Eastern") - rng3 = date_range("3/11/2012 03:00", periods=15, freq="H") + def test_date_range_localize(self, unit): + rng = date_range( + "3/11/2012 03:00", periods=15, freq="h", tz="US/Eastern", unit=unit + ) + rng2 = DatetimeIndex( + ["3/11/2012 03:00", "3/11/2012 04:00"], dtype=f"M8[{unit}, US/Eastern]" + ) + rng3 = date_range("3/11/2012 03:00", periods=15, freq="h", unit=unit) rng3 = rng3.tz_localize("US/Eastern") tm.assert_index_equal(rng._with_freq(None), rng3) @@ -943,10 +133,15 @@ assert val == exp # same UTC value tm.assert_index_equal(rng[:2], rng2) + def test_date_range_localize2(self, unit): # Right before the DST transition - rng = date_range("3/11/2012 00:00", periods=2, freq="H", tz="US/Eastern") + rng = date_range( + "3/11/2012 00:00", periods=2, freq="h", tz="US/Eastern", unit=unit + ) rng2 = DatetimeIndex( - ["3/11/2012 00:00", "3/11/2012 01:00"], tz="US/Eastern", freq="H" + ["3/11/2012 00:00", "3/11/2012 01:00"], + dtype=f"M8[{unit}, US/Eastern]", + freq="h", ) tm.assert_index_equal(rng, rng2) exp = Timestamp("3/11/2012 00:00", tz="US/Eastern") @@ -956,7 +151,9 @@ assert exp.hour == 1 assert rng[1] == exp - rng = date_range("3/11/2012 00:00", periods=10, freq="H", tz="US/Eastern") + rng = date_range( + "3/11/2012 00:00", periods=10, freq="h", tz="US/Eastern", unit=unit + ) assert rng[2].hour == 3 def test_timestamp_equality_different_timezones(self): @@ -973,19 +170,9 @@ assert (utc_range == berlin_range).all() assert (berlin_range == eastern_range).all() - def test_dti_intersection(self): - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") - - left = rng[10:90][::-1] - right = rng[20:80][::-1] - - assert left.tz == rng.tz - result = left.intersection(right) - assert result.tz == left.tz - def test_dti_equals_with_tz(self): - left = date_range("1/1/2011", periods=100, freq="H", tz="utc") - right = date_range("1/1/2011", periods=100, freq="H", tz="US/Eastern") + left = date_range("1/1/2011", periods=100, freq="h", tz="utc") + right = date_range("1/1/2011", periods=100, freq="h", tz="US/Eastern") assert not left.equals(right) @@ -997,46 +184,10 @@ assert idx[0].tzinfo is not None @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_dti_astype_asobject_tzinfos(self, tzstr): - # GH#1345 - - # dates around a dst transition - rng = date_range("2/13/2010", "5/6/2010", tz=tzstr) - - objs = rng.astype(object) - for i, x in enumerate(objs): - exval = rng[i] - assert x == exval - assert x.tzinfo == exval.tzinfo - - objs = rng.astype(object) - for i, x in enumerate(objs): - exval = rng[i] - assert x == exval - assert x.tzinfo == exval.tzinfo - - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_dti_with_timezone_repr(self, tzstr): - rng = date_range("4/13/2010", "5/6/2010") - - rng_eastern = rng.tz_localize(tzstr) - - rng_repr = repr(rng_eastern) - assert "2010-04-13 00:00:00" in rng_repr - - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_dti_take_dont_lose_meta(self, tzstr): - rng = date_range("1/1/2000", periods=20, tz=tzstr) - - result = rng.take(range(5)) - assert result.tz == rng.tz - assert result.freq == rng.freq - - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_utc_box_timestamp_and_localize(self, tzstr): tz = timezones.maybe_get_tz(tzstr) - rng = date_range("3/11/2012", "3/12/2012", freq="H", tz="utc") + rng = date_range("3/11/2012", "3/12/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tzstr) expected = rng[-1].astimezone(tz) @@ -1046,7 +197,7 @@ assert stamp.tzinfo == expected.tzinfo # right tzinfo - rng = date_range("3/13/2012", "3/14/2012", freq="H", tz="utc") + rng = date_range("3/13/2012", "3/14/2012", freq="h", tz="utc") rng_eastern = rng.tz_convert(tzstr) # test not valid for dateutil timezones. # assert 'EDT' in repr(rng_eastern[0].tzinfo) @@ -1054,36 +205,6 @@ rng_eastern[0].tzinfo ) - def test_dti_to_pydatetime(self): - dt = dateutil.parser.parse("2012-06-13T01:39:00Z") - dt = dt.replace(tzinfo=tzlocal()) - - arr = np.array([dt], dtype=object) - - result = to_datetime(arr, utc=True) - assert result.tz is timezone.utc - - rng = date_range("2012-11-03 03:00", "2012-11-05 03:00", tz=tzlocal()) - arr = rng.to_pydatetime() - result = to_datetime(arr, utc=True) - assert result.tz is timezone.utc - - def test_dti_to_pydatetime_fizedtz(self): - dates = np.array( - [ - datetime(2000, 1, 1, tzinfo=fixed_off), - datetime(2000, 1, 2, tzinfo=fixed_off), - datetime(2000, 1, 3, tzinfo=fixed_off), - ] - ) - dti = DatetimeIndex(dates) - - result = dti.to_pydatetime() - tm.assert_numpy_array_equal(dates, result) - - result = dti._mpl_repr() - tm.assert_numpy_array_equal(dates, result) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Central"), gettz("US/Central")]) def test_with_tz(self, tz): # just want it to work @@ -1115,100 +236,16 @@ with pytest.raises(Exception, match=msg): bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), "1/1/2009", tz=tz) - @pytest.mark.parametrize("prefix", ["", "dateutil/"]) - def test_field_access_localize(self, prefix): - strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] - rng = DatetimeIndex(strdates, tz=prefix + "US/Eastern") - assert (rng.hour == 0).all() - - # a more unusual time zone, #1946 - dr = date_range( - "2011-10-02 00:00", freq="h", periods=10, tz=prefix + "America/Atikokan" - ) - - expected = Index(np.arange(10, dtype=np.int32)) - tm.assert_index_equal(dr.hour, expected) - @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_convert_tz_aware_datetime_datetime(self, tz): # GH#1581 dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] dates_aware = [conversion.localize_pydatetime(x, tz) for x in dates] - result = DatetimeIndex(dates_aware) + result = DatetimeIndex(dates_aware).as_unit("ns") assert timezones.tz_compare(result.tz, tz) - converted = to_datetime(dates_aware, utc=True) + converted = to_datetime(dates_aware, utc=True).as_unit("ns") ex_vals = np.array([Timestamp(x).as_unit("ns")._value for x in dates_aware]) tm.assert_numpy_array_equal(converted.asi8, ex_vals) assert converted.tz is timezone.utc - - # Note: not difference, as there is no symmetry requirement there - @pytest.mark.parametrize("setop", ["union", "intersection", "symmetric_difference"]) - def test_dti_setop_aware(self, setop): - # non-overlapping - # GH#39328 as of 2.0 we cast these to UTC instead of object - rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") - - rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", tz="US/Eastern") - - result = getattr(rng, setop)(rng2) - - left = rng.tz_convert("UTC") - right = rng2.tz_convert("UTC") - expected = getattr(left, setop)(right) - tm.assert_index_equal(result, expected) - assert result.tz == left.tz - if len(result): - assert result[0].tz is timezone.utc - assert result[-1].tz is timezone.utc - - def test_dti_union_mixed(self): - # GH 21671 - rng = DatetimeIndex([Timestamp("2011-01-01"), pd.NaT]) - rng2 = DatetimeIndex(["2012-01-01", "2012-01-02"], tz="Asia/Tokyo") - result = rng.union(rng2) - expected = Index( - [ - Timestamp("2011-01-01"), - pd.NaT, - Timestamp("2012-01-01", tz="Asia/Tokyo"), - Timestamp("2012-01-02", tz="Asia/Tokyo"), - ], - dtype=object, - ) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize( - "tz", [None, "UTC", "US/Central", dateutil.tz.tzoffset(None, -28800)] - ) - def test_iteration_preserves_nanoseconds(self, tz): - # GH 19603 - index = DatetimeIndex( - ["2018-02-08 15:00:00.168456358", "2018-02-08 15:00:00.168456359"], tz=tz - ) - for i, ts in enumerate(index): - assert ts == index[i] # pylint: disable=unnecessary-list-index-lookup - - -def test_tz_localize_invalidates_freq(): - # we only preserve freq in unambiguous cases - - # if localized to US/Eastern, this crosses a DST transition - dti = date_range("2014-03-08 23:00", "2014-03-09 09:00", freq="H") - assert dti.freq == "H" - - result = dti.tz_localize(None) # no-op - assert result.freq == "H" - - result = dti.tz_localize("UTC") # unambiguous freq preservation - assert result.freq == "H" - - result = dti.tz_localize("US/Eastern", nonexistent="shift_forward") - assert result.freq is None - assert result.inferred_freq is None # i.e. we are not _too_ strict here - - # Case where we _can_ keep freq because we're length==1 - dti2 = dti[:1] - result = dti2.tz_localize("US/Eastern") - assert result.freq == "H" diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_unique.py pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_unique.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/datetimes/test_unique.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/datetimes/test_unique.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,76 +0,0 @@ -from datetime import ( - datetime, - timedelta, -) - -from pandas import ( - DatetimeIndex, - NaT, - Timestamp, -) -import pandas._testing as tm - - -def test_unique(tz_naive_fixture): - idx = DatetimeIndex(["2017"] * 2, tz=tz_naive_fixture) - expected = idx[:1] - - result = idx.unique() - tm.assert_index_equal(result, expected) - # GH#21737 - # Ensure the underlying data is consistent - assert result[0] == expected[0] - - -def test_index_unique(rand_series_with_duplicate_datetimeindex): - dups = rand_series_with_duplicate_datetimeindex - index = dups.index - - uniques = index.unique() - expected = DatetimeIndex( - [ - datetime(2000, 1, 2), - datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - ] - ) - assert uniques.dtype == "M8[ns]" # sanity - tm.assert_index_equal(uniques, expected) - assert index.nunique() == 4 - - # GH#2563 - assert isinstance(uniques, DatetimeIndex) - - dups_local = index.tz_localize("US/Eastern") - dups_local.name = "foo" - result = dups_local.unique() - expected = DatetimeIndex(expected, name="foo") - expected = expected.tz_localize("US/Eastern") - assert result.tz is not None - assert result.name == "foo" - tm.assert_index_equal(result, expected) - - -def test_index_unique2(): - # NaT, note this is excluded - arr = [1370745748 + t for t in range(20)] + [NaT._value] - idx = DatetimeIndex(arr * 3) - tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) - assert idx.nunique() == 20 - assert idx.nunique(dropna=False) == 21 - - -def test_index_unique3(): - arr = [ - Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20) - ] + [NaT] - idx = DatetimeIndex(arr * 3) - tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) - assert idx.nunique() == 20 - assert idx.nunique(dropna=False) == 21 - - -def test_is_unique_monotonic(rand_series_with_duplicate_datetimeindex): - index = rand_series_with_duplicate_datetimeindex.index - assert not index.is_unique diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_base.py pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_base.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_base.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_base.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,56 +0,0 @@ -import numpy as np -import pytest - -from pandas import IntervalIndex -import pandas._testing as tm - - -class TestInterval: - """ - Tests specific to the shared common index tests; unrelated tests should be placed - in test_interval.py or the specific test file (e.g. test_astype.py) - """ - - @pytest.fixture - def simple_index(self) -> IntervalIndex: - return IntervalIndex.from_breaks(range(11), closed="right") - - @pytest.fixture - def index(self): - return tm.makeIntervalIndex(10) - - def test_take(self, closed): - index = IntervalIndex.from_breaks(range(11), closed=closed) - - result = index.take(range(10)) - tm.assert_index_equal(result, index) - - result = index.take([0, 0, 1]) - expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2], closed=closed) - tm.assert_index_equal(result, expected) - - def test_where(self, simple_index, listlike_box): - klass = listlike_box - - idx = simple_index - cond = [True] * len(idx) - expected = idx - result = expected.where(klass(cond)) - tm.assert_index_equal(result, expected) - - cond = [False] + [True] * len(idx[1:]) - expected = IntervalIndex([np.nan] + idx[1:].tolist()) - result = idx.where(klass(cond)) - tm.assert_index_equal(result, expected) - - def test_getitem_2d_deprecated(self, simple_index): - # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable - idx = simple_index - with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): - idx[:, None] - with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): - # GH#44051 - idx[True] - with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): - # GH#44051 - idx[False] diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,9 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + +from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas.core.dtypes.dtypes import IntervalDtype from pandas import ( @@ -256,6 +259,29 @@ tm.assert_index_equal(result.right, expected_right) assert result.dtype.subtype == expected_subtype + @pytest.mark.parametrize("interval_cls", [IntervalArray, IntervalIndex]) + def test_from_arrays_mismatched_datetimelike_resos(self, interval_cls): + # GH#55714 + left = date_range("2016-01-01", periods=3, unit="s") + right = date_range("2017-01-01", periods=3, unit="ms") + result = interval_cls.from_arrays(left, right) + expected = interval_cls.from_arrays(left.as_unit("ms"), right) + tm.assert_equal(result, expected) + + # td64 + left2 = left - left[0] + right2 = right - left[0] + result2 = interval_cls.from_arrays(left2, right2) + expected2 = interval_cls.from_arrays(left2.as_unit("ms"), right2) + tm.assert_equal(result2, expected2) + + # dt64tz + left3 = left.tz_localize("UTC") + right3 = right.tz_localize("UTC") + result3 = interval_cls.from_arrays(left3, right3) + expected3 = interval_cls.from_arrays(left3.as_unit("ms"), right3) + tm.assert_equal(result3, expected3) + class TestFromBreaks(ConstructorTests): """Tests specific to IntervalIndex.from_breaks""" @@ -307,7 +333,7 @@ converts intervals in breaks format to a dictionary of kwargs to specific to the format expected by IntervalIndex.from_tuples """ - if tm.is_unsigned_integer_dtype(breaks): + if is_unsigned_integer_dtype(breaks): pytest.skip(f"{breaks.dtype} not relevant IntervalIndex.from_tuples tests") if len(breaks) == 0: @@ -365,7 +391,7 @@ converts intervals in breaks format to a dictionary of kwargs to specific to the format expected by the IntervalIndex/Index constructors """ - if tm.is_unsigned_integer_dtype(breaks): + if is_unsigned_integer_dtype(breaks): pytest.skip(f"{breaks.dtype} not relevant for class constructor tests") if len(breaks) == 0: @@ -465,6 +491,23 @@ tm.assert_index_equal(result, expected) +@pytest.mark.parametrize("timezone", ["UTC", "US/Pacific", "GMT"]) +def test_interval_index_subtype(timezone, inclusive_endpoints_fixture): + # GH#46999 + dates = date_range("2022", periods=3, tz=timezone) + dtype = f"interval[datetime64[ns, {timezone}], {inclusive_endpoints_fixture}]" + result = IntervalIndex.from_arrays( + ["2022-01-01", "2022-01-02"], + ["2022-01-02", "2022-01-03"], + closed=inclusive_endpoints_fixture, + dtype=dtype, + ) + expected = IntervalIndex.from_arrays( + dates[:-1], dates[1:], closed=inclusive_endpoints_fixture + ) + tm.assert_index_equal(result, expected) + + def test_dtype_closed_mismatch(): # GH#38394 closed specified in both dtype and IntervalIndex constructor @@ -476,3 +519,17 @@ with pytest.raises(ValueError, match=msg): IntervalArray([], dtype=dtype, closed="neither") + + +@pytest.mark.parametrize( + "dtype", + ["Float64", pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow"))], +) +def test_ea_dtype(dtype): + # GH#56765 + bins = [(0.0, 0.4), (0.4, 0.6)] + interval_dtype = IntervalDtype(subtype=dtype, closed="left") + result = IntervalIndex.from_tuples(bins, closed="left", dtype=interval_dtype) + assert result.dtype == interval_dtype + expected = IntervalIndex.from_tuples(bins, closed="left").astype(interval_dtype) + tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_formats.py pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_formats.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_formats.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_formats.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,8 +1,11 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas import ( DataFrame, + DatetimeIndex, Index, Interval, IntervalIndex, @@ -14,15 +17,7 @@ class TestIntervalIndexRendering: - def test_frame_repr(self): - # https://github.com/pandas-dev/pandas/pull/24134/files - df = DataFrame( - {"A": [1, 2, 3, 4]}, index=IntervalIndex.from_breaks([0, 1, 2, 3, 4]) - ) - result = repr(df) - expected = " A\n(0, 1] 1\n(1, 2] 2\n(2, 3] 3\n(3, 4] 4" - assert result == expected - + # TODO: this is a test for DataFrame/Series, not IntervalIndex @pytest.mark.parametrize( "constructor,expected", [ @@ -38,13 +33,16 @@ (DataFrame, (" 0\n(0.0, 1.0] a\nNaN b\n(2.0, 3.0] c")), ], ) - def test_repr_missing(self, constructor, expected): + def test_repr_missing(self, constructor, expected, using_infer_string, request): # GH 25984 + if using_infer_string and constructor is Series: + request.applymarker(pytest.mark.xfail(reason="repr different")) index = IntervalIndex.from_tuples([(0, 1), np.nan, (2, 3)]) obj = constructor(list("abc"), index=index) result = repr(obj) assert result == expected + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") def test_repr_floats(self): # GH 32553 @@ -80,7 +78,11 @@ ((Timestamp("20180102"), Timestamp("20180103"))), ], "both", - ["[2018-01-01, 2018-01-02]", "NaN", "[2018-01-02, 2018-01-03]"], + [ + "[2018-01-01 00:00:00, 2018-01-02 00:00:00]", + "NaN", + "[2018-01-02 00:00:00, 2018-01-03 00:00:00]", + ], ), ( [ @@ -97,9 +99,21 @@ ), ], ) - def test_to_native_types(self, tuples, closed, expected_data): + def test_get_values_for_csv(self, tuples, closed, expected_data): # GH 28210 index = IntervalIndex.from_tuples(tuples, closed=closed) - result = index._format_native_types() + result = index._get_values_for_csv(na_rep="NaN") expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) + + def test_timestamp_with_timezone(self, unit): + # GH 55035 + left = DatetimeIndex(["2020-01-01"], dtype=f"M8[{unit}, UTC]") + right = DatetimeIndex(["2020-01-02"], dtype=f"M8[{unit}, UTC]") + index = IntervalIndex.from_arrays(left, right) + result = repr(index) + expected = ( + "IntervalIndex([(2020-01-01 00:00:00+00:00, 2020-01-02 00:00:00+00:00]], " + f"dtype='interval[datetime64[{unit}, UTC], right]')" + ) + assert result == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_indexing.py pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_indexing.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -19,12 +19,75 @@ array, date_range, interval_range, + isna, period_range, timedelta_range, ) import pandas._testing as tm +class TestGetItem: + def test_getitem(self, closed): + idx = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) + assert idx[0] == Interval(0.0, 1.0, closed=closed) + assert idx[1] == Interval(1.0, 2.0, closed=closed) + assert isna(idx[2]) + + result = idx[0:1] + expected = IntervalIndex.from_arrays((0.0,), (1.0,), closed=closed) + tm.assert_index_equal(result, expected) + + result = idx[0:2] + expected = IntervalIndex.from_arrays((0.0, 1), (1.0, 2.0), closed=closed) + tm.assert_index_equal(result, expected) + + result = idx[1:3] + expected = IntervalIndex.from_arrays( + (1.0, np.nan), (2.0, np.nan), closed=closed + ) + tm.assert_index_equal(result, expected) + + def test_getitem_2d_deprecated(self): + # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable + idx = IntervalIndex.from_breaks(range(11), closed="right") + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + idx[:, None] + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + # GH#44051 + idx[True] + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + # GH#44051 + idx[False] + + +class TestWhere: + def test_where(self, listlike_box): + klass = listlike_box + + idx = IntervalIndex.from_breaks(range(11), closed="right") + cond = [True] * len(idx) + expected = idx + result = expected.where(klass(cond)) + tm.assert_index_equal(result, expected) + + cond = [False] + [True] * len(idx[1:]) + expected = IntervalIndex([np.nan] + idx[1:].tolist()) + result = idx.where(klass(cond)) + tm.assert_index_equal(result, expected) + + +class TestTake: + def test_take(self, closed): + index = IntervalIndex.from_breaks(range(11), closed=closed) + + result = index.take(range(10)) + tm.assert_index_equal(result, index) + + result = index.take([0, 0, 1]) + expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2], closed=closed) + tm.assert_index_equal(result, expected) + + class TestGetLoc: @pytest.mark.parametrize("side", ["right", "left", "both", "neither"]) def test_get_loc_interval(self, closed, side): @@ -300,16 +363,19 @@ def test_get_indexer_datetime(self): ii = IntervalIndex.from_breaks(date_range("2018-01-01", periods=4)) - result = ii.get_indexer(DatetimeIndex(["2018-01-02"])) + # TODO: with mismatched resolution get_indexer currently raises; + # this should probably coerce? + target = DatetimeIndex(["2018-01-02"], dtype="M8[ns]") + result = ii.get_indexer(target) expected = np.array([0], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).astype(str)) + result = ii.get_indexer(target.astype(str)) tm.assert_numpy_array_equal(result, expected) - # TODO this should probably be deprecated? # https://github.com/pandas-dev/pandas/issues/47772 - result = ii.get_indexer(DatetimeIndex(["2018-01-02"]).asi8) + result = ii.get_indexer(target.asi8) + expected = np.array([-1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_interval.py pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_interval.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_interval.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_interval.py 2024-04-10 17:42:52.000000000 +0000 @@ -86,8 +86,12 @@ [ [1, 1, 2, 5, 15, 53, 217, 1014, 5335, 31240, 201608], [-np.inf, -100, -10, 0.5, 1, 1.5, 3.8, 101, 202, np.inf], - pd.to_datetime(["20170101", "20170202", "20170303", "20170404"]), - pd.to_timedelta(["1ns", "2ms", "3s", "4min", "5H", "6D"]), + date_range("2017-01-01", "2017-01-04"), + pytest.param( + date_range("2017-01-01", "2017-01-04", unit="s"), + marks=pytest.mark.xfail(reason="mismatched result unit"), + ), + pd.to_timedelta(["1ns", "2ms", "3s", "4min", "5h", "6D"]), ], ) def test_length(self, closed, breaks): @@ -337,26 +341,6 @@ assert not index._is_strictly_monotonic_decreasing assert not index.is_monotonic_decreasing - def test_get_item(self, closed): - i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) - assert i[0] == Interval(0.0, 1.0, closed=closed) - assert i[1] == Interval(1.0, 2.0, closed=closed) - assert isna(i[2]) - - result = i[0:1] - expected = IntervalIndex.from_arrays((0.0,), (1.0,), closed=closed) - tm.assert_index_equal(result, expected) - - result = i[0:2] - expected = IntervalIndex.from_arrays((0.0, 1), (1.0, 2.0), closed=closed) - tm.assert_index_equal(result, expected) - - result = i[1:3] - expected = IntervalIndex.from_arrays( - (1.0, np.nan), (2.0, np.nan), closed=closed - ) - tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( "breaks", [ @@ -404,7 +388,7 @@ # GH 20636 index = IntervalIndex.from_breaks(breaks) - to_convert = breaks._constructor([pd.NaT] * 3) + to_convert = breaks._constructor([pd.NaT] * 3).as_unit("ns") expected = Index([np.nan] * 3, dtype=np.float64) result = index._maybe_convert_i8(to_convert) tm.assert_index_equal(result, expected) @@ -689,13 +673,13 @@ # test get_indexer start = Timestamp("1999-12-31T12:00", tz=tz) - target = date_range(start=start, periods=7, freq="12H") + target = date_range(start=start, periods=7, freq="12h") actual = index.get_indexer(target) expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype="intp") tm.assert_numpy_array_equal(actual, expected) start = Timestamp("2000-01-08T18:00", tz=tz) - target = date_range(start=start, periods=7, freq="6H") + target = date_range(start=start, periods=7, freq="6h") actual = index.get_indexer(target) expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype="intp") tm.assert_numpy_array_equal(actual, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_interval_range.py pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_interval_range.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_interval_range.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_interval_range.py 2024-04-10 17:42:52.000000000 +0000 @@ -58,7 +58,7 @@ @pytest.mark.parametrize("tz", [None, "US/Eastern"]) @pytest.mark.parametrize( - "freq, periods", [("D", 364), ("2D", 182), ("22D18H", 16), ("M", 11)] + "freq, periods", [("D", 364), ("2D", 182), ("22D18h", 16), ("ME", 11)] ) def test_constructor_timestamp(self, closed, name, freq, periods, tz): start, end = Timestamp("20180101", tz=tz), Timestamp("20181231", tz=tz) @@ -84,16 +84,14 @@ tm.assert_index_equal(result, expected) # GH 20976: linspace behavior defined from start/end/periods - if not breaks.freq.is_anchored() and tz is None: - # matches expected only for non-anchored offsets and tz naive - # (anchored/DST transitions cause unequal spacing in expected) + if not breaks.freq.n == 1 and tz is None: result = interval_range( start=start, end=end, periods=periods, name=name, closed=closed ) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "freq, periods", [("D", 100), ("2D12H", 40), ("5D", 20), ("25D", 4)] + "freq, periods", [("D", 100), ("2D12h", 40), ("5D", 20), ("25D", 4)] ) def test_constructor_timedelta(self, closed, name, freq, periods): start, end = Timedelta("0 days"), Timedelta("100 days") @@ -130,7 +128,7 @@ (0, 10, 3, 9), (0, 10, 1.5, 9), (0.5, 10, 3, 9.5), - (Timedelta("0D"), Timedelta("10D"), "2D4H", Timedelta("8D16H")), + (Timedelta("0D"), Timedelta("10D"), "2D4h", Timedelta("8D16h")), ( Timestamp("2018-01-01"), Timestamp("2018-02-09"), @@ -140,7 +138,7 @@ ( Timestamp("2018-01-01", tz="US/Eastern"), Timestamp("2018-01-20", tz="US/Eastern"), - "5D12H", + "5D12h", Timestamp("2018-01-17 12:00:00", tz="US/Eastern"), ), ], @@ -184,6 +182,9 @@ def test_linspace_dst_transition(self, start, mid, end): # GH 20976: linspace behavior defined from start/end/periods # accounts for the hour gained/lost during DST transition + start = start.as_unit("ns") + mid = mid.as_unit("ns") + end = end.as_unit("ns") result = interval_range(start=start, end=end, periods=2) expected = IntervalIndex.from_breaks([start, mid, end]) tm.assert_index_equal(result, expected) @@ -219,12 +220,15 @@ expected = "int64" if is_integer(start + end) else "float64" assert result == expected - def test_constructor_coverage(self): + def test_interval_range_fractional_period(self): # float value for periods expected = interval_range(start=0, periods=10) - result = interval_range(start=0, periods=10.5) + msg = "Non-integer 'periods' in pd.date_range, .* pd.interval_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = interval_range(start=0, periods=10.5) tm.assert_index_equal(result, expected) + def test_constructor_coverage(self): # equivalent timestamp-like start/end start, end = Timestamp("2017-01-01"), Timestamp("2017-01-15") expected = interval_range(start=start, end=end) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_setops.py pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_setops.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/interval/test_setops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/interval/test_setops.py 2024-04-10 17:42:52.000000000 +0000 @@ -25,14 +25,16 @@ expected = monotonic_index(0, 13, closed=closed) result = index[::-1].union(other, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) result = other[::-1].union(index, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) tm.assert_index_equal(index.union(index, sort=sort), index) tm.assert_index_equal(index.union(index[:1], sort=sort), index) @@ -65,14 +67,16 @@ expected = monotonic_index(5, 11, closed=closed) result = index[::-1].intersection(other, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) result = other[::-1].intersection(index, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) tm.assert_index_equal(index.intersection(index, sort=sort), index) @@ -148,16 +152,18 @@ index = monotonic_index(0, 11, closed=closed) result = index[1:].symmetric_difference(index[:-1], sort=sort) expected = IntervalIndex([index[0], index[-1]]) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) # GH 19101: empty result, same dtype result = index.symmetric_difference(index, sort=sort) expected = empty_index(dtype="int64", closed=closed) - if sort is None: + if sort in (None, True): tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) # GH 19101: empty result, different dtypes other = IntervalIndex.from_arrays( diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/conftest.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,6 @@ import numpy as np import pytest -import pandas as pd from pandas import ( Index, MultiIndex, @@ -26,52 +25,3 @@ verify_integrity=False, ) return mi - - -@pytest.fixture -def idx_dup(): - # compare tests/indexes/multi/conftest.py - major_axis = Index(["foo", "bar", "baz", "qux"]) - minor_axis = Index(["one", "two"]) - - major_codes = np.array([0, 0, 1, 0, 1, 1]) - minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index_names = ["first", "second"] - mi = MultiIndex( - levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=index_names, - verify_integrity=False, - ) - return mi - - -@pytest.fixture -def index_names(): - # names that match those in the idx fixture for testing equality of - # names assigned to the idx - return ["first", "second"] - - -@pytest.fixture -def narrow_multi_index(): - """ - Return a MultiIndex that is narrower than the display (<80 characters). - """ - n = 1000 - ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) - dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) - return MultiIndex.from_arrays([ci, ci.codes + 9, dti], names=["a", "b", "dti"]) - - -@pytest.fixture -def wide_multi_index(): - """ - Return a MultiIndex that is wider than the display (>80 characters). - """ - n = 1000 - ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) - dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) - levels = [ci, ci.codes + 9, dti, dti, dti] - names = ["a", "b", "dti_1", "dti_2", "dti_3"] - return MultiIndex.from_arrays(levels, names=names) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_analytics.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_analytics.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_analytics.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_analytics.py 2024-04-10 17:42:52.000000000 +0000 @@ -98,8 +98,8 @@ def test_append_mixed_dtypes(): # GH 13660 - dti = date_range("2011-01-01", freq="M", periods=3) - dti_tz = date_range("2011-01-01", freq="M", periods=3, tz="US/Eastern") + dti = date_range("2011-01-01", freq="ME", periods=3) + dti_tz = date_range("2011-01-01", freq="ME", periods=3, tz="US/Eastern") pi = period_range("2011-01", freq="M", periods=3) mi = MultiIndex.from_arrays( diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_compat.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_compat.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_compat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_compat.py 2024-04-10 17:42:52.000000000 +0000 @@ -87,7 +87,7 @@ def test_boxable_categorical_values(): - cat = pd.Categorical(pd.date_range("2012-01-01", periods=3, freq="H")) + cat = pd.Categorical(pd.date_range("2012-01-01", periods=3, freq="h")) result = MultiIndex.from_product([["a", "b", "c"], cat]).values expected = pd.Series( [ diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas.compat import pa_version_under7p0 - from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike import pandas as pd @@ -203,15 +201,15 @@ [ ( pd.period_range("2011-01-01", freq="D", periods=3), - pd.period_range("2015-01-01", freq="H", periods=3), + pd.period_range("2015-01-01", freq="h", periods=3), ), ( date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern"), - date_range("2015-01-01 10:00", freq="H", periods=3, tz="Asia/Tokyo"), + date_range("2015-01-01 10:00", freq="h", periods=3, tz="Asia/Tokyo"), ), ( pd.timedelta_range("1 days", freq="D", periods=3), - pd.timedelta_range("2 hours", freq="H", periods=3), + pd.timedelta_range("2 hours", freq="h", periods=3), ), ], ) @@ -229,7 +227,7 @@ def test_from_arrays_index_datetimelike_mixed(): idx1 = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") - idx2 = date_range("2015-01-01 10:00", freq="H", periods=3) + idx2 = date_range("2015-01-01 10:00", freq="h", periods=3) idx3 = pd.timedelta_range("1 days", freq="D", periods=3) idx4 = pd.period_range("2011-01-01", freq="D", periods=3) @@ -648,10 +646,9 @@ tm.assert_index_equal(expected, result) -@pytest.mark.skipif(pa_version_under7p0, reason="minimum pyarrow not installed") def test_from_frame_missing_values_multiIndex(): # GH 39984 - import pyarrow as pa + pa = pytest.importorskip("pyarrow") df = pd.DataFrame( { @@ -778,7 +775,7 @@ idx1 = pd.DatetimeIndex( ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo" ) - idx2 = date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern") + idx2 = date_range("2010/01/01", periods=6, freq="ME", tz="US/Eastern") idx = MultiIndex.from_arrays([idx1, idx2]) expected1 = pd.DatetimeIndex( @@ -850,11 +847,14 @@ assert lev.dtype == object -def test_dtype_representation(): +def test_dtype_representation(using_infer_string): # GH#46900 pmidx = MultiIndex.from_arrays([[1], ["a"]], names=[("a", "b"), ("c", "d")]) result = pmidx.dtypes + exp = "object" if not using_infer_string else "string" expected = Series( - ["int64", "object"], index=MultiIndex.from_tuples([("a", "b"), ("c", "d")]) + ["int64", exp], + index=MultiIndex.from_tuples([("a", "b"), ("c", "d")]), + dtype=object, ) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_duplicates.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_duplicates.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_duplicates.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_duplicates.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,12 +11,31 @@ from pandas import ( NA, DatetimeIndex, + Index, MultiIndex, Series, ) import pandas._testing as tm +@pytest.fixture +def idx_dup(): + # compare tests/indexes/multi/conftest.py + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) + + major_codes = np.array([0, 0, 1, 0, 1, 1]) + minor_codes = np.array([0, 1, 0, 1, 0, 1]) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) + return mi + + @pytest.mark.parametrize("names", [None, ["first", "second"]]) def test_unique(names): mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names) @@ -238,7 +257,7 @@ def test_duplicated_hashtable_impl(keep, monkeypatch): # GH 9125 n, k = 6, 10 - levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] + levels = [np.arange(n), [str(i) for i in range(n)], 1000 + np.arange(n)] codes = [np.random.default_rng(2).choice(n, k * n) for _ in levels] with monkeypatch.context() as m: m.setattr(libindex, "_SIZE_CUTOFF", 50) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_formats.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_formats.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_formats.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_formats.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,24 +6,31 @@ Index, MultiIndex, ) +import pandas._testing as tm def test_format(idx): - idx.format() - idx[:0].format() + msg = "MultiIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx.format() + idx[:0].format() def test_format_integer_names(): index = MultiIndex( levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1] ) - index.format(names=True) + msg = "MultiIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + index.format(names=True) def test_format_sparse_config(idx): # GH1538 + msg = "MultiIndex.format is deprecated" with pd.option_context("display.multi_sparse", False): - result = idx.format() + with tm.assert_produces_warning(FutureWarning, match=msg): + result = idx.format() assert result[1] == "foo two" @@ -37,8 +44,9 @@ [0, 0, 0, 0, 0, 0], ], ) - - result = index.format() + msg = "MultiIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = index.format() assert result[3] == "1 0 0 0" @@ -131,8 +139,11 @@ names=['first', ...], length=6)""" assert result == expected - def test_rjust(self, narrow_multi_index): - mi = narrow_multi_index + def test_rjust(self): + n = 1000 + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) + mi = MultiIndex.from_arrays([ci, ci.codes + 9, dti], names=["a", "b", "dti"]) result = mi[:1].__repr__() expected = """\ MultiIndex([('a', 9, '2000-01-01 00:00:00')], @@ -174,8 +185,13 @@ names=['a', 'b', 'dti'], length=2000)""" assert result == expected - def test_tuple_width(self, wide_multi_index): - mi = wide_multi_index + def test_tuple_width(self): + n = 1000 + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) + levels = [ci, ci.codes + 9, dti, dti, dti] + names = ["a", "b", "dti_1", "dti_2", "dti_3"] + mi = MultiIndex.from_arrays(levels, names=names) result = mi[:1].__repr__() expected = """MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...)], names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])""" # noqa: E501 @@ -221,3 +237,13 @@ ('abc', 10, '2000-01-01 00:33:19', '2000-01-01 00:33:19', ...)], names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'], length=2000)""" assert result == expected + + def test_multiindex_long_element(self): + # Non-regression test towards GH#52960 + data = MultiIndex.from_tuples([("c" * 62,)]) + + expected = ( + "MultiIndex([('cccccccccccccccccccccccccccccccccccccccc" + "cccccccccccccccccccccc',)],\n )" + ) + assert str(data) == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_get_level_values.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_get_level_values.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_get_level_values.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_get_level_values.py 2024-04-10 17:42:52.000000000 +0000 @@ -115,7 +115,7 @@ def test_values_loses_freq_of_underlying_index(): # GH#49054 - idx = pd.DatetimeIndex(date_range("20200101", periods=3, freq="BM")) + idx = pd.DatetimeIndex(date_range("20200101", periods=3, freq="BME")) expected = idx.copy(deep=True) idx2 = Index([1, 2, 3]) midx = MultiIndex(levels=[idx, idx2], codes=[[0, 1, 2], [0, 1, 2]]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_get_set.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_get_set.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_get_set.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_get_set.py 2024-04-10 17:42:52.000000000 +0000 @@ -34,23 +34,25 @@ idx._get_level_number("fourth") -def test_get_dtypes(): +def test_get_dtypes(using_infer_string): # Test MultiIndex.dtypes (# Gh37062) idx_multitype = MultiIndex.from_product( [[1, 2, 3], ["a", "b", "c"], pd.date_range("20200101", periods=2, tz="UTC")], names=["int", "string", "dt"], ) + + exp = "object" if not using_infer_string else "string" expected = pd.Series( { "int": np.dtype("int64"), - "string": np.dtype("O"), + "string": exp, "dt": DatetimeTZDtype(tz="utc"), } ) tm.assert_series_equal(expected, idx_multitype.dtypes) -def test_get_dtypes_no_level_name(): +def test_get_dtypes_no_level_name(using_infer_string): # Test MultiIndex.dtypes (# GH38580 ) idx_multitype = MultiIndex.from_product( [ @@ -59,17 +61,18 @@ pd.date_range("20200101", periods=2, tz="UTC"), ], ) + exp = "object" if not using_infer_string else "string" expected = pd.Series( { "level_0": np.dtype("int64"), - "level_1": np.dtype("O"), + "level_1": exp, "level_2": DatetimeTZDtype(tz="utc"), } ) tm.assert_series_equal(expected, idx_multitype.dtypes) -def test_get_dtypes_duplicate_level_names(): +def test_get_dtypes_duplicate_level_names(using_infer_string): # Test MultiIndex.dtypes with non-unique level names (# GH45174) result = MultiIndex.from_product( [ @@ -79,8 +82,9 @@ ], names=["A", "A", "A"], ).dtypes + exp = "object" if not using_infer_string else "string" expected = pd.Series( - [np.dtype("int64"), np.dtype("O"), DatetimeTZDtype(tz="utc")], + [np.dtype("int64"), exp, DatetimeTZDtype(tz="utc")], index=["A", "A", "A"], ) tm.assert_series_equal(result, expected) @@ -95,8 +99,9 @@ frame.index._get_level_number(-3) -def test_set_name_methods(idx, index_names): +def test_set_name_methods(idx): # so long as these are synonyms, we don't need to test set_names + index_names = ["first", "second"] assert idx.rename == idx.set_names new_names = [name + "SUFFIX" for name in index_names] ind = idx.set_names(new_names) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_indexing.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_indexing.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas._libs import index as libindex from pandas.errors import ( InvalidIndexError, PerformanceWarning, @@ -12,6 +13,7 @@ import pandas as pd from pandas import ( Categorical, + DataFrame, Index, MultiIndex, date_range, @@ -36,7 +38,11 @@ assert result == (2, 4) def test_slice_locs(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((50, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=50, freq="B"), + ) stacked = df.stack(future_stack=True) idx = stacked.index @@ -56,14 +62,22 @@ tm.assert_almost_equal(sliced.values, expected.values) def test_slice_locs_with_type_mismatch(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) stacked = df.stack(future_stack=True) idx = stacked.index with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs((1, 3)) with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2)) - df = tm.makeCustomDataframe(5, 5) + df = DataFrame( + np.ones((5, 5)), + index=Index([f"i-{i}" for i in range(5)], name="a"), + columns=Index([f"i-{i}" for i in range(5)], name="a"), + ) stacked = df.stack(future_stack=True) idx = stacked.index with pytest.raises(TypeError, match="^Level type mismatch"): @@ -261,7 +275,7 @@ midx = MultiIndex.from_product( [ Categorical(["a", "b", "c"]), - Categorical(date_range("2012-01-01", periods=3, freq="H")), + Categorical(date_range("2012-01-01", periods=3, freq="h")), ] ) result = midx.get_indexer(midx) @@ -342,6 +356,19 @@ expected = np.array([4, 6, 7], dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) + @pytest.mark.parametrize("method", ["pad", "ffill", "backfill", "bfill", "nearest"]) + def test_get_indexer_methods_raise_for_non_monotonic(self, method): + # 53452 + mi = MultiIndex.from_arrays([[0, 4, 2], [0, 4, 2]]) + if method == "nearest": + err = NotImplementedError + msg = "not implemented yet for MultiIndex" + else: + err = ValueError + msg = "index must be monotonic increasing or decreasing" + with pytest.raises(err, match=msg): + mi.get_indexer([(1, 1)], method=method) + def test_get_indexer_three_or_more_levels(self): # https://github.com/pandas-dev/pandas/issues/29896 # tests get_indexer() on MultiIndexes with 3+ levels @@ -830,30 +857,31 @@ assert "element_not_exit" not in idx assert "0 day 09:30:00" in idx - @pytest.mark.slow - def test_large_mi_contains(self): + def test_large_mi_contains(self, monkeypatch): # GH#10645 - result = MultiIndex.from_arrays([range(10**6), range(10**6)]) - assert (10**6, 0) not in result + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", 10) + result = MultiIndex.from_arrays([range(10), range(10)]) + assert (10, 0) not in result def test_timestamp_multiindex_indexer(): # https://github.com/pandas-dev/pandas/issues/26944 idx = MultiIndex.from_product( [ - date_range("2019-01-01T00:15:33", periods=100, freq="H", name="date"), + date_range("2019-01-01T00:15:33", periods=100, freq="h", name="date"), ["x"], [3], ] ) - df = pd.DataFrame({"foo": np.arange(len(idx))}, idx) + df = DataFrame({"foo": np.arange(len(idx))}, idx) result = df.loc[pd.IndexSlice["2019-1-2":, "x", :], "foo"] qidx = MultiIndex.from_product( [ date_range( start="2019-01-02T00:15:33", end="2019-01-05T03:15:33", - freq="H", + freq="h", name="date", ), ["x"], diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_integrity.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_integrity.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_integrity.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_integrity.py 2024-04-10 17:42:52.000000000 +0000 @@ -125,18 +125,20 @@ @pytest.mark.slow -def test_hash_collisions(): +def test_hash_collisions(monkeypatch): # non-smoke test that we don't get hash collisions - - index = MultiIndex.from_product( - [np.arange(1000), np.arange(1000)], names=["one", "two"] - ) - result = index.get_indexer(index.values) - tm.assert_numpy_array_equal(result, np.arange(len(index), dtype="intp")) - - for i in [0, 1, len(index) - 2, len(index) - 1]: - result = index.get_loc(index[i]) - assert result == i + size_cutoff = 50 + with monkeypatch.context() as m: + m.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + index = MultiIndex.from_product( + [np.arange(8), np.arange(8)], names=["one", "two"] + ) + result = index.get_indexer(index.values) + tm.assert_numpy_array_equal(result, np.arange(len(index), dtype="intp")) + + for i in [0, 1, len(index) - 2, len(index) - 1]: + result = index.get_loc(index[i]) + assert result == i def test_dims(): @@ -170,22 +172,29 @@ pd.isna(idx) -def test_large_multiindex_error(): +def test_large_multiindex_error(monkeypatch): # GH12527 - df_below_1000000 = pd.DataFrame( - 1, index=MultiIndex.from_product([[1, 2], range(499999)]), columns=["dest"] - ) - with pytest.raises(KeyError, match=r"^\(-1, 0\)$"): - df_below_1000000.loc[(-1, 0), "dest"] - with pytest.raises(KeyError, match=r"^\(3, 0\)$"): - df_below_1000000.loc[(3, 0), "dest"] - df_above_1000000 = pd.DataFrame( - 1, index=MultiIndex.from_product([[1, 2], range(500001)]), columns=["dest"] - ) - with pytest.raises(KeyError, match=r"^\(-1, 0\)$"): - df_above_1000000.loc[(-1, 0), "dest"] - with pytest.raises(KeyError, match=r"^\(3, 0\)$"): - df_above_1000000.loc[(3, 0), "dest"] + size_cutoff = 50 + with monkeypatch.context() as m: + m.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + df_below_cutoff = pd.DataFrame( + 1, + index=MultiIndex.from_product([[1, 2], range(size_cutoff - 1)]), + columns=["dest"], + ) + with pytest.raises(KeyError, match=r"^\(-1, 0\)$"): + df_below_cutoff.loc[(-1, 0), "dest"] + with pytest.raises(KeyError, match=r"^\(3, 0\)$"): + df_below_cutoff.loc[(3, 0), "dest"] + df_above_cutoff = pd.DataFrame( + 1, + index=MultiIndex.from_product([[1, 2], range(size_cutoff + 1)]), + columns=["dest"], + ) + with pytest.raises(KeyError, match=r"^\(-1, 0\)$"): + df_above_cutoff.loc[(-1, 0), "dest"] + with pytest.raises(KeyError, match=r"^\(3, 0\)$"): + df_above_cutoff.loc[(3, 0), "dest"] def test_mi_hashtable_populated_attribute_error(monkeypatch): @@ -241,7 +250,6 @@ ) df.index.names = ["fizz", "buzz"] - str(df) expected = pd.DataFrame( {"df2": np.arange(100), "df1": np.arange(100)}, index=MultiIndex.from_product([range(10), range(10)], names=["fizz", "buzz"]), diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_join.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_join.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_join.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_join.py 2024-04-10 17:42:52.000000000 +0000 @@ -51,8 +51,11 @@ def test_join_self(idx, join_type): - joined = idx.join(idx, how=join_type) - tm.assert_index_equal(joined, idx) + result = idx.join(idx, how=join_type) + expected = idx + if join_type == "outer": + expected = expected.sort_values() + tm.assert_index_equal(result, expected) def test_join_multi(): @@ -89,12 +92,6 @@ tm.assert_numpy_array_equal(ridx, exp_ridx) -def test_join_self_unique(idx, join_type): - if idx.is_unique: - joined = idx.join(idx, how=join_type) - assert (idx == joined).all() - - def test_join_multi_wrong_order(): # GH 25760 # GH 28956 diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_names.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_names.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_names.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_names.py 2024-04-10 17:42:52.000000000 +0000 @@ -83,11 +83,11 @@ multi_idx.copy(names=[["mario"], ["luigi"]]) -def test_names(idx, index_names): +def test_names(idx): # names are assigned in setup - assert index_names == ["first", "second"] + assert idx.names == ["first", "second"] level_names = [level.name for level in idx.levels] - assert level_names == index_names + assert level_names == idx.names # setting bad names on existing index = idx diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_partial_indexing.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_partial_indexing.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_partial_indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_partial_indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -28,10 +28,10 @@ # 2016-01-03 00:00:00 a 12 # b 13 # c 14 - dr = date_range("2016-01-01", "2016-01-03", freq="12H") + dr = date_range("2016-01-01", "2016-01-03", freq="12h") abc = ["a", "b", "c"] mi = MultiIndex.from_product([dr, abc]) - frame = DataFrame({"c1": range(0, 15)}, index=mi) + frame = DataFrame({"c1": range(15)}, index=mi) return frame diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_reindex.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_reindex.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_reindex.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_reindex.py 2024-04-10 17:42:52.000000000 +0000 @@ -75,11 +75,14 @@ assert idx.reindex([], level=1)[0].names == ["foo", "bar"] -def test_reindex_lvl_preserves_type_if_target_is_empty_list_or_array(): +def test_reindex_lvl_preserves_type_if_target_is_empty_list_or_array( + using_infer_string, +): # GH7774 idx = MultiIndex.from_product([[0, 1], ["a", "b"]]) assert idx.reindex([], level=0)[0].levels[0].dtype.type == np.int64 - assert idx.reindex([], level=1)[0].levels[1].dtype.type == np.object_ + exp = np.object_ if not using_infer_string else str + assert idx.reindex([], level=1)[0].levels[1].dtype.type == exp # case with EA levels cat = pd.Categorical(["foo", "bar"]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_setops.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_setops.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_setops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_setops.py 2024-04-10 17:42:52.000000000 +0000 @@ -204,7 +204,6 @@ def test_difference_sort_special_true(): - # TODO(GH#25151): decide on True behaviour idx = MultiIndex.from_product([[1, 0], ["a", "b"]]) result = idx.difference([], sort=True) expected = MultiIndex.from_product([[0, 1], ["a", "b"]]) @@ -244,10 +243,10 @@ the_union = piece1.union(piece2, sort=sort) - if sort is None: - tm.assert_index_equal(the_union, idx.sort_values()) - - assert tm.equalContents(the_union, idx) + if sort in (None, False): + tm.assert_index_equal(the_union.sort_values(), idx.sort_values()) + else: + tm.assert_index_equal(the_union, idx) # corner case, pass self or empty thing: the_union = idx.union(idx, sort=sort) @@ -259,24 +258,28 @@ tuples = idx.values result = idx[:4].union(tuples[4:], sort=sort) if sort is None: - tm.equalContents(result, idx) + tm.assert_index_equal(result.sort_values(), idx.sort_values()) else: assert result.equals(idx) -def test_union_with_regular_index(idx): +def test_union_with_regular_index(idx, using_infer_string): other = Index(["A", "B", "C"]) result = other.union(idx) assert ("foo", "one") in result assert "B" in result - msg = "The values in the array are unorderable" - with tm.assert_produces_warning(RuntimeWarning, match=msg): - result2 = idx.union(other) - # This is more consistent now, if sorting fails then we don't sort at all - # in the MultiIndex case. - assert not result.equals(result2) + if using_infer_string: + with pytest.raises(NotImplementedError, match="Can only union"): + idx.union(other) + else: + msg = "The values in the array are unorderable" + with tm.assert_produces_warning(RuntimeWarning, match=msg): + result2 = idx.union(other) + # This is more consistent now, if sorting fails then we don't sort at all + # in the MultiIndex case. + assert not result.equals(result2) def test_intersection(idx, sort): @@ -285,9 +288,10 @@ the_int = piece1.intersection(piece2, sort=sort) - if sort is None: + if sort in (None, True): tm.assert_index_equal(the_int, idx[3:5]) - assert tm.equalContents(the_int, idx[3:5]) + else: + tm.assert_index_equal(the_int.sort_values(), idx[3:5]) # corner case, pass self the_int = idx.intersection(idx, sort=sort) @@ -366,8 +370,6 @@ def test_union_sort_other_empty_sort(): - # TODO(GH#25151): decide on True behaviour - # # sort=True idx = MultiIndex.from_product([[1, 0], ["a", "b"]]) other = idx[:0] result = idx.union(other, sort=True) @@ -758,7 +760,12 @@ def test_union_with_na_when_constructing_dataframe(): # GH43222 - series1 = Series((1,), index=MultiIndex.from_tuples(((None, None),))) + series1 = Series( + (1,), + index=MultiIndex.from_arrays( + [Series([None], dtype="string"), Series([None], dtype="string")] + ), + ) series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b")))) result = DataFrame([series1, series2]) expected = DataFrame({(np.nan, np.nan): [1.0, 10.0], ("a", "b"): [np.nan, 20.0]}) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_sorting.py pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_sorting.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/multi/test_sorting.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/multi/test_sorting.py 2024-04-10 17:42:52.000000000 +0000 @@ -338,3 +338,12 @@ ] expected = MultiIndex.from_arrays(arrays) tm.assert_index_equal(result, expected) + + +def test_sort_unnecessary_warning(): + # GH#55386 + midx = MultiIndex.from_tuples([(1.5, 2), (3.5, 3), (0, 1)]) + midx = midx.set_levels([2.5, np.nan, 1], level=0) + result = midx.sort_values() + expected = MultiIndex.from_tuples([(1, 3), (2.5, 1), (np.nan, 2)]) + tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/numeric/test_join.py pandas-2.2.2+dfsg/pandas/tests/indexes/numeric/test_join.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/numeric/test_join.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/numeric/test_join.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,13 +11,13 @@ joined, lidx, ridx = left.join(left, return_indexers=True) - exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4]) + exp_joined = Index([4, 4, 4, 4, 3, 3, 3, 3]) tm.assert_index_equal(joined, exp_joined) - exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp) + exp_lidx = np.array([0, 0, 1, 1, 2, 2, 3, 3], dtype=np.intp) tm.assert_numpy_array_equal(lidx, exp_lidx) - exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp) + exp_ridx = np.array([0, 1, 0, 1, 2, 3, 2, 3], dtype=np.intp) tm.assert_numpy_array_equal(ridx, exp_ridx) def test_join_inner(self): diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/numeric/test_numeric.py pandas-2.2.2+dfsg/pandas/tests/indexes/numeric/test_numeric.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/numeric/test_numeric.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/numeric/test_numeric.py 2024-04-10 17:42:52.000000000 +0000 @@ -318,7 +318,9 @@ def test_view_index(self, simple_index): index = simple_index - index.view(Index) + msg = "Passing a type in .*Index.view is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + index.view(Index) def test_prevent_casting(self, simple_index): index = simple_index @@ -352,11 +354,13 @@ arr = index.values.copy() new_index = index_cls(arr, copy=True) tm.assert_index_equal(new_index, index, exact=True) - val = arr[0] + 3000 + val = int(arr[0]) + 3000 # this should not change index - arr[0] = val - assert new_index[0] != val + if dtype != np.int8: + # NEP 50 won't allow assignment that would overflow + arr[0] = val + assert new_index[0] != val if dtype == np.int64: # pass list, coerce fine @@ -405,8 +409,12 @@ any_unsigned_int_numpy_dtype, ): # see gh-15832 - msg = "Trying to coerce negative values to unsigned integers" - + msg = "|".join( + [ + "Trying to coerce negative values to unsigned integers", + "The elements provided in the data cannot all be casted", + ] + ) with pytest.raises(OverflowError, match=msg): Index([-1], dtype=any_unsigned_int_numpy_dtype) @@ -527,3 +535,19 @@ # TODO: we could plausibly try to infer down to int16 here expected = Index([1000, 2000, 3000], dtype=np.int64) tm.assert_index_equal(result, expected) + + +def test_view_to_datetimelike(): + # GH#55710 + idx = Index([1, 2, 3]) + res = idx.view("m8[s]") + expected = pd.TimedeltaIndex(idx.values.view("m8[s]")) + tm.assert_index_equal(res, expected) + + res2 = idx.view("m8[D]") + expected2 = idx.values.view("m8[D]") + tm.assert_numpy_array_equal(res2, expected2) + + res3 = idx.view("M8[h]") + expected3 = idx.values.view("M8[h]") + tm.assert_numpy_array_equal(res3, expected3) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/numeric/test_setops.py pandas-2.2.2+dfsg/pandas/tests/indexes/numeric/test_setops.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/numeric/test_setops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/numeric/test_setops.py 2024-04-10 17:42:52.000000000 +0000 @@ -133,7 +133,10 @@ index2 = Index([2, 3, 4, 1]) result = index1.symmetric_difference(index2, sort=sort) expected = Index([5, 1]) - assert tm.equalContents(result, expected) + if sort is not None: + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result, expected.sort_values()) assert result.name is None if sort is None: expected = expected.sort_values() diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/object/test_astype.py pandas-2.2.2+dfsg/pandas/tests/indexes/object/test_astype.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/object/test_astype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/object/test_astype.py 2024-04-10 17:42:52.000000000 +0000 @@ -20,7 +20,7 @@ # while we're here, check that Series.astype behaves the same result = Series(idx).astype(str) - expected = Series(expected) + expected = Series(expected, dtype=object) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/object/test_indexing.py pandas-2.2.2+dfsg/pandas/tests/indexes/object/test_indexing.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/object/test_indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/object/test_indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,7 +3,11 @@ import numpy as np import pytest -from pandas._libs.missing import is_matching_na +from pandas._libs.missing import ( + NA, + is_matching_na, +) +from pandas.compat import pa_version_under16p0 import pandas.util._test_decorators as td import pandas as pd @@ -25,20 +29,36 @@ tm.assert_numpy_array_equal(actual, expected) - def test_get_indexer_strings_raises(self): + def test_get_indexer_strings_raises(self, using_infer_string): index = Index(["b", "c"]) - msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" - with pytest.raises(TypeError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="nearest") - - with pytest.raises(TypeError, match=msg): - index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) - - with pytest.raises(TypeError, match=msg): - index.get_indexer( - ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] - ) + if using_infer_string: + import pyarrow as pa + + msg = "has no kernel" + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") + + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) + + else: + msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="nearest") + + with pytest.raises(TypeError, match=msg): + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) + + with pytest.raises(TypeError, match=msg): + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) def test_get_indexer_with_NA_values( self, unique_nulls_fixture, unique_nulls_fixture2 @@ -51,15 +71,21 @@ arr = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=object) index = Index(arr, dtype=object) result = index.get_indexer( - [unique_nulls_fixture, unique_nulls_fixture2, "Unknown"] + Index( + [unique_nulls_fixture, unique_nulls_fixture2, "Unknown"], dtype=object + ) ) expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) class TestGetIndexerNonUnique: - def test_get_indexer_non_unique_nas(self, nulls_fixture): + def test_get_indexer_non_unique_nas( + self, nulls_fixture, request, using_infer_string + ): # even though this isn't non-unique, this should still work + if using_infer_string and (nulls_fixture is None or nulls_fixture is NA): + request.applymarker(pytest.mark.xfail(reason="NAs are cast to NaN")) index = Index(["a", "b", nulls_fixture]) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) @@ -175,7 +201,16 @@ (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype): + def test_slice_locs_negative_step(self, in_slice, expected, dtype, request): + if ( + not pa_version_under16p0 + and dtype == "string[pyarrow_numpy]" + and in_slice == slice("a", "a", -1) + ): + request.applymarker( + pytest.mark.xfail(reason="https://github.com/apache/arrow/issues/40642") + ) + index = Index(list("bcdxy"), dtype=dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_asfreq.py pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_asfreq.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_asfreq.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_asfreq.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,3 +1,5 @@ +import re + import pytest from pandas import ( @@ -7,66 +9,68 @@ ) import pandas._testing as tm +from pandas.tseries import offsets + class TestPeriodIndex: def test_asfreq(self): - pi1 = period_range(freq="A", start="1/1/2001", end="1/1/2001") + pi1 = period_range(freq="Y", start="1/1/2001", end="1/1/2001") pi2 = period_range(freq="Q", start="1/1/2001", end="1/1/2001") pi3 = period_range(freq="M", start="1/1/2001", end="1/1/2001") pi4 = period_range(freq="D", start="1/1/2001", end="1/1/2001") - pi5 = period_range(freq="H", start="1/1/2001", end="1/1/2001 00:00") + pi5 = period_range(freq="h", start="1/1/2001", end="1/1/2001 00:00") pi6 = period_range(freq="Min", start="1/1/2001", end="1/1/2001 00:00") - pi7 = period_range(freq="S", start="1/1/2001", end="1/1/2001 00:00:00") + pi7 = period_range(freq="s", start="1/1/2001", end="1/1/2001 00:00:00") - assert pi1.asfreq("Q", "S") == pi2 + assert pi1.asfreq("Q", "s") == pi2 assert pi1.asfreq("Q", "s") == pi2 assert pi1.asfreq("M", "start") == pi3 assert pi1.asfreq("D", "StarT") == pi4 - assert pi1.asfreq("H", "beGIN") == pi5 - assert pi1.asfreq("Min", "S") == pi6 - assert pi1.asfreq("S", "S") == pi7 - - assert pi2.asfreq("A", "S") == pi1 - assert pi2.asfreq("M", "S") == pi3 - assert pi2.asfreq("D", "S") == pi4 - assert pi2.asfreq("H", "S") == pi5 - assert pi2.asfreq("Min", "S") == pi6 - assert pi2.asfreq("S", "S") == pi7 - - assert pi3.asfreq("A", "S") == pi1 - assert pi3.asfreq("Q", "S") == pi2 - assert pi3.asfreq("D", "S") == pi4 - assert pi3.asfreq("H", "S") == pi5 - assert pi3.asfreq("Min", "S") == pi6 - assert pi3.asfreq("S", "S") == pi7 - - assert pi4.asfreq("A", "S") == pi1 - assert pi4.asfreq("Q", "S") == pi2 - assert pi4.asfreq("M", "S") == pi3 - assert pi4.asfreq("H", "S") == pi5 - assert pi4.asfreq("Min", "S") == pi6 - assert pi4.asfreq("S", "S") == pi7 - - assert pi5.asfreq("A", "S") == pi1 - assert pi5.asfreq("Q", "S") == pi2 - assert pi5.asfreq("M", "S") == pi3 - assert pi5.asfreq("D", "S") == pi4 - assert pi5.asfreq("Min", "S") == pi6 - assert pi5.asfreq("S", "S") == pi7 - - assert pi6.asfreq("A", "S") == pi1 - assert pi6.asfreq("Q", "S") == pi2 - assert pi6.asfreq("M", "S") == pi3 - assert pi6.asfreq("D", "S") == pi4 - assert pi6.asfreq("H", "S") == pi5 - assert pi6.asfreq("S", "S") == pi7 - - assert pi7.asfreq("A", "S") == pi1 - assert pi7.asfreq("Q", "S") == pi2 - assert pi7.asfreq("M", "S") == pi3 - assert pi7.asfreq("D", "S") == pi4 - assert pi7.asfreq("H", "S") == pi5 - assert pi7.asfreq("Min", "S") == pi6 + assert pi1.asfreq("h", "beGIN") == pi5 + assert pi1.asfreq("Min", "s") == pi6 + assert pi1.asfreq("s", "s") == pi7 + + assert pi2.asfreq("Y", "s") == pi1 + assert pi2.asfreq("M", "s") == pi3 + assert pi2.asfreq("D", "s") == pi4 + assert pi2.asfreq("h", "s") == pi5 + assert pi2.asfreq("Min", "s") == pi6 + assert pi2.asfreq("s", "s") == pi7 + + assert pi3.asfreq("Y", "s") == pi1 + assert pi3.asfreq("Q", "s") == pi2 + assert pi3.asfreq("D", "s") == pi4 + assert pi3.asfreq("h", "s") == pi5 + assert pi3.asfreq("Min", "s") == pi6 + assert pi3.asfreq("s", "s") == pi7 + + assert pi4.asfreq("Y", "s") == pi1 + assert pi4.asfreq("Q", "s") == pi2 + assert pi4.asfreq("M", "s") == pi3 + assert pi4.asfreq("h", "s") == pi5 + assert pi4.asfreq("Min", "s") == pi6 + assert pi4.asfreq("s", "s") == pi7 + + assert pi5.asfreq("Y", "s") == pi1 + assert pi5.asfreq("Q", "s") == pi2 + assert pi5.asfreq("M", "s") == pi3 + assert pi5.asfreq("D", "s") == pi4 + assert pi5.asfreq("Min", "s") == pi6 + assert pi5.asfreq("s", "s") == pi7 + + assert pi6.asfreq("Y", "s") == pi1 + assert pi6.asfreq("Q", "s") == pi2 + assert pi6.asfreq("M", "s") == pi3 + assert pi6.asfreq("D", "s") == pi4 + assert pi6.asfreq("h", "s") == pi5 + assert pi6.asfreq("s", "s") == pi7 + + assert pi7.asfreq("Y", "s") == pi1 + assert pi7.asfreq("Q", "s") == pi2 + assert pi7.asfreq("M", "s") == pi3 + assert pi7.asfreq("D", "s") == pi4 + assert pi7.asfreq("h", "s") == pi5 + assert pi7.asfreq("Min", "s") == pi6 msg = "How must be one of S or E" with pytest.raises(ValueError, match=msg): @@ -100,23 +104,23 @@ assert result.freq == exp.freq def test_asfreq_combined_pi(self): - pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="H") - exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="25H") - for freq, how in zip(["1D1H", "1H1D"], ["S", "E"]): + pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="h") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="25h") + for freq, how in zip(["1D1h", "1h1D"], ["S", "E"]): result = pi.asfreq(freq, how=how) tm.assert_index_equal(result, exp) assert result.freq == exp.freq - for freq in ["1D1H", "1H1D"]: + for freq in ["1D1h", "1h1D"]: pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq) - result = pi.asfreq("H") - exp = PeriodIndex(["2001-01-02 00:00", "2001-01-03 02:00", "NaT"], freq="H") + result = pi.asfreq("h") + exp = PeriodIndex(["2001-01-02 00:00", "2001-01-03 02:00", "NaT"], freq="h") tm.assert_index_equal(result, exp) assert result.freq == exp.freq pi = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq) - result = pi.asfreq("H", how="S") - exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="H") + result = pi.asfreq("h", how="S") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="h") tm.assert_index_equal(result, exp) assert result.freq == exp.freq @@ -136,3 +140,50 @@ excepted = Series([1, 2], index=PeriodIndex(["2020-02", "2020-04"], freq="M")) tm.assert_series_equal(result, excepted) + + @pytest.mark.parametrize( + "freq", + [ + "2BMS", + "2YS-MAR", + "2bh", + ], + ) + def test_pi_asfreq_not_supported_frequency(self, freq): + # GH#55785 + msg = f"{freq[1:]} is not supported as period frequency" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + "2BME", + "2YE-MAR", + "2QE", + ], + ) + def test_pi_asfreq_invalid_frequency(self, freq): + # GH#55785 + msg = f"Invalid frequency: {freq}" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + offsets.MonthBegin(2), + offsets.BusinessMonthEnd(2), + ], + ) + def test_pi_asfreq_invalid_baseoffset(self, freq): + # GH#56945 + msg = re.escape(f"{freq} is not supported as period frequency") + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_astype.py pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_astype.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_astype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_astype.py 2024-04-10 17:42:52.000000000 +0000 @@ -41,10 +41,10 @@ tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx") + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) - idx = period_range("1990", "2009", freq="A", name="idx") + idx = period_range("1990", "2009", freq="Y", name="idx") result = idx.astype("i8") tm.assert_index_equal(result, Index(idx.asi8, name="idx")) tm.assert_numpy_array_equal(result.values, idx.asi8) @@ -139,10 +139,13 @@ expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) - def test_period_astype_to_timestamp(self): + def test_period_astype_to_timestamp(self, unit): + # GH#55958 pi = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="M") - exp = DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], tz="US/Eastern") - res = pi.astype("datetime64[ns, US/Eastern]") + exp = DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], tz="US/Eastern" + ).as_unit(unit) + res = pi.astype(f"datetime64[{unit}, US/Eastern]") tm.assert_index_equal(res, exp) assert res.freq == exp.freq diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_factorize.py pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_factorize.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_factorize.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_factorize.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,16 +1,14 @@ import numpy as np -from pandas import ( - PeriodIndex, - factorize, -) +from pandas import PeriodIndex import pandas._testing as tm class TestFactorize: - def test_factorize(self): + def test_factorize_period(self): idx1 = PeriodIndex( - ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], + freq="M", ) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) @@ -24,9 +22,12 @@ tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + def test_factorize_period_nonmonotonic(self): idx2 = PeriodIndex( - ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], + freq="M", ) + exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) arr, idx = idx2.factorize(sort=True) @@ -38,17 +39,3 @@ arr, idx = idx2.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - - def test_factorize_complex(self): # TODO: WTF is this test doing here?s - # GH 17927 - array = [1, 2, 2 + 1j] - msg = "factorize with argument that is not not a Series" - with tm.assert_produces_warning(FutureWarning, match=msg): - labels, uniques = factorize(array) - - expected_labels = np.array([0, 1, 2], dtype=np.intp) - tm.assert_numpy_array_equal(labels, expected_labels) - - # Should return a complex dtype in the future - expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=object) - tm.assert_numpy_array_equal(uniques, expected_uniques) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_fillna.py pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_fillna.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_fillna.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_fillna.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,19 +10,19 @@ class TestFillNA: def test_fillna_period(self): # GH#11343 - idx = PeriodIndex(["2011-01-01 09:00", NaT, "2011-01-01 11:00"], freq="H") + idx = PeriodIndex(["2011-01-01 09:00", NaT, "2011-01-01 11:00"], freq="h") exp = PeriodIndex( - ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="H" + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="h" ) - result = idx.fillna(Period("2011-01-01 10:00", freq="H")) + result = idx.fillna(Period("2011-01-01 10:00", freq="h")) tm.assert_index_equal(result, exp) exp = Index( [ - Period("2011-01-01 09:00", freq="H"), + Period("2011-01-01 09:00", freq="h"), "x", - Period("2011-01-01 11:00", freq="H"), + Period("2011-01-01 11:00", freq="h"), ], dtype=object, ) @@ -31,9 +31,9 @@ exp = Index( [ - Period("2011-01-01 09:00", freq="H"), + Period("2011-01-01 09:00", freq="h"), Period("2011-01-01", freq="D"), - Period("2011-01-01 11:00", freq="H"), + Period("2011-01-01 11:00", freq="h"), ], dtype=object, ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_is_full.py pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_is_full.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_is_full.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_is_full.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,19 +4,19 @@ def test_is_full(): - index = PeriodIndex([2005, 2007, 2009], freq="A") + index = PeriodIndex([2005, 2007, 2009], freq="Y") assert not index.is_full - index = PeriodIndex([2005, 2006, 2007], freq="A") + index = PeriodIndex([2005, 2006, 2007], freq="Y") assert index.is_full - index = PeriodIndex([2005, 2005, 2007], freq="A") + index = PeriodIndex([2005, 2005, 2007], freq="Y") assert not index.is_full - index = PeriodIndex([2005, 2005, 2006], freq="A") + index = PeriodIndex([2005, 2005, 2006], freq="Y") assert index.is_full - index = PeriodIndex([2006, 2005, 2005], freq="A") + index = PeriodIndex([2006, 2005, 2005], freq="Y") with pytest.raises(ValueError, match="Index is not monotonic"): index.is_full diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_shift.py pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_shift.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_shift.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_shift.py 2024-04-10 17:42:52.000000000 +0000 @@ -29,16 +29,16 @@ tm.assert_index_equal(result, expected) def test_shift(self): - pi1 = period_range(freq="A", start="1/1/2001", end="12/1/2009") - pi2 = period_range(freq="A", start="1/1/2002", end="12/1/2010") + pi1 = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="Y", start="1/1/2002", end="12/1/2010") tm.assert_index_equal(pi1.shift(0), pi1) assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(1), pi2) - pi1 = period_range(freq="A", start="1/1/2001", end="12/1/2009") - pi2 = period_range(freq="A", start="1/1/2000", end="12/1/2008") + pi1 = period_range(freq="Y", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="Y", start="1/1/2000", end="12/1/2008") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(-1), pi2) @@ -64,12 +64,12 @@ def test_shift_corner_cases(self): # GH#9903 - idx = PeriodIndex([], name="xxx", freq="H") + idx = PeriodIndex([], name="xxx", freq="h") msg = "`freq` argument is not supported for PeriodIndex.shift" with pytest.raises(TypeError, match=msg): # period shift doesn't accept freq - idx.shift(1, freq="H") + idx.shift(1, freq="h") tm.assert_index_equal(idx.shift(0), idx) tm.assert_index_equal(idx.shift(3), idx) @@ -77,19 +77,19 @@ idx = PeriodIndex( ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], name="xxx", - freq="H", + freq="h", ) tm.assert_index_equal(idx.shift(0), idx) exp = PeriodIndex( ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], name="xxx", - freq="H", + freq="h", ) tm.assert_index_equal(idx.shift(3), exp) exp = PeriodIndex( ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], name="xxx", - freq="H", + freq="h", ) tm.assert_index_equal(idx.shift(-3), exp) @@ -117,6 +117,6 @@ def test_shift_periods(self): # GH #22458 : argument 'n' was deprecated in favor of 'periods' - idx = period_range(freq="A", start="1/1/2001", end="12/1/2009") + idx = period_range(freq="Y", start="1/1/2001", end="12/1/2009") tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_to_timestamp.py pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_to_timestamp.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/period/methods/test_to_timestamp.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/period/methods/test_to_timestamp.py 2024-04-10 17:42:52.000000000 +0000 @@ -47,9 +47,9 @@ tm.assert_datetime_array_equal(result, expected, check_freq=False) def test_to_timestamp_freq(self): - idx = period_range("2017", periods=12, freq="A-DEC") + idx = period_range("2017", periods=12, freq="Y-DEC") result = idx.to_timestamp() - expected = date_range("2017", periods=12, freq="AS-JAN") + expected = date_range("2017", periods=12, freq="YS-JAN") tm.assert_index_equal(result, expected) def test_to_timestamp_pi_nat(self): @@ -58,7 +58,9 @@ result = index.to_timestamp("D") expected = DatetimeIndex( - [NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], name="idx" + [NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], + dtype="M8[ns]", + name="idx", ) tm.assert_index_equal(result, expected) assert result.name == "idx" @@ -72,12 +74,12 @@ tm.assert_index_equal(result3, exp) assert result3.freqstr == "3M" - msg = "Frequency must be positive, because it represents span: -2A" + msg = "Frequency must be positive, because it represents span: -2Y" with pytest.raises(ValueError, match=msg): - result.to_period(freq="-2A") + result.to_period(freq="-2Y") def test_to_timestamp_preserve_name(self): - index = period_range(freq="A", start="1/1/2001", end="12/1/2009", name="foo") + index = period_range(freq="Y", start="1/1/2001", end="12/1/2009", name="foo") assert index.name == "foo" conv = index.to_timestamp("D") @@ -87,7 +89,7 @@ years = np.arange(1960, 2000).repeat(4) quarters = np.tile(list(range(1, 5)), 40) - pindex = PeriodIndex(year=years, quarter=quarters) + pindex = PeriodIndex.from_fields(year=years, quarter=quarters) stamps = pindex.to_timestamp("D", "end") expected = DatetimeIndex([x.to_timestamp("D", "end") for x in pindex]) @@ -98,30 +100,38 @@ idx = PeriodIndex(["2011-01", "NaT", "2011-02"], freq="2M", name="idx") result = idx.to_timestamp() - expected = DatetimeIndex(["2011-01-01", "NaT", "2011-02-01"], name="idx") + expected = DatetimeIndex( + ["2011-01-01", "NaT", "2011-02-01"], dtype="M8[ns]", name="idx" + ) tm.assert_index_equal(result, expected) result = idx.to_timestamp(how="E") - expected = DatetimeIndex(["2011-02-28", "NaT", "2011-03-31"], name="idx") + expected = DatetimeIndex( + ["2011-02-28", "NaT", "2011-03-31"], dtype="M8[ns]", name="idx" + ) expected = expected + Timedelta(1, "D") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) def test_to_timestamp_pi_combined(self): - idx = period_range(start="2011", periods=2, freq="1D1H", name="idx") + idx = period_range(start="2011", periods=2, freq="1D1h", name="idx") result = idx.to_timestamp() - expected = DatetimeIndex(["2011-01-01 00:00", "2011-01-02 01:00"], name="idx") + expected = DatetimeIndex( + ["2011-01-01 00:00", "2011-01-02 01:00"], dtype="M8[ns]", name="idx" + ) tm.assert_index_equal(result, expected) result = idx.to_timestamp(how="E") expected = DatetimeIndex( - ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx" + ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx", dtype="M8[ns]" ) expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) - result = idx.to_timestamp(how="E", freq="H") - expected = DatetimeIndex(["2011-01-02 00:00", "2011-01-03 01:00"], name="idx") + result = idx.to_timestamp(how="E", freq="h") + expected = DatetimeIndex( + ["2011-01-02 00:00", "2011-01-03 01:00"], dtype="M8[ns]", name="idx" + ) expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/period/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/indexes/period/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/period/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/period/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -19,7 +19,87 @@ from pandas.core.arrays import PeriodArray +class TestPeriodIndexDisallowedFreqs: + @pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2M", "2ME"), + ("2Q-MAR", "2QE-MAR"), + ("2Y-FEB", "2YE-FEB"), + ("2M", "2me"), + ("2Q-MAR", "2qe-MAR"), + ("2Y-FEB", "2yE-feb"), + ], + ) + def test_period_index_offsets_frequency_error_message(self, freq, freq_depr): + # GH#52064 + msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2020-01-01", "2020-01-02"], freq=freq_depr) + + with pytest.raises(ValueError, match=msg): + period_range(start="2020-01-01", end="2020-01-02", freq=freq_depr) + + @pytest.mark.parametrize("freq_depr", ["2SME", "2sme", "2CBME", "2BYE", "2Bye"]) + def test_period_index_frequency_invalid_freq(self, freq_depr): + # GH#9586 + msg = f"Invalid frequency: {freq_depr[1:]}" + + with pytest.raises(ValueError, match=msg): + period_range("2020-01", "2020-05", freq=freq_depr) + with pytest.raises(ValueError, match=msg): + PeriodIndex(["2020-01", "2020-05"], freq=freq_depr) + + @pytest.mark.parametrize("freq", ["2BQE-SEP", "2BYE-MAR", "2BME"]) + def test_period_index_from_datetime_index_invalid_freq(self, freq): + # GH#56899 + msg = f"Invalid frequency: {freq[1:]}" + + rng = date_range("01-Jan-2012", periods=8, freq=freq) + with pytest.raises(ValueError, match=msg): + rng.to_period() + + class TestPeriodIndex: + def test_from_ordinals(self): + Period(ordinal=-1000, freq="Y") + Period(ordinal=0, freq="Y") + + msg = "The 'ordinal' keyword in PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="Y") + with tm.assert_produces_warning(FutureWarning, match=msg): + idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="Y") + tm.assert_index_equal(idx1, idx2) + + alt1 = PeriodIndex.from_ordinals([-1, 0, 1], freq="Y") + tm.assert_index_equal(alt1, idx1) + + alt2 = PeriodIndex.from_ordinals(np.array([-1, 0, 1]), freq="Y") + tm.assert_index_equal(alt2, idx2) + + def test_keyword_mismatch(self): + # GH#55961 we should get exactly one of data/ordinals/**fields + per = Period("2016-01-01", "D") + depr_msg1 = "The 'ordinal' keyword in PeriodIndex is deprecated" + depr_msg2 = "Constructing PeriodIndex from fields is deprecated" + + err_msg1 = "Cannot pass both data and ordinal" + with pytest.raises(ValueError, match=err_msg1): + with tm.assert_produces_warning(FutureWarning, match=depr_msg1): + PeriodIndex(data=[per], ordinal=[per.ordinal], freq=per.freq) + + err_msg2 = "Cannot pass both data and fields" + with pytest.raises(ValueError, match=err_msg2): + with tm.assert_produces_warning(FutureWarning, match=depr_msg2): + PeriodIndex(data=[per], year=[per.year], freq=per.freq) + + err_msg3 = "Cannot pass both ordinal and fields" + with pytest.raises(ValueError, match=err_msg3): + with tm.assert_produces_warning(FutureWarning, match=depr_msg2): + PeriodIndex(ordinal=[per.ordinal], year=[per.year], freq=per.freq) + def test_construction_base_constructor(self): # GH 13664 arr = [Period("2011-01", freq="M"), NaT, Period("2011-03", freq="M")] @@ -78,14 +158,18 @@ years = np.arange(1990, 2010).repeat(4)[2:-2] quarters = np.tile(np.arange(1, 5), 20)[2:-2] - index = PeriodIndex(year=years, quarter=quarters, freq="Q-DEC") + depr_msg = "Constructing PeriodIndex from fields is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + index = PeriodIndex(year=years, quarter=quarters, freq="Q-DEC") expected = period_range("1990Q3", "2009Q2", freq="Q-DEC") tm.assert_index_equal(index, expected) - index2 = PeriodIndex(year=years, quarter=quarters, freq="2Q-DEC") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + index2 = PeriodIndex(year=years, quarter=quarters, freq="2Q-DEC") tm.assert_numpy_array_equal(index.asi8, index2.asi8) - index = PeriodIndex(year=years, quarter=quarters) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + index = PeriodIndex(year=years, quarter=quarters) tm.assert_index_equal(index, expected) years = [2007, 2007, 2007] @@ -93,33 +177,33 @@ msg = "Mismatched Period array lengths" with pytest.raises(ValueError, match=msg): - PeriodIndex(year=years, month=months, freq="M") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + PeriodIndex(year=years, month=months, freq="M") with pytest.raises(ValueError, match=msg): - PeriodIndex(year=years, month=months, freq="2M") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + PeriodIndex(year=years, month=months, freq="2M") years = [2007, 2007, 2007] months = [1, 2, 3] - idx = PeriodIndex(year=years, month=months, freq="M") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + idx = PeriodIndex(year=years, month=months, freq="M") exp = period_range("2007-01", periods=3, freq="M") tm.assert_index_equal(idx, exp) - def test_constructor_U(self): - # U was used as undefined period - with pytest.raises(ValueError, match="Invalid frequency: X"): - period_range("2007-1-1", periods=500, freq="X") - def test_constructor_nano(self): idx = period_range( - start=Period(ordinal=1, freq="N"), end=Period(ordinal=4, freq="N"), freq="N" + start=Period(ordinal=1, freq="ns"), + end=Period(ordinal=4, freq="ns"), + freq="ns", ) exp = PeriodIndex( [ - Period(ordinal=1, freq="N"), - Period(ordinal=2, freq="N"), - Period(ordinal=3, freq="N"), - Period(ordinal=4, freq="N"), + Period(ordinal=1, freq="ns"), + Period(ordinal=2, freq="ns"), + Period(ordinal=3, freq="ns"), + Period(ordinal=4, freq="ns"), ], - freq="N", + freq="ns", ) tm.assert_index_equal(idx, exp) @@ -127,24 +211,35 @@ years = np.arange(1960, 2000, dtype=np.int64).repeat(4) quarters = np.tile(np.array([1, 2, 3, 4], dtype=np.int64), 40) - pindex = PeriodIndex(year=years, quarter=quarters) + msg = "Constructing PeriodIndex from fields is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + pindex = PeriodIndex(year=years, quarter=quarters) tm.assert_index_equal(pindex.year, Index(years)) tm.assert_index_equal(pindex.quarter, Index(quarters)) + alt = PeriodIndex.from_fields(year=years, quarter=quarters) + tm.assert_index_equal(alt, pindex) + def test_constructor_invalid_quarters(self): + depr_msg = "Constructing PeriodIndex from fields is deprecated" msg = "Quarter must be 1 <= q <= 4" with pytest.raises(ValueError, match=msg): - PeriodIndex(year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + PeriodIndex( + year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC" + ) - def test_constructor_corner(self): - result = period_range("2007-01", periods=10.5, freq="M") + def test_period_range_fractional_period(self): + msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = period_range("2007-01", periods=10.5, freq="M") exp = period_range("2007-01", periods=10, freq="M") tm.assert_index_equal(result, exp) def test_constructor_with_without_freq(self): # GH53687 - start = Period("2002-01-01 00:00", freq="30T") + start = Period("2002-01-01 00:00", freq="30min") exp = period_range(start=start, periods=5, freq=start.freq) result = period_range(start=start, periods=5) tm.assert_index_equal(exp, result) @@ -164,7 +259,7 @@ msg = "'Period' object is not iterable" with pytest.raises(TypeError, match=msg): - PeriodIndex(data=Period("2007", freq="A")) + PeriodIndex(data=Period("2007", freq="Y")) result = PeriodIndex(iter(idx)) tm.assert_index_equal(result, idx) @@ -177,15 +272,15 @@ result = PeriodIndex(idx, freq=offsets.MonthEnd()) tm.assert_index_equal(result, idx) - assert result.freq == "M" + assert result.freq == "ME" result = PeriodIndex(idx, freq="2M") tm.assert_index_equal(result, idx.asfreq("2M")) - assert result.freq == "2M" + assert result.freq == "2ME" result = PeriodIndex(idx, freq=offsets.MonthEnd(2)) tm.assert_index_equal(result, idx.asfreq("2M")) - assert result.freq == "2M" + assert result.freq == "2ME" result = PeriodIndex(idx, freq="D") exp = idx.asfreq("D", "e") @@ -203,7 +298,7 @@ @pytest.mark.parametrize("box", [None, "series", "index"]) def test_constructor_datetime64arr_ok(self, box): # https://github.com/pandas-dev/pandas/issues/23438 - data = date_range("2017", periods=4, freq="M") + data = date_range("2017", periods=4, freq="ME") if box is None: data = data._values elif box == "series": @@ -248,7 +343,7 @@ idx = PeriodIndex([], freq="M") assert isinstance(idx, PeriodIndex) assert len(idx) == 0 - assert idx.freq == "M" + assert idx.freq == "ME" with pytest.raises(ValueError, match="freq not specified"): PeriodIndex([]) @@ -330,53 +425,18 @@ exp = PeriodIndex(["2011-01-01", "NaT", "2012-01-01"], freq="D") tm.assert_index_equal(idx, exp) - def test_constructor_simple_new(self): - idx = period_range("2007-01", name="p", periods=2, freq="M") - - with pytest.raises(AssertionError, match=""): - idx._simple_new(idx, name="p") - - result = idx._simple_new(idx._data, name="p") - tm.assert_index_equal(result, idx) - - msg = "Should be numpy array of type i8" - with pytest.raises(AssertionError, match=msg): - # Need ndarray, not int64 Index - type(idx._data)._simple_new(Index(idx.asi8), dtype=idx.dtype) - - arr = type(idx._data)._simple_new(idx.asi8, dtype=idx.dtype) - result = idx._simple_new(arr, name="p") - tm.assert_index_equal(result, idx) - - def test_constructor_simple_new_empty(self): - # GH13079 - idx = PeriodIndex([], freq="M", name="p") - with pytest.raises(AssertionError, match=""): - idx._simple_new(idx, name="p") - - result = idx._simple_new(idx._data, name="p") - tm.assert_index_equal(result, idx) - @pytest.mark.parametrize("floats", [[1.1, 2.1], np.array([1.1, 2.1])]) def test_constructor_floats(self, floats): - with pytest.raises(AssertionError, match=" 69 + assert result.dtype == bool diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/test_datetimelike.py pandas-2.2.2+dfsg/pandas/tests/indexes/test_datetimelike.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/test_datetimelike.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/test_datetimelike.py 2024-04-10 17:42:52.000000000 +0000 @@ -89,7 +89,9 @@ result = type(simple_index)(idx) tm.assert_index_equal(result, idx) - idx_view = idx.view(type(simple_index)) + msg = "Passing a type in .*Index.view is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx_view = idx.view(type(simple_index)) result = type(simple_index)(idx) tm.assert_index_equal(result, idx_view) @@ -165,5 +167,5 @@ # GH 55080 dti = pd.to_datetime([10, 20, 30], unit=unit).as_unit(unit) result = dti.diff(1) - expected = pd.TimedeltaIndex([pd.NaT, 10, 10], unit=unit).as_unit(unit) + expected = pd.to_timedelta([pd.NaT, 10, 10], unit=unit).as_unit(unit) tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/test_index_new.py pandas-2.2.2+dfsg/pandas/tests/indexes/test_index_new.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/test_index_new.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/test_index_new.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,12 +4,15 @@ from datetime import ( datetime, timedelta, + timezone, ) from decimal import Decimal import numpy as np import pytest +from pandas._libs.tslibs.timezones import maybe_get_tz + from pandas import ( NA, Categorical, @@ -146,7 +149,7 @@ if nulls_fixture is NA: expected = Index([NA, NaT]) mark = pytest.mark.xfail(reason="Broken with np.NaT ctor; see GH 31884") - request.node.add_marker(mark) + request.applymarker(mark) # GH#35942 numpy will emit a DeprecationWarning within the # assert_index_equal calls. Since we can't do anything # about it until GH#31884 is fixed, we suppress that warning. @@ -183,6 +186,15 @@ tm.assert_index_equal(Index(data), expected) tm.assert_index_equal(Index(np.array(data, dtype=object)), expected) + def test_constructor_datetimes_mixed_tzs(self): + # https://github.com/pandas-dev/pandas/pull/55793/files#r1383719998 + tz = maybe_get_tz("US/Central") + dt1 = datetime(2020, 1, 1, tzinfo=tz) + dt2 = datetime(2020, 1, 1, tzinfo=timezone.utc) + result = Index([dt1, dt2]) + expected = Index([dt1, dt2], dtype=object) + tm.assert_index_equal(result, expected) + class TestDtypeEnforced: # check we don't silently ignore the dtype keyword @@ -350,6 +362,23 @@ index = Index(vals) assert isinstance(index, TimedeltaIndex) + def test_pass_timedeltaindex_to_index(self): + rng = timedelta_range("1 days", "10 days") + idx = Index(rng, dtype=object) + + expected = Index(rng.to_pytimedelta(), dtype=object) + + tm.assert_numpy_array_equal(idx.values, expected.values) + + def test_pass_datetimeindex_to_index(self): + # GH#1396 + rng = date_range("1/1/2000", "3/1/2000") + idx = Index(rng, dtype=object) + + expected = Index(rng.to_pydatetime(), dtype=object) + + tm.assert_numpy_array_equal(idx.values, expected.values) + class TestIndexConstructorUnwrapping: # Test passing different arraylike values to pd.Index @@ -384,7 +413,7 @@ def __init__(self, array) -> None: self.array = array - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=None) -> np.ndarray: return self.array expected = Index(array) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/test_old_base.py pandas-2.2.2+dfsg/pandas/tests/indexes/test_old_base.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/test_old_base.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/test_old_base.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,11 +1,13 @@ from __future__ import annotations from datetime import datetime -import gc +import weakref import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.core.dtypes.common import ( @@ -30,6 +32,7 @@ period_range, ) import pandas._testing as tm +import pandas.core.algorithms as algos from pandas.core.arrays import BaseMaskedArray @@ -209,7 +212,7 @@ 1 // idx def test_logical_compat(self, simple_index): - if simple_index.dtype == object: + if simple_index.dtype in (object, "string"): pytest.skip("Tested elsewhere.") idx = simple_index if idx.dtype.kind in "iufcbm": @@ -272,7 +275,9 @@ if isinstance(index, PeriodIndex): # .values an object array of Period, thus copied - result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) + depr_msg = "The 'ordinal' keyword in PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index.asi8, result.asi8, check_same="same") elif isinstance(index, IntervalIndex): # checked in test_interval.py @@ -295,7 +300,7 @@ tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" ) - elif index.dtype == "string[pyarrow]": + elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"): assert tm.shares_memory(result._values, index._values) else: raise NotImplementedError(index.dtype) @@ -405,14 +410,25 @@ tm.assert_index_equal(result, expected) def test_insert_base(self, index): - result = index[1:4] + trimmed = index[1:4] if not len(index): pytest.skip("Not applicable for empty index") # test 0th element - assert index[0:4].equals(result.insert(0, index[0])) - + warn = None + if index.dtype == object and index.inferred_type == "boolean": + # GH#51363 + warn = FutureWarning + msg = "The behavior of Index.insert with object-dtype is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = trimmed.insert(0, index[0]) + assert index[0:4].equals(result) + + @pytest.mark.skipif( + using_pyarrow_string_dtype(), + reason="completely different behavior, tested elsewher", + ) def test_insert_out_of_bounds(self, index): # TypeError/IndexError matches what np.insert raises in these cases @@ -559,15 +575,20 @@ pytest.skip("Tested elsewhere.") idx = simple_index expected = [str(x) for x in idx] - assert idx.format() == expected + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected def test_format_empty(self, simple_index): # GH35712 if isinstance(simple_index, (PeriodIndex, RangeIndex)): pytest.skip("Tested elsewhere") empty_idx = type(simple_index)([]) - assert empty_idx.format() == [] - assert empty_idx.format(name=True) == [""] + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format() == [] + with tm.assert_produces_warning(FutureWarning, match=msg): + assert empty_idx.format(name=True) == [""] def test_fillna(self, index): # GH 11343 @@ -639,7 +660,10 @@ idx = simple_index if idx.is_unique: joined = idx.join(idx, how=join_type) - assert (idx == joined).all() + expected = simple_index + if join_type == "outer": + expected = algos.safe_sort(expected) + tm.assert_index_equal(joined, expected) def test_map(self, simple_index): # callable @@ -685,7 +709,7 @@ pytest.skip("See test_map.py") idx = simple_index result = idx.map(str) - expected = Index([str(x) for x in idx], dtype=object) + expected = Index([str(x) for x in idx]) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("copy", [True, False]) @@ -739,25 +763,26 @@ @pytest.mark.arm_slow def test_engine_reference_cycle(self, simple_index): # GH27585 - index = simple_index - nrefs_pre = len(gc.get_referrers(index)) + index = simple_index.copy() + ref = weakref.ref(index) index._engine - assert len(gc.get_referrers(index)) == nrefs_pre + del index + assert ref() is None def test_getitem_2d_deprecated(self, simple_index): # GH#30588, GH#31479 if isinstance(simple_index, IntervalIndex): pytest.skip("Tested elsewhere") idx = simple_index - msg = "Multi-dimensional indexing" - with pytest.raises(ValueError, match=msg): + msg = "Multi-dimensional indexing|too many|only" + with pytest.raises((ValueError, IndexError), match=msg): idx[:, None] if not isinstance(idx, RangeIndex): # GH#44051 RangeIndex already raised pre-2.0 with a different message - with pytest.raises(ValueError, match=msg): + with pytest.raises((ValueError, IndexError), match=msg): idx[True] - with pytest.raises(ValueError, match=msg): + with pytest.raises((ValueError, IndexError), match=msg): idx[False] else: msg = "only integers, slices" @@ -821,7 +846,7 @@ alt = index.take(list(range(N)) * 2) tm.assert_index_equal(result, alt, check_exact=True) - def test_inv(self, simple_index): + def test_inv(self, simple_index, using_infer_string): idx = simple_index if idx.dtype.kind in ["i", "u"]: @@ -834,14 +859,21 @@ tm.assert_series_equal(res2, Series(expected)) else: if idx.dtype.kind == "f": + err = TypeError msg = "ufunc 'invert' not supported for the input types" + elif using_infer_string and idx.dtype == "string": + import pyarrow as pa + + err = pa.lib.ArrowNotImplementedError + msg = "has no kernel" else: + err = TypeError msg = "bad operand" - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): ~idx # check that we get the same behavior with Series - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): ~Series(idx) def test_is_boolean_is_deprecated(self, simple_index): @@ -945,7 +977,9 @@ idx_view = idx.view(dtype) tm.assert_index_equal(idx, index_cls(idx_view, name="Foo"), exact=True) - idx_view = idx.view(index_cls) + msg = "Passing a type in .*Index.view is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + idx_view = idx.view(index_cls) tm.assert_index_equal(idx, index_cls(idx_view, name="Foo"), exact=True) def test_format(self, simple_index): @@ -955,7 +989,9 @@ idx = simple_index max_width = max(len(str(x)) for x in idx) expected = [str(x).ljust(max_width) for x in idx] - assert idx.format() == expected + msg = r"Index\.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + assert idx.format() == expected def test_insert_non_na(self, simple_index): # GH#43921 inserting an element that we know we can hold should diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/test_setops.py pandas-2.2.2+dfsg/pandas/tests/indexes/test_setops.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/test_setops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/test_setops.py 2024-04-10 17:42:52.000000000 +0000 @@ -30,6 +30,32 @@ ) +def equal_contents(arr1, arr2) -> bool: + """ + Checks if the set of unique elements of arr1 and arr2 are equivalent. + """ + return frozenset(arr1) == frozenset(arr2) + + +@pytest.fixture( + params=tm.ALL_REAL_NUMPY_DTYPES + + [ + "object", + "category", + "datetime64[ns]", + "timedelta64[ns]", + ] +) +def any_dtype_for_small_pos_integer_indexes(request): + """ + Dtypes that can be given to an Index with small positive integers. + + This means that for any dtype `x` in the params list, `Index([1, 2, 3], dtype=x)` is + valid and gives the correct Index (sub-)class. + """ + return request.param + + def test_union_same_types(index): # Union with a non-unique, non-monotonic index raises error # Only needed for bool index factory @@ -64,7 +90,7 @@ mark = pytest.mark.xfail( reason="GH#44000 True==1", raises=ValueError, strict=False ) - request.node.add_marker(mark) + request.applymarker(mark) common_dtype = find_common_type([idx1.dtype, idx2.dtype]) @@ -89,7 +115,7 @@ raises=AssertionError, strict=False, ) - request.node.add_marker(mark) + request.applymarker(mark) any_uint64 = np.uint64 in (idx1.dtype, idx2.dtype) idx1_signed = is_signed_integer_dtype(idx1.dtype) @@ -113,19 +139,16 @@ @pytest.mark.parametrize( - "idx_fact1,idx_fact2", + "idx1,idx2", [ - (tm.makeIntIndex, tm.makeRangeIndex), - (tm.makeFloatIndex, tm.makeIntIndex), - (tm.makeFloatIndex, tm.makeRangeIndex), - (tm.makeFloatIndex, tm.makeUIntIndex), + (Index(np.arange(5), dtype=np.int64), RangeIndex(5)), + (Index(np.arange(5), dtype=np.float64), Index(np.arange(5), dtype=np.int64)), + (Index(np.arange(5), dtype=np.float64), RangeIndex(5)), + (Index(np.arange(5), dtype=np.float64), Index(np.arange(5), dtype=np.uint64)), ], ) -def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): +def test_compatible_inconsistent_pairs(idx1, idx2): # GH 23525 - idx1 = idx_fact1(10) - idx2 = idx_fact2(20) - res1 = idx1.union(idx2) res2 = idx2.union(idx1) @@ -196,10 +219,10 @@ if isinstance(index, CategoricalIndex): pytest.skip(f"Not relevant for {type(index).__name__}") - first = index[:5] - second = index[:3] + first = index[:5].unique() + second = index[:3].unique() intersect = first.intersection(second) - assert tm.equalContents(intersect, second) + tm.assert_index_equal(intersect, second) if isinstance(index.dtype, DatetimeTZDtype): # The second.values below will drop tz, so the rest of this test @@ -210,7 +233,7 @@ cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: result = first.intersection(case) - assert tm.equalContents(result, second) + assert equal_contents(result, second) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -222,12 +245,13 @@ ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_union_base(self, index): + index = index.unique() first = index[3:] second = index[:5] everything = index union = first.union(second) - assert tm.equalContents(union, everything) + tm.assert_index_equal(union.sort_values(), everything.sort_values()) if isinstance(index.dtype, DatetimeTZDtype): # The second.values below will drop tz, so the rest of this test @@ -238,7 +262,7 @@ cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: result = first.union(case) - assert tm.equalContents(result, everything) + assert equal_contents(result, everything) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -261,13 +285,13 @@ else: answer = index[4:] result = first.difference(second, sort) - assert tm.equalContents(result, answer) + assert equal_contents(result, answer) # GH#10149 cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: result = first.difference(case, sort) - assert tm.equalContents(result, answer) + assert equal_contents(result, answer) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -292,13 +316,13 @@ second = index[:-1] answer = index[[0, -1]] result = first.symmetric_difference(second) - assert tm.equalContents(result, answer) + tm.assert_index_equal(result.sort_values(), answer.sort_values()) # GH#10149 cases = [second.to_numpy(), second.to_series(), second.to_list()] for case in cases: result = first.symmetric_difference(case) - assert tm.equalContents(result, answer) + assert equal_contents(result, answer) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" @@ -320,8 +344,9 @@ # Test unions with various name combinations # Do not test MultiIndex or repeats if not index_flat.is_unique: - pytest.skip("Randomly generated index_flat was not unique.") - index = index_flat + index = index_flat.unique() + else: + index = index_flat # Test copy.union(copy) first = index.copy().set_names(fname) @@ -363,8 +388,9 @@ ) def test_union_unequal(self, index_flat, fname, sname, expected_name): if not index_flat.is_unique: - pytest.skip("Randomly generated index_flat was not unique.") - index = index_flat + index = index_flat.unique() + else: + index = index_flat # test copy.union(subset) - need sort for unicode and string first = index.copy().set_names(fname) @@ -387,8 +413,9 @@ # GH#35847 # Test intersections with various name combinations if not index_flat.is_unique: - pytest.skip("Randomly generated index_flat was not unique.") - index = index_flat + index = index_flat.unique() + else: + index = index_flat # Test copy.intersection(copy) first = index.copy().set_names(fname) @@ -430,8 +457,9 @@ ) def test_intersect_unequal(self, index_flat, fname, sname, expected_name): if not index_flat.is_unique: - pytest.skip("Randomly generated index_flat was not unique.") - index = index_flat + index = index_flat.unique() + else: + index = index_flat # test copy.intersection(subset) - need sort for unicode and string first = index.copy().set_names(fname) @@ -678,9 +706,10 @@ first = index[:20] second = index[:10] intersect = first.intersection(second, sort=sort) - if sort is None: - tm.assert_index_equal(intersect, second.sort_values()) - assert tm.equalContents(intersect, second) + if sort in (None, False): + tm.assert_index_equal(intersect.sort_values(), second.sort_values()) + else: + tm.assert_index_equal(intersect, second) # Corner cases inter = first.intersection(first, sort=sort) @@ -743,9 +772,10 @@ everything = index[:20] union = first.union(second, sort=sort) - if sort is None: - tm.assert_index_equal(union, everything.sort_values()) - assert tm.equalContents(union, everything) + if sort in (None, False): + tm.assert_index_equal(union.sort_values(), everything.sort_values()) + else: + tm.assert_index_equal(union, everything) @pytest.mark.parametrize("klass", [np.array, Series, list]) @pytest.mark.parametrize("index", ["string"], indirect=True) @@ -757,9 +787,10 @@ case = klass(second.values) result = first.union(case, sort=sort) - if sort is None: - tm.assert_index_equal(result, everything.sort_values()) - assert tm.equalContents(result, everything) + if sort in (None, False): + tm.assert_index_equal(result.sort_values(), everything.sort_values()) + else: + tm.assert_index_equal(result, everything) @pytest.mark.parametrize("index", ["string"], indirect=True) def test_union_identity(self, index, sort): @@ -771,10 +802,10 @@ # This should no longer be the same object, since [] is not consistent, # both objects will be recast to dtype('O') - union = first.union([], sort=sort) + union = first.union(Index([], dtype=first.dtype), sort=sort) assert (union is first) is (not sort) - union = Index([]).union(first, sort=sort) + union = Index([], dtype=first.dtype).union(first, sort=sort) assert (union is first) is (not sort) @pytest.mark.parametrize("index", ["string"], indirect=True) @@ -788,7 +819,11 @@ second.name = second_name result = first.difference(second, sort=sort) - assert tm.equalContents(result, answer) + if sort is True: + tm.assert_index_equal(result, answer) + else: + answer.name = second_name + tm.assert_index_equal(result.sort_values(), answer.sort_values()) if expected is None: assert result.name is None @@ -796,11 +831,21 @@ assert result.name == expected def test_difference_empty_arg(self, index, sort): - first = index[5:20] + first = index.copy() + first = first[5:20] first.name = "name" result = first.difference([], sort) + expected = index[5:20].unique() + expected.name = "name" + tm.assert_index_equal(result, expected) - tm.assert_index_equal(result, first) + def test_difference_should_not_compare(self): + # GH 55113 + left = Index([1, 1]) + right = Index([True]) + result = left.difference(right) + expected = Index([1]) + tm.assert_index_equal(result, expected) @pytest.mark.parametrize("index", ["string"], indirect=True) def test_difference_identity(self, index, sort): @@ -861,7 +906,6 @@ if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) - assert tm.equalContents(result, expected) @pytest.mark.parametrize( "index2,expected", @@ -883,13 +927,20 @@ def test_symmetric_difference_non_index(self, sort): index1 = Index([1, 2, 3, 4], name="index1") index2 = np.array([2, 3, 4, 5]) - expected = Index([1, 5]) + expected = Index([1, 5], name="index1") result = index1.symmetric_difference(index2, sort=sort) - assert tm.equalContents(result, expected) + if sort in (None, True): + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) assert result.name == "index1" result = index1.symmetric_difference(index2, result_name="new_name", sort=sort) - assert tm.equalContents(result, expected) + expected.name = "new_name" + if sort in (None, True): + tm.assert_index_equal(result, expected) + else: + tm.assert_index_equal(result.sort_values(), expected) assert result.name == "new_name" def test_union_ea_dtypes(self, any_numeric_ea_and_arrow_dtype): diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/methods/test_astype.py pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/methods/test_astype.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/methods/test_astype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/methods/test_astype.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,6 +12,7 @@ timedelta_range, ) import pandas._testing as tm +from pandas.core.arrays import TimedeltaArray class TestTimedeltaIndex: @@ -60,7 +61,7 @@ tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx") + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) @@ -69,7 +70,7 @@ tm.assert_numpy_array_equal(rng.asi8, result.values) def test_astype_uint(self): - arr = timedelta_range("1H", periods=2) + arr = timedelta_range("1h", periods=2) with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"): arr.astype("uint64") @@ -95,6 +96,56 @@ tm.assert_index_equal(result, idx) assert result is idx + def test_astype_to_td64d_raises(self, index_or_series): + # We don't support "D" reso + scalar = Timedelta(days=31) + td = index_or_series( + [scalar, scalar, scalar + timedelta(minutes=5, seconds=3), NaT], + dtype="m8[ns]", + ) + msg = ( + r"Cannot convert from timedelta64\[ns\] to timedelta64\[D\]. " + "Supported resolutions are 's', 'ms', 'us', 'ns'" + ) + with pytest.raises(ValueError, match=msg): + td.astype("timedelta64[D]") + + def test_astype_ms_to_s(self, index_or_series): + scalar = Timedelta(days=31) + td = index_or_series( + [scalar, scalar, scalar + timedelta(minutes=5, seconds=3), NaT], + dtype="m8[ns]", + ) + + exp_values = np.asarray(td).astype("m8[s]") + exp_tda = TimedeltaArray._simple_new(exp_values, dtype=exp_values.dtype) + expected = index_or_series(exp_tda) + assert expected.dtype == "m8[s]" + result = td.astype("timedelta64[s]") + tm.assert_equal(result, expected) + + def test_astype_freq_conversion(self): + # pre-2.0 td64 astype converted to float64. now for supported units + # (s, ms, us, ns) this converts to the requested dtype. + # This matches TDA and Series + tdi = timedelta_range("1 Day", periods=30) + + res = tdi.astype("m8[s]") + exp_values = np.asarray(tdi).astype("m8[s]") + exp_tda = TimedeltaArray._simple_new( + exp_values, dtype=exp_values.dtype, freq=tdi.freq + ) + expected = Index(exp_tda) + assert expected.dtype == "m8[s]" + tm.assert_index_equal(res, expected) + + # check this matches Series and TimedeltaArray + res = tdi._data.astype("m8[s]") + tm.assert_equal(res, expected._values) + + res = tdi.to_series().astype("m8[s]") + tm.assert_equal(res._values, expected._values._with_freq(None)) + @pytest.mark.parametrize("dtype", [float, "datetime64", "datetime64[ns]"]) def test_astype_raises(self, dtype): # GH 13149, GH 13209 @@ -104,10 +155,10 @@ idx.astype(dtype) def test_astype_category(self): - obj = timedelta_range("1H", periods=2, freq="H") + obj = timedelta_range("1h", periods=2, freq="h") result = obj.astype("category") - expected = pd.CategoricalIndex([Timedelta("1H"), Timedelta("2H")]) + expected = pd.CategoricalIndex([Timedelta("1h"), Timedelta("2h")]) tm.assert_index_equal(result, expected) result = obj._data.astype("category") @@ -115,7 +166,7 @@ tm.assert_categorical_equal(result, expected) def test_astype_array_fallback(self): - obj = timedelta_range("1H", periods=2) + obj = timedelta_range("1h", periods=2) result = obj.astype(bool) expected = Index(np.array([True, True])) tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/methods/test_shift.py pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/methods/test_shift.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/methods/test_shift.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/methods/test_shift.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,26 +14,26 @@ def test_tdi_shift_empty(self): # GH#9903 idx = TimedeltaIndex([], name="xxx") - tm.assert_index_equal(idx.shift(0, freq="H"), idx) - tm.assert_index_equal(idx.shift(3, freq="H"), idx) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) + tm.assert_index_equal(idx.shift(3, freq="h"), idx) def test_tdi_shift_hours(self): # GH#9903 idx = TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") - tm.assert_index_equal(idx.shift(0, freq="H"), idx) + tm.assert_index_equal(idx.shift(0, freq="h"), idx) exp = TimedeltaIndex(["8 hours", "9 hours", "12 hours"], name="xxx") - tm.assert_index_equal(idx.shift(3, freq="H"), exp) + tm.assert_index_equal(idx.shift(3, freq="h"), exp) exp = TimedeltaIndex(["2 hours", "3 hours", "6 hours"], name="xxx") - tm.assert_index_equal(idx.shift(-3, freq="H"), exp) + tm.assert_index_equal(idx.shift(-3, freq="h"), exp) def test_tdi_shift_minutes(self): # GH#9903 idx = TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") - tm.assert_index_equal(idx.shift(0, freq="T"), idx) + tm.assert_index_equal(idx.shift(0, freq="min"), idx) exp = TimedeltaIndex(["05:03:00", "06:03:00", "9:03:00"], name="xxx") - tm.assert_index_equal(idx.shift(3, freq="T"), exp) + tm.assert_index_equal(idx.shift(3, freq="min"), exp) exp = TimedeltaIndex(["04:57:00", "05:57:00", "8:57:00"], name="xxx") - tm.assert_index_equal(idx.shift(-3, freq="T"), exp) + tm.assert_index_equal(idx.shift(-3, freq="min"), exp) def test_tdi_shift_int(self): # GH#8083 diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_arithmetic.py pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_arithmetic.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_arithmetic.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_arithmetic.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,51 @@ +# Arithmetic tests for TimedeltaIndex are generally about the result's `freq` attribute. +# Other cases can be shared in tests.arithmetic.test_timedelta64 +import numpy as np + +from pandas import ( + NaT, + Timedelta, + timedelta_range, +) +import pandas._testing as tm + + +class TestTimedeltaIndexArithmetic: + def test_arithmetic_zero_freq(self): + # GH#51575 don't get a .freq with freq.n = 0 + tdi = timedelta_range(0, periods=100, freq="ns") + result = tdi / 2 + assert result.freq is None + expected = tdi[:50].repeat(2) + tm.assert_index_equal(result, expected) + + result2 = tdi // 2 + assert result2.freq is None + expected2 = expected + tm.assert_index_equal(result2, expected2) + + result3 = tdi * 0 + assert result3.freq is None + expected3 = tdi[:1].repeat(100) + tm.assert_index_equal(result3, expected3) + + def test_tdi_division(self, index_or_series): + # doc example + + scalar = Timedelta(days=31) + td = index_or_series( + [scalar, scalar, scalar + Timedelta(minutes=5, seconds=3), NaT], + dtype="m8[ns]", + ) + + result = td / np.timedelta64(1, "D") + expected = index_or_series( + [31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan] + ) + tm.assert_equal(result, expected) + + result = td / np.timedelta64(1, "s") + expected = index_or_series( + [31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan] + ) + tm.assert_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,10 +11,7 @@ to_timedelta, ) import pandas._testing as tm -from pandas.core.arrays.timedeltas import ( - TimedeltaArray, - sequence_to_td64ns, -) +from pandas.core.arrays.timedeltas import TimedeltaArray class TestTimedeltaIndex: @@ -34,10 +31,7 @@ TimedeltaIndex(arr) with pytest.raises(TypeError, match=msg): - TimedeltaArray._from_sequence(arr) - - with pytest.raises(TypeError, match=msg): - sequence_to_td64ns(arr) + TimedeltaArray._from_sequence(arr, dtype="m8[ns]") with pytest.raises(TypeError, match=msg): to_timedelta(arr) @@ -45,8 +39,10 @@ @pytest.mark.parametrize("unit", ["Y", "y", "M"]) def test_unit_m_y_raises(self, unit): msg = "Units 'M', 'Y', and 'y' are no longer supported" + depr_msg = "The 'unit' keyword in TimedeltaIndex construction is deprecated" with pytest.raises(ValueError, match=msg): - TimedeltaIndex([1, 3, 7], unit) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + TimedeltaIndex([1, 3, 7], unit) def test_int64_nocopy(self): # GH#23539 check that a copy isn't made when we pass int64 data @@ -74,6 +70,7 @@ # has one and it does not match the `freq` input tdi = timedelta_range("1 second", periods=100, freq="1s") + depr_msg = "TimedeltaArray.__init__ is deprecated" msg = ( "Inferred frequency .* from passed values does " "not conform to passed frequency" @@ -83,13 +80,15 @@ with pytest.raises(ValueError, match=msg): # GH#23789 - TimedeltaArray(tdi, freq="D") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + TimedeltaArray(tdi, freq="D") with pytest.raises(ValueError, match=msg): TimedeltaIndex(tdi._data, freq="D") with pytest.raises(ValueError, match=msg): - TimedeltaArray(tdi._data, freq="D") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + TimedeltaArray(tdi._data, freq="D") def test_dt64_data_invalid(self): # GH#23539 @@ -126,7 +125,7 @@ def test_float64_unit_conversion(self): # GH#23539 - tdi = TimedeltaIndex([1.5, 2.25], unit="D") + tdi = to_timedelta([1.5, 2.25], unit="D") expected = TimedeltaIndex([Timedelta(days=1.5), Timedelta(days=2.25)]) tm.assert_index_equal(tdi, expected) @@ -139,6 +138,9 @@ tm.assert_index_equal(pd.Index(arr), TimedeltaIndex(arr)) tm.assert_index_equal(pd.Index(np.array(arr)), TimedeltaIndex(np.array(arr))) + @pytest.mark.filterwarnings( + "ignore:The 'unit' keyword in TimedeltaIndex construction:FutureWarning" + ) def test_constructor(self): expected = TimedeltaIndex( [ @@ -163,15 +165,18 @@ expected = TimedeltaIndex( ["0 days 00:00:00", "0 days 00:00:01", "0 days 00:00:02"] ) - tm.assert_index_equal(TimedeltaIndex(range(3), unit="s"), expected) + result = TimedeltaIndex(range(3), unit="s") + tm.assert_index_equal(result, expected) expected = TimedeltaIndex( ["0 days 00:00:00", "0 days 00:00:05", "0 days 00:00:09"] ) - tm.assert_index_equal(TimedeltaIndex([0, 5, 9], unit="s"), expected) + result = TimedeltaIndex([0, 5, 9], unit="s") + tm.assert_index_equal(result, expected) expected = TimedeltaIndex( ["0 days 00:00:00.400", "0 days 00:00:00.450", "0 days 00:00:01.200"] ) - tm.assert_index_equal(TimedeltaIndex([400, 450, 1200], unit="ms"), expected) + result = TimedeltaIndex([400, 450, 1200], unit="ms") + tm.assert_index_equal(result, expected) def test_constructor_iso(self): # GH #21877 @@ -180,11 +185,14 @@ result = to_timedelta(durations) tm.assert_index_equal(result, expected) - def test_constructor_coverage(self): - rng = timedelta_range("1 days", periods=10.5) + def test_timedelta_range_fractional_period(self): + msg = "Non-integer 'periods' in pd.date_range, pd.timedelta_range" + with tm.assert_produces_warning(FutureWarning, match=msg): + rng = timedelta_range("1 days", periods=10.5) exp = timedelta_range("1 days", periods=10) tm.assert_index_equal(rng, exp) + def test_constructor_coverage(self): msg = "periods must be a number, got foo" with pytest.raises(TypeError, match=msg): timedelta_range(start="1 days", periods="foo", freq="D") @@ -246,7 +254,7 @@ pd.Index(["2000"], dtype="timedelta64") def test_constructor_wrong_precision_raises(self): - msg = r"dtype timedelta64\[D\] cannot be converted to timedelta64\[ns\]" + msg = "Supported timedelta64 resolutions are 's', 'ms', 'us', 'ns'" with pytest.raises(ValueError, match=msg): TimedeltaIndex(["2000"], dtype="timedelta64[D]") @@ -265,7 +273,9 @@ result = TimedeltaIndex(tdi._data, freq=None) assert result.freq is None - tda = TimedeltaArray(tdi, freq=None) + msg = "TimedeltaArray.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + tda = TimedeltaArray(tdi, freq=None) assert tda.freq is None def test_from_categorical(self): diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_formats.py pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_formats.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_formats.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_formats.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,18 @@ class TestTimedeltaIndexRendering: + def test_repr_round_days_non_nano(self): + # GH#55405 + # we should get "1 days", not "1 days 00:00:00" with non-nano + tdi = TimedeltaIndex(["1 days"], freq="D").as_unit("s") + result = repr(tdi) + expected = "TimedeltaIndex(['1 days'], dtype='timedelta64[s]', freq='D')" + assert result == expected + + result2 = repr(Series(tdi)) + expected2 = "0 1 days\ndtype: timedelta64[s]" + assert result2 == expected2 + @pytest.mark.parametrize("method", ["__repr__", "__str__"]) def test_representation(self, method): idx1 = TimedeltaIndex([], freq="D") @@ -39,6 +51,7 @@ result = getattr(idx, method)() assert result == expected + # TODO: this is a Series.__repr__ test def test_representation_to_series(self): idx1 = TimedeltaIndex([], freq="D") idx2 = TimedeltaIndex(["1 days"], freq="D") diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_freq_attr.py pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_freq_attr.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_freq_attr.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_freq_attr.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,7 +12,7 @@ class TestFreq: @pytest.mark.parametrize("values", [["0 days", "2 days", "4 days"], []]) - @pytest.mark.parametrize("freq", ["2D", Day(2), "48H", Hour(48)]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "48h", Hour(48)]) def test_freq_setter(self, values, freq): # GH#20678 idx = TimedeltaIndex(values) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_indexing.py pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_indexing.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -21,7 +21,7 @@ class TestGetItem: def test_getitem_slice_keeps_name(self): # GH#4226 - tdi = timedelta_range("1d", "5d", freq="H", name="timebucket") + tdi = timedelta_range("1d", "5d", freq="h", name="timebucket") assert tdi[1:].name == tdi.name def test_getitem(self): @@ -230,7 +230,7 @@ def test_take_equiv_getitem(self): tds = ["1day 02:00:00", "1 day 04:00:00", "1 day 10:00:00"] - idx = timedelta_range(start="1d", end="2d", freq="H", name="idx") + idx = timedelta_range(start="1d", end="2d", freq="h", name="idx") expected = TimedeltaIndex(tds, freq=None, name="idx") taken1 = idx.take([2, 4, 10]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_join.py pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_join.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_join.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_join.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,7 @@ import numpy as np from pandas import ( + DataFrame, Index, Timedelta, timedelta_range, @@ -25,16 +26,7 @@ tm.assert_index_equal(index, joined) def test_does_not_convert_mixed_integer(self): - df = tm.makeCustomDataframe( - 10, - 10, - data_gen_f=lambda *args, **kwargs: np.random.default_rng( - 2 - ).standard_normal(), - r_idx_type="i", - c_idx_type="td", - ) - str(df) + df = DataFrame(np.ones((5, 5)), columns=timedelta_range("1 day", periods=5)) cols = df.columns.join(df.index, how="outer") joined = cols.join(df.columns) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_scalar_compat.py pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_scalar_compat.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_scalar_compat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_scalar_compat.py 2024-04-10 17:42:52.000000000 +0000 @@ -63,8 +63,8 @@ ) expected_elt = expected_rng[1] - tm.assert_index_equal(td.round(freq="H"), expected_rng) - assert elt.round(freq="H") == expected_elt + tm.assert_index_equal(td.round(freq="h"), expected_rng) + assert elt.round(freq="h") == expected_elt msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): @@ -74,15 +74,15 @@ msg = " is a non-fixed frequency" with pytest.raises(ValueError, match=msg): - td.round(freq="M") + td.round(freq="ME") with pytest.raises(ValueError, match=msg): - elt.round(freq="M") + elt.round(freq="ME") @pytest.mark.parametrize( "freq,msg", [ - ("Y", " is a non-fixed frequency"), - ("M", " is a non-fixed frequency"), + ("YE", " is a non-fixed frequency"), + ("ME", " is a non-fixed frequency"), ("foobar", "Invalid frequency: foobar"), ], ) @@ -100,29 +100,29 @@ t1 = timedelta_range("1 days", periods=3, freq="1 min 2 s 3 us") t2 = -1 * t1 t1a = timedelta_range("1 days", periods=3, freq="1 min 2 s") - t1c = TimedeltaIndex([1, 1, 1], unit="D") + t1c = TimedeltaIndex(np.array([1, 1, 1], "m8[D]")).as_unit("ns") # note that negative times round DOWN! so don't give whole numbers for freq, s1, s2 in [ - ("N", t1, t2), - ("U", t1, t2), + ("ns", t1, t2), + ("us", t1, t2), ( - "L", + "ms", t1a, TimedeltaIndex( ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] ), ), ( - "S", + "s", t1a, TimedeltaIndex( ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] ), ), - ("12T", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), - ("H", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), - ("d", t1c, TimedeltaIndex([-1, -1, -1], unit="D")), + ("12min", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("h", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("d", t1c, -1 * t1c), ]: r1 = t1.round(freq) tm.assert_index_equal(r1, s1) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_setops.py pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_setops.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_setops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_setops.py 2024-04-10 17:42:52.000000000 +0000 @@ -52,8 +52,8 @@ assert result.freq == ordered.freq def test_union_bug_1730(self): - rng_a = timedelta_range("1 day", periods=4, freq="3H") - rng_b = timedelta_range("1 day", periods=4, freq="4H") + rng_a = timedelta_range("1 day", periods=4, freq="3h") + rng_b = timedelta_range("1 day", periods=4, freq="4h") result = rng_a.union(rng_b) exp = TimedeltaIndex(sorted(set(rng_a) | set(rng_b))) @@ -115,7 +115,7 @@ intersect = first.intersection(second, sort=sort) if sort is None: tm.assert_index_equal(intersect, second.sort_values()) - assert tm.equalContents(intersect, second) + tm.assert_index_equal(intersect, second) # Corner cases inter = first.intersection(first, sort=sort) @@ -219,9 +219,11 @@ tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) + # preserve frequency when the difference is a contiguous + # subset of the original range other = timedelta_range("2 days", "5 days", freq="D") idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + expected = TimedeltaIndex(["0 days", "1 days"], freq="D") tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal("freq", idx_diff, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_timedelta.py pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_timedelta.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_timedelta.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_timedelta.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,24 +1,16 @@ -from datetime import timedelta - import numpy as np import pytest from pandas import ( Index, - NaT, Series, Timedelta, timedelta_range, ) import pandas._testing as tm -from pandas.core.arrays import TimedeltaArray class TestTimedeltaIndex: - @pytest.fixture - def index(self): - return tm.makeTimedeltaIndex(10) - def test_misc_coverage(self): rng = timedelta_range("1 day", periods=5) result = rng.groupby(rng.days) @@ -34,14 +26,6 @@ exp = Index([f(x) for x in rng], dtype=np.int64) tm.assert_index_equal(result, exp) - def test_pass_TimedeltaIndex_to_index(self): - rng = timedelta_range("1 days", "10 days") - idx = Index(rng, dtype=object) - - expected = Index(rng.to_pytimedelta(), dtype=object) - - tm.assert_numpy_array_equal(idx.values, expected.values) - def test_fields(self): rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") tm.assert_index_equal(rng.days, Index([1, 1], dtype=np.int64)) @@ -75,80 +59,3 @@ # preserve name (GH15589) rng.name = "name" assert rng.days.name == "name" - - def test_freq_conversion_always_floating(self): - # pre-2.0 td64 astype converted to float64. now for supported units - # (s, ms, us, ns) this converts to the requested dtype. - # This matches TDA and Series - tdi = timedelta_range("1 Day", periods=30) - - res = tdi.astype("m8[s]") - exp_values = np.asarray(tdi).astype("m8[s]") - exp_tda = TimedeltaArray._simple_new( - exp_values, dtype=exp_values.dtype, freq=tdi.freq - ) - expected = Index(exp_tda) - assert expected.dtype == "m8[s]" - tm.assert_index_equal(res, expected) - - # check this matches Series and TimedeltaArray - res = tdi._data.astype("m8[s]") - tm.assert_equal(res, expected._values) - - res = tdi.to_series().astype("m8[s]") - tm.assert_equal(res._values, expected._values._with_freq(None)) - - def test_freq_conversion(self, index_or_series): - # doc example - - scalar = Timedelta(days=31) - td = index_or_series( - [scalar, scalar, scalar + timedelta(minutes=5, seconds=3), NaT], - dtype="m8[ns]", - ) - - result = td / np.timedelta64(1, "D") - expected = index_or_series( - [31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan] - ) - tm.assert_equal(result, expected) - - # We don't support "D" reso, so we use the pre-2.0 behavior - # casting to float64 - msg = ( - r"Cannot convert from timedelta64\[ns\] to timedelta64\[D\]. " - "Supported resolutions are 's', 'ms', 'us', 'ns'" - ) - with pytest.raises(ValueError, match=msg): - td.astype("timedelta64[D]") - - result = td / np.timedelta64(1, "s") - expected = index_or_series( - [31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan] - ) - tm.assert_equal(result, expected) - - exp_values = np.asarray(td).astype("m8[s]") - exp_tda = TimedeltaArray._simple_new(exp_values, dtype=exp_values.dtype) - expected = index_or_series(exp_tda) - assert expected.dtype == "m8[s]" - result = td.astype("timedelta64[s]") - tm.assert_equal(result, expected) - - def test_arithmetic_zero_freq(self): - # GH#51575 don't get a .freq with freq.n = 0 - tdi = timedelta_range(0, periods=100, freq="ns") - result = tdi / 2 - assert result.freq is None - expected = tdi[:50].repeat(2) - tm.assert_index_equal(result, expected) - - result2 = tdi // 2 - assert result2.freq is None - expected2 = expected - tm.assert_index_equal(result2, expected2) - - result3 = tdi * 0 - assert result3.freq is None - expected3 = tdi[:1].repeat(100) - tm.assert_index_equal(result3, expected3) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_timedelta_range.py pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_timedelta_range.py --- pandas-2.1.4+dfsg/pandas/tests/indexes/timedeltas/test_timedelta_range.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexes/timedeltas/test_timedelta_range.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,7 @@ from pandas import ( Timedelta, + TimedeltaIndex, timedelta_range, to_timedelta, ) @@ -45,14 +46,23 @@ @pytest.mark.parametrize( "depr_unit, unit", [ + ("H", "hour"), ("T", "minute"), ("t", "minute"), + ("S", "second"), ("L", "millisecond"), ("l", "millisecond"), + ("U", "microsecond"), + ("u", "microsecond"), + ("N", "nanosecond"), + ("n", "nanosecond"), ], ) - def test_timedelta_units_T_L_deprecated(self, depr_unit, unit): - depr_msg = f"Unit '{depr_unit}' is deprecated." + def test_timedelta_units_H_T_S_L_U_N_deprecated(self, depr_unit, unit): + # GH#52536 + depr_msg = ( + f"'{depr_unit}' is deprecated and will be removed in a future version." + ) expected = to_timedelta(np.arange(5), unit=unit) with tm.assert_produces_warning(FutureWarning, match=depr_msg): @@ -60,7 +70,7 @@ tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "periods, freq", [(3, "2D"), (5, "D"), (6, "19H12T"), (7, "16H"), (9, "12H")] + "periods, freq", [(3, "2D"), (5, "D"), (6, "19h12min"), (7, "16h"), (9, "12h")] ) def test_linspace_behavior(self, periods, freq): # GH 20976 @@ -68,6 +78,16 @@ expected = timedelta_range(start="0 days", end="4 days", freq=freq) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("msg_freq, freq", [("H", "19H12min"), ("T", "19h12T")]) + def test_timedelta_range_H_T_deprecated(self, freq, msg_freq): + # GH#52536 + msg = f"'{msg_freq}' is deprecated and will be removed in a future version." + + result = timedelta_range(start="0 days", end="4 days", periods=6) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = timedelta_range(start="0 days", end="4 days", freq=freq) + tm.assert_index_equal(result, expected) + def test_errors(self): # not enough params msg = ( @@ -88,7 +108,7 @@ # too many params with pytest.raises(ValueError, match=msg): - timedelta_range(start="0 days", end="5 days", periods=10, freq="H") + timedelta_range(start="0 days", end="5 days", periods=10, freq="h") @pytest.mark.parametrize( "start, end, freq, expected_periods", @@ -112,3 +132,42 @@ # https://github.com/pandas-dev/pandas/issues/35897 result = timedelta_range("0s", "1s", periods=31) assert result.freq is None + + @pytest.mark.parametrize( + "freq_depr, start, end, expected_values, expected_freq", + [ + ( + "3.5S", + "05:03:01", + "05:03:10", + ["0 days 05:03:01", "0 days 05:03:04.500000", "0 days 05:03:08"], + "3500ms", + ), + ( + "2.5T", + "5 hours", + "5 hours 8 minutes", + [ + "0 days 05:00:00", + "0 days 05:02:30", + "0 days 05:05:00", + "0 days 05:07:30", + ], + "150s", + ), + ], + ) + def test_timedelta_range_deprecated_freq( + self, freq_depr, start, end, expected_values, expected_freq + ): + # GH#52536 + msg = ( + f"'{freq_depr[-1]}' is deprecated and will be removed in a future version." + ) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = timedelta_range(start=start, end=end, freq=freq_depr) + expected = TimedeltaIndex( + expected_values, dtype="timedelta64[ns]", freq=expected_freq + ) + tm.assert_index_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/interval/test_interval.py pandas-2.2.2+dfsg/pandas/tests/indexing/interval/test_interval.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/interval/test_interval.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/interval/test_interval.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,9 @@ import numpy as np import pytest +from pandas._libs import index as libindex +from pandas.compat import IS64 + import pandas as pd from pandas import ( DataFrame, @@ -70,15 +73,18 @@ with pytest.raises(KeyError, match=r"\[-1\] not in index"): indexer_sl(ser)[[-1, 3]] - @pytest.mark.slow - def test_loc_getitem_large_series(self): - ser = Series( - np.arange(1000000), index=IntervalIndex.from_breaks(np.arange(1000001)) - ) - - result1 = ser.loc[:80000] - result2 = ser.loc[0:80000] - result3 = ser.loc[0:80000:1] + def test_loc_getitem_large_series(self, monkeypatch): + size_cutoff = 20 + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + ser = Series( + np.arange(size_cutoff), + index=IntervalIndex.from_breaks(np.arange(size_cutoff + 1)), + ) + + result1 = ser.loc[:8] + result2 = ser.loc[0:8] + result3 = ser.loc[0:8:1] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) @@ -106,7 +112,11 @@ expected = df.take([4, 5, 4, 5]) tm.assert_frame_equal(result, expected) - with pytest.raises(KeyError, match=r"None of \[\[10\]\] are"): + msg = ( + r"None of \[Index\(\[10\], dtype='object', name='B'\)\] " + r"are in the \[index\]" + ) + with pytest.raises(KeyError, match=msg): df.loc[[10]] # partial missing @@ -128,6 +138,33 @@ tm.assert_equal(result, expected) + def test_setitem_interval_with_slice(self): + # GH#54722 + ii = IntervalIndex.from_breaks(range(4, 15)) + ser = Series(range(10), index=ii) + + orig = ser.copy() + + # This should be a no-op (used to raise) + ser.loc[1:3] = 20 + tm.assert_series_equal(ser, orig) + + ser.loc[6:8] = 19 + orig.iloc[1:4] = 19 + tm.assert_series_equal(ser, orig) + + ser2 = Series(range(5), index=ii[::2]) + orig2 = ser2.copy() + + # this used to raise + ser2.loc[6:8] = 22 # <- raises on main, sets on branch + orig2.iloc[1] = 22 + tm.assert_series_equal(ser2, orig2) + + ser2.loc[5:7] = 21 + orig2.iloc[:2] = 21 + tm.assert_series_equal(ser2, orig2) + class TestIntervalIndexInsideMultiIndex: def test_mi_intervalindex_slicing_with_scalar(self): @@ -172,3 +209,19 @@ ) expected = Series([1, 6, 2, 8, 7], index=expected_index, name="value") tm.assert_series_equal(result, expected) + + @pytest.mark.xfail(not IS64, reason="GH 23440") + @pytest.mark.parametrize( + "base", + [101, 1010], + ) + def test_reindex_behavior_with_interval_index(self, base): + # GH 51826 + + ser = Series( + range(base), + index=IntervalIndex.from_arrays(range(base), range(1, base + 1)), + ) + expected_result = Series([np.nan, 0], index=[np.nan, 1.0], dtype=float) + result = ser.reindex(index=[np.nan, 1.0]) + tm.assert_series_equal(result, expected_result) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/interval/test_interval_new.py pandas-2.2.2+dfsg/pandas/tests/indexing/interval/test_interval_new.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/interval/test_interval_new.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/interval/test_interval_new.py 2024-04-10 17:42:52.000000000 +0000 @@ -140,7 +140,7 @@ # interval expected = 0 result = indexer_sl(ser)[Interval(1, 5)] - result == expected + assert expected == result expected = ser result = indexer_sl(ser)[[Interval(1, 5), Interval(3, 7)]] @@ -149,7 +149,10 @@ with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): indexer_sl(ser)[Interval(3, 5)] - msg = r"None of \[\[Interval\(3, 5, closed='right'\)\]\]" + msg = ( + r"None of \[IntervalIndex\(\[\(3, 5\]\], " + r"dtype='interval\[int64, right\]'\)\] are in the \[index\]" + ) with pytest.raises(KeyError, match=msg): indexer_sl(ser)[[Interval(3, 5)]] diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_chaining_and_caching.py pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_chaining_and_caching.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_chaining_and_caching.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_chaining_and_caching.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas._libs import index as libindex from pandas.errors import SettingWithCopyError import pandas.util._test_decorators as td @@ -12,7 +13,7 @@ import pandas._testing as tm -def test_detect_chained_assignment(using_copy_on_write): +def test_detect_chained_assignment(using_copy_on_write, warn_copy_on_write): # Inplace ops, originally from: # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug a = [12, 23] @@ -32,14 +33,18 @@ if using_copy_on_write: with tm.raises_chained_assignment_error(): zed["eyes"]["right"].fillna(value=555, inplace=True) + elif warn_copy_on_write: + with tm.assert_produces_warning(None): + zed["eyes"]["right"].fillna(value=555, inplace=True) else: msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(SettingWithCopyError, match=msg): - zed["eyes"]["right"].fillna(value=555, inplace=True) + with tm.assert_produces_warning(None): + zed["eyes"]["right"].fillna(value=555, inplace=True) @td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view -def test_cache_updating(using_copy_on_write): +def test_cache_updating(using_copy_on_write, warn_copy_on_write): # 5216 # make sure that we don't try to set a dead cache a = np.random.default_rng(2).random((10, 3)) @@ -51,12 +56,13 @@ # setting via chained assignment # but actually works, since everything is a view + + with tm.raises_chained_assignment_error(): + df.loc[0]["z"].iloc[0] = 1.0 + if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df.loc[0]["z"].iloc[0] = 1.0 assert df.loc[(0, 0), "z"] == df_original.loc[0, "z"] else: - df.loc[0]["z"].iloc[0] = 1.0 result = df.loc[(0, 0), "z"] assert result == 1 @@ -66,16 +72,16 @@ assert result == 2 -@pytest.mark.slow -def test_indexer_caching(): +def test_indexer_caching(monkeypatch): # GH5727 # make sure that indexers are in the _internal_names_set - n = 1000001 - index = MultiIndex.from_arrays([np.arange(n), np.arange(n)]) - s = Series(np.zeros(n), index=index) - str(s) - - # setitem - expected = Series(np.ones(n), index=index) - s[s == 0] = 1 + size_cutoff = 20 + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + index = MultiIndex.from_arrays([np.arange(size_cutoff), np.arange(size_cutoff)]) + s = Series(np.zeros(size_cutoff), index=index) + + # setitem + s[s == 0] = 1 + expected = Series(np.ones(size_cutoff), index=index) tm.assert_series_equal(s, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_getitem.py pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_getitem.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_getitem.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_getitem.py 2024-04-10 17:42:52.000000000 +0000 @@ -145,6 +145,23 @@ indexer(df) +def test_tuple_string_column_names(): + # GH#50372 + mi = MultiIndex.from_tuples([("a", "aa"), ("a", "ab"), ("b", "ba"), ("b", "bb")]) + df = DataFrame([range(4), range(1, 5), range(2, 6)], columns=mi) + df["single_index"] = 0 + + df_flat = df.copy() + df_flat.columns = df_flat.columns.to_flat_index() + df_flat["new_single_index"] = 0 + + result = df_flat[[("a", "aa"), "new_single_index"]] + expected = DataFrame( + [[0, 0], [1, 0], [2, 0]], columns=Index([("a", "aa"), "new_single_index"]) + ) + tm.assert_frame_equal(result, expected) + + def test_frame_getitem_multicolumn_empty_level(): df = DataFrame({"a": ["1", "2", "3"], "b": ["2", "3", "4"]}) df.columns = [ diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_loc.py pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_loc.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_loc.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_loc.py 2024-04-10 17:42:52.000000000 +0000 @@ -566,7 +566,7 @@ tm.assert_frame_equal(df, expected) -def test_loc_nan_multiindex(): +def test_loc_nan_multiindex(using_infer_string): # GH 5286 tups = [ ("Good Things", "C", np.nan), @@ -586,8 +586,12 @@ result = df.loc["Good Things"].loc["C"] expected = DataFrame( np.ones((1, 4)), - index=Index([np.nan], dtype="object", name="u3"), - columns=Index(["d1", "d2", "d3", "d4"], dtype="object"), + index=Index( + [np.nan], + dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + name="u3", + ), + columns=Index(["d1", "d2", "d3", "d4"]), ) tm.assert_frame_equal(result, expected) @@ -698,10 +702,19 @@ tm.assert_series_equal(result, expected) -def test_getitem_str_slice(datapath): +def test_getitem_str_slice(): # GH#15928 - path = datapath("reshape", "merge", "data", "quotes2.csv") - df = pd.read_csv(path, parse_dates=["time"]) + df = DataFrame( + [ + ["20160525 13:30:00.023", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.131", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.135", "MSFT", "51.92", "51.95"], + ["20160525 13:30:00.135", "AAPL", "98.61", "98.62"], + ], + columns="time,ticker,bid,ask".split(","), + ) df2 = df.set_index(["ticker", "time"]).sort_index() res = df2.loc[("AAPL", slice("2016-05-25 13:30:00")), :].droplevel(0) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_multiindex.py pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_multiindex.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_multiindex.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_multiindex.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,7 @@ import numpy as np import pytest -import pandas._libs.index as _index +import pandas._libs.index as libindex from pandas.errors import PerformanceWarning import pandas as pd @@ -33,20 +33,19 @@ with tm.assert_produces_warning(PerformanceWarning): df.loc[(0,)] - def test_indexing_over_hashtable_size_cutoff(self): - n = 10000 - - old_cutoff = _index._SIZE_CUTOFF - _index._SIZE_CUTOFF = 20000 - - s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n)))) - - # hai it works! - assert s[("a", 5)] == 5 - assert s[("a", 6)] == 6 - assert s[("a", 7)] == 7 - - _index._SIZE_CUTOFF = old_cutoff + @pytest.mark.parametrize("offset", [-5, 5]) + def test_indexing_over_hashtable_size_cutoff(self, monkeypatch, offset): + size_cutoff = 20 + n = size_cutoff + offset + + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n)))) + + # hai it works! + assert s[("a", 5)] == 5 + assert s[("a", 6)] == 6 + assert s[("a", 7)] == 7 def test_multi_nan_indexing(self): # GH 3588 diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_partial.py pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_partial.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_partial.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_partial.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,9 +5,9 @@ from pandas import ( DataFrame, + DatetimeIndex, MultiIndex, date_range, - to_datetime, ) import pandas._testing as tm @@ -122,7 +122,10 @@ # exp.loc[2000, 4].values[:] select multiple columns -> .values is not a view @td.skip_array_manager_invalid_test def test_partial_set( - self, multiindex_year_month_day_dataframe_random_data, using_copy_on_write + self, + multiindex_year_month_day_dataframe_random_data, + using_copy_on_write, + warn_copy_on_write, ): # GH #397 ymd = multiindex_year_month_day_dataframe_random_data @@ -137,7 +140,8 @@ df["A"].loc[2000, 4] = 1 df.loc[(2000, 4), "A"] = 1 else: - df["A"].loc[2000, 4] = 1 + with tm.raises_chained_assignment_error(): + df["A"].loc[2000, 4] = 1 exp.iloc[65:85, 0] = 1 tm.assert_frame_equal(df, exp) @@ -146,12 +150,11 @@ tm.assert_frame_equal(df, exp) # this works...for now + with tm.raises_chained_assignment_error(): + df["A"].iloc[14] = 5 if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["A"].iloc[14] = 5 - df["A"].iloc[14] == exp["A"].iloc[14] + assert df["A"].iloc[14] == exp["A"].iloc[14] else: - df["A"].iloc[14] = 5 assert df["A"].iloc[14] == 5 @pytest.mark.parametrize("dtype", [int, float]) @@ -212,7 +215,11 @@ @pytest.mark.parametrize( "indexer, exp_idx, exp_values", [ - (slice("2019-2", None), [to_datetime("2019-02-01")], [2, 3]), + ( + slice("2019-2", None), + DatetimeIndex(["2019-02-01"], dtype="M8[ns]"), + [2, 3], + ), ( slice(None, "2019-2"), date_range("2019", periods=2, freq="MS"), diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_setitem.py pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_setitem.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_setitem.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_setitem.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,7 +9,6 @@ DataFrame, MultiIndex, Series, - Timestamp, date_range, isna, notna, @@ -91,11 +90,11 @@ np.random.default_rng(2).random((12, 4)), index=idx, columns=cols ) - subidx = MultiIndex.from_tuples( - [("A", Timestamp("2015-01-01")), ("A", Timestamp("2015-02-01"))] + subidx = MultiIndex.from_arrays( + [["A", "A"], date_range("2015-01-01", "2015-02-01", freq="MS")] ) - subcols = MultiIndex.from_tuples( - [("foo", Timestamp("2016-01-01")), ("foo", Timestamp("2016-02-01"))] + subcols = MultiIndex.from_arrays( + [["foo", "foo"], date_range("2016-01-01", "2016-02-01", freq="MS")] ) vals = DataFrame( @@ -175,7 +174,7 @@ ) expected = df_orig.copy() - expected.iloc[[0, 2, 3]] *= 2 + expected.iloc[[0, 1, 3]] *= 2 idx = pd.IndexSlice df = df_orig.copy() @@ -201,7 +200,9 @@ df.loc[4, "d"] = arr tm.assert_series_equal(df.loc[4, "d"], Series(arr, index=[8, 10], name="d")) - def test_multiindex_assignment_single_dtype(self, using_copy_on_write): + def test_multiindex_assignment_single_dtype( + self, using_copy_on_write, warn_copy_on_write + ): # GH3777 part 2b # single dtype arr = np.array([0.0, 1.0]) @@ -215,6 +216,8 @@ view = df["c"].iloc[:2].values # arr can be losslessly cast to int, so this setitem is inplace + # INFO(CoW-warn) this does not warn because we directly took .values + # above, so no reference to a pandas object is alive for `view` df.loc[4, "c"] = arr exp = Series(arr, index=[8, 10], name="c", dtype="int64") result = df.loc[4, "c"] @@ -234,7 +237,8 @@ tm.assert_series_equal(result, exp) # scalar ok - df.loc[4, "c"] = 10 + with tm.assert_cow_warning(warn_copy_on_write): + df.loc[4, "c"] = 10 exp = Series(10, index=[8, 10], name="c", dtype="float64") tm.assert_series_equal(df.loc[4, "c"], exp) @@ -248,7 +252,8 @@ # But with a length-1 listlike column indexer this behaves like # `df.loc[4, "c"] = 0 - df.loc[4, ["c"]] = [0] + with tm.assert_cow_warning(warn_copy_on_write): + df.loc[4, ["c"]] = [0] assert (df.loc[4, "c"] == 0).all() def test_groupby_example(self): @@ -273,16 +278,20 @@ new_vals = np.arange(df2.shape[0]) df.loc[name, "new_col"] = new_vals - def test_series_setitem(self, multiindex_year_month_day_dataframe_random_data): + def test_series_setitem( + self, multiindex_year_month_day_dataframe_random_data, warn_copy_on_write + ): ymd = multiindex_year_month_day_dataframe_random_data s = ymd["A"] - s[2000, 3] = np.nan + with tm.assert_cow_warning(warn_copy_on_write): + s[2000, 3] = np.nan assert isna(s.values[42:65]).all() assert notna(s.values[:42]).all() assert notna(s.values[65:]).all() - s[2000, 3, 10] = np.nan + with tm.assert_cow_warning(warn_copy_on_write): + s[2000, 3, 10] = np.nan assert isna(s.iloc[49]) with pytest.raises(KeyError, match="49"): @@ -386,6 +395,7 @@ expected = df.loc[2000, 1, 6][["A", "B", "C"]] tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_loc_getitem_setitem_slice_integers(self, frame_or_series): index = MultiIndex( levels=[[0, 1, 2], [0, 2]], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]] @@ -417,7 +427,7 @@ tm.assert_series_equal(reindexed["foo", "two"], s > s.median()) def test_set_column_scalar_with_loc( - self, multiindex_dataframe_random_data, using_copy_on_write + self, multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write ): frame = multiindex_dataframe_random_data subset = frame.index[[1, 4, 5]] @@ -427,7 +437,8 @@ frame_original = frame.copy() col = frame["B"] - col[subset] = 97 + with tm.assert_cow_warning(warn_copy_on_write): + col[subset] = 97 if using_copy_on_write: # chained setitem doesn't work with CoW tm.assert_frame_equal(frame, frame_original) @@ -527,32 +538,34 @@ def test_frame_setitem_copy_raises( - multiindex_dataframe_random_data, using_copy_on_write + multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write ): # will raise/warn as its chained assignment df = multiindex_dataframe_random_data.T - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: with tm.raises_chained_assignment_error(): df["foo"]["one"] = 2 else: msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(SettingWithCopyError, match=msg): - df["foo"]["one"] = 2 + with tm.raises_chained_assignment_error(): + df["foo"]["one"] = 2 def test_frame_setitem_copy_no_write( - multiindex_dataframe_random_data, using_copy_on_write + multiindex_dataframe_random_data, using_copy_on_write, warn_copy_on_write ): frame = multiindex_dataframe_random_data.T expected = frame df = frame.copy() - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: with tm.raises_chained_assignment_error(): df["foo"]["one"] = 2 else: msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(SettingWithCopyError, match=msg): - df["foo"]["one"] = 2 + with tm.raises_chained_assignment_error(): + df["foo"]["one"] = 2 result = df tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_slice.py pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_slice.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/multiindex/test_slice.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/multiindex/test_slice.py 2024-04-10 17:42:52.000000000 +0000 @@ -739,6 +739,7 @@ expected = s.reindex(s.index[5:]) tm.assert_series_equal(result, expected) + s = ymd["A"].copy() exp = ymd["A"].copy() s[5:] = 0 exp.iloc[5:] = 0 diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/test_at.py pandas-2.2.2+dfsg/pandas/tests/indexing/test_at.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/test_at.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/test_at.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,6 +13,7 @@ CategoricalIndex, DataFrame, DatetimeIndex, + Index, MultiIndex, Series, Timestamp, @@ -70,7 +71,11 @@ df.at[0, "x"] = 4 df.at[0, "cost"] = 789 - expected = DataFrame({"x": [4], "cost": 789}, index=[0]) + expected = DataFrame( + {"x": [4], "cost": 789}, + index=[0], + columns=Index(["x", "cost"], dtype=object), + ) tm.assert_frame_equal(df, expected) # And in particular, check that the _item_cache has updated correctly. diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/test_categorical.py pandas-2.2.2+dfsg/pandas/tests/indexing/test_categorical.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/test_categorical.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/test_categorical.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -14,9 +16,9 @@ Series, Timedelta, Timestamp, + option_context, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT @pytest.fixture @@ -25,7 +27,9 @@ { "A": np.arange(6, dtype="int64"), }, - index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cab")), name="B"), + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cab")), name="B" + ), ) @@ -35,13 +39,15 @@ { "A": np.arange(6, dtype="int64"), }, - index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), + index=CategoricalIndex( + list("aabbca"), dtype=CategoricalDtype(list("cabe")), name="B" + ), ) class TestCategoricalIndex: def test_loc_scalar(self, df): - dtype = CDT(list("cab")) + dtype = CategoricalDtype(list("cab")) result = df.loc["a"] bidx = Series(list("aaa"), name="B").astype(dtype) assert bidx.dtype == dtype @@ -88,7 +94,7 @@ ) tm.assert_frame_equal(df3, expected3) - # Settig a new row _and_ new column + # Setting a new row _and_ new column df4 = df.copy() df4.loc["d", "C"] = 10 expected3 = DataFrame( @@ -270,7 +276,7 @@ tm.assert_frame_equal(result, expected) result = df.iloc[2:4, :].dtypes - expected = Series(["category", "int64"], ["cats", "values"]) + expected = Series(["category", "int64"], ["cats", "values"], dtype=object) tm.assert_series_equal(result, expected) result = df.loc["h":"j", "cats"] @@ -425,38 +431,42 @@ expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns) tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) - def test_ix_categorical_index_non_unique(self): + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_ix_categorical_index_non_unique(self, infer_string): # non-unique - df = DataFrame( - np.random.default_rng(2).standard_normal((3, 3)), - index=list("ABA"), - columns=list("XYX"), - ) - cdf = df.copy() - cdf.index = CategoricalIndex(df.index) - cdf.columns = CategoricalIndex(df.columns) - - exp_index = CategoricalIndex(list("AA"), categories=["A", "B"]) - expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index) - tm.assert_frame_equal(cdf.loc["A", :], expect) - - exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"]) - expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns) - tm.assert_frame_equal(cdf.loc[:, "X"], expect) - - expect = DataFrame( - df.loc[["A", "B"], :], - columns=cdf.columns, - index=CategoricalIndex(list("AAB")), - ) - tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect) - - expect = DataFrame( - df.loc[:, ["X", "Y"]], - index=cdf.index, - columns=CategoricalIndex(list("XXY")), - ) - tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) + with option_context("future.infer_string", infer_string): + df = DataFrame( + np.random.default_rng(2).standard_normal((3, 3)), + index=list("ABA"), + columns=list("XYX"), + ) + cdf = df.copy() + cdf.index = CategoricalIndex(df.index) + cdf.columns = CategoricalIndex(df.columns) + + exp_index = CategoricalIndex(list("AA"), categories=["A", "B"]) + expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index) + tm.assert_frame_equal(cdf.loc["A", :], expect) + + exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"]) + expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns) + tm.assert_frame_equal(cdf.loc[:, "X"], expect) + + expect = DataFrame( + df.loc[["A", "B"], :], + columns=cdf.columns, + index=CategoricalIndex(list("AAB")), + ) + tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect) + + expect = DataFrame( + df.loc[:, ["X", "Y"]], + index=cdf.index, + columns=CategoricalIndex(list("XXY")), + ) + tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) def test_loc_slice(self, df): # GH9748 diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/test_chaining_and_caching.py pandas-2.2.2+dfsg/pandas/tests/indexing/test_chaining_and_caching.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/test_chaining_and_caching.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/test_chaining_and_caching.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,4 @@ -from string import ascii_letters as letters +from string import ascii_letters import numpy as np import pytest @@ -12,6 +12,7 @@ import pandas as pd from pandas import ( DataFrame, + Index, Series, Timestamp, date_range, @@ -24,9 +25,9 @@ def random_text(nobs=100): # Construct a DataFrame where each row is a random slice from 'letters' - idxs = np.random.default_rng(2).integers(len(letters), size=(nobs, 2)) + idxs = np.random.default_rng(2).integers(len(ascii_letters), size=(nobs, 2)) idxs.sort(axis=1) - strings = [letters[x[0] : x[1]] for x in idxs] + strings = [ascii_letters[x[0] : x[1]] for x in idxs] return DataFrame(strings, columns=["letters"]) @@ -44,14 +45,8 @@ # caches a reference to the 'bb' series df["bb"] - # repr machinery triggers consolidation - repr(df) - # Assignment to wrong series - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["bb"].iloc[0] = 0.17 - else: + with tm.raises_chained_assignment_error(): df["bb"].iloc[0] = 0.17 df._clear_item_cache() if not using_copy_on_write: @@ -77,7 +72,9 @@ assert df.loc[0, "c"] == 0.0 assert df.loc[7, "c"] == 1.0 - def test_setitem_cache_updating_slices(self, using_copy_on_write): + def test_setitem_cache_updating_slices( + self, using_copy_on_write, warn_copy_on_write + ): # GH 7084 # not updating cache on series setting with slices expected = DataFrame( @@ -101,10 +98,9 @@ out_original = out.copy() for ix, row in df.iterrows(): v = out[row["C"]][six:eix] + row["D"] - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - out[row["C"]][six:eix] = v - else: + with tm.raises_chained_assignment_error( + (ix == 0) or warn_copy_on_write or using_copy_on_write + ): out[row["C"]][six:eix] = v if not using_copy_on_write: @@ -121,12 +117,14 @@ tm.assert_frame_equal(out, expected) tm.assert_series_equal(out["A"], expected["A"]) - def test_altering_series_clears_parent_cache(self, using_copy_on_write): + def test_altering_series_clears_parent_cache( + self, using_copy_on_write, warn_copy_on_write + ): # GH #33675 df = DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"]) ser = df["A"] - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: assert "A" not in df._item_cache else: assert "A" in df._item_cache @@ -148,54 +146,47 @@ df = DataFrame({"response": np.array(data)}) mask = df.response == "timeout" + with tm.raises_chained_assignment_error(): + df.response[mask] = "none" if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df.response[mask] = "none" tm.assert_frame_equal(df, DataFrame({"response": data})) else: - df.response[mask] = "none" tm.assert_frame_equal(df, DataFrame({"response": mdata})) recarray = np.rec.fromarrays([data], names=["response"]) df = DataFrame(recarray) mask = df.response == "timeout" + with tm.raises_chained_assignment_error(): + df.response[mask] = "none" if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df.response[mask] = "none" tm.assert_frame_equal(df, DataFrame({"response": data})) else: - df.response[mask] = "none" tm.assert_frame_equal(df, DataFrame({"response": mdata})) df = DataFrame({"response": data, "response1": data}) df_original = df.copy() mask = df.response == "timeout" + with tm.raises_chained_assignment_error(): + df.response[mask] = "none" if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df.response[mask] = "none" tm.assert_frame_equal(df, df_original) else: - df.response[mask] = "none" tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data})) # GH 6056 expected = DataFrame({"A": [np.nan, "bar", "bah", "foo", "bar"]}) df = DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) + with tm.raises_chained_assignment_error(): + df["A"].iloc[0] = np.nan if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["A"].iloc[0] = np.nan expected = DataFrame({"A": ["foo", "bar", "bah", "foo", "bar"]}) else: - df["A"].iloc[0] = np.nan expected = DataFrame({"A": [np.nan, "bar", "bah", "foo", "bar"]}) result = df.head() tm.assert_frame_equal(result, expected) df = DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df.A.iloc[0] = np.nan - else: + with tm.raises_chained_assignment_error(): df.A.iloc[0] = np.nan result = df.head() tm.assert_frame_equal(result, expected) @@ -211,20 +202,18 @@ df_original = df.copy() assert df._is_copy is None + with tm.raises_chained_assignment_error(): + df["A"][0] = -5 + with tm.raises_chained_assignment_error(): + df["A"][1] = -6 if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["A"][0] = -5 - with tm.raises_chained_assignment_error(): - df["A"][1] = -6 tm.assert_frame_equal(df, df_original) else: - df["A"][0] = -5 - df["A"][1] = -6 tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow def test_detect_chained_assignment_raises( - self, using_array_manager, using_copy_on_write + self, using_array_manager, using_copy_on_write, warn_copy_on_write ): # test with the chaining df = DataFrame( @@ -242,12 +231,19 @@ with tm.raises_chained_assignment_error(): df["A"][1] = -6 tm.assert_frame_equal(df, df_original) + elif warn_copy_on_write: + with tm.raises_chained_assignment_error(): + df["A"][0] = -5 + with tm.raises_chained_assignment_error(): + df["A"][1] = np.nan elif not using_array_manager: with pytest.raises(SettingWithCopyError, match=msg): - df["A"][0] = -5 + with tm.raises_chained_assignment_error(): + df["A"][0] = -5 with pytest.raises(SettingWithCopyError, match=msg): - df["A"][1] = np.nan + with tm.raises_chained_assignment_error(): + df["A"][1] = np.nan assert df["A"]._is_copy is None else: @@ -260,7 +256,9 @@ tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment_fails(self, using_copy_on_write): + def test_detect_chained_assignment_fails( + self, using_copy_on_write, warn_copy_on_write + ): # Using a copy (the chain), fails df = DataFrame( { @@ -269,7 +267,7 @@ } ) - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: with tm.raises_chained_assignment_error(): df.loc[0]["A"] = -5 else: @@ -277,7 +275,9 @@ df.loc[0]["A"] = -5 @pytest.mark.arm_slow - def test_detect_chained_assignment_doc_example(self, using_copy_on_write): + def test_detect_chained_assignment_doc_example( + self, using_copy_on_write, warn_copy_on_write + ): # Doc example df = DataFrame( { @@ -287,24 +287,25 @@ ) assert df._is_copy is None - if using_copy_on_write: - indexer = df.a.str.startswith("o") + indexer = df.a.str.startswith("o") + if using_copy_on_write or warn_copy_on_write: with tm.raises_chained_assignment_error(): df[indexer]["c"] = 42 else: with pytest.raises(SettingWithCopyError, match=msg): - indexer = df.a.str.startswith("o") df[indexer]["c"] = 42 @pytest.mark.arm_slow def test_detect_chained_assignment_object_dtype( - self, using_array_manager, using_copy_on_write + self, using_array_manager, using_copy_on_write, warn_copy_on_write ): expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) - df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) + df = DataFrame( + {"A": Series(["aaa", "bbb", "ccc"], dtype=object), "B": [1, 2, 3]} + ) df_original = df.copy() - if not using_copy_on_write: + if not using_copy_on_write and not warn_copy_on_write: with pytest.raises(SettingWithCopyError, match=msg): df.loc[0]["A"] = 111 @@ -312,9 +313,14 @@ with tm.raises_chained_assignment_error(): df["A"][0] = 111 tm.assert_frame_equal(df, df_original) + elif warn_copy_on_write: + with tm.raises_chained_assignment_error(): + df["A"][0] = 111 + tm.assert_frame_equal(df, expected) elif not using_array_manager: with pytest.raises(SettingWithCopyError, match=msg): - df["A"][0] = 111 + with tm.raises_chained_assignment_error(): + df["A"][0] = 111 df.loc[0, "A"] = 111 tm.assert_frame_equal(df, expected) @@ -367,8 +373,10 @@ df["letters"] = df["letters"].apply(str.lower) @pytest.mark.arm_slow - def test_detect_chained_assignment_implicit_take2(self, using_copy_on_write): - if using_copy_on_write: + def test_detect_chained_assignment_implicit_take2( + self, using_copy_on_write, warn_copy_on_write + ): + if using_copy_on_write or warn_copy_on_write: pytest.skip("_is_copy is not always set for CoW") # Implicitly take 2 df = random_text(100000) @@ -422,7 +430,9 @@ str(df) @pytest.mark.arm_slow - def test_detect_chained_assignment_undefined_column(self, using_copy_on_write): + def test_detect_chained_assignment_undefined_column( + self, using_copy_on_write, warn_copy_on_write + ): # from SO: # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc df = DataFrame(np.arange(0, 9), columns=["count"]) @@ -433,13 +443,17 @@ with tm.raises_chained_assignment_error(): df.iloc[0:5]["group"] = "a" tm.assert_frame_equal(df, df_original) + elif warn_copy_on_write: + with tm.raises_chained_assignment_error(): + df.iloc[0:5]["group"] = "a" else: with pytest.raises(SettingWithCopyError, match=msg): - df.iloc[0:5]["group"] = "a" + with tm.raises_chained_assignment_error(): + df.iloc[0:5]["group"] = "a" @pytest.mark.arm_slow def test_detect_chained_assignment_changing_dtype( - self, using_array_manager, using_copy_on_write + self, using_array_manager, using_copy_on_write, warn_copy_on_write ): # Mixed type setting but same dtype & changing dtype df = DataFrame( @@ -452,16 +466,19 @@ ) df_original = df.copy() - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: with tm.raises_chained_assignment_error(): df.loc[2]["D"] = "foo" with tm.raises_chained_assignment_error(): df.loc[2]["C"] = "foo" + tm.assert_frame_equal(df, df_original) with tm.raises_chained_assignment_error(extra_warnings=(FutureWarning,)): df["C"][2] = "foo" - tm.assert_frame_equal(df, df_original) - - if not using_copy_on_write: + if using_copy_on_write: + tm.assert_frame_equal(df, df_original) + else: + assert df.loc[2, "C"] == "foo" + else: with pytest.raises(SettingWithCopyError, match=msg): df.loc[2]["D"] = "foo" @@ -470,14 +487,15 @@ if not using_array_manager: with pytest.raises(SettingWithCopyError, match=msg): - df["C"][2] = "foo" + with tm.raises_chained_assignment_error(): + df["C"][2] = "foo" else: # INFO(ArrayManager) for ArrayManager it doesn't matter if it's # changing the dtype or not df["C"][2] = "foo" assert df.loc[2, "C"] == "foo" - def test_setting_with_copy_bug(self, using_copy_on_write): + def test_setting_with_copy_bug(self, using_copy_on_write, warn_copy_on_write): # operating on a copy df = DataFrame( {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} @@ -489,6 +507,9 @@ with tm.raises_chained_assignment_error(): df[["c"]][mask] = df[["b"]][mask] tm.assert_frame_equal(df, df_original) + elif warn_copy_on_write: + with tm.raises_chained_assignment_error(): + df[["c"]][mask] = df[["b"]][mask] else: with pytest.raises(SettingWithCopyError, match=msg): df[["c"]][mask] = df[["b"]][mask] @@ -502,9 +523,11 @@ # this should not raise df2["y"] = ["g", "h", "i"] - def test_detect_chained_assignment_warnings_errors(self, using_copy_on_write): + def test_detect_chained_assignment_warnings_errors( + self, using_copy_on_write, warn_copy_on_write + ): df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: with tm.raises_chained_assignment_error(): df.loc[0]["A"] = 111 return @@ -519,21 +542,20 @@ @pytest.mark.parametrize("rhs", [3, DataFrame({0: [1, 2, 3, 4]})]) def test_detect_chained_assignment_warning_stacklevel( - self, rhs, using_copy_on_write + self, rhs, using_copy_on_write, warn_copy_on_write ): # GH#42570 df = DataFrame(np.arange(25).reshape(5, 5)) df_original = df.copy() chained = df.loc[:3] with option_context("chained_assignment", "warn"): - if not using_copy_on_write: + if not using_copy_on_write and not warn_copy_on_write: with tm.assert_produces_warning(SettingWithCopyWarning) as t: chained[2] = rhs assert t[0].filename == __file__ else: # INFO(CoW) no warning, and original dataframe not changed - with tm.assert_produces_warning(None): - chained[2] = rhs + chained[2] = rhs tm.assert_frame_equal(df, df_original) # TODO(ArrayManager) fast_xs with array-like scalars is not yet working @@ -557,7 +579,10 @@ def test_cache_updating(self): # GH 4939, make sure to update the cache on setitem - df = tm.makeDataFrame() + df = DataFrame( + np.zeros((10, 4)), + columns=Index(list("ABCD"), dtype=object), + ) df["A"] # cache series df.loc["Hello Friend"] = df.iloc[0] assert "Hello Friend" in df["A"].index @@ -599,19 +624,13 @@ ck = [True] * len(df) - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["bb"].iloc[0] = 0.13 - else: + with tm.raises_chained_assignment_error(): df["bb"].iloc[0] = 0.13 # GH#3970 this lookup used to break the chained setting to 0.15 df.iloc[ck] - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["bb"].iloc[0] = 0.15 - else: + with tm.raises_chained_assignment_error(): df["bb"].iloc[0] = 0.15 if not using_copy_on_write: @@ -619,13 +638,10 @@ else: assert df["bb"].iloc[0] == 2.2 - def test_getitem_loc_assignment_slice_state(self, using_copy_on_write): + def test_getitem_loc_assignment_slice_state(self): # GH 13569 df = DataFrame({"a": [10, 20, 30]}) - if using_copy_on_write: - with tm.raises_chained_assignment_error(): - df["a"].loc[4] = 40 - else: + with tm.raises_chained_assignment_error(): df["a"].loc[4] = 40 tm.assert_frame_equal(df, DataFrame({"a": [10, 20, 30]})) tm.assert_series_equal(df["a"], Series([10, 20, 30], name="a")) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/test_coercion.py pandas-2.2.2+dfsg/pandas/tests/indexing/test_coercion.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/test_coercion.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/test_coercion.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,10 +9,13 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import ( IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas as pd import pandas._testing as tm @@ -111,7 +114,7 @@ "val,exp_dtype", [("x", object), (5, IndexError), (1.1, object)] ) def test_setitem_index_object(self, val, exp_dtype): - obj = pd.Series([1, 2, 3, 4], index=list("abcd")) + obj = pd.Series([1, 2, 3, 4], index=pd.Index(list("abcd"), dtype=object)) assert obj.index.dtype == object if exp_dtype is IndexError: @@ -122,7 +125,7 @@ with tm.assert_produces_warning(FutureWarning, match=warn_msg): temp[5] = 5 else: - exp_index = pd.Index(list("abcd") + [val]) + exp_index = pd.Index(list("abcd") + [val], dtype=object) self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) @pytest.mark.parametrize( @@ -195,10 +198,10 @@ ], ) def test_insert_index_object(self, insert, coerced_val, coerced_dtype): - obj = pd.Index(list("abcd")) + obj = pd.Index(list("abcd"), dtype=object) assert obj.dtype == object - exp = pd.Index(["a", coerced_val, "b", "c", "d"]) + exp = pd.Index(["a", coerced_val, "b", "c", "d"], dtype=object) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) @pytest.mark.parametrize( @@ -224,6 +227,8 @@ "insert, coerced_val, coerced_dtype", [ (1, 1.0, None), + # When float_numpy_dtype=float32, this is not the case + # see the correction below (1.1, 1.1, np.float64), (False, False, object), # GH#36319 ("x", "x", object), @@ -236,6 +241,10 @@ obj = pd.Index([1.0, 2.0, 3.0, 4.0], dtype=dtype) coerced_dtype = coerced_dtype if coerced_dtype is not None else dtype + if np_version_gt2 and dtype == "float32" and coerced_val == 1.1: + # Hack, in the 2nd test case, since 1.1 can be losslessly cast to float32 + # the expected dtype will be float32 if the original dtype was float32 + coerced_dtype = np.float32 exp = pd.Index([1.0, coerced_val, 2.0, 3.0, 4.0], dtype=coerced_dtype) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) @@ -254,13 +263,13 @@ def test_insert_index_datetimes(self, fill_val, exp_dtype, insert_value): obj = pd.DatetimeIndex( ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], tz=fill_val.tz - ) + ).as_unit("ns") assert obj.dtype == exp_dtype exp = pd.DatetimeIndex( ["2011-01-01", fill_val.date(), "2011-01-02", "2011-01-03", "2011-01-04"], tz=fill_val.tz, - ) + ).as_unit("ns") self._assert_insert_conversion(obj, fill_val, exp, exp_dtype) if fill_val.tz: @@ -397,7 +406,7 @@ ) def test_where_object(self, index_or_series, fill_val, exp_dtype): klass = index_or_series - obj = klass(list("abcd")) + obj = klass(list("abcd"), dtype=object) assert obj.dtype == object self._run_test(obj, fill_val, klass, exp_dtype) @@ -442,8 +451,8 @@ "fill_val,exp_dtype", [(1, object), (1.1, object), (1 + 1j, object), (True, np.bool_)], ) - def test_where_series_bool(self, fill_val, exp_dtype): - klass = pd.Series # TODO: use index_or_series once we have Index[bool] + def test_where_series_bool(self, index_or_series, fill_val, exp_dtype): + klass = index_or_series obj = klass([True, False, True, False]) assert obj.dtype == np.bool_ @@ -559,10 +568,10 @@ ) def test_fillna_object(self, index_or_series, fill_val, fill_dtype): klass = index_or_series - obj = klass(["a", np.nan, "c", "d"]) + obj = klass(["a", np.nan, "c", "d"], dtype=object) assert obj.dtype == object - exp = klass(["a", fill_val, "c", "d"]) + exp = klass(["a", fill_val, "c", "d"], dtype=object) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.parametrize( @@ -824,6 +833,8 @@ raise ValueError return replacer + # Expected needs adjustment for the infer string option, seems to work as expecetd + @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="TODO: test is to complex") def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") @@ -836,8 +847,6 @@ # tested below return - result = obj.replace(replacer) - if (from_key == "float64" and to_key in ("int64")) or ( from_key == "complex128" and to_key in ("int64", "float64") ): @@ -851,6 +860,17 @@ exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key + msg = "Downcasting behavior in `replace`" + warn = FutureWarning + if ( + exp.dtype == obj.dtype + or exp.dtype == object + or (exp.dtype.kind in "iufc" and obj.dtype.kind in "iufc") + ): + warn = None + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) + tm.assert_series_equal(result, exp) @pytest.mark.parametrize( @@ -861,15 +881,23 @@ @pytest.mark.parametrize( "from_key", ["datetime64[ns, UTC]", "datetime64[ns, US/Eastern]"], indirect=True ) - def test_replace_series_datetime_tz(self, how, to_key, from_key, replacer): + def test_replace_series_datetime_tz( + self, how, to_key, from_key, replacer, using_infer_string + ): index = pd.Index([3, 4], name="xyz") obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - result = obj.replace(replacer) - exp = pd.Series(self.rep[to_key], index=index, name="yyy") - assert exp.dtype == to_key + if using_infer_string and to_key == "object": + assert exp.dtype == "string" + else: + assert exp.dtype == to_key + + msg = "Downcasting behavior in `replace`" + warn = FutureWarning if exp.dtype != object else None + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) tm.assert_series_equal(result, exp) @@ -888,16 +916,22 @@ obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - result = obj.replace(replacer) - exp = pd.Series(self.rep[to_key], index=index, name="yyy") + warn = FutureWarning if isinstance(obj.dtype, pd.DatetimeTZDtype) and isinstance( exp.dtype, pd.DatetimeTZDtype ): # with mismatched tzs, we retain the original dtype as of 2.0 exp = exp.astype(obj.dtype) + warn = None else: assert exp.dtype == to_key + if to_key == from_key: + warn = None + + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) tm.assert_series_equal(result, exp) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/test_datetime.py pandas-2.2.2+dfsg/pandas/tests/indexing/test_datetime.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/test_datetime.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/test_datetime.py 2024-04-10 17:42:52.000000000 +0000 @@ -57,7 +57,10 @@ df = DataFrame({"a": date_range("2014-01-01", periods=10, tz="UTC")}) result = df.iloc[5] expected = Series( - [Timestamp("2014-01-06 00:00:00+0000", tz="UTC")], index=["a"], name=5 + [Timestamp("2014-01-06 00:00:00+0000", tz="UTC")], + index=["a"], + name=5, + dtype="M8[ns, UTC]", ) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/test_floats.py pandas-2.2.2+dfsg/pandas/tests/indexing/test_floats.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/test_floats.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/test_floats.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,6 +6,9 @@ Index, RangeIndex, Series, + date_range, + period_range, + timedelta_range, ) import pandas._testing as tm @@ -39,22 +42,21 @@ tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( - "index_func", + "index", [ - tm.makeStringIndex, - tm.makeCategoricalIndex, - tm.makeDateIndex, - tm.makeTimedeltaIndex, - tm.makePeriodIndex, + Index(list("abcde")), + Index(list("abcde"), dtype="category"), + date_range("2020-01-01", periods=5), + timedelta_range("1 day", periods=5), + period_range("2020-01-01", periods=5), ], ) - def test_scalar_non_numeric(self, index_func, frame_or_series, indexer_sl): + def test_scalar_non_numeric(self, index, frame_or_series, indexer_sl): # GH 4892 # float_indexers should raise exceptions # on appropriate Index types & accessors - i = index_func(5) - s = gen_obj(frame_or_series, i) + s = gen_obj(frame_or_series, index) # getting with pytest.raises(KeyError, match="^3.0$"): @@ -75,19 +77,18 @@ assert 3.0 not in s2.axes[-1] @pytest.mark.parametrize( - "index_func", + "index", [ - tm.makeStringIndex, - tm.makeCategoricalIndex, - tm.makeDateIndex, - tm.makeTimedeltaIndex, - tm.makePeriodIndex, + Index(list("abcde")), + Index(list("abcde"), dtype="category"), + date_range("2020-01-01", periods=5), + timedelta_range("1 day", periods=5), + period_range("2020-01-01", periods=5), ], ) - def test_scalar_non_numeric_series_fallback(self, index_func): + def test_scalar_non_numeric_series_fallback(self, index): # fallsback to position selection, series only - i = index_func(5) - s = Series(np.arange(len(i)), index=i) + s = Series(np.arange(len(index)), index=index) msg = "Series.__getitem__ treating keys as positions is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -131,14 +132,16 @@ expected = 3 assert result == expected - @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) - def test_scalar_integer(self, index_func, frame_or_series, indexer_sl): + @pytest.mark.parametrize( + "index", [Index(np.arange(5), dtype=np.int64), RangeIndex(5)] + ) + def test_scalar_integer(self, index, frame_or_series, indexer_sl): getitem = indexer_sl is not tm.loc # test how scalar float indexers work on int indexes # integer index - i = index_func(5) + i = index obj = gen_obj(frame_or_series, i) # coerce to equal int @@ -168,11 +171,12 @@ result = indexer_sl(s2)[3] compare(result, expected) - @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) - def test_scalar_integer_contains_float(self, index_func, frame_or_series): + @pytest.mark.parametrize( + "index", [Index(np.arange(5), dtype=np.int64), RangeIndex(5)] + ) + def test_scalar_integer_contains_float(self, index, frame_or_series): # contains # integer index - index = index_func(5) obj = gen_obj(frame_or_series, index) # coerce to equal int @@ -214,21 +218,20 @@ self.check(result, s, 3, False) @pytest.mark.parametrize( - "index_func", + "index", [ - tm.makeStringIndex, - tm.makeDateIndex, - tm.makeTimedeltaIndex, - tm.makePeriodIndex, + Index(list("abcde"), dtype=object), + date_range("2020-01-01", periods=5), + timedelta_range("1 day", periods=5), + period_range("2020-01-01", periods=5), ], ) @pytest.mark.parametrize("idx", [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]) - def test_slice_non_numeric(self, index_func, idx, frame_or_series, indexer_sli): + def test_slice_non_numeric(self, index, idx, frame_or_series, indexer_sli): # GH 4892 # float_indexers should raise exceptions # on appropriate Index types & accessors - index = index_func(5) s = gen_obj(frame_or_series, index) # getitem @@ -348,11 +351,11 @@ with pytest.raises(TypeError, match=msg): s.iloc[idx] - @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) - def test_slice_integer_frame_getitem(self, index_func): + @pytest.mark.parametrize( + "index", [Index(np.arange(5), dtype=np.int64), RangeIndex(5)] + ) + def test_slice_integer_frame_getitem(self, index): # similar to above, but on the getitem dim (of a DataFrame) - index = index_func(5) - s = DataFrame(np.random.default_rng(2).standard_normal((5, 2)), index=index) # getitem @@ -403,11 +406,11 @@ s[idx] @pytest.mark.parametrize("idx", [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]) - @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) - def test_float_slice_getitem_with_integer_index_raises(self, idx, index_func): + @pytest.mark.parametrize( + "index", [Index(np.arange(5), dtype=np.int64), RangeIndex(5)] + ) + def test_float_slice_getitem_with_integer_index_raises(self, idx, index): # similar to above, but on the getitem dim (of a DataFrame) - index = index_func(5) - s = DataFrame(np.random.default_rng(2).standard_normal((5, 2)), index=index) # setitem diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/test_iat.py pandas-2.2.2+dfsg/pandas/tests/indexing/test_iat.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/test_iat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/test_iat.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,7 @@ Series, period_range, ) +import pandas._testing as tm def test_iat(float_frame): @@ -30,7 +31,9 @@ assert expected == result -def test_iat_setitem_item_cache_cleared(indexer_ial, using_copy_on_write): +def test_iat_setitem_item_cache_cleared( + indexer_ial, using_copy_on_write, warn_copy_on_write +): # GH#45684 data = {"x": np.arange(8, dtype=np.int64), "y": np.int64(0)} df = DataFrame(data).copy() @@ -38,9 +41,11 @@ # previously this iat setting would split the block and fail to clear # the item_cache. - indexer_ial(df)[7, 0] = 9999 + with tm.assert_cow_warning(warn_copy_on_write): + indexer_ial(df)[7, 0] = 9999 - indexer_ial(df)[7, 1] = 1234 + with tm.assert_cow_warning(warn_copy_on_write): + indexer_ial(df)[7, 1] = 1234 assert df.iat[7, 1] == 1234 if not using_copy_on_write: diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/test_iloc.py pandas-2.2.2+dfsg/pandas/tests/indexing/test_iloc.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/test_iloc.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/test_iloc.py 2024-04-10 17:42:52.000000000 +0000 @@ -100,9 +100,8 @@ # we retain the object dtype. frame = DataFrame({0: np.array([0, 1, 2], dtype=object), 1: range(3)}) df = frame.copy() - orig_vals = df.values indexer(df)[key, 0] = cat - expected = DataFrame({0: cat.astype(object), 1: range(3)}) + expected = DataFrame({0: Series(cat.astype(object), dtype=object), 1: range(3)}) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("box", [array, Series]) @@ -229,17 +228,15 @@ tm.assert_series_equal(result, expected) # doc example - def check(result, expected): - str(result) - result.dtypes - tm.assert_frame_equal(result, expected) - dfl = DataFrame( np.random.default_rng(2).standard_normal((5, 2)), columns=list("AB") ) - check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index, columns=[])) - check(dfl.iloc[:, 1:3], dfl.iloc[:, [1]]) - check(dfl.iloc[4:6], dfl.iloc[[4]]) + tm.assert_frame_equal( + dfl.iloc[:, 2:3], + DataFrame(index=dfl.index, columns=Index([], dtype=dfl.columns.dtype)), + ) + tm.assert_frame_equal(dfl.iloc[:, 1:3], dfl.iloc[:, [1]]) + tm.assert_frame_equal(dfl.iloc[4:6], dfl.iloc[[4]]) msg = "positional indexers are out-of-bounds" with pytest.raises(IndexError, match=msg): @@ -429,7 +426,7 @@ tm.assert_frame_equal(df.iloc[10:, :2], df2) tm.assert_frame_equal(df.iloc[10:, 2:], df1) - def test_iloc_setitem(self): + def test_iloc_setitem(self, warn_copy_on_write): df = DataFrame( np.random.default_rng(2).standard_normal((4, 4)), index=np.arange(0, 8, 2), @@ -454,12 +451,16 @@ def test_iloc_setitem_axis_argument(self): # GH45032 df = DataFrame([[6, "c", 10], [7, "d", 11], [8, "e", 12]]) + df[1] = df[1].astype(object) expected = DataFrame([[6, "c", 10], [7, "d", 11], [5, 5, 5]]) + expected[1] = expected[1].astype(object) df.iloc(axis=0)[2] = 5 tm.assert_frame_equal(df, expected) df = DataFrame([[6, "c", 10], [7, "d", 11], [8, "e", 12]]) + df[1] = df[1].astype(object) expected = DataFrame([[6, "c", 5], [7, "d", 5], [8, "e", 5]]) + expected[1] = expected[1].astype(object) df.iloc(axis=1)[2] = 5 tm.assert_frame_equal(df, expected) @@ -534,7 +535,8 @@ # if the assigned values cannot be held by existing integer arrays, # we cast - df.iloc[:, 0] = df.iloc[:, 0] + 0.5 + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.iloc[:, 0] = df.iloc[:, 0] + 0.5 if not using_array_manager: assert len(df._mgr.blocks) == 2 @@ -618,7 +620,7 @@ assert result == exp # out-of-bounds exception - msg = "index 5 is out of bounds for axis 0 with size 4" + msg = "index 5 is out of bounds for axis 0 with size 4|index out of bounds" with pytest.raises(IndexError, match=msg): df.iloc[10, 5] @@ -644,8 +646,6 @@ df.describe() result = df.iloc[3:5, 0:2] - str(result) - result.dtypes expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=columns[0:2]) tm.assert_frame_equal(result, expected) @@ -653,8 +653,6 @@ # for dups df.columns = list("aaaa") result = df.iloc[3:5, 0:2] - str(result) - result.dtypes expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=list("aa")) tm.assert_frame_equal(result, expected) @@ -668,8 +666,6 @@ if not using_array_manager: df._mgr.blocks[0].mgr_locs result = df.iloc[1:5, 2:4] - str(result) - result.dtypes expected = DataFrame(arr[1:5, 2:4], index=index[1:5], columns=columns[2:4]) tm.assert_frame_equal(result, expected) @@ -795,8 +791,8 @@ else: accessor = df answer = str(bin(accessor[mask]["nums"].sum())) - except (ValueError, IndexingError, NotImplementedError) as e: - answer = str(e) + except (ValueError, IndexingError, NotImplementedError) as err: + answer = str(err) key = ( idx, @@ -826,7 +822,11 @@ df2.loc[idx] def test_iloc_empty_list_indexer_is_ok(self): - df = tm.makeCustomDataframe(5, 2) + df = DataFrame( + np.ones((5, 2)), + index=Index([f"i-{i}" for i in range(5)], name="a"), + columns=Index([f"i-{i}" for i in range(2)], name="a"), + ) # vertical empty tm.assert_frame_equal( df.iloc[:, []], @@ -846,7 +846,9 @@ df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self, using_copy_on_write): + def test_identity_slice_returns_new_object( + self, using_copy_on_write, warn_copy_on_write + ): # GH13873 original_df = DataFrame({"a": [1, 2, 3]}) sliced_df = original_df.iloc[:] @@ -857,7 +859,8 @@ # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig # depending on CoW - original_df.loc[:, "a"] = [4, 4, 4] + with tm.assert_cow_warning(warn_copy_on_write): + original_df.loc[:, "a"] = [4, 4, 4] if using_copy_on_write: assert (sliced_df["a"] == [1, 2, 3]).all() else: @@ -868,7 +871,8 @@ assert sliced_series is not original_series # should also be a shallow copy - original_series[:3] = [7, 8, 9] + with tm.assert_cow_warning(warn_copy_on_write): + original_series[:3] = [7, 8, 9] if using_copy_on_write: # shallow copy not updated (CoW) assert all(sliced_series[:3] == [1, 2, 3]) @@ -1232,7 +1236,9 @@ class TestILocErrors: # NB: this test should work for _any_ Series we can pass as # series_with_simple_index - def test_iloc_float_raises(self, series_with_simple_index, frame_or_series): + def test_iloc_float_raises( + self, series_with_simple_index, frame_or_series, warn_copy_on_write + ): # GH#4892 # float_indexers should raise exceptions # on appropriate Index types & accessors @@ -1249,7 +1255,10 @@ obj.iloc[3.0] with pytest.raises(IndexError, match=_slice_iloc_msg): - obj.iloc[3.0] = 0 + with tm.assert_cow_warning( + warn_copy_on_write and frame_or_series is DataFrame + ): + obj.iloc[3.0] = 0 def test_iloc_getitem_setitem_fancy_exceptions(self, float_frame): with pytest.raises(IndexingError, match="Too many indexers"): @@ -1311,7 +1320,9 @@ self, dtypes, init_value, expected_value ): # GH#22035 - df = DataFrame([[init_value, "str", "str2"]], columns=["a", "b", "b"]) + df = DataFrame( + [[init_value, "str", "str2"]], columns=["a", "b", "b"], dtype=object + ) # with the enforcement of GH#45333 in 2.0, this sets values inplace, # so we retain object dtype @@ -1358,7 +1369,10 @@ def test_frame_iloc_setitem_callable(self): # GH#11485 - df = DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + df = DataFrame( + {"X": [1, 2, 3, 4], "Y": Series(list("aabb"), dtype=object)}, + index=list("ABCD"), + ) # return location res = df.copy() @@ -1412,7 +1426,7 @@ class TestILocSeries: - def test_iloc(self, using_copy_on_write): + def test_iloc(self, using_copy_on_write, warn_copy_on_write): ser = Series( np.random.default_rng(2).standard_normal(10), index=list(range(0, 20, 2)) ) @@ -1431,7 +1445,8 @@ # test slice is a view with tm.assert_produces_warning(None): # GH#45324 make sure we aren't giving a spurious FutureWarning - result[:] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + result[:] = 0 if using_copy_on_write: tm.assert_series_equal(ser, ser_original) else: @@ -1457,6 +1472,7 @@ def test_iloc_nullable_int64_size_1_nan(self): # GH 31861 result = DataFrame({"a": ["test"], "b": [np.nan]}) - result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") expected = DataFrame({"a": ["test"], "b": array([NA], dtype="Int64")}) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/test_indexing.py pandas-2.2.2+dfsg/pandas/tests/indexing/test_indexing.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/test_indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/test_indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -189,7 +191,7 @@ ): df.loc[0, "c"] = "foo" expected = DataFrame( - [{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}] + {"a": [1, 3], "b": [np.nan, 2], "c": Series(["foo", np.nan], dtype=object)} ) tm.assert_frame_equal(df, expected) @@ -241,8 +243,7 @@ def test_dups_fancy_indexing(self): # GH 3455 - df = tm.makeCustomDataframe(10, 3) - df.columns = ["a", "a", "b"] + df = DataFrame(np.eye(3), columns=["a", "a", "b"]) result = df[["b", "a"]].columns expected = Index(["b", "a", "a"]) tm.assert_index_equal(result, expected) @@ -250,8 +251,6 @@ def test_dups_fancy_indexing_across_dtypes(self): # across dtypes df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa")) - df.head() - str(df) result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]]) result.columns = list("aaaaaaa") # GH#3468 @@ -286,18 +285,27 @@ with pytest.raises(KeyError, match="not in index"): df.loc[rows] - def test_dups_fancy_indexing_only_missing_label(self): + def test_dups_fancy_indexing_only_missing_label(self, using_infer_string): # List containing only missing label dfnu = DataFrame( np.random.default_rng(2).standard_normal((5, 3)), index=list("AABCD") ) - with pytest.raises( - KeyError, - match=re.escape( - "\"None of [Index(['E'], dtype='object')] are in the [index]\"" - ), - ): - dfnu.loc[["E"]] + if using_infer_string: + with pytest.raises( + KeyError, + match=re.escape( + "\"None of [Index(['E'], dtype='string')] are in the [index]\"" + ), + ): + dfnu.loc[["E"]] + else: + with pytest.raises( + KeyError, + match=re.escape( + "\"None of [Index(['E'], dtype='object')] are in the [index]\"" + ), + ): + dfnu.loc[["E"]] @pytest.mark.parametrize("vals", [[0, 1, 2], list("abc")]) def test_dups_fancy_indexing_missing_label(self, vals): @@ -453,6 +461,9 @@ ) tm.assert_frame_equal(result, df) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't multiply arrow strings" + ) def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df # set float64 to avoid upcast when setting nan @@ -517,7 +528,7 @@ for col in ["A", "B"]: expected.loc[mask, col] = df["D"] - df.loc[df["A"] == 0, ["A", "B"]] = df["D"] + df.loc[df["A"] == 0, ["A", "B"]] = df["D"].copy() tm.assert_frame_equal(df, expected) def test_setitem_list(self): @@ -555,7 +566,7 @@ with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] - def test_astype_assignment(self): + def test_astype_assignment(self, using_infer_string): # GH4312 (iloc) df_orig = DataFrame( [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") @@ -569,8 +580,9 @@ expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - expected["A"] = expected["A"].astype(object) - expected["B"] = expected["B"].astype(object) + if not using_infer_string: + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) # GH5702 (loc) @@ -579,7 +591,8 @@ expected = DataFrame( [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - expected["A"] = expected["A"].astype(object) + if not using_infer_string: + expected["A"] = expected["A"].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() @@ -587,8 +600,9 @@ expected = DataFrame( [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - expected["B"] = expected["B"].astype(object) - expected["C"] = expected["C"].astype(object) + if not using_infer_string: + expected["B"] = expected["B"].astype(object) + expected["C"] = expected["C"].astype(object) tm.assert_frame_equal(df, expected) def test_astype_assignment_full_replacements(self): @@ -675,6 +689,7 @@ df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") def test_rhs_alignment(self): # GH8258, tests that both rows & columns are aligned to what is # assigned to. covers both uniform data-type & multi-type cases diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/test_loc.py pandas-2.2.2+dfsg/pandas/tests/indexing/test_loc.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/test_loc.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/test_loc.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,6 +12,10 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + +from pandas._libs import index as libindex +from pandas.compat.numpy import np_version_gt2 from pandas.errors import IndexingError import pandas.util._test_decorators as td @@ -260,19 +264,19 @@ @pytest.mark.parametrize( "msg, key", [ - (r"Period\('2019', 'A-DEC'\), 'foo', 'bar'", (Period(2019), "foo", "bar")), - (r"Period\('2019', 'A-DEC'\), 'y1', 'bar'", (Period(2019), "y1", "bar")), - (r"Period\('2019', 'A-DEC'\), 'foo', 'z1'", (Period(2019), "foo", "z1")), + (r"Period\('2019', 'Y-DEC'\), 'foo', 'bar'", (Period(2019), "foo", "bar")), + (r"Period\('2019', 'Y-DEC'\), 'y1', 'bar'", (Period(2019), "y1", "bar")), + (r"Period\('2019', 'Y-DEC'\), 'foo', 'z1'", (Period(2019), "foo", "z1")), ( - r"Period\('2018', 'A-DEC'\), Period\('2016', 'A-DEC'\), 'bar'", + r"Period\('2018', 'Y-DEC'\), Period\('2016', 'Y-DEC'\), 'bar'", (Period(2018), Period(2016), "bar"), ), - (r"Period\('2018', 'A-DEC'\), 'foo', 'y1'", (Period(2018), "foo", "y1")), + (r"Period\('2018', 'Y-DEC'\), 'foo', 'y1'", (Period(2018), "foo", "y1")), ( - r"Period\('2017', 'A-DEC'\), 'foo', Period\('2015', 'A-DEC'\)", + r"Period\('2017', 'Y-DEC'\), 'foo', Period\('2015', 'Y-DEC'\)", (Period(2017), "foo", Period(2015)), ), - (r"Period\('2017', 'A-DEC'\), 'z1', 'bar'", (Period(2017), "z1", "bar")), + (r"Period\('2017', 'Y-DEC'\), 'z1', 'bar'", (Period(2017), "z1", "bar")), ], ) def test_contains_raise_error_if_period_index_is_in_multi_index(self, msg, key): @@ -468,7 +472,7 @@ msg = r"\"None of \[Index\(\['4'\], dtype='object'\)\] are in the \[index\]\"" with pytest.raises(KeyError, match=msg): - s.loc[["4"]] + s.loc[Index(["4"], dtype=object)] s.loc[-1] = 3 with pytest.raises(KeyError, match="not in index"): @@ -580,7 +584,8 @@ } ) df = frame_for_consistency.copy() - df.loc[:, "date"] = val + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = val tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_str(self, frame_for_consistency): @@ -594,7 +599,8 @@ } ) df = frame_for_consistency.copy() - df.loc[:, "date"] = "foo" + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = "foo" tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_dt64_to_float(self, frame_for_consistency): @@ -607,14 +613,16 @@ } ) df = frame_for_consistency.copy() - df.loc[:, "date"] = 1.0 + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = 1.0 tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_single_row(self): # GH 15494 # setting on frame with single row df = DataFrame({"date": Series([Timestamp("20180101")])}) - df.loc[:, "date"] = "string" + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, "date"] = "string" expected = DataFrame({"date": Series(["string"])}) tm.assert_frame_equal(df, expected) @@ -674,9 +682,10 @@ # timedelta64[m] -> float, so this cannot be done inplace, so # no warning - df.loc[:, ("Respondent", "Duration")] = df.loc[ - :, ("Respondent", "Duration") - ] / Timedelta(60_000_000_000) + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + df.loc[:, ("Respondent", "Duration")] = df.loc[ + :, ("Respondent", "Duration") + ] / Timedelta(60_000_000_000) expected = Series( [23.0, 12.0, 14.0, 36.0], index=df.index, name=("Respondent", "Duration") @@ -780,7 +789,9 @@ # is inplace, so that dtype is retained sera = Series(val1, index=keys1, dtype=np.float64) serb = Series(val2, index=keys2) - expected = DataFrame({"A": sera, "B": serb}).reindex(index=index) + expected = DataFrame( + {"A": sera, "B": serb}, columns=Index(["A", "B"], dtype=object) + ).reindex(index=index) tm.assert_frame_equal(df, expected) def test_loc_setitem_frame(self): @@ -827,7 +838,8 @@ df.loc[0, [1, 2]] = [5, 6] tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_multiples(self): + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") + def test_loc_setitem_frame_multiples(self, warn_copy_on_write): # multiple setting df = DataFrame( {"A": ["foo", "bar", "baz"], "B": Series(range(3), dtype=np.int64)} @@ -977,7 +989,7 @@ to_datetime(42).tz_localize("UTC"), to_datetime(666).tz_localize("UTC"), ] - expected = Series(vals, index=["foo", "bar"]) + expected = Series(vals, index=Index(["foo", "bar"], dtype=object)) ser = Series(dtype=object) indexer_sl(ser)["foo"] = vals[0] @@ -1066,7 +1078,11 @@ assert result == "index_name" def test_loc_empty_list_indexer_is_ok(self): - df = tm.makeCustomDataframe(5, 2) + df = DataFrame( + np.ones((5, 2)), + index=Index([f"i-{i}" for i in range(5)], name="a"), + columns=Index([f"i-{i}" for i in range(2)], name="a"), + ) # vertical empty tm.assert_frame_equal( df.loc[:, []], df.iloc[:, :0], check_index_type=True, check_column_type=True @@ -1080,7 +1096,9 @@ df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self, using_copy_on_write): + def test_identity_slice_returns_new_object( + self, using_copy_on_write, warn_copy_on_write + ): # GH13873 original_df = DataFrame({"a": [1, 2, 3]}) @@ -1094,7 +1112,8 @@ # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig # depending on CoW - original_df.loc[:, "a"] = [4, 4, 4] + with tm.assert_cow_warning(warn_copy_on_write): + original_df.loc[:, "a"] = [4, 4, 4] if using_copy_on_write: assert (sliced_df["a"] == [1, 2, 3]).all() else: @@ -1102,7 +1121,7 @@ # These should not return copies df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) - if using_copy_on_write: + if using_copy_on_write or warn_copy_on_write: assert df[0] is not df.loc[:, 0] else: assert df[0] is df.loc[:, 0] @@ -1113,7 +1132,8 @@ assert sliced_series is not original_series assert original_series[:] is not original_series - original_series[:3] = [7, 8, 9] + with tm.assert_cow_warning(warn_copy_on_write): + original_series[:3] = [7, 8, 9] if using_copy_on_write: assert all(sliced_series[:3] == [1, 2, 3]) else: @@ -1124,7 +1144,7 @@ if not using_copy_on_write: mark = pytest.mark.xfail(reason="accidental fix reverted - GH37497") - request.node.add_marker(mark) + request.applymarker(mark) x = DataFrame(zip(range(3), range(3)), columns=["a", "b"]) y = x.copy() @@ -1215,13 +1235,7 @@ with pytest.raises(KeyError, match=msg): df.loc[[0, 1], "x"] = data - msg = "|".join( - [ - "cannot copy sequence with size 2 to array axis with dimension 0", - r"could not broadcast input array from shape \(2,\) into shape \(0,\)", - "Must have equal len keys and value when setting with an iterable", - ] - ) + msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): df.loc[0:2, "x"] = data @@ -1248,6 +1262,7 @@ tm.assert_series_equal(result, expected) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string") def test_loc_setitem_str_to_small_float_conversion_type(self): # GH#20388 @@ -1433,7 +1448,7 @@ df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp) - def test_loc_setitem_single_row_categorical(self): + def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) categories = Categorical(df["Alpha"], categories=["a", "b", "c"]) @@ -1443,7 +1458,9 @@ df.loc[:, "Alpha"] = categories result = df["Alpha"] - expected = Series(categories, index=df.index, name="Alpha").astype(object) + expected = Series(categories, index=df.index, name="Alpha").astype( + object if not using_infer_string else "string[pyarrow_numpy]" + ) tm.assert_series_equal(result, expected) # double-check that the non-loc setting retains categoricalness @@ -1464,12 +1481,16 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # GH#11365 tz = tz_naive_fixture - idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz) + idx = date_range(start="2015-07-12", periods=3, freq="h", tz=tz) expected = DataFrame(1.2, index=idx, columns=["var"]) # if result started off with object dtype, then the .loc.__setitem__ # below would retain object dtype result = DataFrame(index=idx, columns=["var"], dtype=np.float64) - result.loc[:, idxer] = expected + with tm.assert_produces_warning( + FutureWarning if idxer == "var" else None, match="incompatible dtype" + ): + # See https://github.com/pandas-dev/pandas/issues/56223 + result.loc[:, idxer] = expected tm.assert_frame_equal(result, expected) def test_loc_setitem_time_key(self, using_array_manager): @@ -1548,16 +1569,10 @@ # float64 dtype to avoid upcast when trying to set float data ser = Series(range(2), dtype="float64") - msg = "|".join( - [ - r"shape mismatch: value array of shape \(2,2\)", - r"cannot reshape array of size 4 into shape \(2,\)", - ] - ) + msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): ser.loc[range(2)] = data - msg = r"could not broadcast input array from shape \(2,2\) into shape \(2,?\)" with pytest.raises(ValueError, match=msg): ser.loc[:] = data @@ -1609,7 +1624,7 @@ result = df.loc[IndexType("foo", "bar")]["A"] assert result == 1 - def test_loc_setitem_single_column_mixed(self): + def test_loc_setitem_single_column_mixed(self, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((5, 3)), index=["a", "b", "c", "d", "e"], @@ -1617,7 +1632,10 @@ ) df["str"] = "qux" df.loc[df.index[::2], "str"] = np.nan - expected = np.array([np.nan, "qux", np.nan, "qux", np.nan], dtype=object) + expected = Series( + [np.nan, "qux", np.nan, "qux", np.nan], + dtype=object if not using_infer_string else "string[pyarrow_numpy]", + ).values tm.assert_almost_equal(df["str"].values, expected) def test_loc_setitem_cast2(self): @@ -1657,6 +1675,14 @@ expected = frame_or_series([0, 1, 10, 9, 11], index=obj.index) tm.assert_equal(obj, expected) + def test_loc_setitem_numpy_frame_categorical_value(self): + # GH#52927 + df = DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}) + df.loc[1:2, "a"] = Categorical([2, 2], categories=[1, 2]) + + expected = DataFrame({"a": [1, 2, 2, 1, 1], "b": ["a", "a", "a", "a", "a"]}) + tm.assert_frame_equal(df, expected) + class TestLocWithEllipsis: @pytest.fixture(params=[tm.loc, tm.iloc]) @@ -1966,12 +1992,14 @@ class TestLocSetitemWithExpansion: - @pytest.mark.slow - def test_loc_setitem_with_expansion_large_dataframe(self): + def test_loc_setitem_with_expansion_large_dataframe(self, monkeypatch): # GH#10692 - result = DataFrame({"x": range(10**6)}, dtype="int64") - result.loc[len(result)] = len(result) + 1 - expected = DataFrame({"x": range(10**6 + 1)}, dtype="int64") + size_cutoff = 50 + with monkeypatch.context(): + monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff) + result = DataFrame({"x": range(size_cutoff)}, dtype="int64") + result.loc[size_cutoff] = size_cutoff + expected = DataFrame({"x": range(size_cutoff + 1)}, dtype="int64") tm.assert_frame_equal(result, expected) def test_loc_setitem_empty_series(self): @@ -2000,11 +2028,15 @@ # partially set with an empty object series ser = Series(dtype=object) ser.loc["foo"] = 1 - tm.assert_series_equal(ser, Series([1], index=["foo"])) + tm.assert_series_equal(ser, Series([1], index=Index(["foo"], dtype=object))) ser.loc["bar"] = 3 - tm.assert_series_equal(ser, Series([1, 3], index=["foo", "bar"])) + tm.assert_series_equal( + ser, Series([1, 3], index=Index(["foo", "bar"], dtype=object)) + ) ser.loc[3] = 4 - tm.assert_series_equal(ser, Series([1, 3, 4], index=["foo", "bar", 3])) + tm.assert_series_equal( + ser, Series([1, 3, 4], index=Index(["foo", "bar", 3], dtype=object)) + ) def test_loc_setitem_incremental_with_dst(self): # GH#20724 @@ -2034,7 +2066,11 @@ df.loc[conv(dt1), "one"] = 100 df.loc[conv(dt2), "one"] = 200 - expected = DataFrame({"one": [100.0, 200.0]}, index=[dt1, dt2]) + expected = DataFrame( + {"one": [100.0, 200.0]}, + index=[dt1, dt2], + columns=Index(["one"], dtype=object), + ) tm.assert_frame_equal(df, expected) def test_loc_setitem_categorical_column_retains_dtype(self, ordered): @@ -2049,7 +2085,7 @@ start = Timestamp("2017-10-29 00:00:00+0200", tz="Europe/Madrid") end = Timestamp("2017-10-29 03:00:00+0100", tz="Europe/Madrid") ts = Timestamp("2016-10-10 03:00:00", tz="Europe/Madrid") - idx = date_range(start, end, inclusive="left", freq="H") + idx = date_range(start, end, inclusive="left", freq="h") assert ts not in idx # i.e. result.loc setitem is with-expansion result = DataFrame(index=idx, columns=["value"]) @@ -2152,11 +2188,24 @@ result = DataFrame(index=df.index) result.loc[df.index, "data"] = ser - tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, df, check_column_type=False) result = DataFrame(index=df.index) result.loc[df.index, "data"] = ser._values - tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, df, check_column_type=False) + + def test_loc_setitem_ea_not_full_column(self): + # GH#39163 + df = DataFrame({"A": range(5)}) + + val = date_range("2016-01-01", periods=3, tz="US/Pacific") + + df.loc[[0, 1, 2], "B"] = val + + bex = val.append(DatetimeIndex([pd.NaT, pd.NaT], dtype=val.dtype)) + expected = DataFrame({"A": range(5), "B": bex}) + assert expected.dtypes["B"] == val.dtype + tm.assert_frame_equal(df, expected) class TestLocCallable: @@ -2233,7 +2282,10 @@ def test_frame_loc_setitem_callable(self): # GH#11485 - df = DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) + df = DataFrame( + {"X": [1, 2, 3, 4], "Y": Series(list("aabb"), dtype=object)}, + index=list("ABCD"), + ) # return label res = df.copy() @@ -2309,7 +2361,7 @@ tm.assert_series_equal(result, expected) def test_loc_getitem_partial_string_slicing_with_timedeltaindex(self): - ix = timedelta_range(start="1 day", end="2 days", freq="1H") + ix = timedelta_range(start="1 day", end="2 days", freq="1h") ser = ix.to_series() result = ser.loc[:"1 days"] expected = ser.iloc[:-1] @@ -2411,7 +2463,7 @@ "index", [ pd.period_range(start="2017-01-01", end="2018-01-01", freq="M"), - timedelta_range(start="1 day", end="2 days", freq="1H"), + timedelta_range(start="1 day", end="2 days", freq="1h"), ], ) def test_loc_getitem_label_slice_period_timedelta(self, index): @@ -2567,7 +2619,7 @@ df = DataFrame( np.arange(6.0).reshape(3, 2), columns=list("AB"), - index=date_range("1/1/2000", periods=3, freq="1H"), + index=date_range("1/1/2000", periods=3, freq="1h"), ) expected = df.copy() expected["C"] = [expected.index[0]] + [pd.NaT, pd.NaT] @@ -2603,7 +2655,9 @@ expected = DataFrame(values, index=expected.index, columns=expected.columns) tm.assert_frame_equal(float_frame, expected) - def test_loc_setitem_ndframe_values_alignment(self, using_copy_on_write): + def test_loc_setitem_ndframe_values_alignment( + self, using_copy_on_write, warn_copy_on_write + ): # GH#45501 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df.loc[[False, False, True], ["a"]] = DataFrame( @@ -2626,7 +2680,8 @@ df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() ser = df["a"] - ser.loc[[False, False, True]] = Series([10, 11, 12], index=[2, 1, 0]) + with tm.assert_cow_warning(warn_copy_on_write): + ser.loc[[False, False, True]] = Series([10, 11, 12], index=[2, 1, 0]) if using_copy_on_write: tm.assert_frame_equal(df, df_orig) else: @@ -2636,21 +2691,21 @@ # GH#51450 df = DataFrame({"a": [], "b": []}, dtype=object) expected = df.copy() - df.loc[np.array([], dtype=np.bool_), ["a"]] = df["a"] + df.loc[np.array([], dtype=np.bool_), ["a"]] = df["a"].copy() tm.assert_frame_equal(df, expected) def test_loc_indexer_all_false_broadcast(self): # GH#51450 df = DataFrame({"a": ["x"], "b": ["y"]}, dtype=object) expected = df.copy() - df.loc[np.array([False], dtype=np.bool_), ["a"]] = df["b"] + df.loc[np.array([False], dtype=np.bool_), ["a"]] = df["b"].copy() tm.assert_frame_equal(df, expected) def test_loc_indexer_length_one(self): # GH#51435 df = DataFrame({"a": ["x"], "b": ["y"]}, dtype=object) expected = DataFrame({"a": ["y"], "b": ["y"]}, dtype=object) - df.loc[np.array([True], dtype=np.bool_), ["a"]] = df["b"] + df.loc[np.array([True], dtype=np.bool_), ["a"]] = df["b"].copy() tm.assert_frame_equal(df, expected) @@ -2866,7 +2921,7 @@ df = DataFrame( np.random.default_rng(2).standard_normal((5, 3)), columns=["a", "b", "c"], - index=date_range("2012", freq="H", periods=5), + index=date_range("2012", freq="h", periods=5), ) # create dataframe with non-unique DatetimeIndex df = df.iloc[[0, 2, 2, 3]].copy() @@ -2963,7 +3018,15 @@ with tm.assert_produces_warning(FutureWarning, match="item of incompatible dtype"): df.loc[2, "col1"] = value # value that can't be held in uint8 - expected = DataFrame([1, 2, 300, 4], columns=["col1"], dtype="uint16") + if np_version_gt2 and isinstance(value, np.int16): + # Note, result type of uint8 + int16 is int16 + # in numpy < 2, though, numpy would inspect the + # value and see that it could fit in an uint16, resulting in a uint16 + dtype = "int16" + else: + dtype = "uint16" + + expected = DataFrame([1, 2, 300, 4], columns=["col1"], dtype=dtype) tm.assert_frame_equal(df, expected) @@ -3289,3 +3352,15 @@ index = pd.period_range(start="2000", periods=20, freq="B") series = Series(range(20), index=index) assert series.loc["2000-01-14"] == 9 + + def test_loc_nonunique_masked_index(self): + # GH 57027 + ids = list(range(11)) + index = Index(ids * 1000, dtype="Int64") + df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index) + result = df.loc[ids] + expected = DataFrame( + {"val": index.argsort(kind="stable").astype(np.intp)}, + index=Index(np.array(ids).repeat(1000), dtype="Int64"), + ) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/test_partial.py pandas-2.2.2+dfsg/pandas/tests/indexing/test_partial.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/test_partial.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/test_partial.py 2024-04-10 17:42:52.000000000 +0000 @@ -28,7 +28,9 @@ df["series"] = series expected = DataFrame( - {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index") + {"series": [1.23] * 4}, + index=pd.RangeIndex(4, name="df_index"), + columns=Index(["series"], dtype=object), ) tm.assert_frame_equal(df, expected) @@ -39,7 +41,9 @@ series = Series(1.23, index=pd.RangeIndex(4, name="series_index")) df["series"] = series expected = DataFrame( - {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="series_index") + {"series": [1.23] * 4}, + index=pd.RangeIndex(4, name="series_index"), + columns=Index(["series"], dtype=object), ) tm.assert_frame_equal(df, expected) @@ -92,7 +96,9 @@ # these work as they don't really change # anything but the index # GH#5632 - expected = DataFrame(columns=["foo"], index=Index([], dtype="object")) + expected = DataFrame( + columns=Index(["foo"], dtype=object), index=Index([], dtype="object") + ) df = DataFrame(index=Index([], dtype="object")) df["foo"] = Series([], dtype="object") @@ -110,7 +116,9 @@ tm.assert_frame_equal(df, expected) def test_partial_set_empty_frame3(self): - expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected = DataFrame( + columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") + ) expected["foo"] = expected["foo"].astype("float64") df = DataFrame(index=Index([], dtype="int64")) @@ -127,7 +135,9 @@ df = DataFrame(index=Index([], dtype="int64")) df["foo"] = range(len(df)) - expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected = DataFrame( + columns=Index(["foo"], dtype=object), index=Index([], dtype="int64") + ) # range is int-dtype-like, so we get int64 dtype expected["foo"] = expected["foo"].astype("int64") tm.assert_frame_equal(df, expected) @@ -147,14 +157,10 @@ df = DataFrame(columns=["A", "B"]) df[0] = Series(1, index=range(4)) - df.dtypes - str(df) tm.assert_frame_equal(df, expected) df = DataFrame(columns=["A", "B"]) df.loc[:, 0] = Series(1, index=range(4)) - df.dtypes - str(df) tm.assert_frame_equal(df, expected) def test_partial_set_empty_frame_row(self): @@ -204,10 +210,10 @@ df = DataFrame(index=[0]) df = df.copy() df["a"] = 0 - expected = DataFrame(0, index=[0], columns=["a"]) + expected = DataFrame(0, index=[0], columns=Index(["a"], dtype=object)) tm.assert_frame_equal(df, expected) - def test_partial_set_empty_frame_empty_consistencies(self): + def test_partial_set_empty_frame_empty_consistencies(self, using_infer_string): # GH#6171 # consistency on empty frames df = DataFrame(columns=["x", "y"]) @@ -217,7 +223,15 @@ df = DataFrame(columns=["x", "y"]) df["x"] = ["1", "2"] - expected = DataFrame({"x": ["1", "2"], "y": [np.nan, np.nan]}, dtype=object) + expected = DataFrame( + { + "x": Series( + ["1", "2"], + dtype=object if not using_infer_string else "string[pyarrow_numpy]", + ), + "y": Series([np.nan, np.nan], dtype=object), + } + ) tm.assert_frame_equal(df, expected) df = DataFrame(columns=["x", "y"]) @@ -264,6 +278,7 @@ with pytest.raises(IndexError, match=msg): s.iat[3] = 5.0 + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_partial_setting_frame(self, using_array_manager): df_orig = DataFrame( np.arange(6).reshape(3, 2), columns=["A", "B"], dtype="int64" @@ -521,7 +536,11 @@ @pytest.mark.parametrize("key", [100, 100.0]) def test_setitem_with_expansion_numeric_into_datetimeindex(self, key): # GH#4940 inserting non-strings - orig = tm.makeTimeDataFrame() + orig = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) df = orig.copy() df.loc[key, :] = df.iloc[0] @@ -535,7 +554,11 @@ # GH 4940 # allow only setting of 'valid' values - orig = tm.makeTimeDataFrame() + orig = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) # allow object conversion here df = orig.copy() @@ -621,7 +644,7 @@ [ ( period_range(start="2000", periods=20, freq="D"), - ["4D", "8D"], + Index(["4D", "8D"], dtype=object), ( r"None of \[Index\(\['4D', '8D'\], dtype='object'\)\] " r"are in the \[index\]" @@ -629,7 +652,7 @@ ), ( date_range(start="2000", periods=20, freq="D"), - ["4D", "8D"], + Index(["4D", "8D"], dtype=object), ( r"None of \[Index\(\['4D', '8D'\], dtype='object'\)\] " r"are in the \[index\]" @@ -637,7 +660,7 @@ ), ( pd.timedelta_range(start="1 day", periods=20), - ["2000-01-04", "2000-01-08"], + Index(["2000-01-04", "2000-01-08"], dtype=object), ( r"None of \[Index\(\['2000-01-04', '2000-01-08'\], " r"dtype='object'\)\] are in the \[index\]" diff -Nru pandas-2.1.4+dfsg/pandas/tests/indexing/test_scalar.py pandas-2.2.2+dfsg/pandas/tests/indexing/test_scalar.py --- pandas-2.1.4+dfsg/pandas/tests/indexing/test_scalar.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/indexing/test_scalar.py 2024-04-10 17:42:52.000000000 +0000 @@ -140,7 +140,7 @@ df = DataFrame(arr, columns=["A", "A"]) result = df.at[0, "A"] - expected = df.iloc[0] + expected = df.iloc[0].copy() tm.assert_series_equal(result, expected) @@ -246,6 +246,7 @@ assert series.at[(1, 2)] == 1 +@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_at_with_tuple_index_set(): # GH 26989 # DataFrame.at setter works with Index of tuples @@ -276,6 +277,7 @@ assert series.at[1, 3] == 1 assert series.loc[1, 3] == 1 + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") def test_multiindex_at_set(self): # GH 26989 # DataFrame.at and DataFrame.loc setter works with MultiIndex diff -Nru pandas-2.1.4+dfsg/pandas/tests/interchange/test_impl.py pandas-2.2.2+dfsg/pandas/tests/interchange/test_impl.py --- pandas-2.1.4+dfsg/pandas/tests/interchange/test_impl.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/interchange/test_impl.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,7 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) import numpy as np import pytest @@ -8,7 +11,7 @@ is_ci_environment, is_platform_windows, ) -import pandas.util._test_decorators as td +from pandas.compat.numpy import np_version_lt1p23 import pandas as pd import pandas._testing as tm @@ -179,8 +182,6 @@ } ) - df2 = df.__dataframe__() - rng = np.random.default_rng(2) dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns} for col, num_nulls in dict_null.items(): @@ -268,7 +269,7 @@ tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) -@td.skip_if_np_lt("1.23") +@pytest.mark.skipif(np_version_lt1p23, reason="Numpy > 1.23 required") def test_categorical_to_numpy_dlpack(): # https://github.com/pandas-dev/pandas/issues/48393 df = pd.DataFrame({"A": pd.Categorical(["a", "b", "a"])}) @@ -303,6 +304,51 @@ pd.api.interchange.from_dataframe(table, allow_copy=False) +def test_multi_chunk_column() -> None: + pytest.importorskip("pyarrow", "11.0.0") + ser = pd.Series([1, 2, None], dtype="Int64[pyarrow]") + df = pd.concat([ser, ser], ignore_index=True).to_frame("a") + df_orig = df.copy() + with pytest.raises( + RuntimeError, match="Found multi-chunk pyarrow array, but `allow_copy` is False" + ): + pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=False)) + result = pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=True)) + # Interchange protocol defaults to creating numpy-backed columns, so currently this + # is 'float64'. + expected = pd.DataFrame({"a": [1.0, 2.0, None, 1.0, 2.0, None]}, dtype="float64") + tm.assert_frame_equal(result, expected) + + # Check that the rechunking we did didn't modify the original DataFrame. + tm.assert_frame_equal(df, df_orig) + assert len(df["a"].array._pa_array.chunks) == 2 + assert len(df_orig["a"].array._pa_array.chunks) == 2 + + +def test_timestamp_ns_pyarrow(): + # GH 56712 + pytest.importorskip("pyarrow", "11.0.0") + timestamp_args = { + "year": 2000, + "month": 1, + "day": 1, + "hour": 1, + "minute": 1, + "second": 1, + } + df = pd.Series( + [datetime(**timestamp_args)], + dtype="timestamp[ns][pyarrow]", + name="col0", + ).to_frame() + + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi)["col0"].item() + + expected = pd.Timestamp(**timestamp_args) + assert result == expected + + @pytest.mark.parametrize("tz", ["UTC", "US/Pacific"]) @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_datetimetzdtype(tz, unit): @@ -327,7 +373,7 @@ "on CI to path to the tzdata for pyarrow." ), ) - request.node.add_marker(mark) + request.applymarker(mark) arr = pa.array([datetime(2020, 1, 1), None, datetime(2020, 1, 2)]) arr = pc.assume_timezone(arr, "Asia/Kathmandu") @@ -362,3 +408,197 @@ interchange.get_column_by_name = lambda _: column monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) pd.api.interchange.from_dataframe(df) + + +def test_empty_string_column(): + # https://github.com/pandas-dev/pandas/issues/56703 + df = pd.DataFrame({"a": []}, dtype=str) + df2 = df.__dataframe__() + result = pd.api.interchange.from_dataframe(df2) + tm.assert_frame_equal(df, result) + + +def test_large_string(): + # GH#56702 + pytest.importorskip("pyarrow") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + expected = pd.DataFrame({"a": ["x"]}, dtype="object") + tm.assert_frame_equal(result, expected) + + +def test_non_str_names(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.Series([1, 2, 3], name=0).to_frame() + names = df.__dataframe__().column_names() + assert names == ["0"] + + +def test_non_str_names_w_duplicates(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) + dfi = df.__dataframe__() + with pytest.raises( + TypeError, + match=( + "Expected a Series, got a DataFrame. This likely happened because you " + "called __dataframe__ on a DataFrame which, after converting column " + r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " + r"dtype='object'\). Please rename these columns before using the " + "interchange protocol." + ), + ): + pd.api.interchange.from_dataframe(dfi, allow_copy=False) + + +@pytest.mark.parametrize( + ("data", "dtype", "expected_dtype"), + [ + ([1, 2, None], "Int64", "int64"), + ([1, 2, None], "Int64[pyarrow]", "int64"), + ([1, 2, None], "Int8", "int8"), + ([1, 2, None], "Int8[pyarrow]", "int8"), + ( + [1, 2, None], + "UInt64", + "uint64", + ), + ( + [1, 2, None], + "UInt64[pyarrow]", + "uint64", + ), + ([1.0, 2.25, None], "Float32", "float32"), + ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), + ([True, False, None], "boolean", "bool"), + ([True, False, None], "boolean[pyarrow]", "bool"), + (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"), + (["much ado", "about", None], "string[pyarrow]", "large_string"), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), None], + "timestamp[ns][pyarrow]", + "timestamp[ns]", + ), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), None], + "timestamp[us][pyarrow]", + "timestamp[us]", + ), + ( + [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + None, + ], + "timestamp[us, Asia/Kathmandu][pyarrow]", + "timestamp[us, tz=Asia/Kathmandu]", + ), + ], +) +def test_pandas_nullable_with_missing_values( + data: list, dtype: str, expected_dtype: str +) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + # https://github.com/pandas-dev/pandas/issues/57664 + pa = pytest.importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": + expected_dtype = pa.timestamp("us", "Asia/Kathmandu") + + df = pd.DataFrame({"a": data}, dtype=dtype) + result = pai.from_dataframe(df.__dataframe__())["a"] + assert result.type == expected_dtype + assert result[0].as_py() == data[0] + assert result[1].as_py() == data[1] + assert result[2].as_py() is None + + +@pytest.mark.parametrize( + ("data", "dtype", "expected_dtype"), + [ + ([1, 2, 3], "Int64", "int64"), + ([1, 2, 3], "Int64[pyarrow]", "int64"), + ([1, 2, 3], "Int8", "int8"), + ([1, 2, 3], "Int8[pyarrow]", "int8"), + ( + [1, 2, 3], + "UInt64", + "uint64", + ), + ( + [1, 2, 3], + "UInt64[pyarrow]", + "uint64", + ), + ([1.0, 2.25, 5.0], "Float32", "float32"), + ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), + ([True, False, False], "boolean", "bool"), + ([True, False, False], "boolean[pyarrow]", "bool"), + (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"), + (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], + "timestamp[ns][pyarrow]", + "timestamp[ns]", + ), + ( + [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], + "timestamp[us][pyarrow]", + "timestamp[us]", + ), + ( + [ + datetime(2020, 1, 1, tzinfo=timezone.utc), + datetime(2020, 1, 2, tzinfo=timezone.utc), + datetime(2020, 1, 3, tzinfo=timezone.utc), + ], + "timestamp[us, Asia/Kathmandu][pyarrow]", + "timestamp[us, tz=Asia/Kathmandu]", + ), + ], +) +def test_pandas_nullable_without_missing_values( + data: list, dtype: str, expected_dtype: str +) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + pa = pytest.importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]": + expected_dtype = pa.timestamp("us", "Asia/Kathmandu") + + df = pd.DataFrame({"a": data}, dtype=dtype) + result = pai.from_dataframe(df.__dataframe__())["a"] + assert result.type == expected_dtype + assert result[0].as_py() == data[0] + assert result[1].as_py() == data[1] + assert result[2].as_py() == data[2] + + +def test_string_validity_buffer() -> None: + # https://github.com/pandas-dev/pandas/issues/57761 + pytest.importorskip("pyarrow", "11.0.0") + df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") + result = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + assert result is None + + +def test_string_validity_buffer_no_missing() -> None: + # https://github.com/pandas-dev/pandas/issues/57762 + pytest.importorskip("pyarrow", "11.0.0") + df = pd.DataFrame({"a": ["x", None]}, dtype="large_string[pyarrow]") + validity = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"] + assert validity is not None + result = validity[1] + expected = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, "=") + assert result == expected + + +def test_empty_dataframe(): + # https://github.com/pandas-dev/pandas/issues/56700 + df = pd.DataFrame({"a": []}, dtype="int8") + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi, allow_copy=False) + expected = pd.DataFrame({"a": []}, dtype="int8") + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/internals/test_api.py pandas-2.2.2+dfsg/pandas/tests/internals/test_api.py --- pandas-2.1.4+dfsg/pandas/tests/internals/test_api.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/internals/test_api.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,7 +3,10 @@ in core.internals """ +import pytest + import pandas as pd +import pandas._testing as tm from pandas.core import internals from pandas.core.internals import api @@ -26,9 +29,6 @@ "ops", ] expected = [ - "Block", - "DatetimeTZBlock", - "ExtensionBlock", "make_block", "DataManager", "ArrayManager", @@ -37,13 +37,34 @@ "SingleBlockManager", "SingleArrayManager", "concatenate_managers", - "create_block_manager_from_blocks", ] result = [x for x in dir(internals) if not x.startswith("__")] assert set(result) == set(expected + modules) +@pytest.mark.parametrize( + "name", + [ + "NumericBlock", + "ObjectBlock", + "Block", + "ExtensionBlock", + "DatetimeTZBlock", + ], +) +def test_deprecations(name): + # GH#55139 + msg = f"{name} is deprecated.* Use public APIs instead" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + getattr(internals, name) + + if name not in ["NumericBlock", "ObjectBlock"]: + # NumericBlock and ObjectBlock are not in the internals.api namespace + with tm.assert_produces_warning(DeprecationWarning, match=msg): + getattr(api, name) + + def test_make_block_2d_with_dti(): # GH#41168 dti = pd.date_range("2012", periods=3, tz="UTC") @@ -51,3 +72,15 @@ assert blk.shape == (1, 3) assert blk.values.shape == (1, 3) + + +def test_create_block_manager_from_blocks_deprecated(): + # GH#33892 + # If they must, downstream packages should get this from internals.api, + # not internals. + msg = ( + "create_block_manager_from_blocks is deprecated and will be " + "removed in a future version. Use public APIs instead" + ) + with tm.assert_produces_warning(DeprecationWarning, match=msg): + internals.create_block_manager_from_blocks diff -Nru pandas-2.1.4+dfsg/pandas/tests/internals/test_internals.py pandas-2.2.2+dfsg/pandas/tests/internals/test_internals.py --- pandas-2.1.4+dfsg/pandas/tests/internals/test_internals.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/internals/test_internals.py 2024-04-10 17:42:52.000000000 +0000 @@ -397,7 +397,10 @@ def test_pickle(self, mgr): mgr2 = tm.round_trip_pickle(mgr) - tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + tm.assert_frame_equal( + DataFrame._from_mgr(mgr, axes=mgr.axes), + DataFrame._from_mgr(mgr2, axes=mgr2.axes), + ) # GH2431 assert hasattr(mgr2, "_is_consolidated") @@ -411,16 +414,25 @@ def test_non_unique_pickle(self, mgr_string): mgr = create_mgr(mgr_string) mgr2 = tm.round_trip_pickle(mgr) - tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + tm.assert_frame_equal( + DataFrame._from_mgr(mgr, axes=mgr.axes), + DataFrame._from_mgr(mgr2, axes=mgr2.axes), + ) def test_categorical_block_pickle(self): mgr = create_mgr("a: category") mgr2 = tm.round_trip_pickle(mgr) - tm.assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) + tm.assert_frame_equal( + DataFrame._from_mgr(mgr, axes=mgr.axes), + DataFrame._from_mgr(mgr2, axes=mgr2.axes), + ) smgr = create_single_mgr("category") smgr2 = tm.round_trip_pickle(smgr) - tm.assert_series_equal(Series(smgr), Series(smgr2)) + tm.assert_series_equal( + Series()._constructor_from_mgr(smgr, axes=smgr.axes), + Series()._constructor_from_mgr(smgr2, axes=smgr2.axes), + ) def test_iget(self): cols = Index(list("abc")) @@ -579,7 +591,7 @@ else: assert tmgr.iget(3).dtype.type == t - def test_convert(self): + def test_convert(self, using_infer_string): def _compare(old_mgr, new_mgr): """compare the blocks, numeric compare ==, object don't""" old_blocks = set(old_mgr.blocks) @@ -614,9 +626,10 @@ mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(copy=True) - assert new_mgr.iget(0).dtype == np.object_ - assert new_mgr.iget(1).dtype == np.object_ - assert new_mgr.iget(2).dtype == np.object_ + dtype = "string[pyarrow_numpy]" if using_infer_string else np.object_ + assert new_mgr.iget(0).dtype == dtype + assert new_mgr.iget(1).dtype == dtype + assert new_mgr.iget(2).dtype == dtype assert new_mgr.iget(3).dtype == np.int64 assert new_mgr.iget(4).dtype == np.float64 @@ -627,9 +640,9 @@ mgr.iset(1, np.array(["2."] * N, dtype=np.object_)) mgr.iset(2, np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(copy=True) - assert new_mgr.iget(0).dtype == np.object_ - assert new_mgr.iget(1).dtype == np.object_ - assert new_mgr.iget(2).dtype == np.object_ + assert new_mgr.iget(0).dtype == dtype + assert new_mgr.iget(1).dtype == dtype + assert new_mgr.iget(2).dtype == dtype assert new_mgr.iget(3).dtype == np.int32 assert new_mgr.iget(4).dtype == np.bool_ assert new_mgr.iget(5).dtype.type, np.datetime64 @@ -777,24 +790,6 @@ np.array([100.0, 200.0, 300.0]), ) - numeric2 = mgr.get_numeric_data(copy=True) - tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"])) - numeric2.iset( - numeric2.items.get_loc("float"), - np.array([1000.0, 2000.0, 3000.0]), - inplace=True, - ) - if using_copy_on_write: - tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - np.array([1.0, 1.0, 1.0]), - ) - else: - tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - np.array([100.0, 200.0, 300.0]), - ) - def test_get_bool_data(self, using_copy_on_write): mgr = create_mgr( "int: int; float: float; complex: complex;" @@ -822,20 +817,6 @@ np.array([True, False, True]), ) - # Check sharing - bools2 = mgr.get_bool_data(copy=True) - bools2.iset(0, np.array([False, True, False])) - if using_copy_on_write: - tm.assert_numpy_array_equal( - mgr.iget(mgr.items.get_loc("bool")).internal_values(), - np.array([True, True, True]), - ) - else: - tm.assert_numpy_array_equal( - mgr.iget(mgr.items.get_loc("bool")).internal_values(), - np.array([True, False, True]), - ) - def test_unicode_repr_doesnt_raise(self): repr(create_mgr("b,\u05d0: object")) @@ -979,7 +960,6 @@ # 2D only support slice objects # boolean mask - assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_)) assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_)) assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_)) @@ -1335,13 +1315,13 @@ assert not blk._can_hold_element(elem) def test_period_can_hold_element_emptylist(self): - pi = period_range("2016", periods=3, freq="A") + pi = period_range("2016", periods=3, freq="Y") blk = new_block(pi._data.reshape(1, 3), BlockPlacement([1]), ndim=2) assert blk._can_hold_element([]) def test_period_can_hold_element(self, element): - pi = period_range("2016", periods=3, freq="A") + pi = period_range("2016", periods=3, freq="Y") elem = element(pi) self.check_series_setitem(elem, pi, True) @@ -1353,7 +1333,7 @@ with tm.assert_produces_warning(FutureWarning): self.check_series_setitem(elem, pi, False) - dti = pi.to_timestamp("S")[:-1] + dti = pi.to_timestamp("s")[:-1] elem = element(dti) with tm.assert_produces_warning(FutureWarning): self.check_series_setitem(elem, pi, False) diff -Nru pandas-2.1.4+dfsg/pandas/tests/internals/test_managers.py pandas-2.2.2+dfsg/pandas/tests/internals/test_managers.py --- pandas-2.1.4+dfsg/pandas/tests/internals/test_managers.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/internals/test_managers.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,12 @@ """ Testing interaction between the different managers (BlockManager, ArrayManager) """ +import os +import subprocess +import sys + +import pytest + from pandas.core.dtypes.missing import array_equivalent import pandas as pd @@ -14,12 +20,19 @@ def test_dataframe_creation(): - with pd.option_context("mode.data_manager", "block"): - df_block = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + msg = "data_manager option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.data_manager", "block"): + df_block = pd.DataFrame( + {"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]} + ) assert isinstance(df_block._mgr, BlockManager) - with pd.option_context("mode.data_manager", "array"): - df_array = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.data_manager", "array"): + df_array = pd.DataFrame( + {"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]} + ) assert isinstance(df_array._mgr, ArrayManager) # also ensure both are seen as equal @@ -45,12 +58,15 @@ def test_series_creation(): - with pd.option_context("mode.data_manager", "block"): - s_block = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) + msg = "data_manager option is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.data_manager", "block"): + s_block = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) assert isinstance(s_block._mgr, SingleBlockManager) - with pd.option_context("mode.data_manager", "array"): - s_array = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) + with tm.assert_produces_warning(FutureWarning, match=msg): + with pd.option_context("mode.data_manager", "array"): + s_array = pd.Series([1, 2, 3], name="A", index=["a", "b", "c"]) assert isinstance(s_array._mgr, SingleArrayManager) # also ensure both are seen as equal @@ -68,3 +84,20 @@ result = s_array._as_manager("block") assert isinstance(result._mgr, SingleBlockManager) tm.assert_series_equal(result, s_array) + + +@pytest.mark.single_cpu +@pytest.mark.parametrize("manager", ["block", "array"]) +def test_array_manager_depr_env_var(manager): + # GH#55043 + test_env = os.environ.copy() + test_env["PANDAS_DATA_MANAGER"] = manager + response = subprocess.run( + [sys.executable, "-c", "import pandas"], + capture_output=True, + env=test_env, + check=True, + ) + msg = "FutureWarning: The env variable PANDAS_DATA_MANAGER is set" + stderr_msg = response.stderr.decode("utf-8") + assert msg in stderr_msg, stderr_msg diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/conftest.py pandas-2.2.2+dfsg/pandas/tests/io/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/io/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -51,23 +51,7 @@ @pytest.fixture -def s3so(worker_id): - if is_ci_environment(): - url = "http://localhost:5000/" - else: - worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw") - url = f"http://127.0.0.1:555{worker_id}/" - return {"client_kwargs": {"endpoint_url": url}} - - -@pytest.fixture(scope="function" if is_ci_environment() else "session") -def monkeysession(): - with pytest.MonkeyPatch.context() as mp: - yield mp - - -@pytest.fixture(scope="function" if is_ci_environment() else "session") -def s3_base(worker_id, monkeysession): +def s3_base(worker_id, monkeypatch): """ Fixture for mocking S3 interaction. @@ -79,8 +63,8 @@ # temporary workaround as moto fails for botocore >= 1.11 otherwise, # see https://github.com/spulec/moto/issues/1924 & 1952 - monkeysession.setenv("AWS_ACCESS_KEY_ID", "foobar_key") - monkeysession.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") + monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key") + monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") if is_ci_environment(): if is_platform_arm() or is_platform_mac() or is_platform_windows(): # NOT RUN on Windows/macOS/ARM, only Ubuntu @@ -93,10 +77,11 @@ "Windows, macOS or ARM platforms" ) else: + # set in .github/workflows/unit-tests.yml yield "http://localhost:5000" else: requests = pytest.importorskip("requests") - pytest.importorskip("moto", minversion="1.3.14") + pytest.importorskip("moto") pytest.importorskip("flask") # server mode needs flask too # Launching moto in server mode, i.e., as a separate process @@ -129,6 +114,11 @@ @pytest.fixture +def s3so(s3_base): + return {"client_kwargs": {"endpoint_url": s3_base}} + + +@pytest.fixture def s3_resource(s3_base): import boto3 Binary files /tmp/tmpqtnlo80u/3noFIyQG0n/pandas-2.1.4+dfsg/pandas/tests/io/data/excel/test6.xls and /tmp/tmpqtnlo80u/UxW02VZo2q/pandas-2.2.2+dfsg/pandas/tests/io/data/excel/test6.xls differ Binary files /tmp/tmpqtnlo80u/3noFIyQG0n/pandas-2.1.4+dfsg/pandas/tests/io/data/excel/test_cell_annotation.ods and /tmp/tmpqtnlo80u/UxW02VZo2q/pandas-2.2.2+dfsg/pandas/tests/io/data/excel/test_cell_annotation.ods differ Binary files /tmp/tmpqtnlo80u/3noFIyQG0n/pandas-2.1.4+dfsg/pandas/tests/io/data/excel/test_unempty_cells.ods and /tmp/tmpqtnlo80u/UxW02VZo2q/pandas-2.2.2+dfsg/pandas/tests/io/data/excel/test_unempty_cells.ods differ diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/data/gbq_fake_job.txt pandas-2.2.2+dfsg/pandas/tests/io/data/gbq_fake_job.txt --- pandas-2.1.4+dfsg/pandas/tests/io/data/gbq_fake_job.txt 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/data/gbq_fake_job.txt 1970-01-01 00:00:00.000000000 +0000 @@ -1 +0,0 @@ -{'status': {'state': 'DONE'}, 'kind': 'bigquery#job', 'statistics': {'query': {'cacheHit': True, 'totalBytesProcessed': '0'}, 'endTime': '1377668744674', 'totalBytesProcessed': '0', 'startTime': '1377668744466'}, 'jobReference': {'projectId': '57288129629', 'jobId': 'bqjob_r5f956972f0190bdf_00000140c374bf42_2'}, 'etag': '"4PTsVxg68bQkQs1RJ1Ndewqkgg4/oO4VmgFrAku4N6FWci9s7iFIftc"', 'configuration': {'query': {'createDisposition': 'CREATE_IF_NEEDED', 'query': 'SELECT * FROM [publicdata:samples.shakespeare]', 'writeDisposition': 'WRITE_TRUNCATE', 'destinationTable': {'projectId': '57288129629', 'tableId': 'anonb5ec450da88eeeb78a27784ea482ee75a146d442', 'datasetId': '_d0b4f5f0d50dc68a3eb0fa6cba66a9a8687d9253'}}}, 'id': '57288129629:bqjob_r5f956972f0190bdf_00000140c374bf42_2', 'selfLink': 'https://www.googleapis.com/bigquery/v2/projects/57288129629/jobs/bqjob_r5f956972f0190bdf_00000140c374bf42_2'} \ No newline at end of file diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/excel/conftest.py pandas-2.2.2+dfsg/pandas/tests/io/excel/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/io/excel/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/excel/conftest.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,41 +0,0 @@ -import pytest - -import pandas._testing as tm - -from pandas.io.parsers import read_csv - - -@pytest.fixture -def frame(float_frame): - """ - Returns the first ten items in fixture "float_frame". - """ - return float_frame[:10] - - -@pytest.fixture -def tsframe(): - return tm.makeTimeDataFrame()[:5] - - -@pytest.fixture(params=[True, False]) -def merge_cells(request): - return request.param - - -@pytest.fixture -def df_ref(datapath): - """ - Obtain the reference data from read_csv with the Python engine. - """ - filepath = datapath("io", "data", "csv", "test1.csv") - df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python") - return df_ref - - -@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods", ".xlsb"]) -def read_ext(request): - """ - Valid extensions for reading Excel files. - """ - return request.param diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/excel/test_odf.py pandas-2.2.2+dfsg/pandas/tests/io/excel/test_odf.py --- pandas-2.1.4+dfsg/pandas/tests/io/excel/test_odf.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/excel/test_odf.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,11 +3,16 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm pytest.importorskip("odf") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture(autouse=True) def cd_and_set_engine(monkeypatch, datapath): @@ -48,3 +53,25 @@ result = pd.read_excel("test_newlines.ods") tm.assert_frame_equal(result, expected) + + +def test_read_unempty_cells(): + expected = pd.DataFrame( + [1, np.nan, 3, np.nan, 5], + columns=["Column 1"], + ) + + result = pd.read_excel("test_unempty_cells.ods") + + tm.assert_frame_equal(result, expected) + + +def test_read_cell_annotation(): + expected = pd.DataFrame( + ["test", np.nan, "test 3"], + columns=["Column 1"], + ) + + result = pd.read_excel("test_cell_annotation.ods") + + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/excel/test_odswriter.py pandas-2.2.2+dfsg/pandas/tests/io/excel/test_odswriter.py --- pandas-2.1.4+dfsg/pandas/tests/io/excel/test_odswriter.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/excel/test_odswriter.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,14 +1,27 @@ +from datetime import ( + date, + datetime, +) import re import pytest +from pandas.compat import is_platform_windows + +import pandas as pd import pandas._testing as tm from pandas.io.excel import ExcelWriter odf = pytest.importorskip("odf") -pytestmark = pytest.mark.parametrize("ext", [".ods"]) +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + + +@pytest.fixture +def ext(): + return ".ods" def test_write_append_mode_raises(ext): @@ -47,3 +60,47 @@ table = odf.table.Table(name="test_name") writer.book.spreadsheet.addElement(table) assert writer.sheets == {"test_name": table} + + +@pytest.mark.parametrize( + ["value", "cell_value_type", "cell_value_attribute", "cell_value"], + argvalues=[ + (True, "boolean", "boolean-value", "true"), + ("test string", "string", "string-value", "test string"), + (1, "float", "value", "1"), + (1.5, "float", "value", "1.5"), + ( + datetime(2010, 10, 10, 10, 10, 10), + "date", + "date-value", + "2010-10-10T10:10:10", + ), + (date(2010, 10, 10), "date", "date-value", "2010-10-10"), + ], +) +def test_cell_value_type(ext, value, cell_value_type, cell_value_attribute, cell_value): + # GH#54994 ODS: cell attributes should follow specification + # http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#refTable13 + from odf.namespaces import OFFICENS + from odf.table import ( + TableCell, + TableRow, + ) + + table_cell_name = TableCell().qname + + with tm.ensure_clean(ext) as f: + pd.DataFrame([[value]]).to_excel(f, header=False, index=False) + + with pd.ExcelFile(f) as wb: + sheet = wb._reader.get_sheet_by_index(0) + sheet_rows = sheet.getElementsByType(TableRow) + sheet_cells = [ + x + for x in sheet_rows[0].childNodes + if hasattr(x, "qname") and x.qname == table_cell_name + ] + + cell = sheet_cells[0] + assert cell.attributes.get((OFFICENS, "value-type")) == cell_value_type + assert cell.attributes.get((OFFICENS, cell_value_attribute)) == cell_value diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/excel/test_openpyxl.py pandas-2.2.2+dfsg/pandas/tests/io/excel/test_openpyxl.py --- pandas-2.1.4+dfsg/pandas/tests/io/excel/test_openpyxl.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/excel/test_openpyxl.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -13,13 +15,20 @@ ExcelWriter, _OpenpyxlWriter, ) +from pandas.io.excel._openpyxl import OpenpyxlReader openpyxl = pytest.importorskip("openpyxl") -pytestmark = pytest.mark.parametrize("ext", [".xlsx"]) +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + +@pytest.fixture +def ext(): + return ".xlsx" -def test_to_excel_styleconverter(ext): + +def test_to_excel_styleconverter(): from openpyxl import styles hstyle = { @@ -129,6 +138,31 @@ # ExcelWriter needs us to writer something to close properly? DataFrame().to_excel(writer, sheet_name="Sheet2") + # ensure that data_only also works for reading + # and that formulas/values roundtrip + assert ( + pd.read_excel( + f, + sheet_name="Sheet1", + engine="openpyxl", + engine_kwargs={"data_only": data_only}, + ).iloc[0, 1] + == expected + ) + + +@pytest.mark.parametrize("kwarg_name", ["read_only", "data_only"]) +@pytest.mark.parametrize("kwarg_value", [True, False]) +def test_engine_kwargs_append_reader(datapath, ext, kwarg_name, kwarg_value): + # GH 55027 + # test that `read_only` and `data_only` can be passed to + # `openpyxl.reader.excel.load_workbook` via `engine_kwargs` + filename = datapath("io", "data", "excel", "test1" + ext) + with contextlib.closing( + OpenpyxlReader(filename, engine_kwargs={kwarg_name: kwarg_value}) + ) as reader: + assert getattr(reader.book, kwarg_name) == kwarg_value + @pytest.mark.parametrize( "mode,expected", [("w", ["baz"]), ("a", ["foo", "bar", "baz"])] @@ -241,7 +275,7 @@ df = DataFrame({"fruit": ["pear"]}) with tm.ensure_clean(ext) as f: with pytest.raises(ValueError, match=re.escape(msg)): - df.to_excel(f, "foo", engine="openpyxl") + df.to_excel(f, sheet_name="foo", engine="openpyxl") with ExcelWriter( f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists ) as writer: diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/excel/test_readers.py pandas-2.2.2+dfsg/pandas/tests/io/excel/test_readers.py --- pandas-2.1.4+dfsg/pandas/tests/io/excel/test_readers.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/excel/test_readers.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import ( datetime, time, @@ -14,6 +16,9 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + +from pandas.compat import is_platform_windows import pandas.util._test_decorators as td import pandas as pd @@ -22,6 +27,7 @@ Index, MultiIndex, Series, + read_csv, ) import pandas._testing as tm from pandas.core.arrays import ( @@ -29,6 +35,9 @@ StringArray, ) +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ # Add any engines to test here @@ -54,6 +63,7 @@ ), pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")), pytest.param("odf", marks=td.skip_if_no("odf")), + pytest.param("calamine", marks=td.skip_if_no("python_calamine")), ] @@ -67,11 +77,11 @@ return False if engine == "odf" and read_ext != ".ods": return False - if read_ext == ".ods" and engine != "odf": + if read_ext == ".ods" and engine not in {"odf", "calamine"}: return False if engine == "pyxlsb" and read_ext != ".xlsb": return False - if read_ext == ".xlsb" and engine != "pyxlsb": + if read_ext == ".xlsb" and engine not in {"pyxlsb", "calamine"}: return False if engine == "xlrd" and read_ext != ".xls": return False @@ -116,6 +126,36 @@ return read_ext +@pytest.fixture +def df_ref(datapath): + """ + Obtain the reference data from read_csv with the Python engine. + """ + filepath = datapath("io", "data", "csv", "test1.csv") + df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python") + return df_ref + + +def get_exp_unit(read_ext: str, engine: str | None) -> str: + return "ns" + + +def adjust_expected(expected: DataFrame, read_ext: str, engine: str) -> None: + expected.index.name = None + unit = get_exp_unit(read_ext, engine) + # error: "Index" has no attribute "as_unit" + expected.index = expected.index.as_unit(unit) # type: ignore[attr-defined] + + +def xfail_datetimes_with_pyxlsb(engine, request): + if engine == "pyxlsb": + request.applymarker( + pytest.mark.xfail( + reason="Sheets containing datetimes not supported by pyxlsb" + ) + ) + + class TestReaders: @pytest.fixture(autouse=True) def cd_and_set_engine(self, engine, datapath, monkeypatch): @@ -160,9 +200,9 @@ "ods": {"foo": "abcd"}, } - if read_ext[1:] in {"xls", "xlsb"}: + if engine in {"xlrd", "pyxlsb"}: msg = re.escape(r"open_workbook() got an unexpected keyword argument 'foo'") - elif read_ext[1:] == "ods": + elif engine == "odf": msg = re.escape(r"load() got an unexpected keyword argument 'foo'") else: msg = re.escape(r"load_workbook() got an unexpected keyword argument 'foo'") @@ -194,15 +234,12 @@ usecols=3, ) - def test_usecols_list(self, request, read_ext, df_ref): - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + def test_usecols_list(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref[["B", "C"]] + adjust_expected(expected, read_ext, engine) - df_ref = df_ref.reindex(columns=["B", "C"]) df1 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=[0, 2, 3] ) @@ -215,18 +252,15 @@ ) # TODO add index to xls file) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) - def test_usecols_str(self, request, read_ext, df_ref): - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + def test_usecols_str(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref[["A", "B", "C"]] + adjust_expected(expected, read_ext, engine) - df1 = df_ref.reindex(columns=["A", "B", "C"]) df2 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A:D" ) @@ -239,10 +273,12 @@ ) # TODO add index to xls, read xls ignores index name ? - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df3, expected) + + expected = df_ref[["B", "C"]] + adjust_expected(expected, read_ext, engine) - df1 = df_ref.reindex(columns=["B", "C"]) df2 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C,D" ) @@ -254,10 +290,9 @@ usecols="A,C,D", ) # TODO add index to xls file - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df3, expected) - df1 = df_ref.reindex(columns=["B", "C"]) df2 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C:D" ) @@ -268,27 +303,24 @@ index_col=0, usecols="A,C:D", ) - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df3, expected) @pytest.mark.parametrize( "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] ) def test_usecols_diff_positional_int_columns_order( - self, request, read_ext, usecols, df_ref + self, request, engine, read_ext, usecols, df_ref ): - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref[["A", "C"]] + adjust_expected(expected, read_ext, engine) + result = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=usecols ) - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]]) def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref): @@ -296,33 +328,27 @@ expected.index = range(len(expected)) result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols) - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) - def test_read_excel_without_slicing(self, request, read_ext, df_ref): - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + def test_read_excel_without_slicing(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref + adjust_expected(expected, read_ext, engine) + result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) - def test_usecols_excel_range_str(self, request, read_ext, df_ref): - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + def test_usecols_excel_range_str(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) expected = df_ref[["C", "D"]] + adjust_expected(expected, read_ext, engine) + result = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,D:E" ) - tm.assert_frame_equal(result, expected, check_names=False) + tm.assert_frame_equal(result, expected) def test_usecols_excel_range_str_invalid(self, read_ext): msg = "Invalid column name: E1" @@ -398,47 +424,42 @@ expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) tm.assert_frame_equal(parsed, expected) - def test_excel_cell_error_na(self, request, read_ext): - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) + def test_excel_cell_error_na(self, request, engine, read_ext): + xfail_datetimes_with_pyxlsb(engine, request) + + # https://github.com/tafia/calamine/issues/355 + if engine == "calamine" and read_ext == ".ods": + request.applymarker( + pytest.mark.xfail(reason="Calamine can't extract error from ods files") ) parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1") expected = DataFrame([[np.nan]], columns=["Test"]) tm.assert_frame_equal(parsed, expected) - def test_excel_table(self, request, read_ext, df_ref): - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + def test_excel_table(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref + adjust_expected(expected, read_ext, engine) df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0) df2 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet2", skiprows=[1], index_col=0 ) # TODO add index to file - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) df3 = pd.read_excel( "test1" + read_ext, sheet_name="Sheet1", index_col=0, skipfooter=1 ) tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_reader_special_dtypes(self, request, read_ext): - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + def test_reader_special_dtypes(self, request, engine, read_ext): + xfail_datetimes_with_pyxlsb(engine, request) + unit = get_exp_unit(read_ext, engine) expected = DataFrame.from_dict( { "IntCol": [1, 2, -3, 4, 0], @@ -446,13 +467,16 @@ "BoolCol": [True, False, True, True, False], "StrCol": [1, 2, 3, 4, 5], "Str2Col": ["a", 3, "c", "d", "e"], - "DateCol": [ - datetime(2013, 10, 30), - datetime(2013, 10, 31), - datetime(1905, 1, 1), - datetime(2013, 12, 14), - datetime(2015, 3, 14), - ], + "DateCol": Index( + [ + datetime(2013, 10, 30), + datetime(2013, 10, 31), + datetime(1905, 1, 1), + datetime(2013, 12, 14), + datetime(2015, 3, 14), + ], + dtype=f"M8[{unit}]", + ), }, ) basename = "test_types" @@ -520,7 +544,7 @@ "c": [1, 2, 3, 4], "d": [1.0, 2.0, np.nan, 4.0], } - ).reindex(columns=["a", "b", "c", "d"]) + ) tm.assert_frame_equal(actual, expected) @@ -530,7 +554,7 @@ expected["a"] = expected["a"].astype("float64") expected["b"] = expected["b"].astype("float32") - expected["c"] = ["001", "002", "003", "004"] + expected["c"] = Series(["001", "002", "003", "004"], dtype=object) tm.assert_frame_equal(actual, expected) msg = "Unable to convert column d to type int64" @@ -557,8 +581,8 @@ { "a": Series([1, 2, 3, 4], dtype="float64"), "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": ["001", "002", "003", "004"], - "d": ["1", "2", np.nan, "4"], + "c": Series(["001", "002", "003", "004"], dtype=object), + "d": Series(["1", "2", np.nan, "4"], dtype=object), } ), ), @@ -571,7 +595,7 @@ actual = pd.read_excel(basename + read_ext, dtype=dtype) tm.assert_frame_equal(actual, expected) - def test_dtype_backend(self, read_ext, dtype_backend): + def test_dtype_backend(self, read_ext, dtype_backend, engine): # GH#36712 if read_ext in (".xlsb", ".xls"): pytest.skip(f"No engine for filetype: '{read_ext}'") @@ -591,7 +615,7 @@ } ) with tm.ensure_clean(read_ext) as file_path: - df.to_excel(file_path, "test", index=False) + df.to_excel(file_path, sheet_name="test", index=False) result = pd.read_excel( file_path, sheet_name="test", dtype_backend=dtype_backend ) @@ -614,6 +638,9 @@ expected["j"] = ArrowExtensionArray(pa.array([None, None])) else: expected = df + unit = get_exp_unit(read_ext, engine) + expected["i"] = expected["i"].astype(f"M8[{unit}]") + tm.assert_frame_equal(result, expected) def test_dtype_backend_and_dtype(self, read_ext): @@ -623,7 +650,7 @@ df = DataFrame({"a": [np.nan, 1.0], "b": [2.5, np.nan]}) with tm.ensure_clean(read_ext) as file_path: - df.to_excel(file_path, "test", index=False) + df.to_excel(file_path, sheet_name="test", index=False) result = pd.read_excel( file_path, sheet_name="test", @@ -632,6 +659,9 @@ ) tm.assert_frame_equal(result, df) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="infer_string takes precedence" + ) def test_dtype_backend_string(self, read_ext, string_storage): # GH#36712 if read_ext in (".xlsb", ".xls"): @@ -647,7 +677,7 @@ } ) with tm.ensure_clean(read_ext) as file_path: - df.to_excel(file_path, "test", index=False) + df.to_excel(file_path, sheet_name="test", index=False) result = pd.read_excel( file_path, sheet_name="test", dtype_backend="numpy_nullable" ) @@ -668,15 +698,20 @@ ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) + @pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value): # GH#35211 basename = "df_mangle_dup_col_dtypes" - dtype_dict = {"a": str, **dtypes} + dtype_dict = {"a": object, **dtypes} dtype_dict_copy = dtype_dict.copy() # GH#42462 result = pd.read_excel(basename + read_ext, dtype=dtype_dict) - expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) + expected = DataFrame( + { + "a": Series([1], dtype=object), + "a.1": Series([exp_value], dtype=object if not dtypes else None), + } + ) assert dtype_dict == dtype_dict_copy, "dtype dict changed" tm.assert_frame_equal(result, expected) @@ -770,12 +805,7 @@ @pytest.mark.filterwarnings("ignore:Cell A4 is marked:UserWarning:openpyxl") def test_date_conversion_overflow(self, request, engine, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False - if engine == "pyxlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) expected = DataFrame( [ @@ -787,36 +817,35 @@ ) if engine == "openpyxl": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Maybe not supported by openpyxl") ) if engine is None and read_ext in (".xlsx", ".xlsm"): # GH 35029 - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Defaults to openpyxl, maybe not supported") ) result = pd.read_excel("testdateoverflow" + read_ext) tm.assert_frame_equal(result, expected) - def test_sheet_name(self, request, read_ext, df_ref): - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + def test_sheet_name(self, request, read_ext, engine, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + filename = "test1" sheet_name = "Sheet1" + expected = df_ref + adjust_expected(expected, read_ext, engine) + df1 = pd.read_excel( filename + read_ext, sheet_name=sheet_name, index_col=0 ) # doc df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) def test_excel_read_buffer(self, read_ext): pth = "test1" + read_ext @@ -869,6 +898,11 @@ "Unsupported format, or corrupt file: Expected BOF " "record; found b'foo'" ) + elif engine == "calamine": + from python_calamine import CalamineError + + error = CalamineError + msg = "Cannot detect file format" else: error = BadZipFile msg = "File is not a zip file" @@ -962,10 +996,13 @@ f.read() def test_reader_seconds(self, request, engine, read_ext): - if engine == "pyxlsb": - request.node.add_marker( + xfail_datetimes_with_pyxlsb(engine, request) + + # GH 55045 + if engine == "calamine" and read_ext == ".ods": + request.applymarker( pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" + reason="ODS file contains bad datetime (seconds as text)" ) ) @@ -994,14 +1031,11 @@ actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) - def test_read_excel_multiindex(self, request, read_ext): + def test_read_excel_multiindex(self, request, engine, read_ext): # see gh-4679 - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) + + unit = get_exp_unit(read_ext, engine) mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) mi_file = "testmultiindex" + read_ext @@ -1016,6 +1050,7 @@ ], columns=mi, ) + expected[mi[2]] = expected[mi[2]].astype(f"M8[{unit}]") actual = pd.read_excel( mi_file, sheet_name="mi_column", header=[0, 1], index_col=0 @@ -1027,7 +1062,7 @@ expected.columns = ["a", "b", "c", "d"] actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected, check_names=False) + tm.assert_frame_equal(actual, expected) # "both" sheet expected.columns = mi @@ -1035,7 +1070,7 @@ actual = pd.read_excel( mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1] ) - tm.assert_frame_equal(actual, expected, check_names=False) + tm.assert_frame_equal(actual, expected) # "mi_index_name" sheet expected.columns = ["a", "b", "c", "d"] @@ -1088,18 +1123,16 @@ ], ) def test_read_excel_multiindex_blank_after_name( - self, request, read_ext, sheet_name, idx_lvl2 + self, request, engine, read_ext, sheet_name, idx_lvl2 ): # GH34673 - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb (GH4679" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) mi_file = "testmultiindex" + read_ext mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) + + unit = get_exp_unit(read_ext, engine) + expected = DataFrame( [ [1, 2.5, pd.Timestamp("2015-01-01"), True], @@ -1113,6 +1146,7 @@ names=["ilvl1", "ilvl2"], ), ) + expected[mi[2]] = expected[mi[2]].astype(f"M8[{unit}]") result = pd.read_excel( mi_file, sheet_name=sheet_name, @@ -1203,7 +1237,7 @@ expected.index = mi actual = pd.read_excel(filename, sheet_name="multi_no_names", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected, check_names=False) + tm.assert_frame_equal(actual, expected) def test_read_excel_bool_header_arg(self, read_ext): # GH 6114 @@ -1212,14 +1246,11 @@ with pytest.raises(TypeError, match=msg): pd.read_excel("test1" + read_ext, header=arg) - def test_read_excel_skiprows(self, request, read_ext): + def test_read_excel_skiprows(self, request, engine, read_ext): # GH 4903 - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) + + unit = get_exp_unit(read_ext, engine) actual = pd.read_excel( "testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2] @@ -1233,6 +1264,7 @@ ], columns=["a", "b", "c", "d"], ) + expected["c"] = expected["c"].astype(f"M8[{unit}]") tm.assert_frame_equal(actual, expected) actual = pd.read_excel( @@ -1265,16 +1297,13 @@ ], columns=["a", "b", "c", "d"], ) + expected["c"] = expected["c"].astype(f"M8[{unit}]") tm.assert_frame_equal(actual, expected) - def test_read_excel_skiprows_callable_not_in(self, request, read_ext): + def test_read_excel_skiprows_callable_not_in(self, request, engine, read_ext): # GH 4903 - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) + unit = get_exp_unit(read_ext, engine) actual = pd.read_excel( "testskiprows" + read_ext, @@ -1290,6 +1319,7 @@ ], columns=["a", "b", "c", "d"], ) + expected["c"] = expected["c"].astype(f"M8[{unit}]") tm.assert_frame_equal(actual, expected) def test_read_excel_nrows(self, read_ext): @@ -1397,10 +1427,10 @@ def test_ignore_chartsheets_by_str(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="pyxlsb can't distinguish chartsheets from worksheets" ) @@ -1410,10 +1440,10 @@ def test_ignore_chartsheets_by_int(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="pyxlsb can't distinguish chartsheets from worksheets" ) @@ -1542,25 +1572,23 @@ expected = DataFrame(expected, columns=["Test"]) tm.assert_frame_equal(parsed, expected) - def test_excel_table_sheet_by_index(self, request, read_ext, df_ref): - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + def test_excel_table_sheet_by_index(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref + adjust_expected(expected, read_ext, engine) with pd.ExcelFile("test1" + read_ext) as excel: df1 = pd.read_excel(excel, sheet_name=0, index_col=0) df2 = pd.read_excel(excel, sheet_name=1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) with pd.ExcelFile("test1" + read_ext) as excel: df1 = excel.parse(0, index_col=0) df2 = excel.parse(1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1, expected) + tm.assert_frame_equal(df2, expected) with pd.ExcelFile("test1" + read_ext) as excel: df3 = pd.read_excel(excel, sheet_name=0, index_col=0, skipfooter=1) @@ -1571,13 +1599,11 @@ tm.assert_frame_equal(df3, df1.iloc[:-1]) - def test_sheet_name(self, request, read_ext, df_ref): - if read_ext == ".xlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + def test_sheet_name(self, request, engine, read_ext, df_ref): + xfail_datetimes_with_pyxlsb(engine, request) + + expected = df_ref + adjust_expected(expected, read_ext, engine) filename = "test1" sheet_name = "Sheet1" @@ -1588,8 +1614,8 @@ with pd.ExcelFile(filename + read_ext) as excel: df2_parse = excel.parse(index_col=0, sheet_name=sheet_name) - tm.assert_frame_equal(df1_parse, df_ref, check_names=False) - tm.assert_frame_equal(df2_parse, df_ref, check_names=False) + tm.assert_frame_equal(df1_parse, expected) + tm.assert_frame_equal(df2_parse, expected) @pytest.mark.parametrize( "sheet_name", @@ -1641,7 +1667,7 @@ def test_excel_read_binary_via_read_excel(self, read_ext, engine): # GH 38424 with open("test1" + read_ext, "rb") as f: - result = pd.read_excel(f) + result = pd.read_excel(f, engine=engine) expected = pd.read_excel("test1" + read_ext, engine=engine) tm.assert_frame_equal(result, expected) @@ -1664,21 +1690,19 @@ def test_read_datetime_multiindex(self, request, engine, read_ext): # GH 34748 - if engine == "pyxlsb": - request.node.add_marker( - pytest.mark.xfail( - reason="Sheets containing datetimes not supported by pyxlsb" - ) - ) + xfail_datetimes_with_pyxlsb(engine, request) f = "test_datetime_mi" + read_ext with pd.ExcelFile(f) as excel: actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine) - expected_column_index = MultiIndex.from_tuples( - [(pd.to_datetime("02/29/2020"), pd.to_datetime("03/01/2020"))], + + unit = get_exp_unit(read_ext, engine) + dti = pd.DatetimeIndex(["2020-02-29", "2020-03-01"], dtype=f"M8[{unit}]") + expected_column_index = MultiIndex.from_arrays( + [dti[:1], dti[1:]], names=[ - pd.to_datetime("02/29/2020").to_pydatetime(), - pd.to_datetime("03/01/2020").to_pydatetime(), + dti[0].to_pydatetime(), + dti[1].to_pydatetime(), ], ) expected = DataFrame([], index=[], columns=expected_column_index) @@ -1693,10 +1717,10 @@ def test_ignore_chartsheets(self, request, engine, read_ext): # GH 41448 - if engine == "odf": + if read_ext == ".ods": pytest.skip("chartsheets do not exist in the ODF format") if engine == "pyxlsb": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="pyxlsb can't distinguish chartsheets from worksheets" ) @@ -1713,6 +1737,10 @@ import xlrd errors = (BadZipFile, xlrd.biffh.XLRDError) + elif engine == "calamine": + from python_calamine import CalamineError + + errors = (CalamineError,) with tm.ensure_clean(f"corrupt{read_ext}") as file: Path(file).write_text("corrupt", encoding="utf-8") diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/excel/test_style.py pandas-2.2.2+dfsg/pandas/tests/io/excel/test_style.py --- pandas-2.1.4+dfsg/pandas/tests/io/excel/test_style.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/excel/test_style.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows import pandas.util._test_decorators as td from pandas import ( @@ -20,6 +21,9 @@ # could compute styles and render to excel without jinja2, since there is no # 'template' file, but this needs the import error to delayed until render time. +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + def assert_equal_cell_styles(cell1, cell2): # TODO: should find a better way to check equality diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/excel/test_writers.py pandas-2.2.2+dfsg/pandas/tests/io/excel/test_writers.py --- pandas-2.1.4+dfsg/pandas/tests/io/excel/test_writers.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/excel/test_writers.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,7 +11,9 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows from pandas.compat._constants import PY310 +from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td import pandas as pd @@ -19,6 +21,7 @@ DataFrame, Index, MultiIndex, + date_range, option_context, ) import pandas._testing as tm @@ -32,6 +35,26 @@ ) from pandas.io.excel._util import _writers +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + + +def get_exp_unit(path: str) -> str: + return "ns" + + +@pytest.fixture +def frame(float_frame): + """ + Returns the first ten items in fixture "float_frame". + """ + return float_frame[:10] + + +@pytest.fixture(params=[True, False]) +def merge_cells(request): + return request.param + @pytest.fixture def path(ext): @@ -79,7 +102,7 @@ df = DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) with tm.ensure_clean(ext) as path: - df.to_excel(path, filename, index=False, header=False) + df.to_excel(path, sheet_name=filename, index=False, header=False) result = pd.read_excel( path, sheet_name=filename, usecols=[0], header=header ) @@ -95,7 +118,7 @@ df = DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) with tm.ensure_clean(ext) as path: - df.to_excel(path, "with_header", index=False, header=True) + df.to_excel(path, sheet_name="with_header", index=False, header=True) result = pd.read_excel( path, sheet_name=filename, usecols=[0], header=header ) @@ -109,8 +132,10 @@ with tm.ensure_clean(ext) as pth: with ExcelWriter(pth) as writer: - refdf.to_excel(writer, "Data_no_head", header=False, index=False) - refdf.to_excel(writer, "Data_with_head", index=False) + refdf.to_excel( + writer, sheet_name="Data_no_head", header=False, index=False + ) + refdf.to_excel(writer, sheet_name="Data_with_head", index=False) refdf.columns = ["A", "B"] @@ -145,7 +170,7 @@ with tm.ensure_clean(ext) as pth: with ExcelWriter(pth) as ew: for sheetname, df in dfs.items(): - df.to_excel(ew, sheetname) + df.to_excel(ew, sheet_name=sheetname) dfs_returned = pd.read_excel(pth, sheet_name=sheets, index_col=0) @@ -199,8 +224,8 @@ actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) - @pytest.mark.parametrize("c_idx_names", [True, False]) - @pytest.mark.parametrize("r_idx_names", [True, False]) + @pytest.mark.parametrize("c_idx_names", ["a", None]) + @pytest.mark.parametrize("r_idx_names", ["b", None]) @pytest.mark.parametrize("c_idx_levels", [1, 3]) @pytest.mark.parametrize("r_idx_levels", [1, 3]) def test_excel_multindex_roundtrip( @@ -208,21 +233,28 @@ ): # see gh-4679 with tm.ensure_clean(ext) as pth: - if (c_idx_levels == 1 and c_idx_names) and not ( - r_idx_levels == 3 and not r_idx_names - ): - mark = pytest.mark.xfail( - reason="Column index name cannot be serialized unless " - "it's a MultiIndex" - ) - request.node.add_marker(mark) - # Empty name case current read in as # unnamed levels, not Nones. - check_names = r_idx_names or r_idx_levels <= 1 + check_names = bool(r_idx_names) or r_idx_levels <= 1 - df = tm.makeCustomDataframe( - 5, 5, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels + if c_idx_levels == 1: + columns = Index(list("abcde")) + else: + columns = MultiIndex.from_arrays( + [range(5) for _ in range(c_idx_levels)], + names=[f"{c_idx_names}-{i}" for i in range(c_idx_levels)], + ) + if r_idx_levels == 1: + index = Index(list("ghijk")) + else: + index = MultiIndex.from_arrays( + [range(5) for _ in range(r_idx_levels)], + names=[f"{r_idx_names}-{i}" for i in range(r_idx_levels)], + ) + df = DataFrame( + 1.1 * np.ones((5, 5)), + columns=columns, + index=index, ) df.to_excel(pth) @@ -255,7 +287,7 @@ def test_read_excel_parse_dates(self, ext): # see gh-11544, gh-12051 df = DataFrame( - {"col": [1, 2, 3], "date_strings": pd.date_range("2012-01-01", periods=3)} + {"col": [1, 2, 3], "date_strings": date_range("2012-01-01", periods=3)} ) df2 = df.copy() df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") @@ -293,7 +325,7 @@ [ range(4), pd.interval_range( - start=pd.Timestamp("2020-01-01"), periods=4, freq="6M" + start=pd.Timestamp("2020-01-01"), periods=4, freq="6ME" ), ] ) @@ -307,10 +339,10 @@ [ range(4), [ - "(2020-01-31, 2020-07-31]", - "(2020-07-31, 2021-01-31]", - "(2021-01-31, 2021-07-31]", - "(2021-07-31, 2022-01-31]", + "(2020-01-31 00:00:00, 2020-07-31 00:00:00]", + "(2020-07-31 00:00:00, 2021-01-31 00:00:00]", + "(2021-01-31 00:00:00, 2021-07-31 00:00:00]", + "(2021-07-31 00:00:00, 2022-01-31 00:00:00]", ], ] ), @@ -373,10 +405,10 @@ def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: - frame.to_excel(writer, "Data1") + frame.to_excel(writer, sheet_name="Data1") frame2 = frame.copy() frame2.columns = frame.columns[::-1] - frame2.to_excel(writer, "Data2") + frame2.to_excel(writer, sheet_name="Data2") with ExcelFile(path) as reader: found_df = pd.read_excel(reader, sheet_name="Data1", index_col=0) @@ -389,42 +421,42 @@ frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, "test1") - frame.to_excel(path, "test1", columns=["A", "B"]) - frame.to_excel(path, "test1", header=False) - frame.to_excel(path, "test1", index=False) + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) # test roundtrip - frame.to_excel(path, "test1") + frame.to_excel(path, sheet_name="test1") recons = pd.read_excel(path, sheet_name="test1", index_col=0) tm.assert_frame_equal(frame, recons) - frame.to_excel(path, "test1", index=False) + frame.to_excel(path, sheet_name="test1", index=False) recons = pd.read_excel(path, sheet_name="test1", index_col=None) recons.index = frame.index tm.assert_frame_equal(frame, recons) - frame.to_excel(path, "test1", na_rep="NA") + frame.to_excel(path, sheet_name="test1", na_rep="NA") recons = pd.read_excel(path, sheet_name="test1", index_col=0, na_values=["NA"]) tm.assert_frame_equal(frame, recons) # GH 3611 - frame.to_excel(path, "test1", na_rep="88") + frame.to_excel(path, sheet_name="test1", na_rep="88") recons = pd.read_excel(path, sheet_name="test1", index_col=0, na_values=["88"]) tm.assert_frame_equal(frame, recons) - frame.to_excel(path, "test1", na_rep="88") + frame.to_excel(path, sheet_name="test1", na_rep="88") recons = pd.read_excel( path, sheet_name="test1", index_col=0, na_values=[88, 88.0] ) tm.assert_frame_equal(frame, recons) # GH 6573 - frame.to_excel(path, "Sheet1") + frame.to_excel(path, sheet_name="Sheet1") recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(frame, recons) - frame.to_excel(path, "0") + frame.to_excel(path, sheet_name="0") recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(frame, recons) @@ -438,30 +470,38 @@ mixed_frame = frame.copy() mixed_frame["foo"] = "bar" - mixed_frame.to_excel(path, "test1") + mixed_frame.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(mixed_frame, recons) - def test_ts_frame(self, tsframe, path): - df = tsframe + def test_ts_frame(self, path): + unit = get_exp_unit(path) + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD")), + index=date_range("2000-01-01", periods=5, freq="B"), + ) # freq doesn't round-trip index = pd.DatetimeIndex(np.asarray(df.index), freq=None) df.index = index - df.to_excel(path, "test1") + expected = df[:] + expected.index = expected.index.as_unit(unit) + + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) - tm.assert_frame_equal(df, recons) + tm.assert_frame_equal(expected, recons) def test_basics_with_nan(self, frame, path): frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, "test1") - frame.to_excel(path, "test1", columns=["A", "B"]) - frame.to_excel(path, "test1", header=False) - frame.to_excel(path, "test1", index=False) + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) @pytest.mark.parametrize("np_type", [np.int8, np.int16, np.int32, np.int64]) def test_int_types(self, np_type, path): @@ -470,7 +510,7 @@ df = DataFrame( np.random.default_rng(2).integers(-10, 10, size=(10, 2)), dtype=np_type ) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) @@ -485,7 +525,7 @@ def test_float_types(self, np_type, path): # Test np.float values read come back as float. df = DataFrame(np.random.default_rng(2).random(10), dtype=np_type) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( @@ -497,7 +537,7 @@ def test_bool_types(self, path): # Test np.bool_ values read come back as float. df = DataFrame([1, 0, True, False], dtype=np.bool_) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( @@ -508,35 +548,44 @@ def test_inf_roundtrip(self, path): df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(df, recons) - def test_sheets(self, frame, tsframe, path): + def test_sheets(self, frame, path): # freq doesn't round-trip + unit = get_exp_unit(path) + tsframe = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD")), + index=date_range("2000-01-01", periods=5, freq="B"), + ) index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index + expected = tsframe[:] + expected.index = expected.index.as_unit(unit) + frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, "test1") - frame.to_excel(path, "test1", columns=["A", "B"]) - frame.to_excel(path, "test1", header=False) - frame.to_excel(path, "test1", index=False) + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) # Test writing to separate sheets with ExcelWriter(path) as writer: - frame.to_excel(writer, "test1") - tsframe.to_excel(writer, "test2") + frame.to_excel(writer, sheet_name="test1") + tsframe.to_excel(writer, sheet_name="test2") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(frame, recons) recons = pd.read_excel(reader, sheet_name="test2", index_col=0) - tm.assert_frame_equal(tsframe, recons) + tm.assert_frame_equal(expected, recons) assert 2 == len(reader.sheet_names) assert "test1" == reader.sheet_names[0] assert "test2" == reader.sheet_names[1] @@ -545,14 +594,14 @@ frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, "test1") - frame.to_excel(path, "test1", columns=["A", "B"]) - frame.to_excel(path, "test1", header=False) - frame.to_excel(path, "test1", index=False) + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) # column aliases col_aliases = Index(["AA", "X", "Y", "Z"]) - frame.to_excel(path, "test1", header=col_aliases) + frame.to_excel(path, sheet_name="test1", header=col_aliases) with ExcelFile(path) as reader: rs = pd.read_excel(reader, sheet_name="test1", index_col=0) xp = frame.copy() @@ -563,14 +612,16 @@ frame = frame.copy() frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - frame.to_excel(path, "test1") - frame.to_excel(path, "test1", columns=["A", "B"]) - frame.to_excel(path, "test1", header=False) - frame.to_excel(path, "test1", index=False) + frame.to_excel(path, sheet_name="test1") + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", index=False) # test index_label df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 - df.to_excel(path, "test1", index_label=["test"], merge_cells=merge_cells) + df.to_excel( + path, sheet_name="test1", index_label=["test"], merge_cells=merge_cells + ) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np.int64 @@ -581,7 +632,7 @@ df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 df.to_excel( path, - "test1", + sheet_name="test1", index_label=["test", "dummy", "dummy2"], merge_cells=merge_cells, ) @@ -593,7 +644,9 @@ assert df.index.names == recons.index.names df = DataFrame(np.random.default_rng(2).standard_normal((10, 2))) >= 0 - df.to_excel(path, "test1", index_label="test", merge_cells=merge_cells) + df.to_excel( + path, sheet_name="test1", index_label="test", merge_cells=merge_cells + ) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0).astype( np.int64 @@ -603,7 +656,7 @@ frame.to_excel( path, - "test1", + sheet_name="test1", columns=["A", "B", "C", "D"], index=False, merge_cells=merge_cells, @@ -628,27 +681,37 @@ tm.assert_frame_equal(result, df) assert result.index.name == "foo" - def test_excel_roundtrip_datetime(self, merge_cells, tsframe, path): + def test_excel_roundtrip_datetime(self, merge_cells, path): # datetime.date, not sure what to test here exactly + unit = get_exp_unit(path) # freq does not round-trip + tsframe = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD")), + index=date_range("2000-01-01", periods=5, freq="B"), + ) index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index tsf = tsframe.copy() tsf.index = [x.date() for x in tsframe.index] - tsf.to_excel(path, "test1", merge_cells=merge_cells) + tsf.to_excel(path, sheet_name="test1", merge_cells=merge_cells) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) - tm.assert_frame_equal(tsframe, recons) + expected = tsframe[:] + expected.index = expected.index.as_unit(unit) + tm.assert_frame_equal(expected, recons) def test_excel_date_datetime_format(self, ext, path): # see gh-4133 # # Excel output format strings + unit = get_exp_unit(path) + df = DataFrame( [ [date(2014, 1, 31), date(1999, 9, 24)], @@ -665,17 +728,18 @@ index=["DATE", "DATETIME"], columns=["X", "Y"], ) + df_expected = df_expected.astype(f"M8[{unit}]") with tm.ensure_clean(ext) as filename2: with ExcelWriter(path) as writer1: - df.to_excel(writer1, "test1") + df.to_excel(writer1, sheet_name="test1") with ExcelWriter( filename2, date_format="DD.MM.YYYY", datetime_format="DD.MM.YYYY HH-MM-SS", ) as writer2: - df.to_excel(writer2, "test1") + df.to_excel(writer2, sheet_name="test1") with ExcelFile(path) as reader1: rs1 = pd.read_excel(reader1, sheet_name="test1", index_col=0) @@ -689,7 +753,7 @@ # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) - def test_to_excel_interval_no_labels(self, path): + def test_to_excel_interval_no_labels(self, path, using_infer_string): # see gh-19242 # # Test writing Interval without labels. @@ -699,9 +763,11 @@ expected = df.copy() df["new"] = pd.cut(df[0], 10) - expected["new"] = pd.cut(expected[0], 10).astype(str) + expected["new"] = pd.cut(expected[0], 10).astype( + str if not using_infer_string else "string[pyarrow_numpy]" + ) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) @@ -720,7 +786,7 @@ df["new"] = intervals expected["new"] = pd.Series(list(intervals)) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) @@ -741,15 +807,21 @@ lambda x: timedelta(seconds=x).total_seconds() / 86400 ) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(expected, recons) - def test_to_excel_periodindex(self, tsframe, path): - xp = tsframe.resample("M", kind="period").mean() + def test_to_excel_periodindex(self, path): + # xp has a PeriodIndex + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD")), + index=date_range("2000-01-01", periods=5, freq="B"), + ) + xp = df.resample("ME").mean().to_period("M") - xp.to_excel(path, "sht1") + xp.to_excel(path, sheet_name="sht1") with ExcelFile(path) as reader: rs = pd.read_excel(reader, sheet_name="sht1", index_col=0) @@ -760,11 +832,11 @@ new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - frame.to_excel(path, "test1", header=False) - frame.to_excel(path, "test1", columns=["A", "B"]) + frame.to_excel(path, sheet_name="test1", header=False) + frame.to_excel(path, sheet_name="test1", columns=["A", "B"]) # round trip - frame.to_excel(path, "test1", merge_cells=merge_cells) + frame.to_excel(path, sheet_name="test1", merge_cells=merge_cells) with ExcelFile(path) as reader: df = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) tm.assert_frame_equal(frame, df) @@ -799,23 +871,33 @@ header = 0 # round trip - frame.to_excel(path, "test1", merge_cells=merge_cells) + frame.to_excel(path, sheet_name="test1", merge_cells=merge_cells) with ExcelFile(path) as reader: df = pd.read_excel( reader, sheet_name="test1", header=header, index_col=[0, 1] ) if not merge_cells: - fm = frame.columns.format(sparsify=False, adjoin=False, names=False) + fm = frame.columns._format_multi(sparsify=False, include_names=False) frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) - def test_to_excel_multiindex_dates(self, merge_cells, tsframe, path): + def test_to_excel_multiindex_dates(self, merge_cells, path): # try multiindex with dates - new_index = [tsframe.index, np.arange(len(tsframe.index), dtype=np.int64)] - tsframe.index = MultiIndex.from_arrays(new_index) + unit = get_exp_unit(path) + tsframe = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD")), + index=date_range("2000-01-01", periods=5, freq="B"), + ) + tsframe.index = MultiIndex.from_arrays( + [ + tsframe.index.as_unit(unit), + np.arange(len(tsframe.index), dtype=np.int64), + ], + names=["time", "foo"], + ) - tsframe.index.names = ["time", "foo"] - tsframe.to_excel(path, "test1", merge_cells=merge_cells) + tsframe.to_excel(path, sheet_name="test1", merge_cells=merge_cells) with ExcelFile(path) as reader: recons = pd.read_excel(reader, sheet_name="test1", index_col=[0, 1]) @@ -834,7 +916,7 @@ frame2.index = multi_index # Write out to Excel without the index. - frame2.to_excel(path, "test1", index=False) + frame2.to_excel(path, sheet_name="test1", index=False) # Read it back in. with ExcelFile(path) as reader: @@ -848,7 +930,7 @@ expected = DataFrame([], columns=[0, 1, 2]) df = DataFrame([], index=MultiIndex.from_tuples([], names=[0, 1]), columns=[2]) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") with ExcelFile(path) as reader: result = pd.read_excel(reader, sheet_name="test1") @@ -862,7 +944,7 @@ index=["A", "B"], columns=["X", "Y", "Z"], ) - df.to_excel(path, "test1", float_format="%.2f") + df.to_excel(path, sheet_name="test1", float_format="%.2f") with ExcelFile(path) as reader: result = pd.read_excel(reader, sheet_name="test1", index_col=0) @@ -900,7 +982,7 @@ index=["A", "B"], columns=["X", "Y", "Z"], ) - df.to_excel(filename, "test1", float_format="%.2f") + df.to_excel(filename, sheet_name="test1", float_format="%.2f") with ExcelFile(filename) as reader: result = pd.read_excel(reader, sheet_name="test1", index_col=0) @@ -940,8 +1022,25 @@ # ensure limited functionality in 0.10 # override of gh-2370 until sorted out in 0.11 - df = tm.makeCustomDataframe( - nrows, ncols, r_idx_nlevels=r_idx_nlevels, c_idx_nlevels=c_idx_nlevels + if c_idx_nlevels == 1: + columns = Index([f"a-{i}" for i in range(ncols)], dtype=object) + else: + columns = MultiIndex.from_arrays( + [range(ncols) for _ in range(c_idx_nlevels)], + names=[f"i-{i}" for i in range(c_idx_nlevels)], + ) + if r_idx_nlevels == 1: + index = Index([f"b-{i}" for i in range(nrows)], dtype=object) + else: + index = MultiIndex.from_arrays( + [range(nrows) for _ in range(r_idx_nlevels)], + names=[f"j-{i}" for i in range(r_idx_nlevels)], + ) + + df = DataFrame( + np.ones((nrows, ncols)), + columns=columns, + index=index, ) # This if will be removed once multi-column Excel writing @@ -970,7 +1069,7 @@ def test_duplicated_columns(self, path): # see gh-5235 df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B"]) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") expected = DataFrame( [[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B.1"] ) @@ -981,7 +1080,7 @@ # see gh-11007, gh-10970 df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) - df.to_excel(path, "test1") + df.to_excel(path, sheet_name="test1") result = pd.read_excel(path, sheet_name="test1", index_col=0) expected = DataFrame( @@ -990,7 +1089,7 @@ tm.assert_frame_equal(result, expected) # see gh-10982 - df.to_excel(path, "test1", index=False, header=False) + df.to_excel(path, sheet_name="test1", index=False, header=False) result = pd.read_excel(path, sheet_name="test1", header=None) expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) @@ -999,7 +1098,7 @@ def test_swapped_columns(self, path): # Test for issue #5427. write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) - write_frame.to_excel(path, "test1", columns=["B", "A"]) + write_frame.to_excel(path, sheet_name="test1", columns=["B", "A"]) read_frame = pd.read_excel(path, sheet_name="test1", header=0) @@ -1011,12 +1110,12 @@ write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) with pytest.raises(KeyError, match="Not all names specified"): - write_frame.to_excel(path, "test1", columns=["B", "C"]) + write_frame.to_excel(path, sheet_name="test1", columns=["B", "C"]) with pytest.raises( KeyError, match="'passes columns are not ALL present dataframe'" ): - write_frame.to_excel(path, "test1", columns=["C", "D"]) + write_frame.to_excel(path, sheet_name="test1", columns=["C", "D"]) @pytest.mark.parametrize( "to_excel_index,read_excel_index_col", @@ -1029,7 +1128,7 @@ # GH 31677 write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2], "C": [3, 3, 3]}) write_frame.to_excel( - path, "col_subset_bug", columns=["A", "B"], index=to_excel_index + path, sheet_name="col_subset_bug", columns=["A", "B"], index=to_excel_index ) expected = write_frame[["A", "B"]] @@ -1046,7 +1145,7 @@ # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) - df.to_excel(path, "test_c") + df.to_excel(path, sheet_name="test_c") # Read file without comment arg. result1 = pd.read_excel(path, sheet_name="test_c", index_col=0) @@ -1064,7 +1163,7 @@ # Create file to read in df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) - df.to_excel(path, "test_c") + df.to_excel(path, sheet_name="test_c") # Read file with default and explicit comment=None result1 = pd.read_excel(path, sheet_name="test_c") @@ -1078,7 +1177,7 @@ # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) - df.to_excel(path, "test_c") + df.to_excel(path, sheet_name="test_c") # Test read_frame_comment against manually produced expected output. expected = DataFrame({"A": ["one", None, "one"], "B": ["two", None, None]}) @@ -1099,6 +1198,7 @@ def test_datetimes(self, path): # Test writing and reading datetimes. For issue #9139. (xref #9185) + unit = get_exp_unit(path) datetimes = [ datetime(2013, 1, 13, 1, 2, 3), datetime(2013, 1, 13, 2, 45, 56), @@ -1114,10 +1214,11 @@ ] write_frame = DataFrame({"A": datetimes}) - write_frame.to_excel(path, "Sheet1") + write_frame.to_excel(path, sheet_name="Sheet1") read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) - tm.assert_series_equal(write_frame["A"], read_frame["A"]) + expected = write_frame.astype(f"M8[{unit}]") + tm.assert_series_equal(expected["A"], read_frame["A"]) def test_bytes_io(self, engine): # see gh-7074 @@ -1173,7 +1274,7 @@ "str": ["apple", "banana", "cherry"], } ) - df.to_excel(path, "Sheet1") + df.to_excel(path, sheet_name="Sheet1") read = pd.read_excel(path, sheet_name="Sheet1", header=0, index_col=0) expected = df.copy() @@ -1185,15 +1286,16 @@ def test_render_as_column_name(self, path): # see gh-34331 df = DataFrame({"render": [1, 2], "data": [3, 4]}) - df.to_excel(path, "Sheet1") + df.to_excel(path, sheet_name="Sheet1") read = pd.read_excel(path, "Sheet1", index_col=0) expected = df tm.assert_frame_equal(read, expected) def test_true_and_false_value_options(self, path): # see gh-13347 - df = DataFrame([["foo", "bar"]], columns=["col1", "col2"]) - expected = df.replace({"foo": True, "bar": False}) + df = DataFrame([["foo", "bar"]], columns=["col1", "col2"], dtype=object) + with option_context("future.no_silent_downcasting", True): + expected = df.replace({"foo": True, "bar": False}).astype("bool") df.to_excel(path) read_frame = pd.read_excel( @@ -1204,13 +1306,17 @@ def test_freeze_panes(self, path): # see gh-15160 expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) - expected.to_excel(path, "Sheet1", freeze_panes=(1, 1)) + expected.to_excel(path, sheet_name="Sheet1", freeze_panes=(1, 1)) result = pd.read_excel(path, index_col=0) tm.assert_frame_equal(result, expected) def test_path_path_lib(self, engine, ext): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) @@ -1218,7 +1324,11 @@ tm.assert_frame_equal(result, df) def test_path_local_path(self, engine, ext): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), + ) writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) @@ -1303,7 +1413,9 @@ def test_ExcelWriter_dispatch(self, klass, ext): with tm.ensure_clean(ext) as path: with ExcelWriter(path) as writer: - if ext == ".xlsx" and td.safe_import("xlsxwriter"): + if ext == ".xlsx" and bool( + import_optional_dependency("xlsxwriter", errors="ignore") + ): # xlsxwriter has preference over openpyxl if both installed assert isinstance(writer, _XlsxWriter) else: @@ -1349,7 +1461,11 @@ with tm.ensure_clean(path) as filepath: with ExcelWriter(filepath) as writer: assert isinstance(writer, DummyClass) - df = tm.makeCustomDataframe(1, 1) + df = DataFrame( + ["a"], + columns=Index(["b"], name="foo"), + index=Index(["c"], name="bar"), + ) df.to_excel(filepath) DummyClass.assert_called_and_reset() @@ -1374,6 +1490,18 @@ with ExcelWriter(path) as writer: assert os.fspath(writer) == str(path) + def test_to_excel_pos_args_deprecation(self): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_excel except " + r"for the argument 'excel_writer' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + buf = BytesIO() + writer = ExcelWriter(buf) + df.to_excel(writer, "Sheet_name_1") + @pytest.mark.parametrize("klass", _writers.values()) def test_subclass_attr(klass): diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/excel/test_xlrd.py pandas-2.2.2+dfsg/pandas/tests/io/excel/test_xlrd.py --- pandas-2.1.4+dfsg/pandas/tests/io/excel/test_xlrd.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/excel/test_xlrd.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,10 @@ import io +import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm @@ -10,6 +13,9 @@ xlrd = pytest.importorskip("xlrd") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture(params=[".xls"]) def read_ext_xlrd(request): @@ -44,6 +50,17 @@ pd.read_excel(path, engine="xlrd") +def test_nan_in_xls(datapath): + # GH 54564 + path = datapath("io", "data", "excel", "test6.xls") + + expected = pd.DataFrame({0: np.r_[0, 2].astype("int64"), 1: np.r_[1, np.nan]}) + + result = pd.read_excel(path, header=None) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "file_header", [ diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/excel/test_xlsxwriter.py pandas-2.2.2+dfsg/pandas/tests/io/excel/test_xlsxwriter.py --- pandas-2.1.4+dfsg/pandas/tests/io/excel/test_xlsxwriter.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/excel/test_xlsxwriter.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,6 +2,8 @@ import pytest +from pandas.compat import is_platform_windows + from pandas import DataFrame import pandas._testing as tm @@ -9,7 +11,13 @@ xlsxwriter = pytest.importorskip("xlsxwriter") -pytestmark = pytest.mark.parametrize("ext", [".xlsx"]) +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + + +@pytest.fixture +def ext(): + return ".xlsx" def test_column_format(ext): diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/style/test_bar.py pandas-2.2.2+dfsg/pandas/tests/io/formats/style/test_bar.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/style/test_bar.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/style/test_bar.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,13 @@ +import io + import numpy as np import pytest -from pandas import DataFrame +from pandas import ( + NA, + DataFrame, + read_csv, +) pytest.importorskip("jinja2") @@ -305,3 +311,48 @@ msg = r"`height` must be a value in \[0, 100\]" with pytest.raises(ValueError, match=msg): df.style.bar(height=200).to_html() + + +def test_bar_color_and_cmap_error_raises(): + df = DataFrame({"A": [1, 2, 3, 4]}) + msg = "`color` and `cmap` cannot both be given" + # Test that providing both color and cmap raises a ValueError + with pytest.raises(ValueError, match=msg): + df.style.bar(color="#d65f5f", cmap="viridis").to_html() + + +def test_bar_invalid_color_type_error_raises(): + df = DataFrame({"A": [1, 2, 3, 4]}) + msg = ( + r"`color` must be string or list or tuple of 2 strings," + r"\(eg: color=\['#d65f5f', '#5fba7d'\]\)" + ) + # Test that providing an invalid color type raises a ValueError + with pytest.raises(ValueError, match=msg): + df.style.bar(color=123).to_html() + + # Test that providing a color list with more than two elements raises a ValueError + with pytest.raises(ValueError, match=msg): + df.style.bar(color=["#d65f5f", "#5fba7d", "#abcdef"]).to_html() + + +def test_styler_bar_with_NA_values(): + df1 = DataFrame({"A": [1, 2, NA, 4]}) + df2 = DataFrame([[NA, NA], [NA, NA]]) + expected_substring = "style type=" + html_output1 = df1.style.bar(subset="A").to_html() + html_output2 = df2.style.bar(align="left", axis=None).to_html() + assert expected_substring in html_output1 + assert expected_substring in html_output2 + + +def test_style_bar_with_pyarrow_NA_values(): + data = """name,age,test1,test2,teacher + Adam,15,95.0,80,Ashby + Bob,16,81.0,82,Ashby + Dave,16,89.0,84,Jones + Fred,15,,88,Jones""" + df = read_csv(io.StringIO(data), dtype_backend="pyarrow") + expected_substring = "style type=" + html_output = df.style.bar(subset="test1").to_html() + assert expected_substring in html_output diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/style/test_to_latex.py pandas-2.2.2+dfsg/pandas/tests/io/formats/style/test_to_latex.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/style/test_to_latex.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/style/test_to_latex.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,6 +6,7 @@ from pandas import ( DataFrame, MultiIndex, + Series, option_context, ) @@ -22,7 +23,9 @@ @pytest.fixture def df(): - return DataFrame({"A": [0, 1], "B": [-0.61, -1.22], "C": ["ab", "cd"]}) + return DataFrame( + {"A": [0, 1], "B": [-0.61, -1.22], "C": Series(["ab", "cd"], dtype=object)} + ) @pytest.fixture diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/style/test_to_string.py pandas-2.2.2+dfsg/pandas/tests/io/formats/style/test_to_string.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/style/test_to_string.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/style/test_to_string.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,7 +2,10 @@ import pytest -from pandas import DataFrame +from pandas import ( + DataFrame, + Series, +) pytest.importorskip("jinja2") from pandas.io.formats.style import Styler @@ -10,7 +13,9 @@ @pytest.fixture def df(): - return DataFrame({"A": [0, 1], "B": [-0.61, -1.22], "C": ["ab", "cd"]}) + return DataFrame( + {"A": [0, 1], "B": [-0.61, -1.22], "C": Series(["ab", "cd"], dtype=object)} + ) @pytest.fixture diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/test_eng_formatting.py pandas-2.2.2+dfsg/pandas/tests/io/formats/test_eng_formatting.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/test_eng_formatting.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/test_eng_formatting.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,16 +1,39 @@ import numpy as np +import pytest -from pandas import DataFrame -import pandas._testing as tm +from pandas import ( + DataFrame, + reset_option, + set_eng_float_format, +) -import pandas.io.formats.format as fmt +from pandas.io.formats.format import EngFormatter + + +@pytest.fixture(autouse=True) +def reset_float_format(): + yield + reset_option("display.float_format") class TestEngFormatter: + def test_eng_float_formatter2(self, float_frame): + df = float_frame + df.loc[5] = 0 + + set_eng_float_format() + repr(df) + + set_eng_float_format(use_eng_prefix=True) + repr(df) + + set_eng_float_format(accuracy=0) + repr(df) + def test_eng_float_formatter(self): df = DataFrame({"A": [1.41, 141.0, 14100, 1410000.0]}) - fmt.set_eng_float_format() + set_eng_float_format() result = df.to_string() expected = ( " A\n" @@ -21,18 +44,16 @@ ) assert result == expected - fmt.set_eng_float_format(use_eng_prefix=True) + set_eng_float_format(use_eng_prefix=True) result = df.to_string() expected = " A\n0 1.410\n1 141.000\n2 14.100k\n3 1.410M" assert result == expected - fmt.set_eng_float_format(accuracy=0) + set_eng_float_format(accuracy=0) result = df.to_string() expected = " A\n0 1E+00\n1 141E+00\n2 14E+03\n3 1E+06" assert result == expected - tm.reset_display_options() - def compare(self, formatter, input, output): formatted_input = formatter(input) assert formatted_input == output @@ -53,7 +74,7 @@ self.compare(formatter, -input, "-" + output[1:]) def test_exponents_with_eng_prefix(self): - formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + formatter = EngFormatter(accuracy=3, use_eng_prefix=True) f = np.sqrt(2) in_out = [ (f * 10**-24, " 1.414y"), @@ -111,7 +132,7 @@ self.compare_all(formatter, in_out) def test_exponents_without_eng_prefix(self): - formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False) + formatter = EngFormatter(accuracy=4, use_eng_prefix=False) f = np.pi in_out = [ (f * 10**-24, " 3.1416E-24"), @@ -169,7 +190,7 @@ self.compare_all(formatter, in_out) def test_rounding(self): - formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + formatter = EngFormatter(accuracy=3, use_eng_prefix=True) in_out = [ (5.55555, " 5.556"), (55.5555, " 55.556"), @@ -180,7 +201,7 @@ ] self.compare_all(formatter, in_out) - formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + formatter = EngFormatter(accuracy=1, use_eng_prefix=True) in_out = [ (5.55555, " 5.6"), (55.5555, " 55.6"), @@ -191,7 +212,7 @@ ] self.compare_all(formatter, in_out) - formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True) + formatter = EngFormatter(accuracy=0, use_eng_prefix=True) in_out = [ (5.55555, " 6"), (55.5555, " 56"), @@ -202,14 +223,14 @@ ] self.compare_all(formatter, in_out) - formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + formatter = EngFormatter(accuracy=3, use_eng_prefix=True) result = formatter(0) assert result == " 0.000" def test_nan(self): # Issue #11981 - formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + formatter = EngFormatter(accuracy=1, use_eng_prefix=True) result = formatter(np.nan) assert result == "NaN" @@ -221,14 +242,13 @@ } ) pt = df.pivot_table(values="a", index="b", columns="c") - fmt.set_eng_float_format(accuracy=1) + set_eng_float_format(accuracy=1) result = pt.to_string() assert "NaN" in result - tm.reset_display_options() def test_inf(self): # Issue #11981 - formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + formatter = EngFormatter(accuracy=1, use_eng_prefix=True) result = formatter(np.inf) assert result == "inf" diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/test_format.py pandas-2.2.2+dfsg/pandas/tests/io/formats/test_format.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/test_format.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/test_format.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,28 +1,17 @@ """ -Test output formatting for Series/DataFrame, including to_string & reprs +Tests for the file pandas.io.formats.format, *not* tests for general formatting +of pandas objects. """ -from contextlib import nullcontext -from datetime import ( - datetime, - time, - timedelta, -) +from datetime import datetime from io import StringIO -import itertools -import locale -from operator import methodcaller from pathlib import Path import re from shutil import get_terminal_size -import sys -import textwrap -import dateutil import numpy as np import pytest -import pytz -from pandas._config import config +from pandas._config import using_pyarrow_string_dtype import pandas as pd from pandas import ( @@ -38,30 +27,11 @@ read_csv, reset_option, ) -import pandas._testing as tm from pandas.io.formats import printing import pandas.io.formats.format as fmt -def get_local_am_pm(): - """Return the AM and PM strings returned by strftime in current locale.""" - am_local = time(1).strftime("%p") - pm_local = time(13).strftime("%p") - return am_local, pm_local - - -@pytest.fixture(autouse=True) -def clean_config(): - curr_deprecated_options = config._deprecated_options.copy() - curr_registered_options = config._registered_options.copy() - curr_global_config = config._global_config.copy() - yield - config._deprecated_options = curr_deprecated_options - config._registered_options = curr_registered_options - config._global_config = curr_global_config - - @pytest.fixture(params=["string", "pathlike", "buffer"]) def filepath_or_buffer_id(request): """ @@ -172,43 +142,6 @@ class TestDataFrameFormatting: - def test_eng_float_formatter(self, float_frame): - df = float_frame - df.loc[5] = 0 - - fmt.set_eng_float_format() - repr(df) - - fmt.set_eng_float_format(use_eng_prefix=True) - repr(df) - - fmt.set_eng_float_format(accuracy=0) - repr(df) - tm.reset_display_options() - - @pytest.mark.parametrize( - "row, columns, show_counts, result", - [ - [20, 20, None, True], - [20, 20, True, True], - [20, 20, False, False], - [5, 5, None, False], - [5, 5, True, False], - [5, 5, False, False], - ], - ) - def test_show_counts(self, row, columns, show_counts, result): - # Explicit cast to float to avoid implicit cast when setting nan - df = DataFrame(1, columns=range(10), index=range(10)).astype({1: "float"}) - df.iloc[1, 1] = np.nan - - with option_context( - "display.max_info_rows", row, "display.max_info_columns", columns - ): - with StringIO() as buf: - df.info(buf=buf, show_counts=show_counts) - assert ("non-null" in buf.getvalue()) is result - def test_repr_truncation(self): max_len = 20 with option_context("display.max_colwidth", max_len): @@ -225,7 +158,7 @@ r = repr(df) r = r[r.find("\n") + 1 :] - adj = fmt.get_adjustment() + adj = printing.get_adjustment() for line, value in zip(r.split("\n"), df["B"]): if adj.len(value) + 1 > max_len: @@ -239,6 +172,12 @@ with option_context("display.max_colwidth", max_len + 2): assert "..." not in repr(df) + def test_repr_truncation_preserves_na(self): + # https://github.com/pandas-dev/pandas/issues/55630 + df = DataFrame({"a": [pd.NA for _ in range(10)]}) + with option_context("display.max_rows", 2, "display.show_dimensions", False): + assert repr(df) == " a\n0 \n.. ...\n9 " + def test_max_colwidth_negative_int_raises(self): # Deprecation enforced from: # https://github.com/pandas-dev/pandas/issues/31532 @@ -294,38 +233,6 @@ "3 40.0 0.000000e+00" ) - def test_repr_obeys_max_seq_limit(self): - with option_context("display.max_seq_items", 2000): - assert len(printing.pprint_thing(list(range(1000)))) > 1000 - - with option_context("display.max_seq_items", 5): - assert len(printing.pprint_thing(list(range(1000)))) < 100 - - with option_context("display.max_seq_items", 1): - assert len(printing.pprint_thing(list(range(1000)))) < 9 - - def test_repr_set(self): - assert printing.pprint_thing({1}) == "{1}" - - def test_repr_is_valid_construction_code(self): - # for the case of Index, where the repr is traditional rather than - # stylized - idx = Index(["a", "b"]) - res = eval("pd." + repr(idx)) - tm.assert_series_equal(Series(res), Series(idx)) - - def test_repr_should_return_str(self): - # https://docs.python.org/3/reference/datamodel.html#object.__repr__ - # "...The return value must be a string object." - - # (str on py2.x, str (unicode) on py3) - - data = [8, 5, 3, 5] - index1 = ["\u03c3", "\u03c4", "\u03c5", "\u03c6"] - cols = ["\u03c8"] - df = DataFrame(data, columns=cols, index=index1) - assert type(df.__repr__()) == str # both py2 / 3 - def test_repr_no_backslash(self): with option_context("mode.sim_interactive", True): df = DataFrame(np.random.default_rng(2).standard_normal((10, 4))) @@ -565,17 +472,7 @@ with option_context("display.max_columns", 0): assert has_horizontally_truncated_repr(df) - def test_to_string_repr_unicode(self): - buf = StringIO() - - unicode_values = ["\u03c3"] * 10 - unicode_values = np.array(unicode_values, dtype=object) - df = DataFrame({"unicode": unicode_values}) - df.to_string(col_space=10, buf=buf) - - # it works! - repr(df) - + def test_to_string_repr_unicode2(self): idx = Index(["abc", "\u03c3a", "aegdvg"]) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) rs = repr(ser).split("\n") @@ -588,14 +485,6 @@ if not line.startswith("dtype:"): assert len(line) == line_len - # it works even if sys.stdin in None - _stdin = sys.stdin - try: - sys.stdin = None - repr(df) - finally: - sys.stdin = _stdin - def test_east_asian_unicode_false(self): # not aligned properly because of east asian width @@ -945,65 +834,22 @@ # this should work buf.getvalue() - def test_to_string_with_col_space(self): - df = DataFrame(np.random.default_rng(2).random(size=(1, 3))) - c10 = len(df.to_string(col_space=10).split("\n")[1]) - c20 = len(df.to_string(col_space=20).split("\n")[1]) - c30 = len(df.to_string(col_space=30).split("\n")[1]) - assert c10 < c20 < c30 - - # GH 8230 - # col_space wasn't being applied with header=False - with_header = df.to_string(col_space=20) - with_header_row1 = with_header.splitlines()[1] - no_header = df.to_string(col_space=20, header=False) - assert len(with_header_row1) == len(no_header) - - def test_to_string_with_column_specific_col_space_raises(self): - df = DataFrame( - np.random.default_rng(2).random(size=(3, 3)), columns=["a", "b", "c"] - ) - - msg = ( - "Col_space length\\(\\d+\\) should match " - "DataFrame number of columns\\(\\d+\\)" - ) - with pytest.raises(ValueError, match=msg): - df.to_string(col_space=[30, 40]) - - with pytest.raises(ValueError, match=msg): - df.to_string(col_space=[30, 40, 50, 60]) - - msg = "unknown column" - with pytest.raises(ValueError, match=msg): - df.to_string(col_space={"a": "foo", "b": 23, "d": 34}) - - def test_to_string_with_column_specific_col_space(self): - df = DataFrame( - np.random.default_rng(2).random(size=(3, 3)), columns=["a", "b", "c"] - ) - - result = df.to_string(col_space={"a": 10, "b": 11, "c": 12}) - # 3 separating space + each col_space for (id, a, b, c) - assert len(result.split("\n")[1]) == (3 + 1 + 10 + 11 + 12) - - result = df.to_string(col_space=[10, 11, 12]) - assert len(result.split("\n")[1]) == (3 + 1 + 10 + 11 + 12) - @pytest.mark.parametrize( - "index", + "index_scalar", [ - tm.makeStringIndex, - tm.makeIntIndex, - tm.makeDateIndex, - tm.makePeriodIndex, + "a" * 10, + 1, + Timestamp(2020, 1, 1), + pd.Period("2020-01-01"), ], ) @pytest.mark.parametrize("h", [10, 20]) @pytest.mark.parametrize("w", [10, 20]) - def test_to_string_truncate_indices(self, index, h, w): + def test_to_string_truncate_indices(self, index_scalar, h, w): with option_context("display.expand_frame_repr", False): - df = DataFrame(index=index(h), columns=tm.makeStringIndex(w)) + df = DataFrame( + index=[index_scalar] * h, columns=[str(i) * 10 for i in range(w)] + ) with option_context("display.max_rows", 15): if h == 20: assert has_vertically_truncated_repr(df) @@ -1029,23 +875,26 @@ with option_context("display.max_rows", 7, "display.max_columns", 7): assert has_doubly_truncated_repr(df) - def test_truncate_with_different_dtypes(self): + @pytest.mark.parametrize("dtype", ["object", "datetime64[us]"]) + def test_truncate_with_different_dtypes(self, dtype): # 11594, 12045 # when truncated the dtypes of the splits can differ # 11594 - s = Series( + ser = Series( [datetime(2012, 1, 1)] * 10 + [datetime(1012, 1, 2)] - + [datetime(2012, 1, 3)] * 10 + + [datetime(2012, 1, 3)] * 10, + dtype=dtype, ) with option_context("display.max_rows", 8): - result = str(s) - assert "object" in result + result = str(ser) + assert dtype in result + def test_truncate_with_different_dtypes2(self): # 12045 - df = DataFrame({"text": ["some words"] + [None] * 9}) + df = DataFrame({"text": ["some words"] + [None] * 9}, dtype=object) with option_context("display.max_rows", 8, "display.max_columns", 3): result = str(df) @@ -1140,16 +989,6 @@ result = str(df.index) assert start_date in result - def test_nonunicode_nonascii_alignment(self): - df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) - rep_str = df.to_string() - lines = rep_str.split("\n") - assert len(lines[1]) == len(lines[2]) - - def test_unicode_problem_decoding_as_ascii(self): - dm = DataFrame({"c/\u03c3": Series({"test": np.nan})}) - str(dm.to_string()) - def test_string_repr_encoding(self, datapath): filepath = datapath("io", "parser", "data", "unicode_series.csv") df = read_csv(filepath, header=None, encoding="latin1") @@ -1291,377 +1130,6 @@ nmatches = len(re.findall("dtype", str_rep)) assert nmatches == 1 - def test_index_with_nan(self): - # GH 2850 - df = DataFrame( - { - "id1": {0: "1a3", 1: "9h4"}, - "id2": {0: np.nan, 1: "d67"}, - "id3": {0: "78d", 1: "79d"}, - "value": {0: 123, 1: 64}, - } - ) - - # multi-index - y = df.set_index(["id1", "id2", "id3"]) - result = y.to_string() - expected = ( - " value\nid1 id2 id3 \n" - "1a3 NaN 78d 123\n9h4 d67 79d 64" - ) - assert result == expected - - # index - y = df.set_index("id2") - result = y.to_string() - expected = ( - " id1 id3 value\nid2 \n" - "NaN 1a3 78d 123\nd67 9h4 79d 64" - ) - assert result == expected - - # with append (this failed in 0.12) - y = df.set_index(["id1", "id2"]).set_index("id3", append=True) - result = y.to_string() - expected = ( - " value\nid1 id2 id3 \n" - "1a3 NaN 78d 123\n9h4 d67 79d 64" - ) - assert result == expected - - # all-nan in mi - df2 = df.copy() - df2.loc[:, "id2"] = np.nan - y = df2.set_index("id2") - result = y.to_string() - expected = ( - " id1 id3 value\nid2 \n" - "NaN 1a3 78d 123\nNaN 9h4 79d 64" - ) - assert result == expected - - # partial nan in mi - df2 = df.copy() - df2.loc[:, "id2"] = np.nan - y = df2.set_index(["id2", "id3"]) - result = y.to_string() - expected = ( - " id1 value\nid2 id3 \n" - "NaN 78d 1a3 123\n 79d 9h4 64" - ) - assert result == expected - - df = DataFrame( - { - "id1": {0: np.nan, 1: "9h4"}, - "id2": {0: np.nan, 1: "d67"}, - "id3": {0: np.nan, 1: "79d"}, - "value": {0: 123, 1: 64}, - } - ) - - y = df.set_index(["id1", "id2", "id3"]) - result = y.to_string() - expected = ( - " value\nid1 id2 id3 \n" - "NaN NaN NaN 123\n9h4 d67 79d 64" - ) - assert result == expected - - def test_to_string(self): - # big mixed - biggie = DataFrame( - { - "A": np.random.default_rng(2).standard_normal(200), - "B": tm.makeStringIndex(200), - }, - ) - - biggie.loc[:20, "A"] = np.nan - biggie.loc[:20, "B"] = np.nan - s = biggie.to_string() - - buf = StringIO() - retval = biggie.to_string(buf=buf) - assert retval is None - assert buf.getvalue() == s - - assert isinstance(s, str) - - # print in right order - result = biggie.to_string( - columns=["B", "A"], col_space=17, float_format="%.5f".__mod__ - ) - lines = result.split("\n") - header = lines[0].strip().split() - joined = "\n".join([re.sub(r"\s+", " ", x).strip() for x in lines[1:]]) - recons = read_csv(StringIO(joined), names=header, header=None, sep=" ") - tm.assert_series_equal(recons["B"], biggie["B"]) - assert recons["A"].count() == biggie["A"].count() - assert (np.abs(recons["A"].dropna() - biggie["A"].dropna()) < 0.1).all() - - # expected = ['B', 'A'] - # assert header == expected - - result = biggie.to_string(columns=["A"], col_space=17) - header = result.split("\n")[0].strip().split() - expected = ["A"] - assert header == expected - - biggie.to_string(columns=["B", "A"], formatters={"A": lambda x: f"{x:.1f}"}) - - biggie.to_string(columns=["B", "A"], float_format=str) - biggie.to_string(columns=["B", "A"], col_space=12, float_format=str) - - frame = DataFrame(index=np.arange(200)) - frame.to_string() - - def test_to_string_no_header(self): - df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - df_s = df.to_string(header=False) - expected = "0 1 4\n1 2 5\n2 3 6" - - assert df_s == expected - - def test_to_string_specified_header(self): - df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - df_s = df.to_string(header=["X", "Y"]) - expected = " X Y\n0 1 4\n1 2 5\n2 3 6" - - assert df_s == expected - - msg = "Writing 2 cols but got 1 aliases" - with pytest.raises(ValueError, match=msg): - df.to_string(header=["X"]) - - def test_to_string_no_index(self): - # GH 16839, GH 13032 - df = DataFrame({"x": [11, 22], "y": [33, -44], "z": ["AAA", " "]}) - - df_s = df.to_string(index=False) - # Leading space is expected for positive numbers. - expected = " x y z\n11 33 AAA\n22 -44 " - assert df_s == expected - - df_s = df[["y", "x", "z"]].to_string(index=False) - expected = " y x z\n 33 11 AAA\n-44 22 " - assert df_s == expected - - def test_to_string_line_width_no_index(self): - # GH 13998, GH 22505 - df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1, index=False) - expected = " x \\\n 1 \n 2 \n 3 \n\n y \n 4 \n 5 \n 6 " - - assert df_s == expected - - df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1, index=False) - expected = " x \\\n11 \n22 \n33 \n\n y \n 4 \n 5 \n 6 " - - assert df_s == expected - - df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) - - df_s = df.to_string(line_width=1, index=False) - expected = " x \\\n 11 \n 22 \n-33 \n\n y \n 4 \n 5 \n-6 " - - assert df_s == expected - - def test_to_string_line_width_no_header(self): - # GH 53054 - df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1, header=False) - expected = "0 1 \\\n1 2 \n2 3 \n\n0 4 \n1 5 \n2 6 " - - assert df_s == expected - - df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1, header=False) - expected = "0 11 \\\n1 22 \n2 33 \n\n0 4 \n1 5 \n2 6 " - - assert df_s == expected - - df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) - - df_s = df.to_string(line_width=1, header=False) - expected = "0 11 \\\n1 22 \n2 -33 \n\n0 4 \n1 5 \n2 -6 " - - assert df_s == expected - - def test_to_string_line_width_no_index_no_header(self): - # GH 53054 - df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1, index=False, header=False) - expected = "1 \\\n2 \n3 \n\n4 \n5 \n6 " - - assert df_s == expected - - df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1, index=False, header=False) - expected = "11 \\\n22 \n33 \n\n4 \n5 \n6 " - - assert df_s == expected - - df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) - - df_s = df.to_string(line_width=1, index=False, header=False) - expected = " 11 \\\n 22 \n-33 \n\n 4 \n 5 \n-6 " - - assert df_s == expected - - def test_to_string_line_width_with_both_index_and_header(self): - # GH 53054 - df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1) - expected = ( - " x \\\n0 1 \n1 2 \n2 3 \n\n y \n0 4 \n1 5 \n2 6 " - ) - - assert df_s == expected - - df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) - - df_s = df.to_string(line_width=1) - expected = ( - " x \\\n0 11 \n1 22 \n2 33 \n\n y \n0 4 \n1 5 \n2 6 " - ) - - assert df_s == expected - - df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) - - df_s = df.to_string(line_width=1) - expected = ( - " x \\\n0 11 \n1 22 \n2 -33 \n\n y \n0 4 \n1 5 \n2 -6 " - ) - - assert df_s == expected - - def test_to_string_float_formatting(self): - tm.reset_display_options() - with option_context( - "display.precision", - 5, - "display.notebook_repr_html", - False, - ): - df = DataFrame( - {"x": [0, 0.25, 3456.000, 12e45, 1.64e6, 1.7e8, 1.253456, np.pi, -1e6]} - ) - - df_s = df.to_string() - - if _three_digit_exp(): - expected = ( - " x\n0 0.00000e+000\n1 2.50000e-001\n" - "2 3.45600e+003\n3 1.20000e+046\n4 1.64000e+006\n" - "5 1.70000e+008\n6 1.25346e+000\n7 3.14159e+000\n" - "8 -1.00000e+006" - ) - else: - expected = ( - " x\n0 0.00000e+00\n1 2.50000e-01\n" - "2 3.45600e+03\n3 1.20000e+46\n4 1.64000e+06\n" - "5 1.70000e+08\n6 1.25346e+00\n7 3.14159e+00\n" - "8 -1.00000e+06" - ) - assert df_s == expected - - df = DataFrame({"x": [3234, 0.253]}) - df_s = df.to_string() - - expected = " x\n0 3234.000\n1 0.253" - assert df_s == expected - - tm.reset_display_options() - assert get_option("display.precision") == 6 - - df = DataFrame({"x": [1e9, 0.2512]}) - df_s = df.to_string() - - if _three_digit_exp(): - expected = " x\n0 1.000000e+009\n1 2.512000e-001" - else: - expected = " x\n0 1.000000e+09\n1 2.512000e-01" - assert df_s == expected - - def test_to_string_float_format_no_fixed_width(self): - # GH 21625 - df = DataFrame({"x": [0.19999]}) - expected = " x\n0 0.200" - assert df.to_string(float_format="%.3f") == expected - - # GH 22270 - df = DataFrame({"x": [100.0]}) - expected = " x\n0 100" - assert df.to_string(float_format="%.0f") == expected - - def test_to_string_small_float_values(self): - df = DataFrame({"a": [1.5, 1e-17, -5.5e-7]}) - - result = df.to_string() - # sadness per above - if _three_digit_exp(): - expected = ( - " a\n" - "0 1.500000e+000\n" - "1 1.000000e-017\n" - "2 -5.500000e-007" - ) - else: - expected = ( - " a\n" - "0 1.500000e+00\n" - "1 1.000000e-17\n" - "2 -5.500000e-07" - ) - assert result == expected - - # but not all exactly zero - df = df * 0 - result = df.to_string() - expected = " 0\n0 0\n1 0\n2 -0" - - def test_to_string_float_index(self): - index = Index([1.5, 2, 3, 4, 5]) - df = DataFrame(np.arange(5), index=index) - - result = df.to_string() - expected = " 0\n1.5 0\n2.0 1\n3.0 2\n4.0 3\n5.0 4" - assert result == expected - - def test_to_string_complex_float_formatting(self): - # GH #25514, 25745 - with option_context("display.precision", 5): - df = DataFrame( - { - "x": [ - (0.4467846931321966 + 0.0715185102060818j), - (0.2739442392974528 + 0.23515228785438969j), - (0.26974928742135185 + 0.3250604054898979j), - (-1j), - ] - } - ) - result = df.to_string() - expected = ( - " x\n0 0.44678+0.07152j\n" - "1 0.27394+0.23515j\n" - "2 0.26975+0.32506j\n" - "3 -0.00000-1.00000j" - ) - assert result == expected - def test_to_string_ascii_error(self): data = [ ( @@ -1676,139 +1144,6 @@ # it works! repr(df) - def test_to_string_int_formatting(self): - df = DataFrame({"x": [-15, 20, 25, -35]}) - assert issubclass(df["x"].dtype.type, np.integer) - - output = df.to_string() - expected = " x\n0 -15\n1 20\n2 25\n3 -35" - assert output == expected - - def test_to_string_index_formatter(self): - df = DataFrame([range(5), range(5, 10), range(10, 15)]) - - rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) - - xp = """\ - 0 1 2 3 4 -a 0 1 2 3 4 -b 5 6 7 8 9 -c 10 11 12 13 14\ -""" - - assert rs == xp - - def test_to_string_left_justify_cols(self): - tm.reset_display_options() - df = DataFrame({"x": [3234, 0.253]}) - df_s = df.to_string(justify="left") - expected = " x \n0 3234.000\n1 0.253" - assert df_s == expected - - def test_to_string_format_na(self): - tm.reset_display_options() - df = DataFrame( - { - "A": [np.nan, -1, -2.1234, 3, 4], - "B": [np.nan, "foo", "foooo", "fooooo", "bar"], - } - ) - result = df.to_string() - - expected = ( - " A B\n" - "0 NaN NaN\n" - "1 -1.0000 foo\n" - "2 -2.1234 foooo\n" - "3 3.0000 fooooo\n" - "4 4.0000 bar" - ) - assert result == expected - - df = DataFrame( - { - "A": [np.nan, -1.0, -2.0, 3.0, 4.0], - "B": [np.nan, "foo", "foooo", "fooooo", "bar"], - } - ) - result = df.to_string() - - expected = ( - " A B\n" - "0 NaN NaN\n" - "1 -1.0 foo\n" - "2 -2.0 foooo\n" - "3 3.0 fooooo\n" - "4 4.0 bar" - ) - assert result == expected - - def test_to_string_format_inf(self): - # Issue #24861 - tm.reset_display_options() - df = DataFrame( - { - "A": [-np.inf, np.inf, -1, -2.1234, 3, 4], - "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], - } - ) - result = df.to_string() - - expected = ( - " A B\n" - "0 -inf -inf\n" - "1 inf inf\n" - "2 -1.0000 foo\n" - "3 -2.1234 foooo\n" - "4 3.0000 fooooo\n" - "5 4.0000 bar" - ) - assert result == expected - - df = DataFrame( - { - "A": [-np.inf, np.inf, -1.0, -2.0, 3.0, 4.0], - "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], - } - ) - result = df.to_string() - - expected = ( - " A B\n" - "0 -inf -inf\n" - "1 inf inf\n" - "2 -1.0 foo\n" - "3 -2.0 foooo\n" - "4 3.0 fooooo\n" - "5 4.0 bar" - ) - assert result == expected - - def test_to_string_decimal(self): - # Issue #23614 - df = DataFrame({"A": [6.0, 3.1, 2.2]}) - expected = " A\n0 6,0\n1 3,1\n2 2,2" - assert df.to_string(decimal=",") == expected - - def test_to_string_line_width(self): - df = DataFrame(123, index=range(10, 15), columns=range(30)) - s = df.to_string(line_width=80) - assert max(len(line) for line in s.split("\n")) == 80 - - def test_to_string_header_false(self): - # GH 49230 - df = DataFrame([1, 2]) - df.index.name = "a" - s = df.to_string(header=False) - expected = "a \n0 1\n1 2" - assert s == expected - - df = DataFrame([[1, 2], [3, 4]]) - df.index.name = "a" - s = df.to_string(header=False) - expected = "a \n0 1 2\n1 3 4" - assert s == expected - def test_show_dimensions(self): df = DataFrame(123, index=range(10, 15), columns=range(30)) @@ -1869,145 +1204,6 @@ assert "5 rows" not in str(df) assert "5 rows" not in df._repr_html_() - def test_repr_html(self, float_frame): - df = float_frame - df._repr_html_() - - with option_context("display.max_rows", 1, "display.max_columns", 1): - df._repr_html_() - - with option_context("display.notebook_repr_html", False): - df._repr_html_() - - tm.reset_display_options() - - df = DataFrame([[1, 2], [3, 4]]) - with option_context("display.show_dimensions", True): - assert "2 rows" in df._repr_html_() - with option_context("display.show_dimensions", False): - assert "2 rows" not in df._repr_html_() - - tm.reset_display_options() - - def test_repr_html_mathjax(self): - df = DataFrame([[1, 2], [3, 4]]) - assert "tex2jax_ignore" not in df._repr_html_() - - with option_context("display.html.use_mathjax", False): - assert "tex2jax_ignore" in df._repr_html_() - - def test_repr_html_wide(self): - max_cols = 20 - df = DataFrame([["a" * 25] * (max_cols - 1)] * 10) - with option_context("display.max_rows", 60, "display.max_columns", 20): - assert "..." not in df._repr_html_() - - wide_df = DataFrame([["a" * 25] * (max_cols + 1)] * 10) - with option_context("display.max_rows", 60, "display.max_columns", 20): - assert "..." in wide_df._repr_html_() - - def test_repr_html_wide_multiindex_cols(self): - max_cols = 20 - - mcols = MultiIndex.from_product( - [np.arange(max_cols // 2), ["foo", "bar"]], names=["first", "second"] - ) - df = DataFrame([["a" * 25] * len(mcols)] * 10, columns=mcols) - reg_repr = df._repr_html_() - assert "..." not in reg_repr - - mcols = MultiIndex.from_product( - (np.arange(1 + (max_cols // 2)), ["foo", "bar"]), names=["first", "second"] - ) - df = DataFrame([["a" * 25] * len(mcols)] * 10, columns=mcols) - with option_context("display.max_rows", 60, "display.max_columns", 20): - assert "..." in df._repr_html_() - - def test_repr_html_long(self): - with option_context("display.max_rows", 60): - max_rows = get_option("display.max_rows") - h = max_rows - 1 - df = DataFrame({"A": np.arange(1, 1 + h), "B": np.arange(41, 41 + h)}) - reg_repr = df._repr_html_() - assert ".." not in reg_repr - assert str(41 + max_rows // 2) in reg_repr - - h = max_rows + 1 - df = DataFrame({"A": np.arange(1, 1 + h), "B": np.arange(41, 41 + h)}) - long_repr = df._repr_html_() - assert ".." in long_repr - assert str(41 + max_rows // 2) not in long_repr - assert f"{h} rows " in long_repr - assert "2 columns" in long_repr - - def test_repr_html_float(self): - with option_context("display.max_rows", 60): - max_rows = get_option("display.max_rows") - h = max_rows - 1 - df = DataFrame( - { - "idx": np.linspace(-10, 10, h), - "A": np.arange(1, 1 + h), - "B": np.arange(41, 41 + h), - } - ).set_index("idx") - reg_repr = df._repr_html_() - assert ".." not in reg_repr - assert f"{40 + h}" in reg_repr - - h = max_rows + 1 - df = DataFrame( - { - "idx": np.linspace(-10, 10, h), - "A": np.arange(1, 1 + h), - "B": np.arange(41, 41 + h), - } - ).set_index("idx") - long_repr = df._repr_html_() - assert ".." in long_repr - assert "31" not in long_repr - assert f"{h} rows " in long_repr - assert "2 columns" in long_repr - - def test_repr_html_long_multiindex(self): - max_rows = 60 - max_L1 = max_rows // 2 - - tuples = list(itertools.product(np.arange(max_L1), ["foo", "bar"])) - idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) - df = DataFrame( - np.random.default_rng(2).standard_normal((max_L1 * 2, 2)), - index=idx, - columns=["A", "B"], - ) - with option_context("display.max_rows", 60, "display.max_columns", 20): - reg_repr = df._repr_html_() - assert "..." not in reg_repr - - tuples = list(itertools.product(np.arange(max_L1 + 1), ["foo", "bar"])) - idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) - df = DataFrame( - np.random.default_rng(2).standard_normal(((max_L1 + 1) * 2, 2)), - index=idx, - columns=["A", "B"], - ) - long_repr = df._repr_html_() - assert "..." in long_repr - - def test_repr_html_long_and_wide(self): - max_cols = 20 - max_rows = 60 - - h, w = max_rows - 1, max_cols - 1 - df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - with option_context("display.max_rows", 60, "display.max_columns", 20): - assert "..." not in df._repr_html_() - - h, w = max_rows + 1, max_cols + 1 - df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - with option_context("display.max_rows", 60, "display.max_columns", 20): - assert "..." in df._repr_html_() - def test_info_repr(self): # GH#21746 For tests inside a terminal (i.e. not CI) we need to detect # the terminal size to ensure that we try to print something "too big" @@ -2054,43 +1250,10 @@ ): assert not has_non_verbose_info_repr(df) + # FIXME: don't leave commented-out # test verbose overrides # set_option('display.max_info_columns', 4) # exceeded - def test_info_repr_html(self): - max_rows = 60 - max_cols = 20 - # Long - h, w = max_rows + 1, max_cols - 1 - df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - assert r"<class" not in df._repr_html_() - with option_context("display.large_repr", "info"): - assert r"<class" in df._repr_html_() - - # Wide - h, w = max_rows - 1, max_cols + 1 - df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - assert " never truncate assert ".." not in repr(s) - def test_to_string_name(self): - s = Series(range(100), dtype="int64") - s.name = "myser" - res = s.to_string(max_rows=2, name=True) - exp = "0 0\n ..\n99 99\nName: myser" - assert res == exp - res = s.to_string(max_rows=2, name=False) - exp = "0 0\n ..\n99 99" - assert res == exp - - def test_to_string_dtype(self): - s = Series(range(100), dtype="int64") - res = s.to_string(max_rows=2, dtype=True) - exp = "0 0\n ..\n99 99\ndtype: int64" - assert res == exp - res = s.to_string(max_rows=2, dtype=False) - exp = "0 0\n ..\n99 99" - assert res == exp - - def test_to_string_length(self): - s = Series(range(100), dtype="int64") - res = s.to_string(max_rows=2, length=True) - exp = "0 0\n ..\n99 99\nLength: 100" - assert res == exp - - def test_to_string_na_rep(self): - s = Series(index=range(100), dtype=np.float64) - res = s.to_string(na_rep="foo", max_rows=2) - exp = "0 foo\n ..\n99 foo" - assert res == exp - - def test_to_string_float_format(self): - s = Series(range(10), dtype="float64") - res = s.to_string(float_format=lambda x: f"{x:2.1f}", max_rows=2) - exp = "0 0.0\n ..\n9 9.0" - assert res == exp - - def test_to_string_header(self): - s = Series(range(10), dtype="int64") - s.index.name = "foo" - res = s.to_string(header=True, max_rows=2) - exp = "foo\n0 0\n ..\n9 9" - assert res == exp - res = s.to_string(header=False, max_rows=2) - exp = "0 0\n ..\n9 9" - assert res == exp - - def test_to_string_multindex_header(self): - # GH 16718 - df = DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index(["a", "b"]) - res = df.to_string(header=["r1", "r2"]) - exp = " r1 r2\na b \n0 1 2 3" - assert res == exp - - def test_to_string_empty_col(self): - # GH 13653 - s = Series(["", "Hello", "World", "", "", "Mooooo", "", ""]) - res = s.to_string(index=False) - exp = " \n Hello\n World\n \n \nMooooo\n \n " - assert re.match(exp, res) - class TestGenericArrayFormatter: def test_1d_array(self): - # GenericArrayFormatter is used on types for which there isn't a dedicated + # _GenericArrayFormatter is used on types for which there isn't a dedicated # formatter. np.bool_ is one of those types. - obj = fmt.GenericArrayFormatter(np.array([True, False])) + obj = fmt._GenericArrayFormatter(np.array([True, False])) res = obj.get_result() assert len(res) == 2 # Results should be right-justified. @@ -2956,14 +1884,14 @@ assert res[1] == " False" def test_2d_array(self): - obj = fmt.GenericArrayFormatter(np.array([[True, False], [False, True]])) + obj = fmt._GenericArrayFormatter(np.array([[True, False], [False, True]])) res = obj.get_result() assert len(res) == 2 assert res[0] == " [True, False]" assert res[1] == " [False, True]" def test_3d_array(self): - obj = fmt.GenericArrayFormatter( + obj = fmt._GenericArrayFormatter( np.array([[[True, True], [False, False]], [[False, True], [True, False]]]) ) res = obj.get_result() @@ -2998,7 +1926,7 @@ series = Series(ExtTypeStub(), copy=False) res = repr(series) # This line crashed before #33770 was fixed. expected = "\n".join( - ["0 [False True]", "1 [ True False]", "dtype: DtypeStub"] + ["0 [False True]", "1 [True False]", "dtype: DtypeStub"] ) assert res == expected @@ -3132,132 +2060,67 @@ assert str(df) == " x\n0 1.2346e+04\n1 2.0000e+06" -class TestRepr_timedelta64: - def test_none(self): - delta_1d = pd.to_timedelta(1, unit="D") - delta_0d = pd.to_timedelta(0, unit="D") - delta_1s = pd.to_timedelta(1, unit="s") - delta_500ms = pd.to_timedelta(500, unit="ms") - - drepr = lambda x: x._repr_base() - assert drepr(delta_1d) == "1 days" - assert drepr(-delta_1d) == "-1 days" - assert drepr(delta_0d) == "0 days" - assert drepr(delta_1s) == "0 days 00:00:01" - assert drepr(delta_500ms) == "0 days 00:00:00.500000" - assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" - assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" - assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" - assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" - - def test_sub_day(self): - delta_1d = pd.to_timedelta(1, unit="D") - delta_0d = pd.to_timedelta(0, unit="D") - delta_1s = pd.to_timedelta(1, unit="s") - delta_500ms = pd.to_timedelta(500, unit="ms") - - drepr = lambda x: x._repr_base(format="sub_day") - assert drepr(delta_1d) == "1 days" - assert drepr(-delta_1d) == "-1 days" - assert drepr(delta_0d) == "00:00:00" - assert drepr(delta_1s) == "00:00:01" - assert drepr(delta_500ms) == "00:00:00.500000" - assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" - assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" - assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" - assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" - - def test_long(self): - delta_1d = pd.to_timedelta(1, unit="D") - delta_0d = pd.to_timedelta(0, unit="D") - delta_1s = pd.to_timedelta(1, unit="s") - delta_500ms = pd.to_timedelta(500, unit="ms") - - drepr = lambda x: x._repr_base(format="long") - assert drepr(delta_1d) == "1 days 00:00:00" - assert drepr(-delta_1d) == "-1 days +00:00:00" - assert drepr(delta_0d) == "0 days 00:00:00" - assert drepr(delta_1s) == "0 days 00:00:01" - assert drepr(delta_500ms) == "0 days 00:00:00.500000" - assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" - assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" - assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" - assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" - - def test_all(self): - delta_1d = pd.to_timedelta(1, unit="D") - delta_0d = pd.to_timedelta(0, unit="D") - delta_1ns = pd.to_timedelta(1, unit="ns") - - drepr = lambda x: x._repr_base(format="all") - assert drepr(delta_1d) == "1 days 00:00:00.000000000" - assert drepr(-delta_1d) == "-1 days +00:00:00.000000000" - assert drepr(delta_0d) == "0 days 00:00:00.000000000" - assert drepr(delta_1ns) == "0 days 00:00:00.000000001" - assert drepr(-delta_1d + delta_1ns) == "-1 days +00:00:00.000000001" - - class TestTimedelta64Formatter: def test_days(self): - x = pd.to_timedelta(list(range(5)) + [NaT], unit="D") - result = fmt.Timedelta64Formatter(x, box=True).get_result() - assert result[0].strip() == "'0 days'" - assert result[1].strip() == "'1 days'" + x = pd.to_timedelta(list(range(5)) + [NaT], unit="D")._values + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" + assert result[1].strip() == "1 days" - result = fmt.Timedelta64Formatter(x[1:2], box=True).get_result() - assert result[0].strip() == "'1 days'" + result = fmt._Timedelta64Formatter(x[1:2]).get_result() + assert result[0].strip() == "1 days" - result = fmt.Timedelta64Formatter(x, box=False).get_result() + result = fmt._Timedelta64Formatter(x).get_result() assert result[0].strip() == "0 days" assert result[1].strip() == "1 days" - result = fmt.Timedelta64Formatter(x[1:2], box=False).get_result() + result = fmt._Timedelta64Formatter(x[1:2]).get_result() assert result[0].strip() == "1 days" def test_days_neg(self): - x = pd.to_timedelta(list(range(5)) + [NaT], unit="D") - result = fmt.Timedelta64Formatter(-x, box=True).get_result() - assert result[0].strip() == "'0 days'" - assert result[1].strip() == "'-1 days'" + x = pd.to_timedelta(list(range(5)) + [NaT], unit="D")._values + result = fmt._Timedelta64Formatter(-x).get_result() + assert result[0].strip() == "0 days" + assert result[1].strip() == "-1 days" def test_subdays(self): - y = pd.to_timedelta(list(range(5)) + [NaT], unit="s") - result = fmt.Timedelta64Formatter(y, box=True).get_result() - assert result[0].strip() == "'0 days 00:00:00'" - assert result[1].strip() == "'0 days 00:00:01'" + y = pd.to_timedelta(list(range(5)) + [NaT], unit="s")._values + result = fmt._Timedelta64Formatter(y).get_result() + assert result[0].strip() == "0 days 00:00:00" + assert result[1].strip() == "0 days 00:00:01" def test_subdays_neg(self): - y = pd.to_timedelta(list(range(5)) + [NaT], unit="s") - result = fmt.Timedelta64Formatter(-y, box=True).get_result() - assert result[0].strip() == "'0 days 00:00:00'" - assert result[1].strip() == "'-1 days +23:59:59'" + y = pd.to_timedelta(list(range(5)) + [NaT], unit="s")._values + result = fmt._Timedelta64Formatter(-y).get_result() + assert result[0].strip() == "0 days 00:00:00" + assert result[1].strip() == "-1 days +23:59:59" def test_zero(self): - x = pd.to_timedelta(list(range(1)) + [NaT], unit="D") - result = fmt.Timedelta64Formatter(x, box=True).get_result() - assert result[0].strip() == "'0 days'" - - x = pd.to_timedelta(list(range(1)), unit="D") - result = fmt.Timedelta64Formatter(x, box=True).get_result() - assert result[0].strip() == "'0 days'" + x = pd.to_timedelta(list(range(1)) + [NaT], unit="D")._values + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" + + x = pd.to_timedelta(list(range(1)), unit="D")._values + result = fmt._Timedelta64Formatter(x).get_result() + assert result[0].strip() == "0 days" class TestDatetime64Formatter: def test_mixed(self): - x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), NaT]) - result = fmt.Datetime64Formatter(x).get_result() + x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), NaT])._values + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 00:00:00" assert result[1].strip() == "2013-01-01 12:00:00" def test_dates(self): - x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), NaT]) - result = fmt.Datetime64Formatter(x).get_result() + x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), NaT])._values + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01" assert result[1].strip() == "2013-01-02" def test_date_nanos(self): - x = Series([Timestamp(200)]) - result = fmt.Datetime64Formatter(x).get_result() + x = Series([Timestamp(200)])._values + result = fmt._Datetime64Formatter(x).get_result() assert result[0].strip() == "1970-01-01 00:00:00.000000200" def test_dates_display(self): @@ -3265,347 +2128,129 @@ # make sure that we are consistently display date formatting x = Series(date_range("20130101 09:00:00", periods=5, freq="D")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-05 09:00:00" x = Series(date_range("20130101 09:00:00", periods=5, freq="s")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:04" x = Series(date_range("20130101 09:00:00", periods=5, freq="ms")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.004" x = Series(date_range("20130101 09:00:00", periods=5, freq="us")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000004" - x = Series(date_range("20130101 09:00:00", periods=5, freq="N")) + x = Series(date_range("20130101 09:00:00", periods=5, freq="ns")) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt._Datetime64Formatter(x._values).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000000004" def test_datetime64formatter_yearmonth(self): - x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)]) + x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)])._values def format_func(x): return x.strftime("%Y-%m") - formatter = fmt.Datetime64Formatter(x, formatter=format_func) + formatter = fmt._Datetime64Formatter(x, formatter=format_func) result = formatter.get_result() assert result == ["2016-01", "2016-02"] def test_datetime64formatter_hoursecond(self): x = Series( pd.to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f") - ) + )._values def format_func(x): return x.strftime("%H:%M") - formatter = fmt.Datetime64Formatter(x, formatter=format_func) + formatter = fmt._Datetime64Formatter(x, formatter=format_func) result = formatter.get_result() assert result == ["10:10", "12:12"] def test_datetime64formatter_tz_ms(self): - x = Series( - np.array(["2999-01-01", "2999-01-02", "NaT"], dtype="datetime64[ms]") - ).dt.tz_localize("US/Pacific") - result = fmt.Datetime64TZFormatter(x).get_result() + x = ( + Series( + np.array(["2999-01-01", "2999-01-02", "NaT"], dtype="datetime64[ms]") + ) + .dt.tz_localize("US/Pacific") + ._values + ) + result = fmt._Datetime64TZFormatter(x).get_result() assert result[0].strip() == "2999-01-01 00:00:00-08:00" assert result[1].strip() == "2999-01-02 00:00:00-08:00" -class TestNaTFormatting: - def test_repr(self): - assert repr(NaT) == "NaT" - - def test_str(self): - assert str(NaT) == "NaT" - - -class TestPeriodIndexFormat: - def test_period_format_and_strftime_default(self): - per = pd.PeriodIndex([datetime(2003, 1, 1, 12), None], freq="H") - - # Default formatting - formatted = per.format() - assert formatted[0] == "2003-01-01 12:00" # default: minutes not shown - assert formatted[1] == "NaT" - # format is equivalent to strftime(None)... - assert formatted[0] == per.strftime(None)[0] - assert per.strftime(None)[1] is np.nan # ...except for NaTs - - # Same test with nanoseconds freq - per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") - formatted = per.format() - assert (formatted == per.strftime(None)).all() - assert formatted[0] == "2003-01-01 12:01:01.123456789" - assert formatted[1] == "2003-01-01 12:01:01.123456790" - - def test_period_custom(self): - # GH#46252 custom formatting directives %l (ms) and %u (us) - - # 3 digits - per = pd.period_range("2003-01-01 12:01:01.123", periods=2, freq="l") - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") - assert formatted[0] == "03 12:01:01 (ms=123 us=123000 ns=123000000)" - assert formatted[1] == "03 12:01:01 (ms=124 us=124000 ns=124000000)" - - # 6 digits - per = pd.period_range("2003-01-01 12:01:01.123456", periods=2, freq="u") - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") - assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456000)" - assert formatted[1] == "03 12:01:01 (ms=123 us=123457 ns=123457000)" - - # 9 digits - per = pd.period_range("2003-01-01 12:01:01.123456789", periods=2, freq="n") - formatted = per.format(date_format="%y %I:%M:%S (ms=%l us=%u ns=%n)") - assert formatted[0] == "03 12:01:01 (ms=123 us=123456 ns=123456789)" - assert formatted[1] == "03 12:01:01 (ms=123 us=123456 ns=123456790)" - - def test_period_tz(self): - # Formatting periods created from a datetime with timezone. - - # This timestamp is in 2013 in Europe/Paris but is 2012 in UTC - dt = pd.to_datetime(["2013-01-01 00:00:00+01:00"], utc=True) - - # Converting to a period looses the timezone information - # Since tz is currently set as utc, we'll see 2012 - with tm.assert_produces_warning(UserWarning, match="will drop timezone"): - per = dt.to_period(freq="H") - assert per.format()[0] == "2012-12-31 23:00" - - # If tz is currently set as paris before conversion, we'll see 2013 - dt = dt.tz_convert("Europe/Paris") - with tm.assert_produces_warning(UserWarning, match="will drop timezone"): - per = dt.to_period(freq="H") - assert per.format()[0] == "2013-01-01 00:00" - +class TestFormatPercentiles: @pytest.mark.parametrize( - "locale_str", + "percentiles, expected", [ - pytest.param(None, id=str(locale.getlocale())), - "it_IT.utf8", - "it_IT", # Note: encoding will be 'ISO8859-1' - "zh_CN.utf8", - "zh_CN", # Note: encoding will be 'gb2312' + ( + [0.01999, 0.02001, 0.5, 0.666666, 0.9999], + ["1.999%", "2.001%", "50%", "66.667%", "99.99%"], + ), + ( + [0, 0.5, 0.02001, 0.5, 0.666666, 0.9999], + ["0%", "50%", "2.0%", "50%", "66.67%", "99.99%"], + ), + ([0.281, 0.29, 0.57, 0.58], ["28.1%", "29%", "57%", "58%"]), + ([0.28, 0.29, 0.57, 0.58], ["28%", "29%", "57%", "58%"]), + ( + [0.9, 0.99, 0.999, 0.9999, 0.99999], + ["90%", "99%", "99.9%", "99.99%", "99.999%"], + ), ], ) - def test_period_non_ascii_fmt(self, locale_str): - # GH#46468 non-ascii char in input format string leads to wrong output - - # Skip if locale cannot be set - if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): - pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") - - # Change locale temporarily for this test. - with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): - # Scalar - per = pd.Period("2018-03-11 13:00", freq="H") - assert per.strftime("%y é") == "18 é" - - # Index - per = pd.period_range("2003-01-01 01:00:00", periods=2, freq="12h") - formatted = per.format(date_format="%y é") - assert formatted[0] == "03 é" - assert formatted[1] == "03 é" + def test_format_percentiles(self, percentiles, expected): + result = fmt.format_percentiles(percentiles) + assert result == expected @pytest.mark.parametrize( - "locale_str", + "percentiles", [ - pytest.param(None, id=str(locale.getlocale())), - "it_IT.utf8", - "it_IT", # Note: encoding will be 'ISO8859-1' - "zh_CN.utf8", - "zh_CN", # Note: encoding will be 'gb2312' + ([0.1, np.nan, 0.5]), + ([-0.001, 0.1, 0.5]), + ([2, 0.1, 0.5]), + ([0.1, 0.5, "a"]), ], ) - def test_period_custom_locale_directive(self, locale_str): - # GH#46319 locale-specific directive leads to non-utf8 c strftime char* result - - # Skip if locale cannot be set - if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): - pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") - - # Change locale temporarily for this test. - with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): - # Get locale-specific reference - am_local, pm_local = get_local_am_pm() - - # Scalar - per = pd.Period("2018-03-11 13:00", freq="H") - assert per.strftime("%p") == pm_local - - # Index - per = pd.period_range("2003-01-01 01:00:00", periods=2, freq="12h") - formatted = per.format(date_format="%y %I:%M:%S%p") - assert formatted[0] == f"03 01:00:00{am_local}" - assert formatted[1] == f"03 01:00:00{pm_local}" - - -class TestDatetimeIndexFormat: - def test_datetime(self): - formatted = pd.to_datetime([datetime(2003, 1, 1, 12), NaT]).format() - assert formatted[0] == "2003-01-01 12:00:00" - assert formatted[1] == "NaT" - - def test_date(self): - formatted = pd.to_datetime([datetime(2003, 1, 1), NaT]).format() - assert formatted[0] == "2003-01-01" - assert formatted[1] == "NaT" - - def test_date_tz(self): - formatted = pd.to_datetime([datetime(2013, 1, 1)], utc=True).format() - assert formatted[0] == "2013-01-01 00:00:00+00:00" - - formatted = pd.to_datetime([datetime(2013, 1, 1), NaT], utc=True).format() - assert formatted[0] == "2013-01-01 00:00:00+00:00" - - def test_date_explicit_date_format(self): - formatted = pd.to_datetime([datetime(2003, 2, 1), NaT]).format( - date_format="%m-%d-%Y", na_rep="UT" - ) - assert formatted[0] == "02-01-2003" - assert formatted[1] == "UT" - - -class TestDatetimeIndexUnicode: - def test_dates(self): - text = str(pd.to_datetime([datetime(2013, 1, 1), datetime(2014, 1, 1)])) - assert "['2013-01-01'," in text - assert ", '2014-01-01']" in text - - def test_mixed(self): - text = str( - pd.to_datetime( - [datetime(2013, 1, 1), datetime(2014, 1, 1, 12), datetime(2014, 1, 1)] - ) - ) - assert "'2013-01-01 00:00:00'," in text - assert "'2014-01-01 00:00:00']" in text - - -class TestStringRepTimestamp: - def test_no_tz(self): - dt_date = datetime(2013, 1, 2) - assert str(dt_date) == str(Timestamp(dt_date)) - - dt_datetime = datetime(2013, 1, 2, 12, 1, 3) - assert str(dt_datetime) == str(Timestamp(dt_datetime)) - - dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45) - assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) - - ts_nanos_only = Timestamp(200) - assert str(ts_nanos_only) == "1970-01-01 00:00:00.000000200" - - ts_nanos_micros = Timestamp(1200) - assert str(ts_nanos_micros) == "1970-01-01 00:00:00.000001200" - - def test_tz_pytz(self): - dt_date = datetime(2013, 1, 2, tzinfo=pytz.utc) - assert str(dt_date) == str(Timestamp(dt_date)) - - dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=pytz.utc) - assert str(dt_datetime) == str(Timestamp(dt_datetime)) - - dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=pytz.utc) - assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) - - def test_tz_dateutil(self): - utc = dateutil.tz.tzutc() - - dt_date = datetime(2013, 1, 2, tzinfo=utc) - assert str(dt_date) == str(Timestamp(dt_date)) - - dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=utc) - assert str(dt_datetime) == str(Timestamp(dt_datetime)) - - dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=utc) - assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) - - def test_nat_representations(self): - for f in (str, repr, methodcaller("isoformat")): - assert f(NaT) == "NaT" - - -@pytest.mark.parametrize( - "percentiles, expected", - [ - ( - [0.01999, 0.02001, 0.5, 0.666666, 0.9999], - ["1.999%", "2.001%", "50%", "66.667%", "99.99%"], - ), - ( - [0, 0.5, 0.02001, 0.5, 0.666666, 0.9999], - ["0%", "50%", "2.0%", "50%", "66.67%", "99.99%"], - ), - ([0.281, 0.29, 0.57, 0.58], ["28.1%", "29%", "57%", "58%"]), - ([0.28, 0.29, 0.57, 0.58], ["28%", "29%", "57%", "58%"]), - ], -) -def test_format_percentiles(percentiles, expected): - result = fmt.format_percentiles(percentiles) - assert result == expected - + def test_error_format_percentiles(self, percentiles): + msg = r"percentiles should all be in the interval \[0,1\]" + with pytest.raises(ValueError, match=msg): + fmt.format_percentiles(percentiles) -@pytest.mark.parametrize( - "percentiles", - [([0.1, np.nan, 0.5]), ([-0.001, 0.1, 0.5]), ([2, 0.1, 0.5]), ([0.1, 0.5, "a"])], -) -def test_error_format_percentiles(percentiles): - msg = r"percentiles should all be in the interval \[0,1\]" - with pytest.raises(ValueError, match=msg): - fmt.format_percentiles(percentiles) - - -def test_format_percentiles_integer_idx(): - # Issue #26660 - result = fmt.format_percentiles(np.linspace(0, 1, 10 + 1)) - expected = [ - "0%", - "10%", - "20%", - "30%", - "40%", - "50%", - "60%", - "70%", - "80%", - "90%", - "100%", - ] - assert result == expected - - -def test_repr_html_ipython_config(ip): - code = textwrap.dedent( - """\ - from pandas import DataFrame - df = DataFrame({"A": [1, 2]}) - df._repr_html_() - - cfg = get_ipython().config - cfg['IPKernelApp']['parent_appname'] - df._repr_html_() - """ - ) - result = ip.run_cell(code) - assert not result.error_in_exec + def test_format_percentiles_integer_idx(self): + # Issue #26660 + result = fmt.format_percentiles(np.linspace(0, 1, 10 + 1)) + expected = [ + "0%", + "10%", + "20%", + "30%", + "40%", + "50%", + "60%", + "70%", + "80%", + "90%", + "100%", + ] + assert result == expected @pytest.mark.parametrize("method", ["to_string", "to_html", "to_latex"]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/test_info.py pandas-2.2.2+dfsg/pandas/tests/io/formats/test_info.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/test_info.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/test_info.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,527 +0,0 @@ -from io import StringIO -import re -from string import ascii_uppercase as uppercase -import sys -import textwrap - -import numpy as np -import pytest - -from pandas.compat import ( - IS64, - PYPY, -) - -from pandas import ( - CategoricalIndex, - DataFrame, - MultiIndex, - Series, - date_range, - option_context, -) -import pandas._testing as tm - - -@pytest.fixture -def duplicate_columns_frame(): - """Dataframe with duplicate column names.""" - return DataFrame( - np.random.default_rng(2).standard_normal((1500, 4)), - columns=["a", "a", "b", "b"], - ) - - -def test_info_empty(): - # GH #45494 - df = DataFrame() - buf = StringIO() - df.info(buf=buf) - result = buf.getvalue() - expected = textwrap.dedent( - """\ - - RangeIndex: 0 entries - Empty DataFrame\n""" - ) - assert result == expected - - -def test_info_categorical_column_smoke_test(): - n = 2500 - df = DataFrame({"int64": np.random.default_rng(2).integers(100, size=n, dtype=int)}) - df["category"] = Series( - np.array(list("abcdefghij")).take( - np.random.default_rng(2).integers(0, 10, size=n, dtype=int) - ) - ).astype("category") - df.isna() - buf = StringIO() - df.info(buf=buf) - - df2 = df[df["category"] == "d"] - buf = StringIO() - df2.info(buf=buf) - - -@pytest.mark.parametrize( - "fixture_func_name", - [ - "int_frame", - "float_frame", - "datetime_frame", - "duplicate_columns_frame", - ], -) -def test_info_smoke_test(fixture_func_name, request): - frame = request.getfixturevalue(fixture_func_name) - buf = StringIO() - frame.info(buf=buf) - result = buf.getvalue().splitlines() - assert len(result) > 10 - - -@pytest.mark.parametrize( - "num_columns, max_info_columns, verbose", - [ - (10, 100, True), - (10, 11, True), - (10, 10, True), - (10, 9, False), - (10, 1, False), - ], -) -def test_info_default_verbose_selection(num_columns, max_info_columns, verbose): - frame = DataFrame(np.random.default_rng(2).standard_normal((5, num_columns))) - with option_context("display.max_info_columns", max_info_columns): - io_default = StringIO() - frame.info(buf=io_default) - result = io_default.getvalue() - - io_explicit = StringIO() - frame.info(buf=io_explicit, verbose=verbose) - expected = io_explicit.getvalue() - - assert result == expected - - -def test_info_verbose_check_header_separator_body(): - buf = StringIO() - size = 1001 - start = 5 - frame = DataFrame(np.random.default_rng(2).standard_normal((3, size))) - frame.info(verbose=True, buf=buf) - - res = buf.getvalue() - header = " # Column Dtype \n--- ------ ----- " - assert header in res - - frame.info(verbose=True, buf=buf) - buf.seek(0) - lines = buf.readlines() - assert len(lines) > 0 - - for i, line in enumerate(lines): - if start <= i < start + size: - line_nr = f" {i - start} " - assert line.startswith(line_nr) - - -@pytest.mark.parametrize( - "size, header_exp, separator_exp, first_line_exp, last_line_exp", - [ - ( - 4, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 3 3 3 non-null float64", - ), - ( - 11, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 10 10 3 non-null float64", - ), - ( - 101, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 100 100 3 non-null float64", - ), - ( - 1001, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 1000 1000 3 non-null float64", - ), - ( - 10001, - " # Column Non-Null Count Dtype ", - "--- ------ -------------- ----- ", - " 0 0 3 non-null float64", - " 10000 10000 3 non-null float64", - ), - ], -) -def test_info_verbose_with_counts_spacing( - size, header_exp, separator_exp, first_line_exp, last_line_exp -): - """Test header column, spacer, first line and last line in verbose mode.""" - frame = DataFrame(np.random.default_rng(2).standard_normal((3, size))) - with StringIO() as buf: - frame.info(verbose=True, show_counts=True, buf=buf) - all_lines = buf.getvalue().splitlines() - # Here table would contain only header, separator and table lines - # dframe repr, index summary, memory usage and dtypes are excluded - table = all_lines[3:-2] - header, separator, first_line, *rest, last_line = table - assert header == header_exp - assert separator == separator_exp - assert first_line == first_line_exp - assert last_line == last_line_exp - - -def test_info_memory(): - # https://github.com/pandas-dev/pandas/issues/21056 - df = DataFrame({"a": Series([1, 2], dtype="i8")}) - buf = StringIO() - df.info(buf=buf) - result = buf.getvalue() - bytes = float(df.memory_usage().sum()) - expected = textwrap.dedent( - f"""\ - - RangeIndex: 2 entries, 0 to 1 - Data columns (total 1 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 a 2 non-null int64 - dtypes: int64(1) - memory usage: {bytes} bytes - """ - ) - assert result == expected - - -def test_info_wide(): - io = StringIO() - df = DataFrame(np.random.default_rng(2).standard_normal((5, 101))) - df.info(buf=io) - - io = StringIO() - df.info(buf=io, max_cols=101) - result = io.getvalue() - assert len(result.splitlines()) > 100 - - expected = result - with option_context("display.max_info_columns", 101): - io = StringIO() - df.info(buf=io) - result = io.getvalue() - assert result == expected - - -def test_info_duplicate_columns_shows_correct_dtypes(): - # GH11761 - io = StringIO() - frame = DataFrame([[1, 2.0]], columns=["a", "a"]) - frame.info(buf=io) - lines = io.getvalue().splitlines(True) - assert " 0 a 1 non-null int64 \n" == lines[5] - assert " 1 a 1 non-null float64\n" == lines[6] - - -def test_info_shows_column_dtypes(): - dtypes = [ - "int64", - "float64", - "datetime64[ns]", - "timedelta64[ns]", - "complex128", - "object", - "bool", - ] - data = {} - n = 10 - for i, dtype in enumerate(dtypes): - data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype) - df = DataFrame(data) - buf = StringIO() - df.info(buf=buf) - res = buf.getvalue() - header = ( - " # Column Non-Null Count Dtype \n" - "--- ------ -------------- ----- " - ) - assert header in res - for i, dtype in enumerate(dtypes): - name = f" {i:d} {i:d} {n:d} non-null {dtype}" - assert name in res - - -def test_info_max_cols(): - df = DataFrame(np.random.default_rng(2).standard_normal((10, 5))) - for len_, verbose in [(5, None), (5, False), (12, True)]: - # For verbose always ^ setting ^ summarize ^ full output - with option_context("max_info_columns", 4): - buf = StringIO() - df.info(buf=buf, verbose=verbose) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - for len_, verbose in [(12, None), (5, False), (12, True)]: - # max_cols not exceeded - with option_context("max_info_columns", 5): - buf = StringIO() - df.info(buf=buf, verbose=verbose) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - for len_, max_cols in [(12, 5), (5, 4)]: - # setting truncates - with option_context("max_info_columns", 4): - buf = StringIO() - df.info(buf=buf, max_cols=max_cols) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - # setting wouldn't truncate - with option_context("max_info_columns", 5): - buf = StringIO() - df.info(buf=buf, max_cols=max_cols) - res = buf.getvalue() - assert len(res.strip().split("\n")) == len_ - - -def test_info_memory_usage(): - # Ensure memory usage is displayed, when asserted, on the last line - dtypes = [ - "int64", - "float64", - "datetime64[ns]", - "timedelta64[ns]", - "complex128", - "object", - "bool", - ] - data = {} - n = 10 - for i, dtype in enumerate(dtypes): - data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype) - df = DataFrame(data) - buf = StringIO() - - # display memory usage case - df.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - assert "memory usage: " in res[-1] - - # do not display memory usage case - df.info(buf=buf, memory_usage=False) - res = buf.getvalue().splitlines() - assert "memory usage: " not in res[-1] - - df.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - - # memory usage is a lower bound, so print it as XYZ+ MB - assert re.match(r"memory usage: [^+]+\+", res[-1]) - - df.iloc[:, :5].info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - - # excluded column with object dtype, so estimate is accurate - assert not re.match(r"memory usage: [^+]+\+", res[-1]) - - # Test a DataFrame with duplicate columns - dtypes = ["int64", "int64", "int64", "float64"] - data = {} - n = 100 - for i, dtype in enumerate(dtypes): - data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype) - df = DataFrame(data) - df.columns = dtypes - - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) - df_with_object_index.info(buf=buf, memory_usage=True) - res = buf.getvalue().splitlines() - assert re.match(r"memory usage: [^+]+\+", res[-1]) - - df_with_object_index.info(buf=buf, memory_usage="deep") - res = buf.getvalue().splitlines() - assert re.match(r"memory usage: [^+]+$", res[-1]) - - # Ensure df size is as expected - # (cols * rows * bytes) + index size - df_size = df.memory_usage().sum() - exp_size = len(dtypes) * n * 8 + df.index.nbytes - assert df_size == exp_size - - # Ensure number of cols in memory_usage is the same as df - size_df = np.size(df.columns.values) + 1 # index=True; default - assert size_df == np.size(df.memory_usage()) - - # assert deep works only on object - assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() - - # test for validity - DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) - DataFrame(1, index=["a"], columns=["A"]).index.nbytes - df = DataFrame( - data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] - ) - df.index.nbytes - df.memory_usage(index=True) - df.index.values.nbytes - - mem = df.memory_usage(deep=True).sum() - assert mem > 0 - - -@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") -def test_info_memory_usage_deep_not_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) - assert ( - df_with_object_index.memory_usage(index=True, deep=True).sum() - > df_with_object_index.memory_usage(index=True).sum() - ) - - df_object = DataFrame({"a": ["a"]}) - assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() - - -@pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result") -def test_info_memory_usage_deep_pypy(): - df_with_object_index = DataFrame({"a": [1]}, index=["foo"]) - assert ( - df_with_object_index.memory_usage(index=True, deep=True).sum() - == df_with_object_index.memory_usage(index=True).sum() - ) - - df_object = DataFrame({"a": ["a"]}) - assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() - - -@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") -def test_usage_via_getsizeof(): - df = DataFrame( - data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] - ) - mem = df.memory_usage(deep=True).sum() - # sys.getsizeof will call the .memory_usage with - # deep=True, and add on some GC overhead - diff = mem - sys.getsizeof(df) - assert abs(diff) < 100 - - -def test_info_memory_usage_qualified(): - buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) - df.info(buf=buf) - assert "+" not in buf.getvalue() - - buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=list("ABC")) - df.info(buf=buf) - assert "+" in buf.getvalue() - - buf = StringIO() - df = DataFrame( - 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) - ) - df.info(buf=buf) - assert "+" not in buf.getvalue() - - buf = StringIO() - df = DataFrame( - 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) - ) - df.info(buf=buf) - assert "+" in buf.getvalue() - - -def test_info_memory_usage_bug_on_multiindex(): - # GH 14308 - # memory usage introspection should not materialize .values - - def memory_usage(f): - return f.memory_usage(deep=True).sum() - - N = 100 - M = len(uppercase) - index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], - names=["id", "date"], - ) - df = DataFrame( - {"value": np.random.default_rng(2).standard_normal(N * M)}, index=index - ) - - unstacked = df.unstack("id") - assert df.values.nbytes == unstacked.values.nbytes - assert memory_usage(df) > memory_usage(unstacked) - - # high upper bound - assert memory_usage(unstacked) - memory_usage(df) < 2000 - - -def test_info_categorical(): - # GH14298 - idx = CategoricalIndex(["a", "b"]) - df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx) - - buf = StringIO() - df.info(buf=buf) - - -@pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") -def test_info_int_columns(): - # GH#37245 - df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) - buf = StringIO() - df.info(show_counts=True, buf=buf) - result = buf.getvalue() - expected = textwrap.dedent( - """\ - - Index: 2 entries, A to B - Data columns (total 2 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 1 2 non-null int64 - 1 2 2 non-null int64 - dtypes: int64(2) - memory usage: 48.0+ bytes - """ - ) - assert result == expected - - -def test_memory_usage_empty_no_warning(): - # GH#50066 - df = DataFrame(index=["a", "b"]) - with tm.assert_produces_warning(None): - result = df.memory_usage() - expected = Series(16 if IS64 else 8, index=["Index"]) - tm.assert_series_equal(result, expected) - - -@pytest.mark.single_cpu -def test_info_compute_numba(): - # GH#51922 - pytest.importorskip("numba") - df = DataFrame([[1, 2], [3, 4]]) - - with option_context("compute.use_numba", True): - buf = StringIO() - df.info() - result = buf.getvalue() - - buf = StringIO() - df.info() - expected = buf.getvalue() - assert result == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/test_ipython_compat.py pandas-2.2.2+dfsg/pandas/tests/io/formats/test_ipython_compat.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/test_ipython_compat.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/test_ipython_compat.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,90 @@ +import numpy as np + +import pandas._config.config as cf + +from pandas import ( + DataFrame, + MultiIndex, +) + + +class TestTableSchemaRepr: + def test_publishes(self, ip): + ipython = ip.instance(config=ip.config) + df = DataFrame({"A": [1, 2]}) + objects = [df["A"], df] # dataframe / series + expected_keys = [ + {"text/plain", "application/vnd.dataresource+json"}, + {"text/plain", "text/html", "application/vnd.dataresource+json"}, + ] + + opt = cf.option_context("display.html.table_schema", True) + last_obj = None + for obj, expected in zip(objects, expected_keys): + last_obj = obj + with opt: + formatted = ipython.display_formatter.format(obj) + assert set(formatted[0].keys()) == expected + + with_latex = cf.option_context("styler.render.repr", "latex") + + with opt, with_latex: + formatted = ipython.display_formatter.format(last_obj) + + expected = { + "text/plain", + "text/html", + "text/latex", + "application/vnd.dataresource+json", + } + assert set(formatted[0].keys()) == expected + + def test_publishes_not_implemented(self, ip): + # column MultiIndex + # GH#15996 + midx = MultiIndex.from_product([["A", "B"], ["a", "b", "c"]]) + df = DataFrame( + np.random.default_rng(2).standard_normal((5, len(midx))), columns=midx + ) + + opt = cf.option_context("display.html.table_schema", True) + + with opt: + formatted = ip.instance(config=ip.config).display_formatter.format(df) + + expected = {"text/plain", "text/html"} + assert set(formatted[0].keys()) == expected + + def test_config_on(self): + df = DataFrame({"A": [1, 2]}) + with cf.option_context("display.html.table_schema", True): + result = df._repr_data_resource_() + + assert result is not None + + def test_config_default_off(self): + df = DataFrame({"A": [1, 2]}) + with cf.option_context("display.html.table_schema", False): + result = df._repr_data_resource_() + + assert result is None + + def test_enable_data_resource_formatter(self, ip): + # GH#10491 + formatters = ip.instance(config=ip.config).display_formatter.formatters + mimetype = "application/vnd.dataresource+json" + + with cf.option_context("display.html.table_schema", True): + assert "application/vnd.dataresource+json" in formatters + assert formatters[mimetype].enabled + + # still there, just disabled + assert "application/vnd.dataresource+json" in formatters + assert not formatters[mimetype].enabled + + # able to re-set + with cf.option_context("display.html.table_schema", True): + assert "application/vnd.dataresource+json" in formatters + assert formatters[mimetype].enabled + # smoke test that it works + ip.instance(config=ip.config).display_formatter.format(cf) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/test_printing.py pandas-2.2.2+dfsg/pandas/tests/io/formats/test_printing.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/test_printing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/test_printing.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,14 +1,10 @@ +# Note! This file is aimed specifically at pandas.io.formats.printing utility +# functions, not the general printing of pandas objects. import string -import numpy as np -import pytest - import pandas._config.config as cf -import pandas as pd - from pandas.io.formats import printing -import pandas.io.formats.format as fmt def test_adjoin(): @@ -20,20 +16,34 @@ assert adjoined == expected -def test_repr_binary_type(): - letters = string.ascii_letters - try: - raw = bytes(letters, encoding=cf.get_option("display.encoding")) - except TypeError: - raw = bytes(letters) - b = str(raw.decode("utf-8")) - res = printing.pprint_thing(b, quote_strings=True) - assert res == repr(b) - res = printing.pprint_thing(b, quote_strings=False) - assert res == b +class TestPPrintThing: + def test_repr_binary_type(self): + letters = string.ascii_letters + try: + raw = bytes(letters, encoding=cf.get_option("display.encoding")) + except TypeError: + raw = bytes(letters) + b = str(raw.decode("utf-8")) + res = printing.pprint_thing(b, quote_strings=True) + assert res == repr(b) + res = printing.pprint_thing(b, quote_strings=False) + assert res == b + + def test_repr_obeys_max_seq_limit(self): + with cf.option_context("display.max_seq_items", 2000): + assert len(printing.pprint_thing(list(range(1000)))) > 1000 + + with cf.option_context("display.max_seq_items", 5): + assert len(printing.pprint_thing(list(range(1000)))) < 100 + + with cf.option_context("display.max_seq_items", 1): + assert len(printing.pprint_thing(list(range(1000)))) < 9 + def test_repr_set(self): + assert printing.pprint_thing({1}) == "{1}" -class TestFormattBase: + +class TestFormatBase: def test_adjoin(self): data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] expected = "a dd ggg\nb ee hhh\nc ff iii" @@ -48,7 +58,7 @@ adjoined = printing.adjoin(2, *data) assert adjoined == expected - adj = fmt.EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() expected = """あ dd ggg b ええ hhh @@ -73,7 +83,7 @@ assert adj.len(cols[2]) == 26 def test_justify(self): - adj = fmt.EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() def just(x, *args, **kwargs): # wrapper to test single str @@ -95,7 +105,7 @@ assert just("パンダ", 10, mode="right") == " パンダ" def test_east_asian_len(self): - adj = fmt.EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() assert adj.len("abc") == 3 assert adj.len("abc") == 3 @@ -106,143 +116,14 @@ assert adj.len("パンダpanda") == 10 def test_ambiguous_width(self): - adj = fmt.EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() assert adj.len("¡¡ab") == 4 with cf.option_context("display.unicode.ambiguous_as_wide", True): - adj = fmt.EastAsianTextAdjustment() + adj = printing._EastAsianTextAdjustment() assert adj.len("¡¡ab") == 6 data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "¡¡ab", "いいい"]] expected = "あ dd ggg \nb ええ ¡¡ab\nc ff いいい" adjoined = adj.adjoin(2, *data) assert adjoined == expected - - -class TestTableSchemaRepr: - def test_publishes(self, ip): - ipython = ip.instance(config=ip.config) - df = pd.DataFrame({"A": [1, 2]}) - objects = [df["A"], df] # dataframe / series - expected_keys = [ - {"text/plain", "application/vnd.dataresource+json"}, - {"text/plain", "text/html", "application/vnd.dataresource+json"}, - ] - - opt = pd.option_context("display.html.table_schema", True) - last_obj = None - for obj, expected in zip(objects, expected_keys): - last_obj = obj - with opt: - formatted = ipython.display_formatter.format(obj) - assert set(formatted[0].keys()) == expected - - with_latex = pd.option_context("styler.render.repr", "latex") - - with opt, with_latex: - formatted = ipython.display_formatter.format(last_obj) - - expected = { - "text/plain", - "text/html", - "text/latex", - "application/vnd.dataresource+json", - } - assert set(formatted[0].keys()) == expected - - def test_publishes_not_implemented(self, ip): - # column MultiIndex - # GH 15996 - midx = pd.MultiIndex.from_product([["A", "B"], ["a", "b", "c"]]) - df = pd.DataFrame( - np.random.default_rng(2).standard_normal((5, len(midx))), columns=midx - ) - - opt = pd.option_context("display.html.table_schema", True) - - with opt: - formatted = ip.instance(config=ip.config).display_formatter.format(df) - - expected = {"text/plain", "text/html"} - assert set(formatted[0].keys()) == expected - - def test_config_on(self): - df = pd.DataFrame({"A": [1, 2]}) - with pd.option_context("display.html.table_schema", True): - result = df._repr_data_resource_() - - assert result is not None - - def test_config_default_off(self): - df = pd.DataFrame({"A": [1, 2]}) - with pd.option_context("display.html.table_schema", False): - result = df._repr_data_resource_() - - assert result is None - - def test_enable_data_resource_formatter(self, ip): - # GH 10491 - formatters = ip.instance(config=ip.config).display_formatter.formatters - mimetype = "application/vnd.dataresource+json" - - with pd.option_context("display.html.table_schema", True): - assert "application/vnd.dataresource+json" in formatters - assert formatters[mimetype].enabled - - # still there, just disabled - assert "application/vnd.dataresource+json" in formatters - assert not formatters[mimetype].enabled - - # able to re-set - with pd.option_context("display.html.table_schema", True): - assert "application/vnd.dataresource+json" in formatters - assert formatters[mimetype].enabled - # smoke test that it works - ip.instance(config=ip.config).display_formatter.format(cf) - - -def test_multiindex_long_element(): - # Non-regression test towards GH #52960 - data = pd.MultiIndex.from_tuples([("c" * 62,)]) - - expected = ( - "MultiIndex([('cccccccccccccccccccccccccccccccccccccccc" - "cccccccccccccccccccccc',)],\n )" - ) - assert str(data) == expected - - -@pytest.mark.parametrize( - "data,output", - [ - ([2, complex("nan"), 1], [" 2.0+0.0j", " NaN+0.0j", " 1.0+0.0j"]), - ([2, complex("nan"), -1], [" 2.0+0.0j", " NaN+0.0j", "-1.0+0.0j"]), - ([-2, complex("nan"), -1], ["-2.0+0.0j", " NaN+0.0j", "-1.0+0.0j"]), - ([-1.23j, complex("nan"), -1], ["-0.00-1.23j", " NaN+0.00j", "-1.00+0.00j"]), - ([1.23j, complex("nan"), 1.23], [" 0.00+1.23j", " NaN+0.00j", " 1.23+0.00j"]), - ( - [-1.23j, complex(np.nan, np.nan), 1], - ["-0.00-1.23j", " NaN+ NaNj", " 1.00+0.00j"], - ), - ( - [-1.23j, complex(1.2, np.nan), 1], - ["-0.00-1.23j", " 1.20+ NaNj", " 1.00+0.00j"], - ), - ( - [-1.23j, complex(np.nan, -1.2), 1], - ["-0.00-1.23j", " NaN-1.20j", " 1.00+0.00j"], - ), - ], -) -@pytest.mark.parametrize("as_frame", [True, False]) -def test_ser_df_with_complex_nans(data, output, as_frame): - # GH#53762, GH#53841 - obj = pd.Series(np.array(data)) - if as_frame: - obj = obj.to_frame(name="val") - reprs = [f"{i} {val}" for i, val in enumerate(output)] - expected = f"{'val': >{len(reprs[0])}}\n" + "\n".join(reprs) - else: - reprs = [f"{i} {val}" for i, val in enumerate(output)] - expected = "\n".join(reprs) + "\ndtype: complex128" - assert str(obj) == expected, f"\n{str(obj)}\n\n{expected}" diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/test_series_info.py pandas-2.2.2+dfsg/pandas/tests/io/formats/test_series_info.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/test_series_info.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/test_series_info.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,181 +0,0 @@ -from io import StringIO -from string import ascii_uppercase as uppercase -import textwrap - -import numpy as np -import pytest - -from pandas.compat import PYPY - -from pandas import ( - CategoricalIndex, - MultiIndex, - Series, - date_range, -) - - -def test_info_categorical_column_just_works(): - n = 2500 - data = np.array(list("abcdefghij")).take( - np.random.default_rng(2).integers(0, 10, size=n, dtype=int) - ) - s = Series(data).astype("category") - s.isna() - buf = StringIO() - s.info(buf=buf) - - s2 = s[s == "d"] - buf = StringIO() - s2.info(buf=buf) - - -def test_info_categorical(): - # GH14298 - idx = CategoricalIndex(["a", "b"]) - s = Series(np.zeros(2), index=idx) - buf = StringIO() - s.info(buf=buf) - - -@pytest.mark.parametrize("verbose", [True, False]) -def test_info_series(lexsorted_two_level_string_multiindex, verbose): - index = lexsorted_two_level_string_multiindex - ser = Series(range(len(index)), index=index, name="sth") - buf = StringIO() - ser.info(verbose=verbose, buf=buf) - result = buf.getvalue() - - expected = textwrap.dedent( - """\ - - MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') - """ - ) - if verbose: - expected += textwrap.dedent( - """\ - Series name: sth - Non-Null Count Dtype - -------------- ----- - 10 non-null int64 - """ - ) - expected += textwrap.dedent( - f"""\ - dtypes: int64(1) - memory usage: {ser.memory_usage()}.0+ bytes - """ - ) - assert result == expected - - -def test_info_memory(): - s = Series([1, 2], dtype="i8") - buf = StringIO() - s.info(buf=buf) - result = buf.getvalue() - memory_bytes = float(s.memory_usage()) - expected = textwrap.dedent( - f"""\ - - RangeIndex: 2 entries, 0 to 1 - Series name: None - Non-Null Count Dtype - -------------- ----- - 2 non-null int64 - dtypes: int64(1) - memory usage: {memory_bytes} bytes - """ - ) - assert result == expected - - -def test_info_wide(): - s = Series(np.random.default_rng(2).standard_normal(101)) - msg = "Argument `max_cols` can only be passed in DataFrame.info, not Series.info" - with pytest.raises(ValueError, match=msg): - s.info(max_cols=1) - - -def test_info_shows_dtypes(): - dtypes = [ - "int64", - "float64", - "datetime64[ns]", - "timedelta64[ns]", - "complex128", - "object", - "bool", - ] - n = 10 - for dtype in dtypes: - s = Series(np.random.default_rng(2).integers(2, size=n).astype(dtype)) - buf = StringIO() - s.info(buf=buf) - res = buf.getvalue() - name = f"{n:d} non-null {dtype}" - assert name in res - - -@pytest.mark.xfail(PYPY, reason="on PyPy deep=True doesn't change result") -def test_info_memory_usage_deep_not_pypy(): - s_with_object_index = Series({"a": [1]}, index=["foo"]) - assert s_with_object_index.memory_usage( - index=True, deep=True - ) > s_with_object_index.memory_usage(index=True) - - s_object = Series({"a": ["a"]}) - assert s_object.memory_usage(deep=True) > s_object.memory_usage() - - -@pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result") -def test_info_memory_usage_deep_pypy(): - s_with_object_index = Series({"a": [1]}, index=["foo"]) - assert s_with_object_index.memory_usage( - index=True, deep=True - ) == s_with_object_index.memory_usage(index=True) - - s_object = Series({"a": ["a"]}) - assert s_object.memory_usage(deep=True) == s_object.memory_usage() - - -@pytest.mark.parametrize( - "series, plus", - [ - (Series(1, index=[1, 2, 3]), False), - (Series(1, index=list("ABC")), True), - (Series(1, index=MultiIndex.from_product([range(3), range(3)])), False), - ( - Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])), - True, - ), - ], -) -def test_info_memory_usage_qualified(series, plus): - buf = StringIO() - series.info(buf=buf) - if plus: - assert "+" in buf.getvalue() - else: - assert "+" not in buf.getvalue() - - -def test_info_memory_usage_bug_on_multiindex(): - # GH 14308 - # memory usage introspection should not materialize .values - N = 100 - M = len(uppercase) - index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], - names=["id", "date"], - ) - s = Series(np.random.default_rng(2).standard_normal(N * M), index=index) - - unstacked = s.unstack("id") - assert s.values.nbytes == unstacked.values.nbytes - assert s.memory_usage(deep=True) > unstacked.memory_usage(deep=True).sum() - - # high upper bound - diff = unstacked.memory_usage(deep=True).sum() - s.memory_usage(deep=True) - assert diff < 2000 diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/test_to_csv.py pandas-2.2.2+dfsg/pandas/tests/io/formats/test_to_csv.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/test_to_csv.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/test_to_csv.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,6 +10,7 @@ import pandas as pd from pandas import ( DataFrame, + Index, compat, ) import pandas._testing as tm @@ -285,7 +286,7 @@ df = DataFrame( { "date": pd.to_datetime("1970-01-01"), - "datetime": pd.date_range("1970-01-01", periods=2, freq="H"), + "datetime": pd.date_range("1970-01-01", periods=2, freq="h"), } ) expected_rows = [ @@ -665,7 +666,7 @@ def test_to_csv_errors(self, errors): # GH 22610 data = ["\ud800foo"] - ser = pd.Series(data, index=pd.Index(data)) + ser = pd.Series(data, index=Index(data, dtype=object), dtype=object) with tm.ensure_clean("test.csv") as path: ser.to_csv(path, errors=errors) # No use in reading back the data as it is not the same anymore @@ -679,7 +680,11 @@ GH 35058 and GH 19827 """ - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), + ) with tm.ensure_clean() as path: with open(path, mode="w+b") as handle: df.to_csv(handle, mode=mode) @@ -713,7 +718,11 @@ def test_to_csv_iterative_compression_name(compression): # GH 38714 - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), + ) with tm.ensure_clean() as path: df.to_csv(path, compression=compression, chunksize=1) tm.assert_frame_equal( @@ -723,7 +732,11 @@ def test_to_csv_iterative_compression_buffer(compression): # GH 38714 - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), + ) with io.BytesIO() as buffer: df.to_csv(buffer, compression=compression, chunksize=1) buffer.seek(0) @@ -731,3 +744,15 @@ pd.read_csv(buffer, compression=compression, index_col=0), df ) assert not buffer.closed + + +def test_to_csv_pos_args_deprecation(): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_csv except for the " + r"argument 'path_or_buf' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + buffer = io.BytesIO() + df.to_csv(buffer, ";") diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/test_to_html.py pandas-2.2.2+dfsg/pandas/tests/io/formats/test_to_html.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/test_to_html.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/test_to_html.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,8 @@ from datetime import datetime from io import StringIO +import itertools import re +import textwrap import numpy as np import pytest @@ -10,6 +12,7 @@ DataFrame, Index, MultiIndex, + get_option, option_context, ) import pandas._testing as tm @@ -56,7 +59,7 @@ df = DataFrame( { "A": np.random.default_rng(2).standard_normal(200), - "B": tm.makeStringIndex(200), + "B": Index([f"{i}?!" for i in range(200)]), }, index=np.arange(200), ) @@ -68,7 +71,7 @@ return df -@pytest.fixture(params=fmt._VALID_JUSTIFY_PARAMETERS) +@pytest.fixture(params=fmt.VALID_JUSTIFY_PARAMETERS) def justify(request): return request.param @@ -237,7 +240,7 @@ ( DataFrame( [[0, 1], [2, 3], [4, 5], [6, 7]], - columns=["foo", None], + columns=Index(["foo", None], dtype=object), index=np.arange(4), ), {"__index__": lambda x: "abcd"[x]}, @@ -416,15 +419,15 @@ "columns,justify,expected", [ ( - MultiIndex.from_tuples( - list(zip(np.arange(2).repeat(2), np.mod(range(4), 2))), + MultiIndex.from_arrays( + [np.arange(2).repeat(2), np.mod(range(4), 2)], names=["CL0", "CL1"], ), "left", "multiindex_1", ), ( - MultiIndex.from_tuples(list(zip(range(4), np.mod(range(4), 2)))), + MultiIndex.from_arrays([np.arange(4), np.mod(range(4), 2)]), "right", "multiindex_2", ), @@ -768,7 +771,7 @@ [0, "https://pandas.pydata.org/?q1=a&q2=b", "pydata.org"], [0, "www.pydata.org", "pydata.org"], ] - df = DataFrame(data, columns=["foo", "bar", None]) + df = DataFrame(data, columns=Index(["foo", "bar", None], dtype=object)) result = df.to_html(render_links=render_links) expected = expected_html(datapath, expected) @@ -826,43 +829,226 @@ assert expected in h -def test_html_repr_min_rows_default(datapath): - # gh-27991 +class TestReprHTML: + def test_html_repr_min_rows_default(self, datapath): + # gh-27991 - # default setting no truncation even if above min_rows - df = DataFrame({"a": range(20)}) - result = df._repr_html_() - expected = expected_html(datapath, "html_repr_min_rows_default_no_truncation") - assert result == expected + # default setting no truncation even if above min_rows + df = DataFrame({"a": range(20)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_no_truncation") + assert result == expected - # default of max_rows 60 triggers truncation if above - df = DataFrame({"a": range(61)}) - result = df._repr_html_() - expected = expected_html(datapath, "html_repr_min_rows_default_truncated") - assert result == expected + # default of max_rows 60 triggers truncation if above + df = DataFrame({"a": range(61)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_truncated") + assert result == expected + @pytest.mark.parametrize( + "max_rows,min_rows,expected", + [ + # truncated after first two rows + (10, 4, "html_repr_max_rows_10_min_rows_4"), + # when set to None, follow value of max_rows + (12, None, "html_repr_max_rows_12_min_rows_None"), + # when set value higher as max_rows, use the minimum + (10, 12, "html_repr_max_rows_10_min_rows_12"), + # max_rows of None -> never truncate + (None, 12, "html_repr_max_rows_None_min_rows_12"), + ], + ) + def test_html_repr_min_rows(self, datapath, max_rows, min_rows, expected): + # gh-27991 -@pytest.mark.parametrize( - "max_rows,min_rows,expected", - [ - # truncated after first two rows - (10, 4, "html_repr_max_rows_10_min_rows_4"), - # when set to None, follow value of max_rows - (12, None, "html_repr_max_rows_12_min_rows_None"), - # when set value higher as max_rows, use the minimum - (10, 12, "html_repr_max_rows_10_min_rows_12"), - # max_rows of None -> never truncate - (None, 12, "html_repr_max_rows_None_min_rows_12"), - ], -) -def test_html_repr_min_rows(datapath, max_rows, min_rows, expected): - # gh-27991 + df = DataFrame({"a": range(61)}) + expected = expected_html(datapath, expected) + with option_context("display.max_rows", max_rows, "display.min_rows", min_rows): + result = df._repr_html_() + assert result == expected + + def test_repr_html_ipython_config(self, ip): + code = textwrap.dedent( + """\ + from pandas import DataFrame + df = DataFrame({"A": [1, 2]}) + df._repr_html_() + + cfg = get_ipython().config + cfg['IPKernelApp']['parent_appname'] + df._repr_html_() + """ + ) + result = ip.run_cell(code, silent=True) + assert not result.error_in_exec - df = DataFrame({"a": range(61)}) - expected = expected_html(datapath, expected) - with option_context("display.max_rows", max_rows, "display.min_rows", min_rows): - result = df._repr_html_() - assert result == expected + def test_info_repr_html(self): + max_rows = 60 + max_cols = 20 + # Long + h, w = max_rows + 1, max_cols - 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + assert r"<class" not in df._repr_html_() + with option_context("display.large_repr", "info"): + assert r"<class" in df._repr_html_() + + # Wide + h, w = max_rows - 1, max_cols + 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + assert "{40 + h}" in reg_repr + + h = max_rows + 1 + df = DataFrame( + { + "idx": np.linspace(-10, 10, h), + "A": np.arange(1, 1 + h), + "B": np.arange(41, 41 + h), + } + ).set_index("idx") + long_repr = df._repr_html_() + assert ".." in long_repr + assert "31" not in long_repr + assert f"{h} rows " in long_repr + assert "2 columns" in long_repr + + def test_repr_html_long_multiindex(self): + max_rows = 60 + max_L1 = max_rows // 2 + + tuples = list(itertools.product(np.arange(max_L1), ["foo", "bar"])) + idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame( + np.random.default_rng(2).standard_normal((max_L1 * 2, 2)), + index=idx, + columns=["A", "B"], + ) + with option_context("display.max_rows", 60, "display.max_columns", 20): + reg_repr = df._repr_html_() + assert "..." not in reg_repr + + tuples = list(itertools.product(np.arange(max_L1 + 1), ["foo", "bar"])) + idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame( + np.random.default_rng(2).standard_normal(((max_L1 + 1) * 2, 2)), + index=idx, + columns=["A", "B"], + ) + long_repr = df._repr_html_() + assert "..." in long_repr + + def test_repr_html_long_and_wide(self): + max_cols = 20 + max_rows = 60 + + h, w = max_rows - 1, max_cols - 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." not in df._repr_html_() + + h, w = max_rows + 1, max_cols + 1 + df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." in df._repr_html_() def test_to_html_multilevel(multiindex_year_month_day_dataframe_random_data): @@ -978,3 +1164,14 @@ "" ) assert result == expected + + +def test_to_html_pos_args_deprecation(): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_html except for the " + r"argument 'buf' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_html(None, None) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/test_to_latex.py pandas-2.2.2+dfsg/pandas/tests/io/formats/test_to_latex.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/test_to_latex.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/test_to_latex.py 2024-04-10 17:42:52.000000000 +0000 @@ -187,6 +187,22 @@ ) assert result == expected + def test_to_latex_pos_args_deprecation(self): + # GH-54229 + df = DataFrame( + { + "name": ["Raphael", "Donatello"], + "age": [26, 45], + "height": [181.23, 177.65], + } + ) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_latex except for " + r"the argument 'buf' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_latex(None, None) + class TestToLatexLongtable: def test_to_latex_empty_longtable(self): diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/test_to_markdown.py pandas-2.2.2+dfsg/pandas/tests/io/formats/test_to_markdown.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/test_to_markdown.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/test_to_markdown.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,8 +1,12 @@ -from io import StringIO +from io import ( + BytesIO, + StringIO, +) import pytest import pandas as pd +import pandas._testing as tm pytest.importorskip("tabulate") @@ -88,3 +92,15 @@ df = pd.DataFrame([1, 2, 3]) with pytest.raises(ValueError, match="Pass 'index' instead of 'showindex"): df.to_markdown(index=True, showindex=True) + + +def test_markdown_pos_args_deprecatation(): + # GH-54229 + df = pd.DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_markdown except for the " + r"argument 'buf' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + buffer = BytesIO() + df.to_markdown(buffer, "grid") diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/formats/test_to_string.py pandas-2.2.2+dfsg/pandas/tests/io/formats/test_to_string.py --- pandas-2.1.4+dfsg/pandas/tests/io/formats/test_to_string.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/formats/test_to_string.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,357 +1,1216 @@ -from datetime import datetime +from datetime import ( + datetime, + timedelta, +) from io import StringIO +import re +import sys from textwrap import dedent import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas import ( + CategoricalIndex, DataFrame, + Index, + NaT, Series, + Timestamp, + concat, + date_range, + get_option, option_context, + read_csv, + timedelta_range, to_datetime, ) +import pandas._testing as tm -def test_repr_embedded_ndarray(): - arr = np.empty(10, dtype=[("err", object)]) - for i in range(len(arr)): - arr["err"][i] = np.random.default_rng(2).standard_normal(i) - - df = DataFrame(arr) - repr(df["err"]) - repr(df) - df.to_string() - - -def test_repr_tuples(): - buf = StringIO() +def _three_digit_exp(): + return f"{1.7e8:.4g}" == "1.7e+008" - df = DataFrame({"tups": list(zip(range(10), range(10)))}) - repr(df) - df.to_string(col_space=10, buf=buf) +class TestDataFrameToStringFormatters: + def test_to_string_masked_ea_with_formatter(self): + # GH#39336 + df = DataFrame( + { + "a": Series([0.123456789, 1.123456789], dtype="Float64"), + "b": Series([1, 2], dtype="Int64"), + } + ) + result = df.to_string(formatters=["{:.2f}".format, "{:.2f}".format]) + expected = dedent( + """\ + a b + 0 0.12 1.00 + 1 1.12 2.00""" + ) + assert result == expected -def test_to_string_truncate(): - # GH 9784 - dont truncate when calling DataFrame.to_string - df = DataFrame( - [ + def test_to_string_with_formatters(self): + df = DataFrame( { - "a": "foo", - "b": "bar", - "c": "let's make this a very VERY long line that is longer " - "than the default 50 character limit", - "d": 1, + "int": [1, 2, 3], + "float": [1.0, 2.0, 3.0], + "object": [(1, 2), True, False], }, - {"a": "foo", "b": "bar", "c": "stuff", "d": 1}, + columns=["int", "float", "object"], + ) + + formatters = [ + ("int", lambda x: f"0x{x:x}"), + ("float", lambda x: f"[{x: 4.1f}]"), + ("object", lambda x: f"-{x!s}-"), ] - ) - df.set_index(["a", "b", "c"]) - assert df.to_string() == ( - " a b " - " c d\n" - "0 foo bar let's make this a very VERY long line t" - "hat is longer than the default 50 character limit 1\n" - "1 foo bar " - " stuff 1" - ) - with option_context("max_colwidth", 20): - # the display option has no effect on the to_string method - assert df.to_string() == ( - " a b " - " c d\n" - "0 foo bar let's make this a very VERY long line t" - "hat is longer than the default 50 character limit 1\n" - "1 foo bar " - " stuff 1" + result = df.to_string(formatters=dict(formatters)) + result2 = df.to_string(formatters=list(zip(*formatters))[1]) + assert result == ( + " int float object\n" + "0 0x1 [ 1.0] -(1, 2)-\n" + "1 0x2 [ 2.0] -True-\n" + "2 0x3 [ 3.0] -False-" ) - assert df.to_string(max_colwidth=20) == ( - " a b c d\n" - "0 foo bar let's make this ... 1\n" - "1 foo bar stuff 1" - ) + assert result == result2 + def test_to_string_with_datetime64_monthformatter(self): + months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] + x = DataFrame({"months": months}) + + def format_func(x): + return x.strftime("%Y-%m") + + result = x.to_string(formatters={"months": format_func}) + expected = dedent( + """\ + months + 0 2016-01 + 1 2016-02""" + ) + assert result.strip() == expected -@pytest.mark.parametrize( - "input_array, expected", - [ - ("a", "a"), - (["a", "b"], "a\nb"), - ([1, "a"], "1\na"), - (1, "1"), - ([0, -1], " 0\n-1"), - (1.0, "1.0"), - ([" a", " b"], " a\n b"), - ([".1", "1"], ".1\n 1"), - (["10", "-10"], " 10\n-10"), - ], -) -def test_format_remove_leading_space_series(input_array, expected): - # GH: 24980 - s = Series(input_array).to_string(index=False) - assert s == expected + def test_to_string_with_datetime64_hourformatter(self): + x = DataFrame( + {"hod": to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f")} + ) + def format_func(x): + return x.strftime("%H:%M") -@pytest.mark.parametrize( - "input_array, expected", - [ - ({"A": ["a"]}, "A\na"), - ({"A": ["a", "b"], "B": ["c", "dd"]}, "A B\na c\nb dd"), - ({"A": ["a", 1], "B": ["aa", 1]}, "A B\na aa\n1 1"), - ], -) -def test_format_remove_leading_space_dataframe(input_array, expected): - # GH: 24980 - df = DataFrame(input_array).to_string(index=False) - assert df == expected - - -@pytest.mark.parametrize( - "max_cols, max_rows, expected", - [ - ( - 10, - None, - " 0 1 2 3 4 ... 6 7 8 9 10\n" - " 0 0 0 0 0 ... 0 0 0 0 0\n" - " 0 0 0 0 0 ... 0 0 0 0 0\n" - " 0 0 0 0 0 ... 0 0 0 0 0\n" - " 0 0 0 0 0 ... 0 0 0 0 0", - ), - ( - None, - 2, - " 0 1 2 3 4 5 6 7 8 9 10\n" - " 0 0 0 0 0 0 0 0 0 0 0\n" - " .. .. .. .. .. .. .. .. .. .. ..\n" - " 0 0 0 0 0 0 0 0 0 0 0", - ), - ( - 10, - 2, - " 0 1 2 3 4 ... 6 7 8 9 10\n" - " 0 0 0 0 0 ... 0 0 0 0 0\n" - " .. .. .. .. .. ... .. .. .. .. ..\n" - " 0 0 0 0 0 ... 0 0 0 0 0", - ), - ( - 9, - 2, - " 0 1 2 3 ... 7 8 9 10\n" - " 0 0 0 0 ... 0 0 0 0\n" - " .. .. .. .. ... .. .. .. ..\n" - " 0 0 0 0 ... 0 0 0 0", - ), - ( - 1, - 1, - " 0 ...\n 0 ...\n.. ...", - ), - ], -) -def test_truncation_no_index(max_cols, max_rows, expected): - df = DataFrame([[0] * 11] * 4) - assert df.to_string(index=False, max_cols=max_cols, max_rows=max_rows) == expected + result = x.to_string(formatters={"hod": format_func}) + expected = dedent( + """\ + hod + 0 10:10 + 1 12:12""" + ) + assert result.strip() == expected + + def test_to_string_with_formatters_unicode(self): + df = DataFrame({"c/\u03c3": [1, 2, 3]}) + result = df.to_string(formatters={"c/\u03c3": str}) + expected = dedent( + """\ + c/\u03c3 + 0 1 + 1 2 + 2 3""" + ) + assert result == expected + + def test_to_string_index_formatter(self): + df = DataFrame([range(5), range(5, 10), range(10, 15)]) + rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) -def test_to_string_unicode_columns(float_frame): - df = DataFrame({"\u03c3": np.arange(10.0)}) + xp = dedent( + """\ + 0 1 2 3 4 + a 0 1 2 3 4 + b 5 6 7 8 9 + c 10 11 12 13 14\ + """ + ) + assert rs == xp + + def test_no_extra_space(self): + # GH#52690: Check that no extra space is given + col1 = "TEST" + col2 = "PANDAS" + col3 = "to_string" + expected = f"{col1:<6s} {col2:<7s} {col3:<10s}" + df = DataFrame([{"col1": "TEST", "col2": "PANDAS", "col3": "to_string"}]) + d = {"col1": "{:<6s}".format, "col2": "{:<7s}".format, "col3": "{:<10s}".format} + result = df.to_string(index=False, header=False, formatters=d) + assert result == expected + + +class TestDataFrameToStringColSpace: + def test_to_string_with_column_specific_col_space_raises(self): + df = DataFrame( + np.random.default_rng(2).random(size=(3, 3)), columns=["a", "b", "c"] + ) - buf = StringIO() - df.to_string(buf=buf) - buf.getvalue() + msg = ( + "Col_space length\\(\\d+\\) should match " + "DataFrame number of columns\\(\\d+\\)" + ) + with pytest.raises(ValueError, match=msg): + df.to_string(col_space=[30, 40]) - buf = StringIO() - df.info(buf=buf) - buf.getvalue() + with pytest.raises(ValueError, match=msg): + df.to_string(col_space=[30, 40, 50, 60]) - result = float_frame.to_string() - assert isinstance(result, str) + msg = "unknown column" + with pytest.raises(ValueError, match=msg): + df.to_string(col_space={"a": "foo", "b": 23, "d": 34}) + + def test_to_string_with_column_specific_col_space(self): + df = DataFrame( + np.random.default_rng(2).random(size=(3, 3)), columns=["a", "b", "c"] + ) + result = df.to_string(col_space={"a": 10, "b": 11, "c": 12}) + # 3 separating space + each col_space for (id, a, b, c) + assert len(result.split("\n")[1]) == (3 + 1 + 10 + 11 + 12) + + result = df.to_string(col_space=[10, 11, 12]) + assert len(result.split("\n")[1]) == (3 + 1 + 10 + 11 + 12) + + def test_to_string_with_col_space(self): + df = DataFrame(np.random.default_rng(2).random(size=(1, 3))) + c10 = len(df.to_string(col_space=10).split("\n")[1]) + c20 = len(df.to_string(col_space=20).split("\n")[1]) + c30 = len(df.to_string(col_space=30).split("\n")[1]) + assert c10 < c20 < c30 + + # GH#8230 + # col_space wasn't being applied with header=False + with_header = df.to_string(col_space=20) + with_header_row1 = with_header.splitlines()[1] + no_header = df.to_string(col_space=20, header=False) + assert len(with_header_row1) == len(no_header) -def test_to_string_utf8_columns(): - n = "\u05d0".encode() + def test_to_string_repr_tuples(self): + buf = StringIO() - with option_context("display.max_rows", 1): - df = DataFrame([1, 2], columns=[n]) + df = DataFrame({"tups": list(zip(range(10), range(10)))}) repr(df) + df.to_string(col_space=10, buf=buf) -def test_to_string_unicode_two(): - dm = DataFrame({"c/\u03c3": []}) - buf = StringIO() - dm.to_string(buf) - - -def test_to_string_unicode_three(): - dm = DataFrame(["\xc2"]) - buf = StringIO() - dm.to_string(buf) - - -def test_to_string_with_formatters(): - df = DataFrame( - { - "int": [1, 2, 3], - "float": [1.0, 2.0, 3.0], - "object": [(1, 2), True, False], - }, - columns=["int", "float", "object"], - ) +class TestDataFrameToStringHeader: + def test_to_string_header_false(self): + # GH#49230 + df = DataFrame([1, 2]) + df.index.name = "a" + s = df.to_string(header=False) + expected = "a \n0 1\n1 2" + assert s == expected - formatters = [ - ("int", lambda x: f"0x{x:x}"), - ("float", lambda x: f"[{x: 4.1f}]"), - ("object", lambda x: f"-{x!s}-"), - ] - result = df.to_string(formatters=dict(formatters)) - result2 = df.to_string(formatters=list(zip(*formatters))[1]) - assert result == ( - " int float object\n" - "0 0x1 [ 1.0] -(1, 2)-\n" - "1 0x2 [ 2.0] -True-\n" - "2 0x3 [ 3.0] -False-" - ) - assert result == result2 + df = DataFrame([[1, 2], [3, 4]]) + df.index.name = "a" + s = df.to_string(header=False) + expected = "a \n0 1 2\n1 3 4" + assert s == expected + def test_to_string_multindex_header(self): + # GH#16718 + df = DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index(["a", "b"]) + res = df.to_string(header=["r1", "r2"]) + exp = " r1 r2\na b \n0 1 2 3" + assert res == exp -def test_to_string_with_datetime64_monthformatter(): - months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] - x = DataFrame({"months": months}) - - def format_func(x): - return x.strftime("%Y-%m") - - result = x.to_string(formatters={"months": format_func}) - expected = dedent( - """\ - months - 0 2016-01 - 1 2016-02""" - ) - assert result.strip() == expected + def test_to_string_no_header(self): + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + df_s = df.to_string(header=False) + expected = "0 1 4\n1 2 5\n2 3 6" -def test_to_string_with_datetime64_hourformatter(): - x = DataFrame( - {"hod": to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f")} - ) + assert df_s == expected - def format_func(x): - return x.strftime("%H:%M") + def test_to_string_specified_header(self): + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - result = x.to_string(formatters={"hod": format_func}) - expected = dedent( - """\ - hod - 0 10:10 - 1 12:12""" - ) - assert result.strip() == expected + df_s = df.to_string(header=["X", "Y"]) + expected = " X Y\n0 1 4\n1 2 5\n2 3 6" + assert df_s == expected -def test_to_string_with_formatters_unicode(): - df = DataFrame({"c/\u03c3": [1, 2, 3]}) - result = df.to_string(formatters={"c/\u03c3": str}) - expected = dedent( - """\ - c/\u03c3 - 0 1 - 1 2 - 2 3""" - ) - assert result == expected + msg = "Writing 2 cols but got 1 aliases" + with pytest.raises(ValueError, match=msg): + df.to_string(header=["X"]) -def test_to_string_complex_number_trims_zeros(): - s = Series([1.000000 + 1.000000j, 1.0 + 1.0j, 1.05 + 1.0j]) - result = s.to_string() - expected = dedent( - """\ - 0 1.00+1.00j - 1 1.00+1.00j - 2 1.05+1.00j""" - ) - assert result == expected +class TestDataFrameToStringLineWidth: + def test_to_string_line_width(self): + df = DataFrame(123, index=range(10, 15), columns=range(30)) + lines = df.to_string(line_width=80) + assert max(len(line) for line in lines.split("\n")) == 80 + def test_to_string_line_width_no_index(self): + # GH#13998, GH#22505 + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) -def test_nullable_float_to_string(float_ea_dtype): - # https://github.com/pandas-dev/pandas/issues/36775 - dtype = float_ea_dtype - s = Series([0.0, 1.0, None], dtype=dtype) - result = s.to_string() - expected = dedent( - """\ - 0 0.0 - 1 1.0 - 2 """ - ) - assert result == expected + df_s = df.to_string(line_width=1, index=False) + expected = " x \\\n 1 \n 2 \n 3 \n\n y \n 4 \n 5 \n 6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False) + expected = " x \\\n11 \n22 \n33 \n\n y \n 4 \n 5 \n 6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) + + df_s = df.to_string(line_width=1, index=False) + expected = " x \\\n 11 \n 22 \n-33 \n\n y \n 4 \n 5 \n-6 " + + assert df_s == expected + + def test_to_string_line_width_no_header(self): + # GH#53054 + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, header=False) + expected = "0 1 \\\n1 2 \n2 3 \n\n0 4 \n1 5 \n2 6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, header=False) + expected = "0 11 \\\n1 22 \n2 33 \n\n0 4 \n1 5 \n2 6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) + + df_s = df.to_string(line_width=1, header=False) + expected = "0 11 \\\n1 22 \n2 -33 \n\n0 4 \n1 5 \n2 -6 " + + assert df_s == expected + + def test_to_string_line_width_with_both_index_and_header(self): + # GH#53054 + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1) + expected = ( + " x \\\n0 1 \n1 2 \n2 3 \n\n y \n0 4 \n1 5 \n2 6 " + ) + + assert df_s == expected + + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1) + expected = ( + " x \\\n0 11 \n1 22 \n2 33 \n\n y \n0 4 \n1 5 \n2 6 " + ) + + assert df_s == expected + + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) + + df_s = df.to_string(line_width=1) + expected = ( + " x \\\n0 11 \n1 22 \n2 -33 \n\n y \n0 4 \n1 5 \n2 -6 " + ) + + assert df_s == expected + + def test_to_string_line_width_no_index_no_header(self): + # GH#53054 + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False, header=False) + expected = "1 \\\n2 \n3 \n\n4 \n5 \n6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False, header=False) + expected = "11 \\\n22 \n33 \n\n4 \n5 \n6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) + + df_s = df.to_string(line_width=1, index=False, header=False) + expected = " 11 \\\n 22 \n-33 \n\n 4 \n 5 \n-6 " + + assert df_s == expected + + +class TestToStringNumericFormatting: + def test_to_string_float_format_no_fixed_width(self): + # GH#21625 + df = DataFrame({"x": [0.19999]}) + expected = " x\n0 0.200" + assert df.to_string(float_format="%.3f") == expected + + # GH#22270 + df = DataFrame({"x": [100.0]}) + expected = " x\n0 100" + assert df.to_string(float_format="%.0f") == expected + + def test_to_string_small_float_values(self): + df = DataFrame({"a": [1.5, 1e-17, -5.5e-7]}) + + result = df.to_string() + # sadness per above + if _three_digit_exp(): + expected = ( + " a\n" + "0 1.500000e+000\n" + "1 1.000000e-017\n" + "2 -5.500000e-007" + ) + else: + expected = ( + " a\n" + "0 1.500000e+00\n" + "1 1.000000e-17\n" + "2 -5.500000e-07" + ) + assert result == expected + + # but not all exactly zero + df = df * 0 + result = df.to_string() + expected = " 0\n0 0\n1 0\n2 -0" + # TODO: assert that these match?? + + def test_to_string_complex_float_formatting(self): + # GH #25514, 25745 + with option_context("display.precision", 5): + df = DataFrame( + { + "x": [ + (0.4467846931321966 + 0.0715185102060818j), + (0.2739442392974528 + 0.23515228785438969j), + (0.26974928742135185 + 0.3250604054898979j), + (-1j), + ] + } + ) + result = df.to_string() + expected = ( + " x\n0 0.44678+0.07152j\n" + "1 0.27394+0.23515j\n" + "2 0.26975+0.32506j\n" + "3 -0.00000-1.00000j" + ) + assert result == expected + + def test_to_string_format_inf(self): + # GH#24861 + df = DataFrame( + { + "A": [-np.inf, np.inf, -1, -2.1234, 3, 4], + "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], + } + ) + result = df.to_string() + + expected = ( + " A B\n" + "0 -inf -inf\n" + "1 inf inf\n" + "2 -1.0000 foo\n" + "3 -2.1234 foooo\n" + "4 3.0000 fooooo\n" + "5 4.0000 bar" + ) + assert result == expected + + df = DataFrame( + { + "A": [-np.inf, np.inf, -1.0, -2.0, 3.0, 4.0], + "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], + } + ) + result = df.to_string() + + expected = ( + " A B\n" + "0 -inf -inf\n" + "1 inf inf\n" + "2 -1.0 foo\n" + "3 -2.0 foooo\n" + "4 3.0 fooooo\n" + "5 4.0 bar" + ) + assert result == expected + + def test_to_string_int_formatting(self): + df = DataFrame({"x": [-15, 20, 25, -35]}) + assert issubclass(df["x"].dtype.type, np.integer) + + output = df.to_string() + expected = " x\n0 -15\n1 20\n2 25\n3 -35" + assert output == expected + + def test_to_string_float_formatting(self): + with option_context( + "display.precision", + 5, + "display.notebook_repr_html", + False, + ): + df = DataFrame( + {"x": [0, 0.25, 3456.000, 12e45, 1.64e6, 1.7e8, 1.253456, np.pi, -1e6]} + ) + + df_s = df.to_string() + + if _three_digit_exp(): + expected = ( + " x\n0 0.00000e+000\n1 2.50000e-001\n" + "2 3.45600e+003\n3 1.20000e+046\n4 1.64000e+006\n" + "5 1.70000e+008\n6 1.25346e+000\n7 3.14159e+000\n" + "8 -1.00000e+006" + ) + else: + expected = ( + " x\n0 0.00000e+00\n1 2.50000e-01\n" + "2 3.45600e+03\n3 1.20000e+46\n4 1.64000e+06\n" + "5 1.70000e+08\n6 1.25346e+00\n7 3.14159e+00\n" + "8 -1.00000e+06" + ) + assert df_s == expected + + df = DataFrame({"x": [3234, 0.253]}) + df_s = df.to_string() + + expected = " x\n0 3234.000\n1 0.253" + assert df_s == expected + + assert get_option("display.precision") == 6 + + df = DataFrame({"x": [1e9, 0.2512]}) + df_s = df.to_string() + + if _three_digit_exp(): + expected = " x\n0 1.000000e+009\n1 2.512000e-001" + else: + expected = " x\n0 1.000000e+09\n1 2.512000e-01" + assert df_s == expected + + +class TestDataFrameToString: + def test_to_string_decimal(self): + # GH#23614 + df = DataFrame({"A": [6.0, 3.1, 2.2]}) + expected = " A\n0 6,0\n1 3,1\n2 2,2" + assert df.to_string(decimal=",") == expected + + def test_to_string_left_justify_cols(self): + df = DataFrame({"x": [3234, 0.253]}) + df_s = df.to_string(justify="left") + expected = " x \n0 3234.000\n1 0.253" + assert df_s == expected + + def test_to_string_format_na(self): + df = DataFrame( + { + "A": [np.nan, -1, -2.1234, 3, 4], + "B": [np.nan, "foo", "foooo", "fooooo", "bar"], + } + ) + result = df.to_string() + + expected = ( + " A B\n" + "0 NaN NaN\n" + "1 -1.0000 foo\n" + "2 -2.1234 foooo\n" + "3 3.0000 fooooo\n" + "4 4.0000 bar" + ) + assert result == expected + + df = DataFrame( + { + "A": [np.nan, -1.0, -2.0, 3.0, 4.0], + "B": [np.nan, "foo", "foooo", "fooooo", "bar"], + } + ) + result = df.to_string() + expected = ( + " A B\n" + "0 NaN NaN\n" + "1 -1.0 foo\n" + "2 -2.0 foooo\n" + "3 3.0 fooooo\n" + "4 4.0 bar" + ) + assert result == expected + + def test_to_string_with_dict_entries(self): + df = DataFrame({"A": [{"a": 1, "b": 2}]}) + + val = df.to_string() + assert "'a': 1" in val + assert "'b': 2" in val + + def test_to_string_with_categorical_columns(self): + # GH#35439 + data = [[4, 2], [3, 2], [4, 3]] + cols = ["aaaaaaaaa", "b"] + df = DataFrame(data, columns=cols) + df_cat_cols = DataFrame(data, columns=CategoricalIndex(cols)) + + assert df.to_string() == df_cat_cols.to_string() + + def test_repr_embedded_ndarray(self): + arr = np.empty(10, dtype=[("err", object)]) + for i in range(len(arr)): + arr["err"][i] = np.random.default_rng(2).standard_normal(i) + + df = DataFrame(arr) + repr(df["err"]) + repr(df) + df.to_string() + + def test_to_string_truncate(self): + # GH 9784 - dont truncate when calling DataFrame.to_string + df = DataFrame( + [ + { + "a": "foo", + "b": "bar", + "c": "let's make this a very VERY long line that is longer " + "than the default 50 character limit", + "d": 1, + }, + {"a": "foo", "b": "bar", "c": "stuff", "d": 1}, + ] + ) + df.set_index(["a", "b", "c"]) + assert df.to_string() == ( + " a b " + " c d\n" + "0 foo bar let's make this a very VERY long line t" + "hat is longer than the default 50 character limit 1\n" + "1 foo bar " + " stuff 1" + ) + with option_context("max_colwidth", 20): + # the display option has no effect on the to_string method + assert df.to_string() == ( + " a b " + " c d\n" + "0 foo bar let's make this a very VERY long line t" + "hat is longer than the default 50 character limit 1\n" + "1 foo bar " + " stuff 1" + ) + assert df.to_string(max_colwidth=20) == ( + " a b c d\n" + "0 foo bar let's make this ... 1\n" + "1 foo bar stuff 1" + ) -def test_nullable_int_to_string(any_int_ea_dtype): - # https://github.com/pandas-dev/pandas/issues/36775 - dtype = any_int_ea_dtype - s = Series([0, 1, None], dtype=dtype) - result = s.to_string() - expected = dedent( - """\ - 0 0 - 1 1 - 2 """ + @pytest.mark.parametrize( + "input_array, expected", + [ + ({"A": ["a"]}, "A\na"), + ({"A": ["a", "b"], "B": ["c", "dd"]}, "A B\na c\nb dd"), + ({"A": ["a", 1], "B": ["aa", 1]}, "A B\na aa\n1 1"), + ], ) - assert result == expected + def test_format_remove_leading_space_dataframe(self, input_array, expected): + # GH#24980 + df = DataFrame(input_array).to_string(index=False) + assert df == expected + @pytest.mark.parametrize( + "data,expected", + [ + ( + {"col1": [1, 2], "col2": [3, 4]}, + " col1 col2\n0 1 3\n1 2 4", + ), + ( + {"col1": ["Abc", 0.756], "col2": [np.nan, 4.5435]}, + " col1 col2\n0 Abc NaN\n1 0.756 4.5435", + ), + ( + {"col1": [np.nan, "a"], "col2": [0.009, 3.543], "col3": ["Abc", 23]}, + " col1 col2 col3\n0 NaN 0.009 Abc\n1 a 3.543 23", + ), + ], + ) + def test_to_string_max_rows_zero(self, data, expected): + # GH#35394 + result = DataFrame(data=data).to_string(max_rows=0) + assert result == expected -@pytest.mark.parametrize("na_rep", ["NaN", "Ted"]) -def test_to_string_na_rep_and_float_format(na_rep): - # GH 13828 - df = DataFrame([["A", 1.2225], ["A", None]], columns=["Group", "Data"]) - result = df.to_string(na_rep=na_rep, float_format="{:.2f}".format) - expected = dedent( - f"""\ - Group Data - 0 A 1.22 - 1 A {na_rep}""" + @pytest.mark.parametrize( + "max_cols, max_rows, expected", + [ + ( + 10, + None, + " 0 1 2 3 4 ... 6 7 8 9 10\n" + " 0 0 0 0 0 ... 0 0 0 0 0\n" + " 0 0 0 0 0 ... 0 0 0 0 0\n" + " 0 0 0 0 0 ... 0 0 0 0 0\n" + " 0 0 0 0 0 ... 0 0 0 0 0", + ), + ( + None, + 2, + " 0 1 2 3 4 5 6 7 8 9 10\n" + " 0 0 0 0 0 0 0 0 0 0 0\n" + " .. .. .. .. .. .. .. .. .. .. ..\n" + " 0 0 0 0 0 0 0 0 0 0 0", + ), + ( + 10, + 2, + " 0 1 2 3 4 ... 6 7 8 9 10\n" + " 0 0 0 0 0 ... 0 0 0 0 0\n" + " .. .. .. .. .. ... .. .. .. .. ..\n" + " 0 0 0 0 0 ... 0 0 0 0 0", + ), + ( + 9, + 2, + " 0 1 2 3 ... 7 8 9 10\n" + " 0 0 0 0 ... 0 0 0 0\n" + " .. .. .. .. ... .. .. .. ..\n" + " 0 0 0 0 ... 0 0 0 0", + ), + ( + 1, + 1, + " 0 ...\n 0 ...\n.. ...", + ), + ], ) - assert result == expected + def test_truncation_no_index(self, max_cols, max_rows, expected): + df = DataFrame([[0] * 11] * 4) + assert ( + df.to_string(index=False, max_cols=max_cols, max_rows=max_rows) == expected + ) + def test_to_string_no_index(self): + # GH#16839, GH#13032 + df = DataFrame({"x": [11, 22], "y": [33, -44], "z": ["AAA", " "]}) + + df_s = df.to_string(index=False) + # Leading space is expected for positive numbers. + expected = " x y z\n11 33 AAA\n22 -44 " + assert df_s == expected + + df_s = df[["y", "x", "z"]].to_string(index=False) + expected = " y x z\n 33 11 AAA\n-44 22 " + assert df_s == expected + + def test_to_string_unicode_columns(self, float_frame): + df = DataFrame({"\u03c3": np.arange(10.0)}) + + buf = StringIO() + df.to_string(buf=buf) + buf.getvalue() + + buf = StringIO() + df.info(buf=buf) + buf.getvalue() + + result = float_frame.to_string() + assert isinstance(result, str) + + @pytest.mark.parametrize("na_rep", ["NaN", "Ted"]) + def test_to_string_na_rep_and_float_format(self, na_rep): + # GH#13828 + df = DataFrame([["A", 1.2225], ["A", None]], columns=["Group", "Data"]) + result = df.to_string(na_rep=na_rep, float_format="{:.2f}".format) + expected = dedent( + f"""\ + Group Data + 0 A 1.22 + 1 A {na_rep}""" + ) + assert result == expected -@pytest.mark.parametrize( - "data,expected", - [ - ( - {"col1": [1, 2], "col2": [3, 4]}, - " col1 col2\n0 1 3\n1 2 4", - ), - ( - {"col1": ["Abc", 0.756], "col2": [np.nan, 4.5435]}, - " col1 col2\n0 Abc NaN\n1 0.756 4.5435", - ), - ( - {"col1": [np.nan, "a"], "col2": [0.009, 3.543], "col3": ["Abc", 23]}, - " col1 col2 col3\n0 NaN 0.009 Abc\n1 a 3.543 23", - ), - ], -) -def test_to_string_max_rows_zero(data, expected): - # GH35394 - result = DataFrame(data=data).to_string(max_rows=0) - assert result == expected + def test_to_string_string_dtype(self): + # GH#50099 + pytest.importorskip("pyarrow") + df = DataFrame( + {"x": ["foo", "bar", "baz"], "y": ["a", "b", "c"], "z": [1, 2, 3]} + ) + df = df.astype( + {"x": "string[pyarrow]", "y": "string[python]", "z": "int64[pyarrow]"} + ) + result = df.dtypes.to_string() + expected = dedent( + """\ + x string[pyarrow] + y string[python] + z int64[pyarrow]""" + ) + assert result == expected + def test_to_string_pos_args_deprecation(self): + # GH#54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + "Starting with pandas version 3.0 all arguments of to_string " + "except for the " + "argument 'buf' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + buf = StringIO() + df.to_string(buf, None, None, True, True) -def test_to_string_string_dtype(): - # GH#50099 - pytest.importorskip("pyarrow") - df = DataFrame({"x": ["foo", "bar", "baz"], "y": ["a", "b", "c"], "z": [1, 2, 3]}) - df = df.astype( - {"x": "string[pyarrow]", "y": "string[python]", "z": "int64[pyarrow]"} - ) - result = df.dtypes.to_string() - expected = dedent( - """\ - x string[pyarrow] - y string[python] - z int64[pyarrow]""" + def test_to_string_utf8_columns(self): + n = "\u05d0".encode() + df = DataFrame([1, 2], columns=[n]) + + with option_context("display.max_rows", 1): + repr(df) + + def test_to_string_unicode_two(self): + dm = DataFrame({"c/\u03c3": []}) + buf = StringIO() + dm.to_string(buf) + + def test_to_string_unicode_three(self): + dm = DataFrame(["\xc2"]) + buf = StringIO() + dm.to_string(buf) + + def test_to_string_with_float_index(self): + index = Index([1.5, 2, 3, 4, 5]) + df = DataFrame(np.arange(5), index=index) + + result = df.to_string() + expected = " 0\n1.5 0\n2.0 1\n3.0 2\n4.0 3\n5.0 4" + assert result == expected + + def test_to_string(self): + # big mixed + biggie = DataFrame( + { + "A": np.random.default_rng(2).standard_normal(200), + "B": Index([f"{i}?!" for i in range(200)]), + }, + ) + + biggie.loc[:20, "A"] = np.nan + biggie.loc[:20, "B"] = np.nan + s = biggie.to_string() + + buf = StringIO() + retval = biggie.to_string(buf=buf) + assert retval is None + assert buf.getvalue() == s + + assert isinstance(s, str) + + # print in right order + result = biggie.to_string( + columns=["B", "A"], col_space=17, float_format="%.5f".__mod__ + ) + lines = result.split("\n") + header = lines[0].strip().split() + joined = "\n".join([re.sub(r"\s+", " ", x).strip() for x in lines[1:]]) + recons = read_csv(StringIO(joined), names=header, header=None, sep=" ") + tm.assert_series_equal(recons["B"], biggie["B"]) + assert recons["A"].count() == biggie["A"].count() + assert (np.abs(recons["A"].dropna() - biggie["A"].dropna()) < 0.1).all() + + # FIXME: don't leave commented-out + # expected = ['B', 'A'] + # assert header == expected + + result = biggie.to_string(columns=["A"], col_space=17) + header = result.split("\n")[0].strip().split() + expected = ["A"] + assert header == expected + + biggie.to_string(columns=["B", "A"], formatters={"A": lambda x: f"{x:.1f}"}) + + biggie.to_string(columns=["B", "A"], float_format=str) + biggie.to_string(columns=["B", "A"], col_space=12, float_format=str) + + frame = DataFrame(index=np.arange(200)) + frame.to_string() + + # TODO: split or simplify this test? + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="fix when arrow is default") + def test_to_string_index_with_nan(self): + # GH#2850 + df = DataFrame( + { + "id1": {0: "1a3", 1: "9h4"}, + "id2": {0: np.nan, 1: "d67"}, + "id3": {0: "78d", 1: "79d"}, + "value": {0: 123, 1: 64}, + } + ) + + # multi-index + y = df.set_index(["id1", "id2", "id3"]) + result = y.to_string() + expected = ( + " value\nid1 id2 id3 \n" + "1a3 NaN 78d 123\n9h4 d67 79d 64" + ) + assert result == expected + + # index + y = df.set_index("id2") + result = y.to_string() + expected = ( + " id1 id3 value\nid2 \n" + "NaN 1a3 78d 123\nd67 9h4 79d 64" + ) + assert result == expected + + # with append (this failed in 0.12) + y = df.set_index(["id1", "id2"]).set_index("id3", append=True) + result = y.to_string() + expected = ( + " value\nid1 id2 id3 \n" + "1a3 NaN 78d 123\n9h4 d67 79d 64" + ) + assert result == expected + + # all-nan in mi + df2 = df.copy() + df2.loc[:, "id2"] = np.nan + y = df2.set_index("id2") + result = y.to_string() + expected = ( + " id1 id3 value\nid2 \n" + "NaN 1a3 78d 123\nNaN 9h4 79d 64" + ) + assert result == expected + + # partial nan in mi + df2 = df.copy() + df2.loc[:, "id2"] = np.nan + y = df2.set_index(["id2", "id3"]) + result = y.to_string() + expected = ( + " id1 value\nid2 id3 \n" + "NaN 78d 1a3 123\n 79d 9h4 64" + ) + assert result == expected + + df = DataFrame( + { + "id1": {0: np.nan, 1: "9h4"}, + "id2": {0: np.nan, 1: "d67"}, + "id3": {0: np.nan, 1: "79d"}, + "value": {0: 123, 1: 64}, + } + ) + + y = df.set_index(["id1", "id2", "id3"]) + result = y.to_string() + expected = ( + " value\nid1 id2 id3 \n" + "NaN NaN NaN 123\n9h4 d67 79d 64" + ) + assert result == expected + + def test_to_string_nonunicode_nonascii_alignment(self): + df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) + rep_str = df.to_string() + lines = rep_str.split("\n") + assert len(lines[1]) == len(lines[2]) + + def test_unicode_problem_decoding_as_ascii(self): + df = DataFrame({"c/\u03c3": Series({"test": np.nan})}) + str(df.to_string()) + + def test_to_string_repr_unicode(self): + buf = StringIO() + + unicode_values = ["\u03c3"] * 10 + unicode_values = np.array(unicode_values, dtype=object) + df = DataFrame({"unicode": unicode_values}) + df.to_string(col_space=10, buf=buf) + + # it works! + repr(df) + # it works even if sys.stdin in None + _stdin = sys.stdin + try: + sys.stdin = None + repr(df) + finally: + sys.stdin = _stdin + + +class TestSeriesToString: + def test_to_string_without_index(self): + # GH#11729 Test index=False option + ser = Series([1, 2, 3, 4]) + result = ser.to_string(index=False) + expected = "\n".join(["1", "2", "3", "4"]) + assert result == expected + + def test_to_string_name(self): + ser = Series(range(100), dtype="int64") + ser.name = "myser" + res = ser.to_string(max_rows=2, name=True) + exp = "0 0\n ..\n99 99\nName: myser" + assert res == exp + res = ser.to_string(max_rows=2, name=False) + exp = "0 0\n ..\n99 99" + assert res == exp + + def test_to_string_dtype(self): + ser = Series(range(100), dtype="int64") + res = ser.to_string(max_rows=2, dtype=True) + exp = "0 0\n ..\n99 99\ndtype: int64" + assert res == exp + res = ser.to_string(max_rows=2, dtype=False) + exp = "0 0\n ..\n99 99" + assert res == exp + + def test_to_string_length(self): + ser = Series(range(100), dtype="int64") + res = ser.to_string(max_rows=2, length=True) + exp = "0 0\n ..\n99 99\nLength: 100" + assert res == exp + + def test_to_string_na_rep(self): + ser = Series(index=range(100), dtype=np.float64) + res = ser.to_string(na_rep="foo", max_rows=2) + exp = "0 foo\n ..\n99 foo" + assert res == exp + + def test_to_string_float_format(self): + ser = Series(range(10), dtype="float64") + res = ser.to_string(float_format=lambda x: f"{x:2.1f}", max_rows=2) + exp = "0 0.0\n ..\n9 9.0" + assert res == exp + + def test_to_string_header(self): + ser = Series(range(10), dtype="int64") + ser.index.name = "foo" + res = ser.to_string(header=True, max_rows=2) + exp = "foo\n0 0\n ..\n9 9" + assert res == exp + res = ser.to_string(header=False, max_rows=2) + exp = "0 0\n ..\n9 9" + assert res == exp + + def test_to_string_empty_col(self): + # GH#13653 + ser = Series(["", "Hello", "World", "", "", "Mooooo", "", ""]) + res = ser.to_string(index=False) + exp = " \n Hello\n World\n \n \nMooooo\n \n " + assert re.match(exp, res) + + def test_to_string_timedelta64(self): + Series(np.array([1100, 20], dtype="timedelta64[ns]")).to_string() + + ser = Series(date_range("2012-1-1", periods=3, freq="D")) + + # GH#2146 + + # adding NaTs + y = ser - ser.shift(1) + result = y.to_string() + assert "1 days" in result + assert "00:00:00" not in result + assert "NaT" in result + + # with frac seconds + o = Series([datetime(2012, 1, 1, microsecond=150)] * 3) + y = ser - o + result = y.to_string() + assert "-1 days +23:59:59.999850" in result + + # rounding? + o = Series([datetime(2012, 1, 1, 1)] * 3) + y = ser - o + result = y.to_string() + assert "-1 days +23:00:00" in result + assert "1 days 23:00:00" in result + + o = Series([datetime(2012, 1, 1, 1, 1)] * 3) + y = ser - o + result = y.to_string() + assert "-1 days +22:59:00" in result + assert "1 days 22:59:00" in result + + o = Series([datetime(2012, 1, 1, 1, 1, microsecond=150)] * 3) + y = ser - o + result = y.to_string() + assert "-1 days +22:58:59.999850" in result + assert "0 days 22:58:59.999850" in result + + # neg time + td = timedelta(minutes=5, seconds=3) + s2 = Series(date_range("2012-1-1", periods=3, freq="D")) + td + y = ser - s2 + result = y.to_string() + assert "-1 days +23:54:57" in result + + td = timedelta(microseconds=550) + s2 = Series(date_range("2012-1-1", periods=3, freq="D")) + td + y = ser - td + result = y.to_string() + assert "2012-01-01 23:59:59.999450" in result + + # no boxing of the actual elements + td = Series(timedelta_range("1 days", periods=3)) + result = td.to_string() + assert result == "0 1 days\n1 2 days\n2 3 days" + + def test_to_string(self): + ts = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10, freq="B"), + ) + buf = StringIO() + + s = ts.to_string() + + retval = ts.to_string(buf=buf) + assert retval is None + assert buf.getvalue().strip() == s + + # pass float_format + format = "%.4f".__mod__ + result = ts.to_string(float_format=format) + result = [x.split()[1] for x in result.split("\n")[:-1]] + expected = [format(x) for x in ts] + assert result == expected + + # empty string + result = ts[:0].to_string() + assert result == "Series([], Freq: B)" + + result = ts[:0].to_string(length=0) + assert result == "Series([], Freq: B)" + + # name and length + cp = ts.copy() + cp.name = "foo" + result = cp.to_string(length=True, name=True, dtype=True) + last_line = result.split("\n")[-1].strip() + assert last_line == (f"Freq: B, Name: foo, Length: {len(cp)}, dtype: float64") + + @pytest.mark.parametrize( + "input_array, expected", + [ + ("a", "a"), + (["a", "b"], "a\nb"), + ([1, "a"], "1\na"), + (1, "1"), + ([0, -1], " 0\n-1"), + (1.0, "1.0"), + ([" a", " b"], " a\n b"), + ([".1", "1"], ".1\n 1"), + (["10", "-10"], " 10\n-10"), + ], ) - assert result == expected + def test_format_remove_leading_space_series(self, input_array, expected): + # GH: 24980 + ser = Series(input_array) + result = ser.to_string(index=False) + assert result == expected + + def test_to_string_complex_number_trims_zeros(self): + ser = Series([1.000000 + 1.000000j, 1.0 + 1.0j, 1.05 + 1.0j]) + result = ser.to_string() + expected = dedent( + """\ + 0 1.00+1.00j + 1 1.00+1.00j + 2 1.05+1.00j""" + ) + assert result == expected + + def test_nullable_float_to_string(self, float_ea_dtype): + # https://github.com/pandas-dev/pandas/issues/36775 + dtype = float_ea_dtype + ser = Series([0.0, 1.0, None], dtype=dtype) + result = ser.to_string() + expected = dedent( + """\ + 0 0.0 + 1 1.0 + 2 """ + ) + assert result == expected + + def test_nullable_int_to_string(self, any_int_ea_dtype): + # https://github.com/pandas-dev/pandas/issues/36775 + dtype = any_int_ea_dtype + ser = Series([0, 1, None], dtype=dtype) + result = ser.to_string() + expected = dedent( + """\ + 0 0 + 1 1 + 2 """ + ) + assert result == expected + + def test_to_string_mixed(self): + ser = Series(["foo", np.nan, -1.23, 4.56]) + result = ser.to_string() + expected = "".join(["0 foo\n", "1 NaN\n", "2 -1.23\n", "3 4.56"]) + assert result == expected + + # but don't count NAs as floats + ser = Series(["foo", np.nan, "bar", "baz"]) + result = ser.to_string() + expected = "".join(["0 foo\n", "1 NaN\n", "2 bar\n", "3 baz"]) + assert result == expected + + ser = Series(["foo", 5, "bar", "baz"]) + result = ser.to_string() + expected = "".join(["0 foo\n", "1 5\n", "2 bar\n", "3 baz"]) + assert result == expected + + def test_to_string_float_na_spacing(self): + ser = Series([0.0, 1.5678, 2.0, -3.0, 4.0]) + ser[::2] = np.nan + + result = ser.to_string() + expected = ( + "0 NaN\n" + "1 1.5678\n" + "2 NaN\n" + "3 -3.0000\n" + "4 NaN" + ) + assert result == expected + + def test_to_string_with_datetimeindex(self): + index = date_range("20130102", periods=6) + ser = Series(1, index=index) + result = ser.to_string() + assert "2013-01-02" in result + + # nat in index + s2 = Series(2, index=[Timestamp("20130111"), NaT]) + ser = concat([s2, ser]) + result = ser.to_string() + assert "NaT" in result + + # nat in summary + result = str(s2.index) + assert "NaT" in result diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/generate_legacy_storage_files.py pandas-2.2.2+dfsg/pandas/tests/io/generate_legacy_storage_files.py --- pandas-2.1.4+dfsg/pandas/tests/io/generate_legacy_storage_files.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/generate_legacy_storage_files.py 2024-04-10 17:42:52.000000000 +0000 @@ -128,7 +128,7 @@ return DataFrame(data, index=dates).apply(SparseArray) -def create_data(): +def create_pickle_data(): """create the pickle data""" data = { "A": [0.0, 1.0, 2.0, 3.0, np.nan], @@ -146,7 +146,7 @@ "period": period_range("2013-01-01", freq="M", periods=10), "float": Index(np.arange(10, dtype=np.float64)), "uint": Index(np.arange(10, dtype=np.uint64)), - "timedelta": timedelta_range("00:00:00", freq="30T", periods=10), + "timedelta": timedelta_range("00:00:00", freq="30min", periods=10), } index["range"] = RangeIndex(10) @@ -286,12 +286,6 @@ } -def create_pickle_data(): - data = create_data() - - return data - - def platform_name(): return "_".join( [ diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/json/conftest.py pandas-2.2.2+dfsg/pandas/tests/io/json/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/io/json/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/json/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,10 +7,3 @@ Fixture for orients excluding the table format. """ return request.param - - -@pytest.fixture(params=["ujson", "pyarrow"]) -def engine(request): - if request.param == "pyarrow": - pytest.importorskip("pyarrow.json") - return request.param diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/json/test_compression.py pandas-2.2.2+dfsg/pandas/tests/io/json/test_compression.py --- pandas-2.1.4+dfsg/pandas/tests/io/json/test_compression.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/json/test_compression.py 2024-04-10 17:42:52.000000000 +0000 @@ -93,27 +93,31 @@ pd.read_json(path, compression="unsupported") +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) def test_to_json_compression( - compression_only, read_infer, to_infer, compression_to_extension + compression_only, read_infer, to_infer, compression_to_extension, infer_string ): - # see gh-15008 - compression = compression_only - - # We'll complete file extension subsequently. - filename = "test." - filename += compression_to_extension[compression] - - df = pd.DataFrame({"A": [1]}) - - to_compression = "infer" if to_infer else compression - read_compression = "infer" if read_infer else compression - - with tm.ensure_clean(filename) as path: - df.to_json(path, compression=to_compression) - result = pd.read_json(path, compression=read_compression) - tm.assert_frame_equal(result, df) + with pd.option_context("future.infer_string", infer_string): + # see gh-15008 + compression = compression_only + + # We'll complete file extension subsequently. + filename = "test." + filename += compression_to_extension[compression] + + df = pd.DataFrame({"A": [1]}) + + to_compression = "infer" if to_infer else compression + read_compression = "infer" if read_infer else compression + + with tm.ensure_clean(filename) as path: + df.to_json(path, compression=to_compression) + result = pd.read_json(path, compression=read_compression) + tm.assert_frame_equal(result, df) def test_to_json_compression_mode(compression): diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/json/test_json_table_schema.py pandas-2.2.2+dfsg/pandas/tests/io/json/test_json_table_schema.py --- pandas-2.1.4+dfsg/pandas/tests/io/json/test_json_table_schema.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/json/test_json_table_schema.py 2024-04-10 17:42:52.000000000 +0000 @@ -32,7 +32,7 @@ "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], "C": pd.date_range("2016-01-01", freq="d", periods=4), - "D": pd.timedelta_range("1H", periods=4, freq="T"), + "D": pd.timedelta_range("1h", periods=4, freq="min"), }, index=pd.Index(range(4), name="idx"), ) @@ -45,7 +45,7 @@ "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], "C": pd.date_range("2016-01-01", freq="d", periods=4), - "D": pd.timedelta_range("1H", periods=4, freq="T"), + "D": pd.timedelta_range("1h", periods=4, freq="min"), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.0, 2.0, 3, 4.0], @@ -56,7 +56,7 @@ class TestBuildSchema: - def test_build_table_schema(self, df_schema): + def test_build_table_schema(self, df_schema, using_infer_string): result = build_table_schema(df_schema, version=False) expected = { "fields": [ @@ -68,6 +68,8 @@ ], "primaryKey": ["idx"], } + if using_infer_string: + expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"} assert result == expected result = build_table_schema(df_schema) assert "pandas_version" in result @@ -97,7 +99,7 @@ } assert result == expected - def test_multiindex(self, df_schema): + def test_multiindex(self, df_schema, using_infer_string): df = df_schema idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)]) df.index = idx @@ -114,6 +116,13 @@ ], "primaryKey": ["level_0", "level_1"], } + if using_infer_string: + expected["fields"][0] = { + "name": "level_0", + "type": "any", + "extDtype": "string", + } + expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"} assert result == expected df.index.names = ["idx0", None] @@ -150,13 +159,16 @@ pd.to_datetime(["2016"], utc=True), pd.Series(pd.to_datetime(["2016"])), pd.Series(pd.to_datetime(["2016"], utc=True)), - pd.period_range("2016", freq="A", periods=3), + pd.period_range("2016", freq="Y", periods=3), ], ) def test_as_json_table_type_date_data(self, date_data): assert as_json_table_type(date_data.dtype) == "datetime" - @pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])]) + @pytest.mark.parametrize( + "str_data", + [pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)], + ) def test_as_json_table_type_string_data(self, str_data): assert as_json_table_type(str_data.dtype) == "string" @@ -261,7 +273,7 @@ tm.assert_frame_equal(result1, df) tm.assert_frame_equal(result2, df) - def test_to_json(self, df_table): + def test_to_json(self, df_table, using_infer_string): df = df_table df.index.name = "idx" result = df.to_json(orient="table", date_format="iso") @@ -292,6 +304,9 @@ {"name": "H", "type": "datetime", "tz": "US/Central"}, ] + if using_infer_string: + fields[2] = {"name": "B", "type": "any", "extDtype": "string"} + schema = {"fields": fields, "primaryKey": ["idx"]} data = [ OrderedDict( @@ -389,7 +404,7 @@ result["schema"].pop("pandas_version") fields = [ - {"freq": "Q-JAN", "name": "index", "type": "datetime"}, + {"freq": "QE-JAN", "name": "index", "type": "datetime"}, {"name": "values", "type": "integer"}, ] @@ -480,9 +495,9 @@ assert result == expected def test_convert_pandas_type_to_json_period_range(self): - arr = pd.period_range("2016", freq="A-DEC", periods=4) + arr = pd.period_range("2016", freq="Y-DEC", periods=4) result = convert_pandas_type_to_json_field(arr) - expected = {"name": "values", "type": "datetime", "freq": "A-DEC"} + expected = {"name": "values", "type": "datetime", "freq": "YE-DEC"} assert result == expected @pytest.mark.parametrize("kind", [pd.Categorical, pd.CategoricalIndex]) @@ -695,7 +710,7 @@ @pytest.mark.parametrize("index_nm", [None, "idx", "index"]) @pytest.mark.parametrize( "vals", - [{"timedeltas": pd.timedelta_range("1H", periods=4, freq="T")}], + [{"timedeltas": pd.timedelta_range("1h", periods=4, freq="min")}], ) def test_read_json_table_orient_raises(self, index_nm, vals, recwarn): df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) @@ -786,7 +801,7 @@ "A": [1, 2, 3, 4], "B": ["a", "b", "c", "c"], "C": pd.date_range("2016-01-01", freq="d", periods=4), - # 'D': pd.timedelta_range('1H', periods=4, freq='T'), + # 'D': pd.timedelta_range('1h', periods=4, freq='min'), "E": pd.Series(pd.Categorical(["a", "b", "c", "c"])), "F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)), "G": [1.1, 2.2, 3.3, 4.4], @@ -845,3 +860,14 @@ expected = DataFrame({"a": [1, 2.0, "s"]}) result = pd.read_json(StringIO(df_json), orient="table") tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize("freq", ["M", "2M", "Q", "2Q", "Y", "2Y"]) + def test_read_json_table_orient_period_depr_freq(self, freq, recwarn): + # GH#9586 + df = DataFrame( + {"ints": [1, 2]}, + index=pd.PeriodIndex(["2020-01", "2021-06"], freq=freq), + ) + out = df.to_json(orient="table") + result = pd.read_json(out, orient="table") + tm.assert_frame_equal(df, result) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/json/test_normalize.py pandas-2.2.2+dfsg/pandas/tests/io/json/test_normalize.py --- pandas-2.1.4+dfsg/pandas/tests/io/json/test_normalize.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/json/test_normalize.py 2024-04-10 17:42:52.000000000 +0000 @@ -200,7 +200,7 @@ ) def test_accepted_input(self, data, record_path, exception_type): if exception_type is not None: - with pytest.raises(exception_type, match=tm.EMPTY_STRING_PATTERN): + with pytest.raises(exception_type, match=""): json_normalize(data, record_path=record_path) else: result = json_normalize(data, record_path=record_path) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/json/test_pandas.py pandas-2.2.2+dfsg/pandas/tests/io/json/test_pandas.py --- pandas-2.1.4+dfsg/pandas/tests/io/json/test_pandas.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/json/test_pandas.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,10 @@ import datetime from datetime import timedelta from decimal import Decimal -from io import StringIO +from io import ( + BytesIO, + StringIO, +) import json import os import sys @@ -10,6 +13,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import IS64 import pandas.util._test_decorators as td @@ -18,8 +23,11 @@ NA, DataFrame, DatetimeIndex, + Index, + RangeIndex, Series, Timestamp, + date_range, read_json, ) import pandas._testing as tm @@ -27,6 +35,9 @@ ArrowStringArray, StringArray, ) +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + +from pandas.io.json import ujson_dumps def test_literal_json_deprecation(): @@ -85,22 +96,24 @@ class TestPandasContainer: @pytest.fixture def categorical_frame(self): - _seriesd = tm.getSeriesData() - - _cat_frame = DataFrame(_seriesd) - - cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * (len(_cat_frame) - 15) - _cat_frame.index = pd.CategoricalIndex(cat, name="E") - _cat_frame["E"] = list(reversed(cat)) - _cat_frame["sort"] = np.arange(len(_cat_frame), dtype="int64") - return _cat_frame + data = { + c: np.random.default_rng(i).standard_normal(30) + for i, c in enumerate(list("ABCD")) + } + cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * 15 + data["E"] = list(reversed(cat)) + data["sort"] = np.arange(30, dtype="int64") + return DataFrame(data, index=pd.CategoricalIndex(cat, name="E")) @pytest.fixture def datetime_series(self): # Same as usual datetime_series, but with index freq set to None, # since that doesn't round-trip, see GH#33711 - ser = tm.makeTimeSeries() - ser.name = "ts" + ser = Series( + 1.1 * np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) ser.index = ser.index._with_freq(None) return ser @@ -108,7 +121,11 @@ def datetime_frame(self): # Same as usual datetime_frame, but with index freq set to None, # since that doesn't round-trip, see GH#33711 - df = DataFrame(tm.getTimeSeriesData()) + df = DataFrame( + np.random.default_rng(2).standard_normal((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=30, freq="B"), + ) df.index = df.index._with_freq(None) return df @@ -163,7 +180,7 @@ # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need # TODO: a to_epoch method would also solve; see GH 14772 - expected.iloc[:, 0] = expected.iloc[:, 0].view(np.int64) // 1000000 + expected.isetitem(0, expected.iloc[:, 0].astype(np.int64) // 1000000) elif orient == "split": expected = df expected.columns = ["x", "x.1"] @@ -233,11 +250,11 @@ @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_categorical( - self, request, orient, categorical_frame, convert_axes + self, request, orient, categorical_frame, convert_axes, using_infer_string ): # TODO: create a better frame to test with and improve coverage if orient in ("index", "columns"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"Can't have duplicate index values for orient '{orient}')" ) @@ -247,7 +264,9 @@ result = read_json(data, orient=orient, convert_axes=convert_axes) expected = categorical_frame.copy() - expected.index = expected.index.astype(str) # Categorical not preserved + expected.index = expected.index.astype( + str if not using_infer_string else "string[pyarrow_numpy]" + ) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON assert_json_roundtrip_equal(result, expected, orient) @@ -257,7 +276,7 @@ data = StringIO(empty_frame.to_json(orient=orient)) result = read_json(data, orient=orient, convert_axes=convert_axes) if orient == "split": - idx = pd.Index([], dtype=(float if convert_axes else object)) + idx = Index([], dtype=(float if convert_axes else object)) expected = DataFrame(index=idx, columns=idx) elif orient in ["index", "columns"]: expected = DataFrame() @@ -285,7 +304,7 @@ @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_mixed(self, orient, convert_axes): - index = pd.Index(["a", "b", "c", "d", "e"]) + index = Index(["a", "b", "c", "d", "e"]) values = { "A": [0.0, 1.0, 2.0, 3.0, 4.0], "B": [0.0, 1.0, 0.0, 1.0, 0.0], @@ -475,18 +494,18 @@ left = read_json(inp, orient=orient, convert_axes=False) tm.assert_frame_equal(left, right) - right.index = pd.RangeIndex(len(df)) + right.index = RangeIndex(len(df)) inp = StringIO(df.to_json(orient="records")) left = read_json(inp, orient="records", convert_axes=False) tm.assert_frame_equal(left, right) - right.columns = pd.RangeIndex(df.shape[1]) + right.columns = RangeIndex(df.shape[1]) inp = StringIO(df.to_json(orient="values")) left = read_json(inp, orient="values", convert_axes=False) tm.assert_frame_equal(left, right) def test_v12_compat(self, datapath): - dti = pd.date_range("2000-01-03", "2000-01-07") + dti = date_range("2000-01-03", "2000-01-07") # freq doesn't roundtrip dti = DatetimeIndex(np.asarray(dti), freq=None) df = DataFrame( @@ -513,10 +532,10 @@ df_iso = df.drop(["modified"], axis=1) v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json") df_unser_iso = read_json(v12_iso_json) - tm.assert_frame_equal(df_iso, df_unser_iso) + tm.assert_frame_equal(df_iso, df_unser_iso, check_column_type=False) - def test_blocks_compat_GH9037(self): - index = pd.date_range("20000101", periods=10, freq="H") + def test_blocks_compat_GH9037(self, using_infer_string): + index = date_range("20000101", periods=10, freq="h") # freq doesn't round-trip index = DatetimeIndex(list(index), freq=None) @@ -599,7 +618,9 @@ ) # JSON deserialisation always creates unicode strings - df_mixed.columns = df_mixed.columns.astype(np.str_) + df_mixed.columns = df_mixed.columns.astype( + np.str_ if not using_infer_string else "string[pyarrow_numpy]" + ) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") tm.assert_frame_equal( @@ -671,16 +692,19 @@ unserialized = read_json( StringIO(s.to_json(orient="records")), orient="records", typ="series" ) - tm.assert_numpy_array_equal(s.values, unserialized.values) + tm.assert_equal(s.values, unserialized.values) def test_series_default_orient(self, string_series): assert string_series.to_json() == string_series.to_json(orient="index") - def test_series_roundtrip_simple(self, orient, string_series): + def test_series_roundtrip_simple(self, orient, string_series, using_infer_string): data = StringIO(string_series.to_json(orient=orient)) result = read_json(data, typ="series", orient=orient) expected = string_series + if using_infer_string and orient in ("split", "index", "columns"): + # These schemas don't contain dtypes, so we infer string + expected.index = expected.index.astype("string[pyarrow_numpy]") if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -865,14 +889,13 @@ ) def test_convert_dates_infer(self, infer_word): # GH10747 - from pandas.io.json import dumps data = [{"id": 1, infer_word: 1036713600000}, {"id": 2}] expected = DataFrame( [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] ) - result = read_json(StringIO(dumps(data)))[["id", infer_word]] + result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -1021,7 +1044,7 @@ dfj2["date"] = Timestamp("20130101") dfj2["ints"] = range(5) dfj2["bools"] = True - dfj2.index = pd.date_range("20130101", periods=5) + dfj2.index = date_range("20130101", periods=5) json = StringIO(dfj2.to_json()) result = read_json(json, dtype={"ints": np.int64, "bools": np.bool_}) @@ -1065,7 +1088,7 @@ result = read_json(StringIO(ser.to_json()), typ="series").apply(converter) tm.assert_series_equal(result, ser) - ser = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1])) + ser = Series([timedelta(23), timedelta(seconds=5)], index=Index([0, 1])) assert ser.dtype == "timedelta64[ns]" result = read_json(StringIO(ser.to_json()), typ="series").apply(converter) tm.assert_series_equal(result, ser) @@ -1081,7 +1104,7 @@ { "a": [timedelta(days=23), timedelta(seconds=5)], "b": [1, 2], - "c": pd.date_range(start="20130101", periods=2), + "c": date_range(start="20130101", periods=2), } ) data = StringIO(frame.to_json(date_unit="ns")) @@ -1125,6 +1148,18 @@ result = ser.to_json(date_format=date_format) assert result == expected + @pytest.mark.parametrize("as_object", [True, False]) + @pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta]) + def test_timedelta_to_json_fractional_precision(self, as_object, timedelta_typ): + data = [timedelta_typ(milliseconds=42)] + ser = Series(data, index=data) + if as_object: + ser = ser.astype(object) + + result = ser.to_json() + expected = '{"42":42}' + assert result == expected + def test_default_handler(self): value = object() frame = DataFrame({"a": [7, value]}) @@ -1133,8 +1168,6 @@ tm.assert_frame_equal(expected, result, check_index_type=False) def test_default_handler_indirect(self): - from pandas.io.json import dumps - def default(obj): if isinstance(obj, complex): return [("mathjs", "Complex"), ("re", obj.real), ("im", obj.imag)] @@ -1151,7 +1184,9 @@ '[9,[[1,null],["STR",null],[[["mathjs","Complex"],' '["re",4.0],["im",-5.0]],"N\\/A"]]]' ) - assert dumps(df_list, default_handler=default, orient="values") == expected + assert ( + ujson_dumps(df_list, default_handler=default, orient="values") == expected + ) def test_default_handler_numpy_unsupported_dtype(self): # GH12554 to_json raises 'Unhandled numpy dtype 15' @@ -1196,10 +1231,10 @@ def test_datetime_tz(self): # GH4377 df.to_json segfaults with non-ndarray blocks - tz_range = pd.date_range("20130101", periods=3, tz="US/Eastern") + tz_range = date_range("20130101", periods=3, tz="US/Eastern") tz_naive = tz_range.tz_convert("utc").tz_localize(None) - df = DataFrame({"A": tz_range, "B": pd.date_range("20130101", periods=3)}) + df = DataFrame({"A": tz_range, "B": date_range("20130101", periods=3)}) df_naive = df.copy() df_naive["A"] = tz_naive @@ -1235,35 +1270,29 @@ ], ) def test_tz_is_utc(self, ts): - from pandas.io.json import dumps - exp = '"2013-01-10T05:00:00.000Z"' - assert dumps(ts, iso_dates=True) == exp + assert ujson_dumps(ts, iso_dates=True) == exp dt = ts.to_pydatetime() - assert dumps(dt, iso_dates=True) == exp + assert ujson_dumps(dt, iso_dates=True) == exp def test_tz_is_naive(self): - from pandas.io.json import dumps - ts = Timestamp("2013-01-10 05:00:00") exp = '"2013-01-10T05:00:00.000"' - assert dumps(ts, iso_dates=True) == exp + assert ujson_dumps(ts, iso_dates=True) == exp dt = ts.to_pydatetime() - assert dumps(dt, iso_dates=True) == exp + assert ujson_dumps(dt, iso_dates=True) == exp @pytest.mark.parametrize( "tz_range", [ - pd.date_range("2013-01-01 05:00:00Z", periods=2), - pd.date_range("2013-01-01 00:00:00", periods=2, tz="US/Eastern"), - pd.date_range("2013-01-01 00:00:00-0500", periods=2), + date_range("2013-01-01 05:00:00Z", periods=2), + date_range("2013-01-01 00:00:00", periods=2, tz="US/Eastern"), + date_range("2013-01-01 00:00:00-0500", periods=2), ], ) def test_tz_range_is_utc(self, tz_range): - from pandas.io.json import dumps - exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' dfexp = ( '{"DT":{' @@ -1271,33 +1300,31 @@ '"1":"2013-01-02T05:00:00.000Z"}}' ) - assert dumps(tz_range, iso_dates=True) == exp + assert ujson_dumps(tz_range, iso_dates=True) == exp dti = DatetimeIndex(tz_range) # Ensure datetimes in object array are serialized correctly # in addition to the normal DTI case - assert dumps(dti, iso_dates=True) == exp - assert dumps(dti.astype(object), iso_dates=True) == exp + assert ujson_dumps(dti, iso_dates=True) == exp + assert ujson_dumps(dti.astype(object), iso_dates=True) == exp df = DataFrame({"DT": dti}) - result = dumps(df, iso_dates=True) + result = ujson_dumps(df, iso_dates=True) assert result == dfexp - assert dumps(df.astype({"DT": object}), iso_dates=True) + assert ujson_dumps(df.astype({"DT": object}), iso_dates=True) def test_tz_range_is_naive(self): - from pandas.io.json import dumps - - dti = pd.date_range("2013-01-01 05:00:00", periods=2) + dti = date_range("2013-01-01 05:00:00", periods=2) exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]' dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}' # Ensure datetimes in object array are serialized correctly # in addition to the normal DTI case - assert dumps(dti, iso_dates=True) == exp - assert dumps(dti.astype(object), iso_dates=True) == exp + assert ujson_dumps(dti, iso_dates=True) == exp + assert ujson_dumps(dti.astype(object), iso_dates=True) == exp df = DataFrame({"DT": dti}) - result = dumps(df, iso_dates=True) + result = ujson_dumps(df, iso_dates=True) assert result == dfexp - assert dumps(df.astype({"DT": object}), iso_dates=True) + assert ujson_dumps(df.astype({"DT": object}), iso_dates=True) def test_read_inline_jsonl(self): # GH9180 @@ -1463,6 +1490,9 @@ result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) + # TODO: We are casting to string which coerces None to NaN before casting back + # to object, ending up with incorrect na values + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion") @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) def test_to_json_from_json_columns_dtypes(self, orient): # GH21892 GH33205 @@ -1629,7 +1659,8 @@ result = read_json( StringIO('{"2019-01-01T11:00:00.000Z":88}'), typ="series", orient="index" ) - expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) + exp_dti = DatetimeIndex(["2019-01-01 11:00:00"], dtype="M8[ns, UTC]") + expected = Series([88], index=exp_dti) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -1719,6 +1750,11 @@ assert result == expected + @pytest.mark.skipif( + using_pyarrow_string_dtype(), + reason="Adjust expected when infer_string is default, no bug here, " + "just a complicated parametrization", + ) @pytest.mark.parametrize( "orient,expected", [ @@ -1897,7 +1933,7 @@ # GH 31615 if isinstance(nulls_fixture, Decimal): mark = pytest.mark.xfail(reason="not implemented") - request.node.add_marker(mark) + request.applymarker(mark) result = DataFrame([[nulls_fixture]]).to_json() assert result == '{"0":{"0":null}}' @@ -1912,7 +1948,7 @@ # GH 15273 df = DataFrame( True, - index=pd.date_range("2017-01-20", "2017-01-23"), + index=date_range("2017-01-20", "2017-01-23"), columns=["foo", "bar"], ).stack(future_stack=True) result = df.to_json() @@ -1994,7 +2030,9 @@ @pytest.mark.parametrize( "orient", ["split", "records", "values", "index", "columns"] ) - def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient): + def test_read_json_dtype_backend( + self, string_storage, dtype_backend, orient, using_infer_string + ): # GH#50750 pa = pytest.importorskip("pyarrow") df = DataFrame( @@ -2010,10 +2048,20 @@ } ) - if string_storage == "python": + if using_infer_string: + string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"])) + string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None])) + elif string_storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) + else: string_array = ArrowStringArray(pa.array(["a", "b", "c"])) string_array_na = ArrowStringArray(pa.array(["a", "b", None])) @@ -2048,7 +2096,7 @@ ) if orient == "values": - expected.columns = list(range(0, 8)) + expected.columns = list(range(8)) tm.assert_frame_equal(result, expected) @@ -2109,7 +2157,46 @@ expected = DataFrame( [["a", "b"], ["c", "d"]], dtype="string[pyarrow_numpy]", - index=pd.Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), - columns=pd.Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), + index=Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), + columns=Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), ) tm.assert_frame_equal(result, expected) + + +def test_json_pos_args_deprecation(): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_json except for the " + r"argument 'path_or_buf' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + buf = BytesIO() + df.to_json(buf, "split") + + +@td.skip_if_no("pyarrow") +def test_to_json_ea_null(): + # GH#57224 + df = DataFrame( + { + "a": Series([1, NA], dtype="int64[pyarrow]"), + "b": Series([2, NA], dtype="Int64"), + } + ) + result = df.to_json(orient="records", lines=True) + expected = """{"a":1,"b":2} +{"a":null,"b":null} +""" + assert result == expected + + +def test_read_json_lines_rangeindex(): + # GH 57429 + data = """ +{"a": 1, "b": 2} +{"a": 3, "b": 4} +""" + result = read_json(StringIO(data), lines=True).index + expected = RangeIndex(2) + tm.assert_index_equal(result, expected, exact=True) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/json/test_readlines.py pandas-2.2.2+dfsg/pandas/tests/io/json/test_readlines.py --- pandas-2.1.4+dfsg/pandas/tests/io/json/test_readlines.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/json/test_readlines.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,6 +14,10 @@ from pandas.io.json._json import JsonReader +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + @pytest.fixture def lines_json_df(): @@ -21,6 +25,13 @@ return df.to_json(lines=True, orient="records") +@pytest.fixture(params=["ujson", "pyarrow"]) +def engine(request): + if request.param == "pyarrow": + pytest.importorskip("pyarrow.json") + return request.param + + def test_read_jsonl(): # GH9180 result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True) @@ -43,7 +54,7 @@ if engine == "pyarrow": # GH 48893 reason = "Pyarrow only supports a file path as an input and line delimited json" - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) df = DataFrame( [([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")], @@ -121,7 +132,7 @@ "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) unchunked = read_json(StringIO(lines_json_df), lines=True) with read_json( @@ -148,7 +159,7 @@ "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason)) + request.applymarker(pytest.mark.xfail(reason=reason)) # Test reading line-format JSON to Series with chunksize param s = pd.Series({"A": 1, "B": 2}) @@ -172,7 +183,7 @@ "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) # Other tests check that the final result of read_json(chunksize=True) # is correct. This checks the intermediate chunks. @@ -191,7 +202,7 @@ "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) with tm.ensure_clean("test.json") as path: df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) @@ -274,7 +285,7 @@ "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) with tm.ensure_clean("test.json") as path: monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949") @@ -309,7 +320,7 @@ "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} @@ -351,7 +362,7 @@ "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError)) df_list_expected = [ DataFrame([[1, 2]], columns=["a", "b"], index=[0]), @@ -399,7 +410,7 @@ # Test ValueError when orient is not 'records' df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) msg = ( - r"mode='a' \(append\) is only supported when" + r"mode='a' \(append\) is only supported when " "lines is True and orient is 'records'" ) with pytest.raises(ValueError, match=msg): @@ -411,7 +422,7 @@ # Test ValueError when lines is not True df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) msg = ( - r"mode='a' \(append\) is only supported when" + r"mode='a' \(append\) is only supported when " "lines is True and orient is 'records'" ) with pytest.raises(ValueError, match=msg): diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/json/test_ujson.py pandas-2.2.2+dfsg/pandas/tests/io/json/test_ujson.py --- pandas-2.1.4+dfsg/pandas/tests/io/json/test_ujson.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/json/test_ujson.py 2024-04-10 17:42:52.000000000 +0000 @@ -585,7 +585,7 @@ assert ujson.ujson_loads(int_exp) == json.loads(int_exp) def test_loads_non_str_bytes_raises(self): - msg = "Expected 'str' or 'bytes'" + msg = "a bytes-like object is required, not 'NoneType'" with pytest.raises(TypeError, match=msg): ujson.ujson_loads(None) @@ -814,10 +814,19 @@ def test_0d_array(self): # gh-18878 - msg = re.escape("array(1) (0d array) is not JSON serializable at the moment") + msg = re.escape( + "array(1) (numpy-scalar) is not JSON serializable at the moment" + ) with pytest.raises(TypeError, match=msg): ujson.ujson_dumps(np.array(1)) + def test_array_long_double(self): + msg = re.compile( + "1234.5.* \\(numpy-scalar\\) is not JSON serializable at the moment" + ) + with pytest.raises(TypeError, match=msg): + ujson.ujson_dumps(np.longdouble(1234.5)) + class TestPandasJSONTests: def test_dataframe(self, orient): @@ -1033,7 +1042,7 @@ def test_encode_big_set(self): s = set() - for x in range(0, 100000): + for x in range(100000): s.add(x) # Make sure no Exception is raised. diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_chunksize.py pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_chunksize.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_chunksize.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_chunksize.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,7 +16,9 @@ ) import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) @pytest.mark.parametrize("index_col", [0, "index"]) @@ -44,6 +46,13 @@ ) expected = expected.set_index("index") + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: + list(reader) + return + with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader: chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[:2]) @@ -63,6 +72,8 @@ """ parser = all_parsers msg = r"'chunksize' must be an integer >=1" + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): with parser.read_csv(StringIO(data), chunksize=chunksize) as _: @@ -83,6 +94,12 @@ parser = all_parsers kwargs = {"index_col": 0, "nrows": 5} + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + expected = parser.read_csv(StringIO(data), **kwargs) with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader: tm.assert_frame_equal(concat(reader), expected) @@ -100,6 +117,12 @@ parser = all_parsers kwargs = {"index_col": 0, "nrows": 5} + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + expected = parser.read_csv(StringIO(data), **kwargs) with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader: tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) @@ -117,6 +140,13 @@ 7,8,9 1,2,3""" + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), chunksize=2) as reader: + reader.get_chunk() + return + with parser.read_csv(StringIO(data), chunksize=2) as reader: result = reader.get_chunk() @@ -137,8 +167,17 @@ """ parser = all_parsers result = parser.read_csv(StringIO(data), **kwargs) + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: + concat(reader) + return + with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader: - tm.assert_frame_equal(concat(reader), result) + via_reader = concat(reader) + tm.assert_frame_equal(via_reader, result) def test_read_chunksize_jagged_names(all_parsers): @@ -147,6 +186,16 @@ data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv( + StringIO(data), names=range(10), chunksize=4 + ) as reader: + concat(reader) + return + with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: result = concat(reader) tm.assert_frame_equal(result, expected) @@ -171,10 +220,9 @@ data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) # Coercions should work without warnings. - with tm.assert_produces_warning(None): - with monkeypatch.context() as m: - m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) - result = parser.read_csv(StringIO(data)) + with monkeypatch.context() as m: + m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic) + result = parser.read_csv(StringIO(data)) assert type(result.a[0]) is np.float64 assert result.a.dtype == float @@ -197,12 +245,17 @@ buf = StringIO(data) - df = parser.read_csv_check_warnings( - warning_type, - r"Columns \(0\) have mixed types. " - "Specify dtype option on import or set low_memory=False.", - buf, - ) + if parser.engine == "pyarrow": + df = parser.read_csv( + buf, + ) + else: + df = parser.read_csv_check_warnings( + warning_type, + r"Columns \(0\) have mixed types. " + "Specify dtype option on import or set low_memory=False.", + buf, + ) assert df.a.dtype == object @@ -216,6 +269,18 @@ nrows = 10 data = StringIO("foo,bar\n") + if parser.engine == "pyarrow": + msg = ( + "The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine" + ) + with pytest.raises(ValueError, match=msg): + if iterator: + with parser.read_csv(data, chunksize=nrows) as reader: + next(iter(reader)) + else: + parser.read_csv(data, nrows=nrows) + return + if iterator: with parser.read_csv(data, chunksize=nrows) as reader: result = next(iter(reader)) @@ -237,6 +302,14 @@ for i in range(1000): f.write(str(i) + "\n") + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with parser.read_csv(path, chunksize=20) as result: + for _ in result: + pass + return + with parser.read_csv(path, chunksize=20) as result: for _ in result: pass @@ -250,6 +323,18 @@ 9,10,11 """ + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + names=["a", "b"], + chunksize=2, + usecols=[0, 1], + header=None, + ) + return + result_chunks = parser.read_csv( StringIO(data), names=["a", "b"], @@ -276,6 +361,12 @@ 9,10,11 """ + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), chunksize=2) + return + result_chunks = parser.read_csv(StringIO(data), chunksize=2) expected_frames = [ diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_common_basic.py pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_common_basic.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_common_basic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_common_basic.py 2024-04-10 17:42:52.000000000 +0000 @@ -29,6 +29,10 @@ from pandas.io.parsers import TextFileReader from pandas.io.parsers.c_parser_wrapper import CParserWrapper +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -116,7 +120,6 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_1000_sep(all_parsers): parser = all_parsers data = """A|B|C @@ -125,11 +128,17 @@ """ expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) + if parser.engine == "pyarrow": + msg = "The 'thousands' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep="|", thousands=",") + return + result = parser.read_csv(StringIO(data), sep="|", thousands=",") tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_unnamed_columns(all_parsers): data = """A,B,C,, 1,2,3,4,5 @@ -158,7 +167,6 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_read_csv_low_memory_no_rows_with_index(all_parsers): # see gh-21141 parser = all_parsers @@ -171,6 +179,13 @@ 2,2,3,4 3,3,4,5 """ + + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) + return + result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) expected = DataFrame(columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) @@ -209,7 +224,6 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize("nrows", [3, 3.0]) def test_read_nrows(all_parsers, nrows): # see gh-10476 @@ -227,11 +241,16 @@ ) parser = all_parsers + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), nrows=nrows) + return + result = parser.read_csv(StringIO(data), nrows=nrows) tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize("nrows", [1.2, "foo", -1]) def test_read_nrows_bad(all_parsers, nrows): data = """index,A,B,C,D @@ -244,6 +263,8 @@ """ msg = r"'nrows' must be an integer >=0" parser = all_parsers + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), nrows=nrows) @@ -258,7 +279,7 @@ parser.read_csv(StringIO(data), skipfooter=1, nrows=5) -@xfail_pyarrow +@skip_pyarrow def test_missing_trailing_delimiters(all_parsers): parser = all_parsers data = """A,B,C,D @@ -274,7 +295,6 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_skip_initial_space(all_parsers): data = ( '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' @@ -286,6 +306,18 @@ ) parser = all_parsers + if parser.engine == "pyarrow": + msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + names=list(range(33)), + header=None, + na_values=["-9999.0"], + skipinitialspace=True, + ) + return + result = parser.read_csv( StringIO(data), names=list(range(33)), @@ -335,7 +367,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -355,7 +387,7 @@ data = '''SEARCH_TERM,ACTUAL_URL "bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" "tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" -"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa: E501 +"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' parser = all_parsers result = parser.read_csv( @@ -367,18 +399,23 @@ tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) -@xfail_pyarrow def test_ignore_leading_whitespace(all_parsers): # see gh-3374, gh-6607 parser = all_parsers data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9" + + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=r"\s+") + return result = parser.read_csv(StringIO(data), sep=r"\s+") expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]}) tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) def test_uneven_lines_with_usecols(all_parsers, usecols): # see gh-12203 @@ -401,7 +438,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -434,7 +471,6 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -464,7 +500,21 @@ data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501 parser = all_parsers - result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + + if parser.engine == "pyarrow": + msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + return + + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) tm.assert_frame_equal(result, expected) @@ -473,8 +523,12 @@ data = "a b c\n1 2 3" parser = all_parsers + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" with pytest.raises(ValueError, match="you can only specify one"): - parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) def test_read_filepath_or_buffer(all_parsers): @@ -485,7 +539,6 @@ parser.read_csv(filepath_or_buffer=b"input") -@xfail_pyarrow @pytest.mark.parametrize("delim_whitespace", [True, False]) def test_single_char_leading_whitespace(all_parsers, delim_whitespace): # see gh-9710 @@ -498,14 +551,30 @@ b\n""" expected = DataFrame({"MyColumn": list("abab")}) - result = parser.read_csv( - StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace - ) + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + + if parser.engine == "pyarrow": + msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv( + StringIO(data), + skipinitialspace=True, + delim_whitespace=delim_whitespace, + ) + return + + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace + ) tm.assert_frame_equal(result, expected) -# Skip for now, actually only one test fails though, but its tricky to xfail -@skip_pyarrow @pytest.mark.parametrize( "sep,skip_blank_lines,exp_data", [ @@ -525,7 +594,7 @@ ), ], ) -def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): +def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data, request): parser = all_parsers data = """\ A,B,C @@ -540,12 +609,20 @@ if sep == r"\s+": data = data.replace(",", " ") + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines + ) + return + result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines) expected = DataFrame(exp_data, columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_whitespace_lines(all_parsers): parser = all_parsers data = """ @@ -561,7 +638,6 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize( "data,expected", [ @@ -586,6 +662,12 @@ def test_whitespace_regex_separator(all_parsers, data, expected): # see gh-6607 parser = all_parsers + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=r"\s+") + return + result = parser.read_csv(StringIO(data), sep=r"\s+") tm.assert_frame_equal(result, expected) @@ -659,7 +741,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_first_row_bom(all_parsers): # see gh-26545 parser = all_parsers @@ -670,7 +752,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_first_row_bom_unquoted(all_parsers): # see gh-36343 parser = all_parsers @@ -681,7 +763,6 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize("nrows", range(1, 6)) def test_blank_lines_between_header_and_data_rows(all_parsers, nrows): # GH 28071 @@ -691,11 +772,20 @@ ) csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4" parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The 'nrows' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False + ) + return + df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False) tm.assert_frame_equal(df, ref[:nrows]) -@xfail_pyarrow +@skip_pyarrow def test_no_header_two_extra_columns(all_parsers): # GH 26218 column_names = ["one", "two", "three"] @@ -724,12 +814,25 @@ parser.read_csv(StringIO(data), names=set("QAZ")) -@xfail_pyarrow def test_read_table_delim_whitespace_default_sep(all_parsers): # GH: 35958 f = StringIO("a b c\n1 -2 -3\n4 5 6") parser = all_parsers - result = parser.read_table(f, delim_whitespace=True) + + depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" + + if parser.engine == "pyarrow": + msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_table(f, delim_whitespace=True) + return + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_table(f, delim_whitespace=True) expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) tm.assert_frame_equal(result, expected) @@ -743,11 +846,15 @@ "Specified a delimiter with both sep and " "delim_whitespace=True; you can only specify one." ) - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, sep=delimiter) + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, sep=delimiter) - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) def test_read_csv_delimiter_and_sep_no_default(all_parsers): @@ -784,14 +891,18 @@ "Specified a delimiter with both sep and " "delim_whitespace=True; you can only specify one." ) - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, sep=delimiter) + depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, sep=delimiter) - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, delimiter=delimiter) + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, delimiter=delimiter) -@xfail_pyarrow +@skip_pyarrow def test_dict_keys_as_names(all_parsers): # GH: 36928 data = "1,2" @@ -804,7 +915,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 parser = all_parsers @@ -832,7 +943,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_short_single_line(all_parsers): # GH 47566 parser = all_parsers @@ -843,7 +954,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # ValueError: Length mismatch: Expected axis has 2 elements def test_short_multi_line(all_parsers): # GH 47566 parser = all_parsers diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_data_list.py pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_data_list.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_data_list.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_data_list.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,6 +12,10 @@ from pandas.io.parsers import TextParser +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_decimal.py pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_decimal.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_decimal.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_decimal.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,10 +9,11 @@ from pandas import DataFrame import pandas._testing as tm -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) -@xfail_pyarrow @pytest.mark.parametrize( "data,thousands,decimal", [ @@ -38,6 +39,14 @@ parser = all_parsers expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) + if parser.engine == "pyarrow": + msg = "The 'thousands' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), sep="|", thousands=thousands, decimal=decimal + ) + return + result = parser.read_csv( StringIO(data), sep="|", thousands=thousands, decimal=decimal ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_file_buffer_url.py pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_file_buffer_url.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_file_buffer_url.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_file_buffer_url.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,6 +11,7 @@ from urllib.error import URLError import uuid +import numpy as np import pytest from pandas.errors import ( @@ -19,12 +20,18 @@ ) import pandas.util._test_decorators as td -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm -# TODO(1.4) Please xfail individual tests at release time -# instead of skip -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @pytest.mark.network @@ -60,16 +67,26 @@ pytest.skip("Failing on: " + " ".join(platform.uname())) +@xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_path_lib(all_parsers): parser = all_parsers - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) +@xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_local_path(all_parsers): parser = all_parsers - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) result = tm.round_trip_localpath( df.to_csv, lambda p: parser.read_csv(p, index_col=0) ) @@ -206,10 +223,22 @@ "in-quoted-field", ], ) -def test_eof_states(all_parsers, data, kwargs, expected, msg): +def test_eof_states(all_parsers, data, kwargs, expected, msg, request): # see gh-10728, gh-10548 parser = all_parsers + if parser.engine == "pyarrow" and "comment" in kwargs: + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + + if parser.engine == "pyarrow" and "\r" not in data: + # pandas.errors.ParserError: CSV parse error: Expected 3 columns, got 1: + # ValueError: skiprows argument must be an integer when using engine='pyarrow' + # AssertionError: Regex pattern did not match. + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + if expected is None: with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), **kwargs) @@ -228,6 +257,12 @@ new_file.flush() new_file.seek(0) + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(new_file, sep=r"\s+", header=None) + return + result = parser.read_csv(new_file, sep=r"\s+", header=None) expected = DataFrame([[0, 0]]) @@ -359,10 +394,18 @@ with tm.ensure_clean() as path: expected.to_csv(path, index=False, compression=compression) - tm.assert_frame_equal( - parser.read_csv(path, memory_map=True, compression=compression), - expected, - ) + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(path, memory_map=True, compression=compression) + return + + result = parser.read_csv(path, memory_map=True, compression=compression) + + tm.assert_frame_equal( + result, + expected, + ) def test_context_manager(all_parsers, datapath): @@ -371,6 +414,12 @@ path = datapath("io", "data", "csv", "iris.csv") + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(path, chunksize=1) + return + reader = parser.read_csv(path, chunksize=1) assert not reader.handles.handle.closed try: @@ -386,6 +435,12 @@ parser = all_parsers with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path: + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(path, chunksize=1) + return + reader = parser.read_csv(path, chunksize=1) assert not reader.handles.handle.closed try: @@ -396,6 +451,7 @@ assert not reader.handles.handle.closed +@skip_pyarrow # ParserError: Empty CSV file def test_file_descriptor_leak(all_parsers, using_copy_on_write): # GH 31488 parser = all_parsers @@ -412,5 +468,11 @@ {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]} ) + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(mmap_file, memory_map=True) + return + result = parser.read_csv(mmap_file, memory_map=True) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_float.py pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_float.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_float.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_float.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,9 +12,14 @@ from pandas import DataFrame import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block def test_float_parser(all_parsers): # see gh-9565 parser = all_parsers @@ -35,7 +40,14 @@ tm.assert_frame_equal(df_roundtrip, df) -@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) +@pytest.mark.parametrize( + "neg_exp", + [ + -617, + -100000, + pytest.param(-99999999999999999, marks=pytest.mark.skip_ubsan), + ], +) def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): # GH#38753 parser, precision = all_parsers_all_precisions @@ -46,6 +58,8 @@ tm.assert_frame_equal(result, expected) +@pytest.mark.skip_ubsan +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): # GH#38753 @@ -55,7 +69,7 @@ if precision == "round_trip": if exp == 999999999999999999 and is_platform_linux(): mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") - request.node.add_marker(mark) + request.applymarker(mark) value = np.inf if exp > 0 else 0.0 expected = DataFrame({"data": [value]}) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_index.py pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_index.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_index.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_index.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,10 +15,11 @@ ) import pandas._testing as tm -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) -# GH#43650: Some expected failures with the pyarrow engine can occasionally -# cause a deadlock instead, so we skip these instead of xfailing +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -108,7 +109,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_multi_index_no_level_names_implicit(all_parsers): parser = all_parsers data = """A,B,C,D @@ -142,7 +143,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "data,expected,header", [ @@ -164,7 +165,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: DataFrame.columns are different def test_no_unnamed_index(all_parsers): parser = all_parsers data = """ id c0 c1 c2 @@ -207,7 +208,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_read_duplicate_index_implicit(all_parsers): data = """A,B,C,D foo,2,3,4,5 @@ -235,7 +236,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_read_csv_no_index_name(all_parsers, csv_dir_path): parser = all_parsers csv2 = os.path.join(csv_dir_path, "test2.csv") @@ -263,7 +264,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_empty_with_index(all_parsers): # see gh-10184 data = "x,y" @@ -274,6 +275,7 @@ tm.assert_frame_equal(result, expected) +# CSV parse error: Empty CSV file or block: cannot infer number of columns @skip_pyarrow def test_empty_with_multi_index(all_parsers): # see gh-10467 @@ -287,6 +289,7 @@ tm.assert_frame_equal(result, expected) +# CSV parse error: Empty CSV file or block: cannot infer number of columns @skip_pyarrow def test_empty_with_reversed_multi_index(all_parsers): data = "x,y,z" diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_inf.py pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_inf.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_inf.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_inf.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,10 +13,14 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@xfail_pyarrow +@xfail_pyarrow # AssertionError: DataFrame.index are different @pytest.mark.parametrize("na_filter", [True, False]) def test_inf_parsing(all_parsers, na_filter): parser = all_parsers @@ -40,7 +44,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: DataFrame.index are different @pytest.mark.parametrize("na_filter", [True, False]) def test_infinity_parsing(all_parsers, na_filter): parser = all_parsers @@ -63,7 +67,11 @@ parser = all_parsers data = "1.0\nNaN\n3.0" msg = "use_inf_as_na option is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + warn = FutureWarning + if parser.engine == "pyarrow": + warn = (FutureWarning, DeprecationWarning) + + with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): with option_context("use_inf_as_na", True): result = parser.read_csv(StringIO(data), header=None) expected = DataFrame([1.0, np.nan, 3.0]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_ints.py pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_ints.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_ints.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_ints.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,8 +13,11 @@ ) import pandas._testing as tm -# GH#43650: Some expected failures with the pyarrow engine can occasionally -# cause a deadlock instead, so we skip these instead of xfailing +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -98,12 +101,16 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow # Flaky @pytest.mark.parametrize("sep", [" ", r"\s+"]) def test_integer_overflow_bug(all_parsers, sep): # see gh-2601 data = "65248E10 11\n55555E55 22\n" parser = all_parsers + if parser.engine == "pyarrow" and sep != " ": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=None, sep=sep) + return result = parser.read_csv(StringIO(data), header=None, sep=sep) expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]]) @@ -120,9 +127,8 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) -def test_int64_overflow(all_parsers, conv): +def test_int64_overflow(all_parsers, conv, request): data = """ID 00013007854817840016671868 00013007854817840016749251 @@ -136,6 +142,10 @@ if conv is None: # 13007854817840016671868 > UINT64_MAX, so this # will overflow and return object as the dtype. + if parser.engine == "pyarrow": + mark = pytest.mark.xfail(reason="parses to float64") + request.applymarker(mark) + result = parser.read_csv(StringIO(data)) expected = DataFrame( [ @@ -154,17 +164,23 @@ # 13007854817840016671868 > UINT64_MAX, so attempts # to cast to either int64 or uint64 will result in # an OverflowError being raised. - msg = ( - "(Python int too large to convert to C long)|" - "(long too big to convert)|" - "(int too big to convert)" + msg = "|".join( + [ + "Python int too large to convert to C long", + "long too big to convert", + "int too big to convert", + ] ) + err = OverflowError + if parser.engine == "pyarrow": + err = ValueError + msg = "The 'converters' option is not supported with the 'pyarrow' engine" - with pytest.raises(OverflowError, match=msg): + with pytest.raises(err, match=msg): parser.read_csv(StringIO(data), converters={"ID": conv}) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] ) @@ -178,7 +194,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) @@ -192,7 +208,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # gets float64 dtype instead of object @pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]]) def test_numeric_range_too_wide(all_parsers, exp_data): # No numerical dtype can hold both negative and uint64 diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_iterator.py pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_iterator.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_iterator.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_iterator.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,7 +12,9 @@ ) import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) def test_iterator(all_parsers): @@ -29,6 +31,13 @@ kwargs = {"index_col": 0} expected = parser.read_csv(StringIO(data), **kwargs) + + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), iterator=True, **kwargs) + return + with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader: first_chunk = reader.read(3) tm.assert_frame_equal(first_chunk, expected[:3]) @@ -45,6 +54,12 @@ baz,7,8,9 """ + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), iterator=True) + return + with parser.read_csv(StringIO(data), iterator=True) as reader: result = list(reader) @@ -64,6 +79,11 @@ bar,4,5,6 baz,7,8,9 """ + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), chunksize=1) + return with parser.read_csv(StringIO(data), chunksize=1) as reader: result = list(reader) @@ -85,6 +105,12 @@ parser = all_parsers data = "a\n1\n2" + if parser.engine == "pyarrow": + msg = ( + "The '(chunksize|iterator)' option is not supported with the " + "'pyarrow' engine" + ) + with pytest.raises(ValueError, match=msg): with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _: pass diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_read_errors.py pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_read_errors.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_read_errors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_read_errors.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,5 +1,5 @@ """ -Tests that work on both the Python and C engines but do not have a +Tests that work on the Python, C and PyArrow engines but do not have a specific classification into the other test modules. """ import codecs @@ -15,12 +15,14 @@ from pandas.errors import ( EmptyDataError, ParserError, + ParserWarning, ) from pandas import DataFrame import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") def test_empty_decimal_marker(all_parsers): @@ -32,6 +34,12 @@ msg = "Only length-1 decimal markers supported" parser = all_parsers + if parser.engine == "pyarrow": + msg = ( + "only single character unicode strings can be " + "converted to Py_UCS4, got length 0" + ) + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), decimal="") @@ -66,7 +74,11 @@ 2,3,4 """ msg = "Expected 3 fields in line 4, saw 5" - with pytest.raises(ParserError, match=msg): + err = ParserError + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + err = ValueError + with pytest.raises(err, match=msg): parser.read_csv(StringIO(data), header=1, comment="#") @@ -81,6 +93,20 @@ 2,3,4 """ parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The 'iterator' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=1, + comment="#", + iterator=True, + chunksize=1, + skiprows=[2], + ) + return + msg = "Expected 3 fields in line 6, saw 5" with parser.read_csv( StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2] @@ -89,6 +115,7 @@ reader.read(nrows) +@xfail_pyarrow # does not raise def test_catch_too_many_names(all_parsers): # see gh-5156 data = """\ @@ -108,6 +135,7 @@ parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) +@skip_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers @@ -129,7 +157,7 @@ parser.read_table("foo.tsv", foo=1) -def test_suppress_error_output(all_parsers, capsys): +def test_suppress_error_output(all_parsers): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" @@ -138,9 +166,6 @@ result = parser.read_csv(StringIO(data), on_bad_lines="skip") tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert captured.err == "" - def test_error_bad_lines(all_parsers): # see gh-15925 @@ -148,23 +173,33 @@ data = "a\n1\n1,2,3\n4\n5,6,7" msg = "Expected 1 fields in line 3, saw 3" + + if parser.engine == "pyarrow": + # "CSV parse error: Expected 1 columns, got 3: 1,2,3" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), on_bad_lines="error") -def test_warn_bad_lines(all_parsers, capsys): +def test_warn_bad_lines(all_parsers): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) + match_msg = "Skipping line" - result = parser.read_csv(StringIO(data), on_bad_lines="warn") + expected_warning = ParserWarning + if parser.engine == "pyarrow": + match_msg = "Expected 1 columns, but found 3: 1,2,3" + expected_warning = (ParserWarning, DeprecationWarning) + + with tm.assert_produces_warning( + expected_warning, match=match_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert "Skipping line 3" in captured.err - assert "Skipping line 5" in captured.err - def test_read_csv_wrong_num_columns(all_parsers): # Too few columns. @@ -176,6 +211,10 @@ parser = all_parsers msg = "Expected 6 fields in line 3, saw 7" + if parser.engine == "pyarrow": + # Expected 6 columns, got 7: 6,7,8,9,10,11,12 + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data)) @@ -188,7 +227,7 @@ if parser.engine == "c" or (parser.engine == "python" and PY311): if parser.engine == "python" and PY311: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="In Python 3.11, this is read as an empty character not null" ) @@ -197,7 +236,12 @@ out = parser.read_csv(StringIO(data), names=names) tm.assert_frame_equal(out, expected) else: - msg = "NULL byte detected" + if parser.engine == "pyarrow": + # CSV parse error: Empty CSV file or block: " + # cannot infer number of columns" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + else: + msg = "NULL byte detected" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), names=names) @@ -206,13 +250,17 @@ def test_open_file(request, all_parsers): # GH 39024 parser = all_parsers + + msg = "Could not determine delimiter" + err = csv.Error if parser.engine == "c": - request.node.add_marker( - pytest.mark.xfail( - reason=f"{parser.engine} engine does not support sep=None " - f"with delim_whitespace=False" - ) + msg = "the 'c' engine does not support sep=None with delim_whitespace=False" + err = ValueError + elif parser.engine == "pyarrow": + msg = ( + "the 'pyarrow' engine does not support sep=None with delim_whitespace=False" ) + err = ValueError with tm.ensure_clean() as path: file = Path(path) @@ -220,7 +268,7 @@ with tm.assert_produces_warning(None): # should not trigger a ResourceWarning - with pytest.raises(csv.Error, match="Could not determine delimiter"): + with pytest.raises(err, match=msg): parser.read_csv(file, sep=None, encoding_errors="replace") @@ -240,12 +288,15 @@ "Could not construct index. Requested to use 1 " "number of columns, but 3 left to parse." ) + elif parser.engine == "pyarrow": + # "CSV parse error: Expected 1 columns, got 4: col1,col2,col3,col4" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") -def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys): +def test_on_bad_lines_warn_correct_formatting(all_parsers): # see gh-15925 parser = all_parsers data = """1,2 @@ -255,18 +306,15 @@ a,b """ expected = DataFrame({"1": "a", "2": ["b"] * 2}) + match_msg = "Skipping line" - result = parser.read_csv(StringIO(data), on_bad_lines="warn") + expected_warning = ParserWarning + if parser.engine == "pyarrow": + match_msg = "Expected 2 columns, but found 3: a,b,c" + expected_warning = (ParserWarning, DeprecationWarning) + + with tm.assert_produces_warning( + expected_warning, match=match_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) - - captured = capsys.readouterr() - if parser.engine == "c": - warn = """Skipping line 3: expected 2 fields, saw 3 -Skipping line 4: expected 2 fields, saw 3 - -""" - else: - warn = """Skipping line 3: Expected 2 fields in line 3, saw 3 -Skipping line 4: Expected 2 fields in line 4, saw 3 -""" - assert captured.err == warn diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_verbose.py pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_verbose.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/common/test_verbose.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/common/test_verbose.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,7 +6,9 @@ import pytest -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +import pandas._testing as tm + +depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated" def test_verbose_read(all_parsers, capsys): @@ -21,8 +23,20 @@ one,1,2,3 two,1,2,3""" + if parser.engine == "pyarrow": + msg = "The 'verbose' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), verbose=True) + return + # Engines are verbose in different ways. - parser.read_csv(StringIO(data), verbose=True) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), verbose=True) captured = capsys.readouterr() if parser.engine == "c": @@ -44,7 +58,19 @@ seven,1,2,3 eight,1,2,3""" - parser.read_csv(StringIO(data), verbose=True, index_col=0) + if parser.engine == "pyarrow": + msg = "The 'verbose' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), verbose=True, index_col=0) + return + + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), verbose=True, index_col=0) captured = capsys.readouterr() # Engines are verbose in different ways. diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/conftest.py pandas-2.2.2+dfsg/pandas/tests/io/parser/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -29,13 +29,24 @@ return read_csv(*args, **kwargs) def read_csv_check_warnings( - self, warn_type: type[Warning], warn_msg: str, *args, **kwargs + self, + warn_type: type[Warning], + warn_msg: str, + *args, + raise_on_extra_warnings=True, + check_stacklevel: bool = True, + **kwargs, ): # We need to check the stacklevel here instead of in the tests # since this is where read_csv is called and where the warning # should point to. kwargs = self.update_kwargs(kwargs) - with tm.assert_produces_warning(warn_type, match=warn_msg): + with tm.assert_produces_warning( + warn_type, + match=warn_msg, + raise_on_extra_warnings=raise_on_extra_warnings, + check_stacklevel=check_stacklevel, + ): return read_csv(*args, **kwargs) def read_table(self, *args, **kwargs): @@ -43,13 +54,20 @@ return read_table(*args, **kwargs) def read_table_check_warnings( - self, warn_type: type[Warning], warn_msg: str, *args, **kwargs + self, + warn_type: type[Warning], + warn_msg: str, + *args, + raise_on_extra_warnings=True, + **kwargs, ): # We need to check the stacklevel here instead of in the tests # since this is where read_table is called and where the warning # should point to. kwargs = self.update_kwargs(kwargs) - with tm.assert_produces_warning(warn_type, match=warn_msg): + with tm.assert_produces_warning( + warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings + ): return read_table(*args, **kwargs) @@ -268,6 +286,8 @@ def pyarrow_xfail(request): """ Fixture that xfails a test if the engine is pyarrow. + + Use if failure is do to unsupported keywords or inconsistent results. """ if "all_parsers" in request.fixturenames: parser = request.getfixturevalue("all_parsers") @@ -278,13 +298,15 @@ return if parser.engine == "pyarrow": mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") - request.node.add_marker(mark) + request.applymarker(mark) @pytest.fixture def pyarrow_skip(request): """ Fixture that skips a test if the engine is pyarrow. + + Use if failure is do a parsing failure from pyarrow.csv.read_csv """ if "all_parsers" in request.fixturenames: parser = request.getfixturevalue("all_parsers") @@ -294,4 +316,4 @@ else: return if parser.engine == "pyarrow": - pytest.skip("pyarrow doesn't support this.") + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/dtypes/test_categorical.py pandas-2.2.2+dfsg/pandas/tests/io/parser/dtypes/test_categorical.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/dtypes/test_categorical.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/dtypes/test_categorical.py 2024-04-10 17:42:52.000000000 +0000 @@ -20,11 +20,14 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@xfail_pyarrow +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different @pytest.mark.parametrize( "dtype", [ @@ -51,9 +54,8 @@ tm.assert_frame_equal(actual, expected) -@skip_pyarrow # Flaky @pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) -def test_categorical_dtype_single(all_parsers, dtype): +def test_categorical_dtype_single(all_parsers, dtype, request): # see gh-10153 parser = all_parsers data = """a,b,c @@ -63,11 +65,18 @@ expected = DataFrame( {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} ) + if parser.engine == "pyarrow": + mark = pytest.mark.xfail( + strict=False, + reason="Flaky test sometimes gives object dtype instead of Categorical", + ) + request.applymarker(mark) + actual = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(actual, expected) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different def test_categorical_dtype_unsorted(all_parsers): # see gh-10153 parser = all_parsers @@ -86,7 +95,7 @@ tm.assert_frame_equal(actual, expected) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different def test_categorical_dtype_missing(all_parsers): # see gh-10153 parser = all_parsers @@ -105,7 +114,7 @@ tm.assert_frame_equal(actual, expected) -@xfail_pyarrow +@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different @pytest.mark.slow def test_categorical_dtype_high_cardinality_numeric(all_parsers, monkeypatch): # see gh-18186 @@ -137,7 +146,6 @@ tm.assert_frame_equal(actual, expected) -@xfail_pyarrow def test_categorical_dtype_chunksize_infer_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -150,6 +158,13 @@ DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), ] + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2) + return + with parser.read_csv( StringIO(data), dtype={"b": "category"}, chunksize=2 ) as actuals: @@ -157,7 +172,6 @@ tm.assert_frame_equal(actual, expected) -@xfail_pyarrow def test_categorical_dtype_chunksize_explicit_categories(all_parsers): # see gh-10153 parser = all_parsers @@ -175,6 +189,13 @@ ), ] dtype = CategoricalDtype(cats) + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) + return + with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals: for actual, expected in zip(actuals, expecteds): tm.assert_frame_equal(actual, expected) @@ -249,7 +270,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow # Flaky def test_categorical_coerces_datetime(all_parsers): parser = all_parsers dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None) @@ -275,9 +295,9 @@ def test_categorical_coerces_timedelta(all_parsers): parser = all_parsers - dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} + dtype = {"b": CategoricalDtype(pd.to_timedelta(["1h", "2h", "3h"]))} - data = "b\n1H\n2H\n3H" + data = "b\n1h\n2h\n3h" expected = DataFrame({"b": Categorical(dtype["b"].categories)}) result = parser.read_csv(StringIO(data), dtype=dtype) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/dtypes/test_dtypes_basic.py pandas-2.2.2+dfsg/pandas/tests/io/parser/dtypes/test_dtypes_basic.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/dtypes/test_dtypes_basic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/dtypes/test_dtypes_basic.py 2024-04-10 17:42:52.000000000 +0000 @@ -22,6 +22,10 @@ StringArray, ) +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @@ -69,7 +73,6 @@ tm.assert_frame_equal(result, expected) -@pytest.mark.usefixtures("pyarrow_xfail") def test_invalid_dtype_per_column(all_parsers): parser = all_parsers data = """\ @@ -83,7 +86,6 @@ parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) -@pytest.mark.usefixtures("pyarrow_xfail") def test_raise_on_passed_int_dtype_with_nas(all_parsers): # see gh-2631 parser = all_parsers @@ -92,22 +94,31 @@ 2001,,11 2001,106380451,67""" - msg = ( - "Integer column has NA values" - if parser.engine == "c" - else "Unable to convert column DOY" - ) + if parser.engine == "c": + msg = "Integer column has NA values" + elif parser.engine == "pyarrow": + msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" + else: + msg = "Unable to convert column DOY" + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) -@pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_with_converters(all_parsers): parser = all_parsers data = """a,b 1.1,2.2 1.2,2.3""" + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} + ) + return + # Dtype spec ignored if converted specified. result = parser.read_csv_check_warnings( ParserWarning, @@ -234,7 +245,7 @@ # GH#31920 value = numeric_decimal[0] if thousands is None and value in ("1_,", "1_234,56", "1_234,56e0"): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason=f"thousands={thousands} and sep is in {value}") ) df = parser.read_csv( @@ -520,6 +531,9 @@ tm.assert_frame_equal(result, expected) +# pyarrow engine failing: +# https://github.com/pandas-dev/pandas/issues/56136 +@pytest.mark.usefixtures("pyarrow_xfail") def test_ea_int_avoid_overflow(all_parsers): # GH#32134 parser = all_parsers @@ -560,6 +574,58 @@ tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) +def test_string_inference_object_dtype(all_parsers, dtype): + # GH#56047 + pytest.importorskip("pyarrow") + + data = """a,b +x,a +y,a +z,a""" + parser = all_parsers + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data), dtype=dtype) + + expected = DataFrame( + { + "a": pd.Series(["x", "y", "z"], dtype=object), + "b": pd.Series(["a", "a", "a"], dtype=object), + }, + columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data), dtype={"a": dtype}) + + expected = DataFrame( + { + "a": pd.Series(["x", "y", "z"], dtype=object), + "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + }, + columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + + +def test_accurate_parsing_of_large_integers(all_parsers): + # GH#52505 + data = """SYMBOL,MOMENT,ID,ID_DEAL +AAPL,20230301181139587,1925036343869802844, +AAPL,20230301181139587,2023552585717889863,2023552585717263358 +NVDA,20230301181139587,2023552585717889863,2023552585717263359 +AMC,20230301181139587,2023552585717889863,2023552585717263360 +AMZN,20230301181139587,2023552585717889759,2023552585717263360 +MSFT,20230301181139587,2023552585717889863,2023552585717263361 +NVDA,20230301181139587,2023552585717889827,2023552585717263361""" + orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 + assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263361, "ID_DEAL"]) == 2 + + def test_dtypes_with_usecols(all_parsers): # GH#54868 diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/dtypes/test_empty.py pandas-2.2.2+dfsg/pandas/tests/io/parser/dtypes/test_empty.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/dtypes/test_empty.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/dtypes/test_empty.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,10 +17,10 @@ ) import pandas._testing as tm -# TODO(1.4): Change me into individual xfails at release time -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_dtype_all_columns_empty(all_parsers): # see gh-12048 parser = all_parsers @@ -30,6 +30,7 @@ tm.assert_frame_equal(result, expected) +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_pass_dtype(all_parsers): parser = all_parsers @@ -42,6 +43,7 @@ tm.assert_frame_equal(result, expected) +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_index_pass_dtype(all_parsers): parser = all_parsers @@ -56,6 +58,7 @@ tm.assert_frame_equal(result, expected) +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_multi_index_pass_dtype(all_parsers): parser = all_parsers @@ -72,6 +75,7 @@ tm.assert_frame_equal(result, expected) +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): parser = all_parsers @@ -84,6 +88,7 @@ tm.assert_frame_equal(result, expected) +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): parser = all_parsers @@ -96,6 +101,7 @@ tm.assert_frame_equal(result, expected) +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): # see gh-9424 parser = all_parsers @@ -165,6 +171,7 @@ ), ], ) +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_dtype(all_parsers, dtype, expected): # see gh-14712 parser = all_parsers diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_c_parser_only.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_c_parser_only.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_c_parser_only.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_c_parser_only.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,9 +17,11 @@ import numpy as np import pytest -from pandas.compat import is_ci_environment from pandas.compat.numpy import np_version_gte1p24 -from pandas.errors import ParserError +from pandas.errors import ( + ParserError, + ParserWarning, +) import pandas.util._test_decorators as td from pandas import ( @@ -49,7 +51,11 @@ data = "a b c~1 2 3~4 5 6~7 8 9" parser = c_parser_only - df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) tm.assert_frame_equal(df, expected) @@ -144,7 +150,9 @@ @td.skip_if_32bit @pytest.mark.slow -def test_precise_conversion(c_parser_only): +# test numbers between 1 and 2 +@pytest.mark.parametrize("num", np.linspace(1.0, 2.0, num=21)) +def test_precise_conversion(c_parser_only, num): parser = c_parser_only normal_errors = [] @@ -153,27 +161,23 @@ def error(val: float, actual_val: Decimal) -> Decimal: return abs(Decimal(f"{val:.100}") - actual_val) - # test numbers between 1 and 2 - for num in np.linspace(1.0, 2.0, num=500): - # 25 decimal digits of precision - text = f"a\n{num:.25}" + # 25 decimal digits of precision + text = f"a\n{num:.25}" - normal_val = float( - parser.read_csv(StringIO(text), float_precision="legacy")["a"][0] - ) - precise_val = float( - parser.read_csv(StringIO(text), float_precision="high")["a"][0] - ) - roundtrip_val = float( - parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0] - ) - actual_val = Decimal(text[2:]) + normal_val = float( + parser.read_csv(StringIO(text), float_precision="legacy")["a"][0] + ) + precise_val = float(parser.read_csv(StringIO(text), float_precision="high")["a"][0]) + roundtrip_val = float( + parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0] + ) + actual_val = Decimal(text[2:]) - normal_errors.append(error(normal_val, actual_val)) - precise_errors.append(error(precise_val, actual_val)) + normal_errors.append(error(normal_val, actual_val)) + precise_errors.append(error(precise_val, actual_val)) - # round-trip should match float() - assert roundtrip_val == float(text[2:]) + # round-trip should match float() + assert roundtrip_val == float(text[2:]) assert sum(precise_errors) <= sum(normal_errors) assert max(precise_errors) <= max(normal_errors) @@ -284,7 +288,8 @@ @pytest.mark.slow -def test_grow_boundary_at_cap(c_parser_only): +@pytest.mark.parametrize("count", [3 * 2**n for n in range(6)]) +def test_grow_boundary_at_cap(c_parser_only, count): # See gh-12494 # # Cause of error was that the C parser @@ -293,19 +298,18 @@ # to capacity, which would later cause a # buffer overflow error when checking the # EOF terminator of the CSV stream. + # 3 * 2^n commas was observed to break the parser parser = c_parser_only - def test_empty_header_read(count): - with StringIO("," * count) as s: - expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)]) - df = parser.read_csv(s) - tm.assert_frame_equal(df, expected) - - for cnt in range(1, 101): - test_empty_header_read(cnt) + with StringIO("," * count) as s: + expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)]) + df = parser.read_csv(s) + tm.assert_frame_equal(df, expected) -def test_parse_trim_buffers(c_parser_only): +@pytest.mark.slow +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +def test_parse_trim_buffers(c_parser_only, encoding): # This test is part of a bugfix for gh-13703. It attempts to # to stress the system memory allocator, to cause it to move the # stream buffer and either let the OS reclaim the region, or let @@ -316,6 +320,9 @@ # times it fails due to memory corruption, which causes the # loaded DataFrame to differ from the expected one. + # Also force 'utf-8' encoding, so that `_string_convert` would take + # a different execution branch. + parser = c_parser_only # Generate a large mixed-type CSV file on-the-fly (one record is @@ -372,24 +379,15 @@ # Iterate over the CSV file in chunks of `chunksize` lines with parser.read_csv( - StringIO(csv_data), header=None, dtype=object, chunksize=chunksize - ) as chunks_: - result = concat(chunks_, axis=0, ignore_index=True) - - # Check for data corruption if there was no segfault - tm.assert_frame_equal(result, expected) - - # This extra test was added to replicate the fault in gh-5291. - # Force 'utf-8' encoding, so that `_string_convert` would take - # a different execution branch. - with parser.read_csv( StringIO(csv_data), header=None, dtype=object, chunksize=chunksize, - encoding="utf_8", + encoding=encoding, ) as chunks_: result = concat(chunks_, axis=0, ignore_index=True) + + # Check for data corruption if there was no segfault tm.assert_frame_equal(result, expected) @@ -461,7 +459,7 @@ tm.assert_frame_equal(result, expected) -def test_comment_whitespace_delimited(c_parser_only, capsys): +def test_comment_whitespace_delimited(c_parser_only): parser = c_parser_only test_input = """\ 1 2 @@ -474,18 +472,17 @@ 8# 1 field, NaN 9 2 3 # skipped line # comment""" - df = parser.read_csv( - StringIO(test_input), - comment="#", - header=None, - delimiter="\\s+", - skiprows=0, - on_bad_lines="warn", - ) - captured = capsys.readouterr() - # skipped lines 2, 3, 4, 9 - for line_num in (2, 3, 4, 9): - assert f"Skipping line {line_num}" in captured.err + with tm.assert_produces_warning( + ParserWarning, match="Skipping line", check_stacklevel=False + ): + df = parser.read_csv( + StringIO(test_input), + comment="#", + header=None, + delimiter="\\s+", + skiprows=0, + on_bad_lines="warn", + ) expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) tm.assert_frame_equal(df, expected) @@ -537,24 +534,6 @@ tm.assert_frame_equal(out, expected) -@pytest.mark.single_cpu -@pytest.mark.skipif(is_ci_environment(), reason="Too memory intensive for CI.") -def test_bytes_exceed_2gb(c_parser_only): - # see gh-16798 - # - # Read from a "CSV" that has a column larger than 2GB. - parser = c_parser_only - - if parser.low_memory: - pytest.skip("not a low_memory test") - - # csv takes 10 seconds to construct, spikes memory to 8GB+, the whole test - # spikes up to 10.4GB on the c_high case - csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])) - df = parser.read_csv(csv) - assert not df.empty - - def test_chunk_whitespace_on_boundary(c_parser_only): # see gh-9735: this issue is C parser-specific (bug when # parsing whitespace and characters at chunk boundary) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_comment.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_comment.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_comment.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_comment.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,8 +10,6 @@ from pandas import DataFrame import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - @pytest.mark.parametrize("na_values", [None, ["NaN"]]) def test_comment(all_parsers, na_values): @@ -23,6 +21,11 @@ expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", na_values=na_values) + return result = parser.read_csv(StringIO(data), comment="#", na_values=na_values) tm.assert_frame_equal(result, expected) @@ -38,19 +41,40 @@ #ignore this line 5.,NaN,10.0 """ + warn = None + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + if read_kwargs.get("delim_whitespace"): data = data.replace(",", " ") + warn = FutureWarning elif read_kwargs.get("lineterminator"): - if parser.engine != "c": - mark = pytest.mark.xfail( - reason="Custom terminator not supported with Python engine" - ) - request.node.add_marker(mark) - data = data.replace("\n", read_kwargs.get("lineterminator")) read_kwargs["comment"] = "#" - result = parser.read_csv(StringIO(data), **read_kwargs) + if parser.engine == "pyarrow": + if "lineterminator" in read_kwargs: + msg = ( + "The 'lineterminator' option is not supported with the 'pyarrow' engine" + ) + else: + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + warn, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), **read_kwargs) + return + elif parser.engine == "python" and read_kwargs.get("lineterminator"): + msg = r"Custom line terminators not supported in python parser \(yet\)" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + warn, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), **read_kwargs) + return + + with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): + result = parser.read_csv(StringIO(data), **read_kwargs) expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] @@ -72,6 +96,12 @@ expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", skiprows=4) + return + result = parser.read_csv(StringIO(data), comment="#", skiprows=4) tm.assert_frame_equal(result, expected) @@ -89,6 +119,11 @@ expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", header=1) + return result = parser.read_csv(StringIO(data), comment="#", header=1) tm.assert_frame_equal(result, expected) @@ -110,6 +145,12 @@ expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) + return + result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) tm.assert_frame_equal(result, expected) @@ -118,6 +159,14 @@ def test_custom_comment_char(all_parsers, comment_char): parser = all_parsers data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" + + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data.replace("#", comment_char)), comment=comment_char + ) + return result = parser.read_csv( StringIO(data.replace("#", comment_char)), comment=comment_char ) @@ -137,6 +186,11 @@ else: expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", header=header) + return result = parser.read_csv(StringIO(data), comment="#", header=header) tm.assert_frame_equal(result, expected) @@ -146,7 +200,7 @@ if all_parsers.engine == "c": reason = "see gh-34002: works on the python engine but not the c engine" # NA value containing comment char is interpreted as comment - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=AssertionError)) + request.applymarker(pytest.mark.xfail(reason=reason, raises=AssertionError)) parser = all_parsers data = ( @@ -156,6 +210,11 @@ "4,5#,6,10\n" "7,8,#N/A,11\n" ) + if parser.engine == "pyarrow": + msg = "The 'comment' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), comment="#", na_values="#N/A") + return result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A") expected = DataFrame( { diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_compression.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_compression.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_compression.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_compression.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,7 +13,9 @@ from pandas import DataFrame import pandas._testing as tm -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) @pytest.fixture(params=[True, False]) @@ -32,7 +34,6 @@ return parser, data, expected -@skip_pyarrow @pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) def test_zip(parser_and_data, compression): parser, data, expected = parser_and_data @@ -50,7 +51,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("compression", ["zip", "infer"]) def test_zip_error_multiple_files(parser_and_data, compression): parser, data, expected = parser_and_data @@ -66,7 +66,6 @@ parser.read_csv(path, compression=compression) -@skip_pyarrow def test_zip_error_no_files(parser_and_data): parser, _, _ = parser_and_data @@ -78,7 +77,6 @@ parser.read_csv(path, compression="zip") -@skip_pyarrow def test_zip_error_invalid_zip(parser_and_data): parser, _, _ = parser_and_data @@ -88,7 +86,6 @@ parser.read_csv(f, compression="zip") -@skip_pyarrow @pytest.mark.parametrize("filename", [None, "test.{ext}"]) def test_compression( request, @@ -105,7 +102,7 @@ filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Cannot deduce compression from buffer of compressed data." ) @@ -124,7 +121,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("ext", [None, "gz", "bz2"]) def test_infer_compression(all_parsers, csv1, buffer, ext): # see gh-9770 @@ -144,7 +140,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): # see gh-18071, gh-24130 parser = all_parsers @@ -162,7 +157,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) def test_invalid_compression(all_parsers, invalid_compression): parser = all_parsers @@ -174,7 +168,6 @@ parser.read_csv("test_file.zip", **compress_kwargs) -@skip_pyarrow def test_compression_tar_archive(all_parsers, csv_dir_path): parser = all_parsers path = os.path.join(csv_dir_path, "tar_csv.tar.gz") @@ -196,7 +189,6 @@ tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df) -@skip_pyarrow def test_writes_tar_gz(all_parsers): parser = all_parsers data = DataFrame( diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_converters.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_converters.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_converters.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_converters.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,15 +15,17 @@ ) import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_skip") - def test_converters_type_must_be_dict(all_parsers): parser = all_parsers data = """index,A,B,C,D foo,2,3,4,5 """ - + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), converters=0) + return with pytest.raises(TypeError, match="Type converters.+"): parser.read_csv(StringIO(data), converters=0) @@ -39,6 +41,12 @@ b,3,4,01/02/2009 c,4,5,01/03/2009 """ + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), converters={column: converter}) + return + result = parser.read_csv(StringIO(data), converters={column: converter}) expected = parser.read_csv(StringIO(data)) @@ -53,6 +61,13 @@ data = """000102,1.2,A\n001245,2,B""" converters = {0: lambda x: x.strip()} + + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=None, converters=converters) + return + result = parser.read_csv(StringIO(data), header=None, converters=converters) # Column 0 should not be casted to numeric and should remain as object. @@ -73,6 +88,12 @@ "Number3" ] = lambda x: float(x.replace(",", ".")) + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=";", converters=converters) + return + result = parser.read_csv(StringIO(data), sep=";", converters=converters) expected = DataFrame( [ @@ -141,6 +162,16 @@ results = [] for day_converter in [convert_days, convert_days_sentinel]: + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + converters={"score": convert_score, "days": day_converter}, + na_values=["", None], + ) + continue + result = parser.read_csv( StringIO(data), converters={"score": convert_score, "days": day_converter}, @@ -149,7 +180,8 @@ assert pd.isna(result["days"][1]) results.append(result) - tm.assert_frame_equal(results[0], results[1]) + if parser.engine != "pyarrow": + tm.assert_frame_equal(results[0], results[1]) @pytest.mark.parametrize("conv_f", [lambda x: x, str]) @@ -158,6 +190,14 @@ parser = all_parsers data = "A;B\n1;2\n3;4" + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), sep=";", index_col="A", converters={"A": conv_f} + ) + return + rs = parser.read_csv( StringIO(data), sep=";", index_col="A", converters={"A": conv_f} ) @@ -171,6 +211,12 @@ parser = all_parsers data = "A,B\n1,2\n3,4" + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), converters={"A": lambda x: x}) + return + rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x}) xp = DataFrame({"A": ["1", "3"], "B": [2, 4]}) @@ -182,6 +228,20 @@ parser = all_parsers data = "A,B,B\nX,Y,Z\n1,2,3" + if parser.engine == "pyarrow": + msg = "The 'converters' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=list(range(2)), + converters={ + ("A", "X"): np.int32, + ("B", "Y"): np.int32, + ("B", "Z"): np.float32, + }, + ) + return + result = parser.read_csv( StringIO(data), header=list(range(2)), diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_dialect.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_dialect.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_dialect.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_dialect.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,7 +13,9 @@ from pandas import DataFrame import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) @pytest.fixture @@ -40,6 +42,13 @@ dia = csv.excel() dia.quoting = csv.QUOTE_NONE + + if parser.engine == "pyarrow": + msg = "The 'dialect' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dialect=dia) + return + df = parser.read_csv(StringIO(data), dialect=dia) data = """\ @@ -63,6 +72,12 @@ exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]}) with tm.with_csv_dialect(dialect_name, delimiter=":"): + if parser.engine == "pyarrow": + msg = "The 'dialect' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dialect=dialect_name) + return + df = parser.read_csv(StringIO(data), dialect=dialect_name) tm.assert_frame_equal(df, exp) @@ -108,6 +123,18 @@ kwds[arg] = "blah" with tm.with_csv_dialect(dialect_name, **dialect_kwargs): + if parser.engine == "pyarrow": + msg = "The 'dialect' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv_check_warnings( + # No warning bc we raise + None, + "Conflicting values for", + StringIO(data), + dialect=dialect_name, + **kwds, + ) + return result = parser.read_csv_check_warnings( warning_klass, "Conflicting values for", @@ -146,6 +173,18 @@ data = "a:b\n1:2" with tm.with_csv_dialect(dialect_name, **dialect_kwargs): + if parser.engine == "pyarrow": + msg = "The 'dialect' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv_check_warnings( + # no warning bc we raise + None, + "Conflicting values for 'delimiter'", + StringIO(data), + dialect=dialect_name, + **kwargs, + ) + return result = parser.read_csv_check_warnings( warning_klass, "Conflicting values for 'delimiter'", diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_encoding.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_encoding.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_encoding.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_encoding.py 2024-04-10 17:42:52.000000000 +0000 @@ -19,8 +19,11 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") def test_bytes_io_input(all_parsers): @@ -34,7 +37,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_read_csv_unicode(all_parsers): parser = all_parsers data = BytesIO("\u0141aski, Jan;1".encode()) @@ -44,7 +47,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow @pytest.mark.parametrize("sep", [",", "\t"]) @pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) def test_utf16_bom_skiprows(all_parsers, sep, encoding): @@ -126,10 +129,8 @@ and data == "\n1" and kwargs.get("skip_blank_lines", True) ): - # Manually xfail, since we don't have mechanism to xfail specific version - request.node.add_marker( - pytest.mark.xfail(reason="Pyarrow can't read blank lines") - ) + # CSV parse error: Empty CSV file or block: cannot infer number of columns + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) tm.assert_frame_equal(result, expected) @@ -179,13 +180,16 @@ tm.assert_frame_equal(expected, result) -@skip_pyarrow @pytest.mark.parametrize("pass_encoding", [True, False]) def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): # see gh-24130 parser = all_parsers encoding = encoding_fmt.format(utf_value) + if parser.engine == "pyarrow" and pass_encoding is True and utf_value in [16, 32]: + # FIXME: this is bad! + pytest.skip("These cases freeze") + expected = DataFrame({"foo": ["bar"]}) with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f: @@ -196,7 +200,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_encoding_named_temp_file(all_parsers): # see gh-31819 parser = all_parsers @@ -234,7 +237,6 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"]) def test_encoding_memory_map(all_parsers, encoding): # GH40986 @@ -248,11 +250,17 @@ ) with tm.ensure_clean() as file: expected.to_csv(file, index=False, encoding=encoding) + + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(file, encoding=encoding, memory_map=True) + return + df = parser.read_csv(file, encoding=encoding, memory_map=True) tm.assert_frame_equal(df, expected) -@xfail_pyarrow def test_chunk_splits_multibyte_char(all_parsers): """ Chunk splits a multibyte character with memory_map=True @@ -268,11 +276,17 @@ df.iloc[2047] = "a" * 127 + "ą" with tm.ensure_clean("bug-gh43540.csv") as fname: df.to_csv(fname, index=False, header=False, encoding="utf-8") - dfr = parser.read_csv(fname, header=None, memory_map=True, engine="c") + + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(fname, header=None, memory_map=True) + return + + dfr = parser.read_csv(fname, header=None, memory_map=True) tm.assert_frame_equal(dfr, df) -@xfail_pyarrow def test_readcsv_memmap_utf8(all_parsers): """ GH 43787 @@ -296,9 +310,14 @@ df = DataFrame(lines) with tm.ensure_clean("utf8test.csv") as fname: df.to_csv(fname, index=False, header=False, encoding="utf-8") - dfr = parser.read_csv( - fname, header=None, memory_map=True, engine="c", encoding="utf-8" - ) + + if parser.engine == "pyarrow": + msg = "The 'memory_map' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8") + return + + dfr = parser.read_csv(fname, header=None, memory_map=True, encoding="utf-8") tm.assert_frame_equal(df, dfr) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_header.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_header.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_header.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_header.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,11 +18,15 @@ ) import pandas._testing as tm -# TODO(1.4): Change me to xfails at release time +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_read_with_bad_header(all_parsers): parser = all_parsers msg = r"but only \d+ lines in file" @@ -76,7 +80,7 @@ parser.read_csv(StringIO(data), header=header) -@skip_pyarrow +@xfail_pyarrow # AssertionError: DataFrame are different def test_header_with_index_col(all_parsers): parser = all_parsers data = """foo,1,2,3 @@ -114,10 +118,9 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_header_multi_index(all_parsers): parser = all_parsers - expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 @@ -133,6 +136,23 @@ R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) + data_gen_f = lambda r, c: f"R{r}C{c}" + + data = [[data_gen_f(r, c) for c in range(3)] for r in range(5)] + index = MultiIndex.from_arrays( + [[f"R_l0_g{i}" for i in range(5)], [f"R_l1_g{i}" for i in range(5)]], + names=["R0", "R1"], + ) + columns = MultiIndex.from_arrays( + [ + [f"C_l0_g{i}" for i in range(3)], + [f"C_l1_g{i}" for i in range(3)], + [f"C_l2_g{i}" for i in range(3)], + [f"C_l3_g{i}" for i in range(3)], + ], + names=["C0", "C1", "C2", "C3"], + ) + expected = DataFrame(data, columns=columns, index=index) tm.assert_frame_equal(result, expected) @@ -180,7 +200,7 @@ _TestTuple = namedtuple("_TestTuple", ["first", "second"]) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "kwargs", [ @@ -228,7 +248,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "kwargs", [ @@ -275,7 +295,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "kwargs", [ @@ -323,7 +343,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_header_multi_index_common_format_malformed1(all_parsers): parser = all_parsers expected = DataFrame( @@ -344,7 +364,7 @@ tm.assert_frame_equal(expected, result) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers expected = DataFrame( @@ -366,7 +386,7 @@ tm.assert_frame_equal(expected, result) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_header_multi_index_common_format_malformed3(all_parsers): parser = all_parsers expected = DataFrame( @@ -387,7 +407,7 @@ tm.assert_frame_equal(expected, result) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_header_multi_index_blank_line(all_parsers): # GH 40442 parser = all_parsers @@ -399,20 +419,24 @@ tm.assert_frame_equal(expected, result) -@skip_pyarrow @pytest.mark.parametrize( "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] ) -def test_header_names_backward_compat(all_parsers, data, header): +def test_header_names_backward_compat(all_parsers, data, header, request): # see gh-2539 parser = all_parsers + + if parser.engine == "pyarrow" and header is not None: + mark = pytest.mark.xfail(reason="DataFrame.columns are different") + request.applymarker(mark) + expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"]) result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header) tm.assert_frame_equal(result, expected) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Empty CSV file or block: cannot infer @pytest.mark.parametrize("kwargs", [{}, {"index_col": False}]) def test_read_only_header_no_rows(all_parsers, kwargs): # See gh-7773 @@ -457,7 +481,7 @@ parser.read_csv(StringIO(data), header=header) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_singleton_header(all_parsers): # see gh-7757 data = """a,b,c\n0,1,2\n1,2,3""" @@ -468,7 +492,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "data,expected", [ @@ -515,7 +539,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is requireds @pytest.mark.parametrize("index_col", [None, [0]]) @pytest.mark.parametrize( "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] @@ -554,7 +578,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Expected 2 columns, got 3 def test_names_longer_than_header_but_equal_with_data_rows(all_parsers): # GH#38453 parser = all_parsers @@ -567,7 +591,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_read_csv_multiindex_columns(all_parsers): # GH#6051 parser = all_parsers @@ -599,7 +623,7 @@ tm.assert_frame_equal(df2, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_read_csv_multi_header_length_check(all_parsers): # GH#43102 parser = all_parsers @@ -615,7 +639,7 @@ parser.read_csv(StringIO(case), header=[0, 2]) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Expected 3 columns, got 2 def test_header_none_and_implicit_index(all_parsers): # GH#22144 parser = all_parsers @@ -627,7 +651,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@skip_pyarrow # regex mismatch "CSV parse error: Expected 2 columns, got " def test_header_none_and_implicit_index_in_second_row(all_parsers): # GH#22144 parser = all_parsers @@ -636,7 +660,6 @@ parser.read_csv(StringIO(data), names=["a", "b"], header=None) -@skip_pyarrow def test_header_none_and_on_bad_lines_skip(all_parsers): # GH#22144 parser = all_parsers @@ -648,7 +671,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is requireds def test_header_missing_rows(all_parsers): # GH#47400 parser = all_parsers @@ -660,7 +683,8 @@ parser.read_csv(StringIO(data), header=[0, 1, 2]) -@skip_pyarrow +# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine +@xfail_pyarrow def test_header_multiple_whitespaces(all_parsers): # GH#54931 parser = all_parsers @@ -672,7 +696,8 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +# ValueError: The 'delim_whitespace' option is not supported with the 'pyarrow' engine +@xfail_pyarrow def test_header_delim_whitespace(all_parsers): # GH#54918 parser = all_parsers @@ -681,6 +706,28 @@ 3,4 """ - result = parser.read_csv(StringIO(data), delim_whitespace=True) + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), delim_whitespace=True) expected = DataFrame({"a,b": ["1,2", "3,4"]}) tm.assert_frame_equal(result, expected) + + +def test_usecols_no_header_pyarrow(pyarrow_parser_only): + parser = pyarrow_parser_only + data = """ +a,i,x +b,j,y +""" + result = parser.read_csv( + StringIO(data), + header=None, + usecols=[0, 1], + dtype="string[pyarrow]", + dtype_backend="pyarrow", + engine="pyarrow", + ) + expected = DataFrame([["a", "i"], ["b", "j"]], dtype="string[pyarrow]") + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_index_col.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_index_col.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_index_col.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_index_col.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,7 +15,11 @@ ) import pandas._testing as tm -# TODO(1.4): Change me to xfails at release time +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -73,7 +77,7 @@ parser.read_csv(StringIO(data), index_col=True) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Expected 3 columns, got 4 def test_infer_index_col(all_parsers): data = """A,B,C foo,1,2,3 @@ -91,7 +95,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Empty CSV file or block @pytest.mark.parametrize( "index_col,kwargs", [ @@ -140,7 +144,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_empty_with_index_col_false(all_parsers): # see gh-10413 data = "x,y" @@ -151,7 +155,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "index_names", [ @@ -162,9 +165,13 @@ ["NotReallyUnnamed", "Unnamed: 0"], ], ) -def test_multi_index_naming(all_parsers, index_names): +def test_multi_index_naming(all_parsers, index_names, request): parser = all_parsers + if parser.engine == "pyarrow" and "" in index_names: + mark = pytest.mark.xfail(reason="One case raises, others are wrong") + request.applymarker(mark) + # We don't want empty index names being replaced with "Unnamed: 0" data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"]) result = parser.read_csv(StringIO(data), index_col=[0, 1]) @@ -176,7 +183,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_multi_index_naming_not_all_at_beginning(all_parsers): parser = all_parsers data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" @@ -191,7 +198,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_no_multi_index_level_names_empty(all_parsers): # GH 10984 parser = all_parsers @@ -207,7 +214,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_header_with_index_col(all_parsers): # GH 33476 parser = all_parsers @@ -253,7 +260,7 @@ tm.assert_frame_equal(result, df.set_index("a")) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_index_col_multiindex_columns_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -270,7 +277,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_index_col_header_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -283,7 +290,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_multiindex_columns_no_data(all_parsers): # GH#38292 parser = all_parsers @@ -294,7 +301,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_multiindex_columns_index_col_with_data(all_parsers): # GH#38292 parser = all_parsers @@ -311,7 +318,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Empty CSV file or block def test_infer_types_boolean_sum(all_parsers): # GH#44079 parser = all_parsers @@ -341,7 +348,7 @@ data = "a,b\n01,2" parser = all_parsers if dtype == object and parser.engine == "pyarrow": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine") ) result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) @@ -349,7 +356,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_multiindex_columns_not_leading_index_col(all_parsers): # GH#38549 parser = all_parsers diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_mangle_dupes.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_mangle_dupes.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_mangle_dupes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_mangle_dupes.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,10 +10,15 @@ from pandas import DataFrame import pandas._testing as tm -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") -@skip_pyarrow +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +@xfail_pyarrow # ValueError: Found non-unique column index def test_basic(all_parsers): parser = all_parsers @@ -24,7 +29,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_basic_names(all_parsers): # See gh-7160 parser = all_parsers @@ -45,7 +50,7 @@ parser.read_csv(StringIO(data), names=["a", "b", "a"]) -@skip_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index @pytest.mark.parametrize( "data,expected", [ @@ -74,7 +79,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "data,names,expected", [ @@ -114,7 +118,7 @@ parser.read_csv(StringIO(data), names=names) -@skip_pyarrow +@xfail_pyarrow # AssertionError: DataFrame.columns are different def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 orig_key = "0" @@ -137,7 +141,7 @@ tm.assert_frame_equal(df, expected) -@skip_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_mangle_dupe_cols_already_exists(all_parsers): # GH#14704 parser = all_parsers @@ -151,7 +155,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # ValueError: Found non-unique column index def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers): # GH#14704 parser = all_parsers @@ -165,7 +169,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("usecol, engine", [([0, 1, 1], "python"), ([0, 1, 1], "c")]) def test_mangle_cols_names(all_parsers, usecol, engine): # GH 11823 diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_multi_thread.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_multi_thread.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_multi_thread.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_multi_thread.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,15 +13,17 @@ from pandas import DataFrame import pandas._testing as tm +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + # We'll probably always skip these for pyarrow # Maybe we'll add our own tests for pyarrow too pytestmark = [ pytest.mark.single_cpu, pytest.mark.slow, - pytest.mark.usefixtures("pyarrow_skip"), ] +@xfail_pyarrow # ValueError: Found non-unique column index def test_multi_thread_string_io_read_csv(all_parsers): # see gh-11786 parser = all_parsers @@ -116,6 +118,7 @@ return final_dataframe +@xfail_pyarrow # ValueError: The 'nrows' option is not supported def test_multi_thread_path_multipart_read_csv(all_parsers): # see gh-11786 num_tasks = 4 diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_na_values.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_na_values.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_na_values.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_na_values.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,8 +16,12 @@ ) import pandas._testing as tm -skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") def test_string_nas(all_parsers): @@ -55,7 +59,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "na_values", [ @@ -83,12 +86,26 @@ """, ], ) -def test_non_string_na_values(all_parsers, data, na_values): +def test_non_string_na_values(all_parsers, data, na_values, request): # see gh-3611: with an odd float format, we can't match # the string "999.0" exactly but still need float matching parser = all_parsers expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"]) + if parser.engine == "pyarrow" and not all(isinstance(x, str) for x in na_values): + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), na_values=na_values) + return + elif parser.engine == "pyarrow" and "-999.000" in data: + # bc the pyarrow engine does not include the float-ified version + # of "-999" -> -999, it does not match the entry with the trailing + # zeros, so "-999.000" is not treated as null. + mark = pytest.mark.xfail( + reason="pyarrow engined does not recognize equivalent floats" + ) + request.applymarker(mark) + result = parser.read_csv(StringIO(data), na_values=na_values) tm.assert_frame_equal(result, expected) @@ -141,8 +158,6 @@ tm.assert_frame_equal(result, expected) -# TODO: needs skiprows list support in pyarrow -@skip_pyarrow @pytest.mark.parametrize("na_values", ["baz", ["baz"]]) def test_custom_na_values(all_parsers, na_values): parser = all_parsers @@ -155,6 +170,12 @@ expected = DataFrame( [[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"] ) + if parser.engine == "pyarrow": + msg = "skiprows argument must be an integer when using engine='pyarrow'" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) + return + result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) tm.assert_frame_equal(result, expected) @@ -179,8 +200,6 @@ tm.assert_frame_equal(result, expected) -# TODO: Needs pyarrow support for dictionary in na_values -@skip_pyarrow def test_na_value_dict(all_parsers): data = """A,B,C foo,bar,NA @@ -188,6 +207,13 @@ foo,bar,NA bar,foo,foo""" parser = all_parsers + + if parser.engine == "pyarrow": + msg = "pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]}) + return + df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]}) expected = DataFrame( { @@ -232,8 +258,6 @@ tm.assert_frame_equal(result, expected) -# TODO: xfail components of this test, the first one passes -@skip_pyarrow @pytest.mark.parametrize( "kwargs,expected", [ @@ -279,7 +303,7 @@ ), ], ) -def test_na_values_keep_default(all_parsers, kwargs, expected): +def test_na_values_keep_default(all_parsers, kwargs, expected, request): data = """\ A,B,C a,1,one @@ -291,6 +315,15 @@ g,7,seven """ parser = all_parsers + if parser.engine == "pyarrow": + if "na_values" in kwargs and isinstance(kwargs["na_values"], dict): + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + return + mark = pytest.mark.xfail() + request.applymarker(mark) + result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) @@ -321,12 +354,19 @@ tm.assert_frame_equal(result, expected) -# TODO: Blocked on na_values dict support in pyarrow -@skip_pyarrow def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 data = "a,b\n,2" parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), na_values={"b": ["2"]}, keep_default_na=False + ) + return + result = parser.read_csv( StringIO(data), na_values={"b": ["2"]}, keep_default_na=False ) @@ -334,21 +374,24 @@ tm.assert_frame_equal(result, expected) -# TODO: Blocked on na_values dict support in pyarrow -@skip_pyarrow def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # see gh-19227 # # Scalar values shouldn't cause the parsing to crash or fail. data = "a,b\n1,2" parser = all_parsers + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False) + return + df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False) expected = DataFrame({"a": [1], "b": [np.nan]}) tm.assert_frame_equal(df, expected) -# TODO: Blocked on na_values dict support in pyarrow -@skip_pyarrow @pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): # see gh-19227 @@ -369,6 +412,17 @@ } ) + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=None, + keep_default_na=False, + na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values}, + ) + return + result = parser.read_csv( StringIO(data), header=None, @@ -378,8 +432,7 @@ tm.assert_frame_equal(result, expected) -# TODO: Empty null_values doesn't work properly on pyarrow -@skip_pyarrow +@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", [ @@ -401,8 +454,7 @@ tm.assert_frame_equal(result, expected) -# TODO: Arrow parse error -@skip_pyarrow +@skip_pyarrow # CSV parse error: Expected 8 columns, got 5: def test_na_trailing_columns(all_parsers): parser = all_parsers data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax @@ -430,8 +482,6 @@ tm.assert_frame_equal(result, expected) -# TODO: xfail the na_values dict case -@skip_pyarrow @pytest.mark.parametrize( "na_values,row_data", [ @@ -445,12 +495,27 @@ names = ["a", "b"] data = "1,2\n2,1" + if parser.engine == "pyarrow" and isinstance(na_values, dict): + if isinstance(na_values, dict): + err = ValueError + msg = "The pyarrow engine doesn't support passing a dict for na_values" + else: + err = TypeError + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(err, match=msg): + parser.read_csv(StringIO(data), names=names, na_values=na_values) + return + elif parser.engine == "pyarrow": + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), names=names, na_values=na_values) + return + result = parser.read_csv(StringIO(data), names=names, na_values=na_values) expected = DataFrame(row_data, columns=names) tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_na_values_dict_aliasing(all_parsers): parser = all_parsers na_values = {"a": 2, "b": 1} @@ -460,25 +525,36 @@ data = "1,2\n2,1" expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names) + + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), names=names, na_values=na_values) + return + result = parser.read_csv(StringIO(data), names=names, na_values=na_values) tm.assert_frame_equal(result, expected) tm.assert_dict_equal(na_values, na_values_copy) -@skip_pyarrow def test_na_values_dict_col_index(all_parsers): # see gh-14203 data = "a\nfoo\n1" parser = all_parsers na_values = {0: "foo"} + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), na_values=na_values) + return + result = parser.read_csv(StringIO(data), na_values=na_values) expected = DataFrame({"a": [np.nan, 1]}) tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -491,9 +567,19 @@ (str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])), ], ) -def test_na_values_uint64(all_parsers, data, kwargs, expected): +def test_na_values_uint64(all_parsers, data, kwargs, expected, request): # see gh-14983 parser = all_parsers + + if parser.engine == "pyarrow" and "na_values" in kwargs: + msg = "The 'pyarrow' engine requires all na_values to be strings" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), header=None, **kwargs) + return + elif parser.engine == "pyarrow": + mark = pytest.mark.xfail(reason="Returns float64 instead of object") + request.applymarker(mark) + result = parser.read_csv(StringIO(data), header=None, **kwargs) tm.assert_frame_equal(result, expected) @@ -508,18 +594,20 @@ tm.assert_frame_equal(result, expected) -# TODO: Missing support for na_filter kewyord -@skip_pyarrow @pytest.mark.parametrize( "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] ) -def test_no_na_filter_on_index(all_parsers, na_filter, index_data): +def test_no_na_filter_on_index(all_parsers, na_filter, index_data, request): # see gh-5239 # # Don't parse NA-values in index unless na_filter=True parser = all_parsers data = "a,b,c\n1,,3\n4,5,6" + if parser.engine == "pyarrow" and na_filter is False: + mark = pytest.mark.xfail(reason="mismatched index result") + request.applymarker(mark) + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b")) result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter) tm.assert_frame_equal(result, expected) @@ -538,7 +626,7 @@ tm.assert_frame_equal(out, expected) -@skip_pyarrow +@xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): # see gh-20377 @@ -554,7 +642,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # mismatched exception message @pytest.mark.parametrize( "data, na_values", [ @@ -568,11 +656,14 @@ ) def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): parser = all_parsers - msg = ( - "(Bool column has NA values in column [0a])|" - "(cannot safely convert passed user dtype of " - "bool for object dtyped data in column 0)" + msg = "|".join( + [ + "Bool column has NA values in column [0a]", + "cannot safely convert passed user dtype of " + "bool for object dtyped data in column 0", + ] ) + with pytest.raises(ValueError, match=msg): parser.read_csv( StringIO(data), @@ -583,7 +674,9 @@ ) -@skip_pyarrow +# TODO: this test isn't about the na_values keyword, it is about the empty entries +# being returned with NaN entries, whereas the pyarrow engine returns "nan" +@xfail_pyarrow # mismatched shapes def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers @@ -612,12 +705,19 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_nan_multi_index(all_parsers): # GH 42446 parser = all_parsers data = "A,B,B\nX,Y,Z\n1,2,inf" + if parser.engine == "pyarrow": + msg = "The pyarrow engine doesn't support passing a dict for na_values" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"} + ) + return + result = parser.read_csv( StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"} ) @@ -633,7 +733,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # Failed: DID NOT RAISE ; it casts the NaN to False def test_bool_and_nan_to_bool(all_parsers): # GH#42808 parser = all_parsers diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_network.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_network.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_network.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_network.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,16 +2,13 @@ Tests parsers ability to read and parse non-local files and hence require a network connection to be read. """ -from io import ( - BytesIO, - StringIO, -) +from io import BytesIO import logging +import re import numpy as np import pytest -from pandas.compat import is_ci_environment import pandas.util._test_decorators as td from pandas import DataFrame @@ -20,6 +17,10 @@ from pandas.io.feather_format import read_feather from pandas.io.parsers import read_csv +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + @pytest.mark.network @pytest.mark.single_cpu @@ -288,39 +289,23 @@ tm.assert_frame_equal(result, expected) @pytest.mark.single_cpu - @pytest.mark.skipif( - is_ci_environment(), - reason="GH: 45651: This test can hang in our CI min_versions build", - ) def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so): # 8 MB, S3FS uses 5MB chunks - import s3fs - - df = DataFrame( - np.random.default_rng(2).standard_normal((100000, 4)), columns=list("abcd") - ) - str_buf = StringIO() - - df.to_csv(str_buf) - - buf = BytesIO(str_buf.getvalue().encode("utf-8")) - - s3_public_bucket.put_object(Key="large-file.csv", Body=buf) - - # Possibly some state leaking in between tests. - # If we don't clear this cache, we saw `GetObject operation: Forbidden`. - # Presumably the s3fs instance is being cached, with the directory listing - # from *before* we add the large-file.csv in the s3_public_bucket_with_data. - s3fs.S3FileSystem.clear_instance_cache() - - with caplog.at_level(logging.DEBUG, logger="s3fs"): - read_csv( - f"s3://{s3_public_bucket.name}/large-file.csv", - nrows=5, - storage_options=s3so, - ) - # log of fetch_range (start, stop) - assert (0, 5505024) in (x.args[-2:] for x in caplog.records) + df = DataFrame(np.zeros((100000, 4)), columns=list("abcd")) + with BytesIO(df.to_csv().encode("utf-8")) as buf: + s3_public_bucket.put_object(Key="large-file.csv", Body=buf) + uri = f"{s3_public_bucket.name}/large-file.csv" + match_re = re.compile(rf"^Fetch: {uri}, 0-(?P\d+)$") + with caplog.at_level(logging.DEBUG, logger="s3fs"): + read_csv( + f"s3://{uri}", + nrows=5, + storage_options=s3so, + ) + for log in caplog.messages: + if match := re.match(match_re, log): + # Less than 8 MB + assert int(match.group("stop")) < 8000000 def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so): # GH 25945 diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_parse_dates.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_parse_dates.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_parse_dates.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_parse_dates.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,13 +12,11 @@ from io import StringIO from dateutil.parser import parse as du_parse -from hypothesis import given import numpy as np import pytest import pytz from pandas._libs.tslibs import parsing -from pandas._libs.tslibs.parsing import py_parse_datetime_string import pandas as pd from pandas import ( @@ -30,15 +28,16 @@ Timestamp, ) import pandas._testing as tm -from pandas._testing._hypothesis import DATETIME_NO_TZ from pandas.core.indexes.datetimes import date_range +from pandas.core.tools.datetimes import start_caching_at from pandas.io.parsers import read_csv -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) -# GH#43650: Some expected failures with the pyarrow engine can occasionally -# cause a deadlock instead, so we skip these instead of xfailing +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -129,13 +128,19 @@ [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2] ) - df = parser.read_csv( - StringIO(data), - sep=";", - thousands="-", - parse_dates={"Date": [0, 1]}, - header=None, + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + df = parser.read_csv( + StringIO(data), + sep=";", + thousands="-", + parse_dates={"Date": [0, 1]}, + header=None, + ) tm.assert_frame_equal(df, expected) @@ -157,7 +162,7 @@ mark = pytest.mark.xfail( reason="pyarrow doesn't support disabling auto-inference on column numbers." ) - request.node.add_marker(mark) + request.applymarker(mark) def date_parser(*date_cols): """ @@ -188,6 +193,7 @@ "use 'date_format' instead", StringIO(data), **kwds, + raise_on_extra_warnings=False, ) expected = DataFrame( @@ -326,7 +332,9 @@ mark = pytest.mark.xfail( reason="pyarrow doesn't support disabling auto-inference on column numbers." ) - request.node.add_marker(mark) + request.applymarker(mark) + + depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" kwds = { "header": None, @@ -334,7 +342,10 @@ "keep_date_col": keep_date_col, "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], } - result = parser.read_csv(StringIO(data), **kwds) + with tm.assert_produces_warning( + (DeprecationWarning, FutureWarning), match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), **kwds) expected = DataFrame( [ @@ -502,7 +513,11 @@ "date_parser": pd.to_datetime, } result = parser.read_csv_check_warnings( - FutureWarning, "use 'date_format' instead", StringIO(data), **kwds + FutureWarning, + "use 'date_format' instead", + StringIO(data), + **kwds, + raise_on_extra_warnings=False, ) expected = DataFrame( @@ -556,6 +571,7 @@ parse_dates=[[0, 1]], header=None, date_parser=Timestamp, + raise_on_extra_warnings=False, ) expected = DataFrame( [ @@ -595,7 +611,13 @@ KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" - result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) expected = DataFrame( [ [ @@ -697,8 +719,14 @@ def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): parser = all_parsers + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), parse_dates=parse_dates) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), parse_dates=parse_dates) def test_date_parser_int_bug(all_parsers): @@ -722,6 +750,7 @@ date_parser=lambda x: datetime.fromtimestamp(int(x), tz=timezone.utc).replace( tzinfo=None ), + raise_on_extra_warnings=False, ) expected = DataFrame( [ @@ -778,7 +807,7 @@ tm.assert_frame_equal(result, df) -@xfail_pyarrow +@skip_pyarrow def test_csv_custom_parser(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -798,7 +827,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@skip_pyarrow def test_parse_dates_implicit_first_col(all_parsers): data = """A,B,C 20090101,a,1,2 @@ -822,9 +851,7 @@ parser = all_parsers result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) # freq doesn't round-trip - index = DatetimeIndex( - list(date_range("1/1/2009", periods=3)), name="date", freq=None - ) + index = date_range("1/1/2009", periods=3, name="date")._with_freq(None) expected = DataFrame( {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index @@ -991,11 +1018,11 @@ expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) - tm.assert_frame_equal(result, expected) if parser.engine == "pyarrow": expected_tz = pytz.utc else: expected_tz = timezone.utc + tm.assert_frame_equal(result, expected) assert result.index.tz is expected_tz @@ -1094,9 +1121,15 @@ if not isinstance(parse_dates, dict): expected.index.name = "date_NominalTime" - result = parser.read_csv( - StringIO(data), parse_dates=parse_dates, index_col=index_col + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), parse_dates=parse_dates, index_col=index_col + ) tm.assert_frame_equal(result, expected) @@ -1180,13 +1213,19 @@ ) expected = expected.set_index("nominal") - with parser.read_csv( - StringIO(data), - parse_dates={"nominal": [1, 2]}, - index_col="nominal", - chunksize=2, - ) as reader: - chunks = list(reader) + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + with parser.read_csv( + StringIO(data), + parse_dates={"nominal": [1, 2]}, + index_col="nominal", + chunksize=2, + ) as reader: + chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[:2]) tm.assert_frame_equal(chunks[1], expected[2:4]) @@ -1205,14 +1244,24 @@ KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - with_indices = parser.read_csv( - StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" - ) - with_names = parser.read_csv( - StringIO(data), - index_col="nominal", - parse_dates={"nominal": ["date", "nominalTime"]}, + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + with_indices = parser.read_csv( + StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" + ) + + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + with_names = parser.read_csv( + StringIO(data), + index_col="nominal", + parse_dates={"nominal": ["date", "nominalTime"]}, + ) tm.assert_frame_equal(with_indices, with_names) @@ -1227,10 +1276,19 @@ KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - result = parser.read_csv( - StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) - expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} + ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) expected = expected.set_index(["nominal", "ID"]) tm.assert_frame_equal(result, expected) @@ -1271,7 +1329,7 @@ # if we have an invalid date make sure that we handle this with # and w/o the cache properly parser = all_parsers - s = StringIO((f"{value},\n") * 50000) + s = StringIO((f"{value},\n") * (start_caching_at + 1)) parser.read_csv( s, @@ -1310,6 +1368,7 @@ names=["foo", "bar"], parse_dates=["foo"], cache_dates=cache_dates, + raise_on_extra_warnings=False, ) @@ -1341,6 +1400,7 @@ parse_dates=["Date"], infer_datetime_format=True, sep=",", + raise_on_extra_warnings=False, ) @@ -1511,6 +1571,7 @@ StringIO(data), date_parser=pd.to_datetime, **kwargs, + raise_on_extra_warnings=False, ) # Python can sometimes be flaky about how @@ -1530,6 +1591,7 @@ header=0, parse_dates={"ymd": [0, 1, 2]}, date_parser=lambda x: x, + raise_on_extra_warnings=False, ) expected = DataFrame( @@ -1564,6 +1626,7 @@ header=0, parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, **{key: value}, + raise_on_extra_warnings=False, ) expected = DataFrame( [ @@ -1600,6 +1663,7 @@ header=0, parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, **{key: value}, + raise_on_extra_warnings=False, ) expected = DataFrame( [ @@ -1625,6 +1689,7 @@ header=0, parse_dates={"ym": [0, 1]}, date_parser=parse_function, + raise_on_extra_warnings=False, ) expected = DataFrame( [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], @@ -1720,24 +1785,19 @@ 2018-01-04 09:05:00+09:00,23400""" result = parser.read_csv(StringIO(data), parse_dates=["dt"]) - dti = DatetimeIndex( - list( - date_range( - start="2018-01-04 09:01:00", - end="2018-01-04 09:05:00", - freq="1min", - tz=timezone(timedelta(minutes=540)), - ) - ), - freq=None, - ) + dti = date_range( + start="2018-01-04 09:01:00", + end="2018-01-04 09:05:00", + freq="1min", + tz=timezone(timedelta(minutes=540)), + )._with_freq(None) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) -@skip_pyarrow +@skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], @@ -1753,7 +1813,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "date_string,dayfirst,expected", [ @@ -1766,16 +1825,28 @@ ], ) def test_parse_delimited_date_swap_no_warning( - all_parsers, date_string, dayfirst, expected + all_parsers, date_string, dayfirst, expected, request ): parser = all_parsers expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + if parser.engine == "pyarrow": + if not dayfirst: + # "CSV parse error: Empty CSV file or block" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + msg = "The 'dayfirst' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] + ) + return + result = parser.read_csv( StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] ) tm.assert_frame_equal(result, expected) +# ArrowInvalid: CSV parse error: Empty CSV file or block: cannot infer number of columns @skip_pyarrow @pytest.mark.parametrize( "date_string,dayfirst,expected", @@ -1818,50 +1889,7 @@ pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) -def _helper_hypothesis_delimited_date(call, date_string, **kwargs): - msg, result = None, None - try: - result = call(date_string, **kwargs) - except ValueError as er: - msg = str(er) - return msg, result - - -@skip_pyarrow -@given(DATETIME_NO_TZ) -@pytest.mark.parametrize("delimiter", list(" -./")) -@pytest.mark.parametrize("dayfirst", [True, False]) -@pytest.mark.parametrize( - "date_format", - ["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"], -) -def test_hypothesis_delimited_date( - request, date_format, dayfirst, delimiter, test_datetime -): - if date_format == "%m %Y" and delimiter == ".": - request.node.add_marker( - pytest.mark.xfail( - reason="parse_datetime_string cannot reliably tell whether " - "e.g. %m.%Y is a float or a date" - ) - ) - date_string = test_datetime.strftime(date_format.replace(" ", delimiter)) - - except_out_dateutil, result = _helper_hypothesis_delimited_date( - py_parse_datetime_string, date_string, dayfirst=dayfirst - ) - except_in_dateutil, expected = _helper_hypothesis_delimited_date( - du_parse, - date_string, - default=datetime(1, 1, 1), - dayfirst=dayfirst, - yearfirst=False, - ) - - assert except_out_dateutil == except_in_dateutil - assert result == expected - - +# ArrowKeyError: Column 'fdate1' in include_columns does not exist in CSV file @skip_pyarrow @pytest.mark.parametrize( "names, usecols, parse_dates, missing_cols", @@ -1889,19 +1917,34 @@ parser = all_parsers content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + warn = FutureWarning + if isinstance(parse_dates, list) and all( + isinstance(x, (int, str)) for x in parse_dates + ): + warn = None + with pytest.raises(ValueError, match=msg): - parser.read_csv( - content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates - ) + with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): + parser.read_csv( + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates + ) -@skip_pyarrow +@xfail_pyarrow # mismatched shape def test_date_parser_and_names(all_parsers): # GH#33699 parser = all_parsers data = StringIO("""x,y\n1,2""") + warn = UserWarning + if parser.engine == "pyarrow": + # DeprecationWarning for passing a Manager object + warn = (UserWarning, DeprecationWarning) result = parser.read_csv_check_warnings( - UserWarning, + warn, "Could not infer format", data, parse_dates=["B"], @@ -1911,7 +1954,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required def test_date_parser_multiindex_columns(all_parsers): parser = all_parsers data = """a,b @@ -1924,7 +1967,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "parse_spec, col_name", [ @@ -1937,18 +1980,24 @@ data = """a,b,c 1,2,3 2019-12,-31,6""" - result = parser.read_csv( - StringIO(data), - parse_dates=parse_spec, - header=[0, 1], + + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), + parse_dates=parse_spec, + header=[0, 1], + ) expected = DataFrame( {col_name: Timestamp("2019-12-31").as_unit("ns"), ("c", "3"): [6]} ) tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_date_parser_usecols_thousands(all_parsers): # GH#39365 data = """A,B,C @@ -1957,6 +2006,19 @@ """ parser = all_parsers + + if parser.engine == "pyarrow": + # DeprecationWarning for passing a Manager object + msg = "The 'thousands' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + parse_dates=[1], + usecols=[1, 2], + thousands="-", + ) + return + result = parser.read_csv_check_warnings( UserWarning, "Could not infer format", @@ -1969,17 +2031,21 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow -def test_parse_dates_and_keep_orgin_column(all_parsers): +@xfail_pyarrow # mismatched shape +def test_parse_dates_and_keep_original_column(all_parsers): # GH#13378 parser = all_parsers data = """A 20150908 20150909 """ - result = parser.read_csv( - StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True - ) + depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True + ) expected_data = [Timestamp("2015-09-08"), Timestamp("2015-09-09")] expected = DataFrame({"date": expected_data, "A": expected_data}) tm.assert_frame_equal(result, expected) @@ -2068,7 +2134,7 @@ tm.assert_index_equal(expected, res) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Expected 3 columns, got 4 def test_infer_first_column_as_index(all_parsers): # GH#11019 parser = all_parsers @@ -2081,7 +2147,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # pyarrow engine doesn't support passing a dict for na_values @pytest.mark.parametrize( ("key", "value", "warn"), [ @@ -2121,7 +2187,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # string[python] instead of dt64[ns] def test_parse_dates_and_string_dtype(all_parsers): # GH#34066 parser = all_parsers @@ -2156,7 +2222,12 @@ warn = UserWarning msg = r"when dayfirst=False \(the default\) was specified" result = parser.read_csv_check_warnings( - warn, msg, StringIO(data), parse_dates=True, index_col=0 + warn, + msg, + StringIO(data), + parse_dates=True, + index_col=0, + raise_on_extra_warnings=False, ) expected = DataFrame({"b": [1, 2]}, index=expected_index) tm.assert_frame_equal(result, expected) @@ -2183,7 +2254,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})] ) @@ -2194,7 +2264,12 @@ 31-,12-2019 31-,12-2020""" - with tm.assert_produces_warning(None): + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): result = parser.read_csv( StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates ) @@ -2206,7 +2281,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # object dtype index def test_parse_dates_dict_format_index(all_parsers): # GH#51240 parser = all_parsers @@ -2249,7 +2324,7 @@ tm.assert_frame_equal(result, expected) -@xfail_pyarrow +@xfail_pyarrow # object dtype index def test_from_csv_with_mixed_offsets(all_parsers): parser = all_parsers data = "a\n2020-01-01T00:00:00+01:00\n2020-01-01T00:00:00+00:00" diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_python_parser_only.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_python_parser_only.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_python_parser_only.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_python_parser_only.py 2024-04-10 17:42:52.000000000 +0000 @@ -274,7 +274,7 @@ parser.read_csv(StringIO(data), quoting=quoting, **kwargs) -def test_none_delimiter(python_parser_only, capsys): +def test_none_delimiter(python_parser_only): # see gh-13374 and gh-17465 parser = python_parser_only data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9" @@ -283,12 +283,14 @@ # We expect the third line in the data to be # skipped because it is malformed, but we do # not expect any errors to occur. - result = parser.read_csv(StringIO(data), header=0, sep=None, on_bad_lines="warn") + with tm.assert_produces_warning( + ParserWarning, match="Skipping line 3", check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), header=0, sep=None, on_bad_lines="warn" + ) tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert "Skipping line 3" in captured.err - @pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) @pytest.mark.parametrize("skipfooter", [0, 1]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_quoting.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_quoting.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_quoting.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_quoting.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,7 +14,11 @@ from pandas import DataFrame import pandas._testing as tm -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @pytest.mark.parametrize( @@ -28,6 +32,7 @@ ({"quotechar": 2}, '"quotechar" must be string( or None)?, not int'), ], ) +@skip_pyarrow # ParserError: CSV parse error: Empty CSV file or block def test_bad_quote_char(all_parsers, kwargs, msg): data = "1,2,3" parser = all_parsers @@ -43,6 +48,7 @@ (10, 'bad "quoting" value'), # quoting must be in the range [0, 3] ], ) +@xfail_pyarrow # ValueError: The 'quoting' option is not supported def test_bad_quoting(all_parsers, quoting, msg): data = "1,2,3" parser = all_parsers @@ -72,6 +78,7 @@ tm.assert_frame_equal(result, expected) +@xfail_pyarrow # ValueError: The 'quoting' option is not supported @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) @pytest.mark.parametrize("quote_char", ["", None]) def test_null_quote_char(all_parsers, quoting, quote_char): @@ -112,6 +119,7 @@ ({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]), ], ) +@xfail_pyarrow # ValueError: The 'quoting' option is not supported def test_quoting_various(all_parsers, kwargs, exp_data): data = '1,2,"foo"' parser = all_parsers @@ -125,10 +133,14 @@ @pytest.mark.parametrize( "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] ) -def test_double_quote(all_parsers, doublequote, exp_data): +def test_double_quote(all_parsers, doublequote, exp_data, request): parser = all_parsers data = 'a,b\n3,"4 "" 5"' + if parser.engine == "pyarrow" and not doublequote: + mark = pytest.mark.xfail(reason="Mismatched result") + request.applymarker(mark) + result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote) expected = DataFrame(exp_data, columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -146,11 +158,15 @@ @pytest.mark.parametrize("balanced", [True, False]) -def test_unbalanced_quoting(all_parsers, balanced): +def test_unbalanced_quoting(all_parsers, balanced, request): # see gh-22789. parser = all_parsers data = 'a,b,c\n1,2,"3' + if parser.engine == "pyarrow" and not balanced: + mark = pytest.mark.xfail(reason="Mismatched result") + request.applymarker(mark) + if balanced: # Re-balance the quoting and read in without errors. expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_read_fwf.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_read_fwf.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_read_fwf.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_read_fwf.py 2024-04-10 17:42:52.000000000 +0000 @@ -602,7 +602,10 @@ 101.6 956.1 """.strip() skiprows = 2 - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) @@ -617,7 +620,10 @@ 456 78 9 456 """.strip() skiprows = [0, 2] - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) @@ -898,7 +904,7 @@ def test_skiprows_with_iterator(): - # GH#10261 + # GH#10261, GH#56323 data = """0 1 2 @@ -920,8 +926,8 @@ ) expected_frames = [ DataFrame({"a": [3, 4]}), - DataFrame({"a": [5, 7, 8]}, index=[2, 3, 4]), - DataFrame({"a": []}, dtype="object"), + DataFrame({"a": [5, 7]}, index=[2, 3]), + DataFrame({"a": [8]}, index=[4]), ] for i, result in enumerate(df_iter): tm.assert_frame_equal(result, expected_frames[i]) @@ -965,6 +971,12 @@ if string_storage == "python": arr = StringArray(np.array(["a", "b"], dtype=np.object_)) arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + arr = ArrowExtensionArray(pa.array(["a", "b"])) + arr_na = ArrowExtensionArray(pa.array([None, "a"])) else: pa = pytest.importorskip("pyarrow") arr = ArrowStringArray(pa.array(["a", "b"])) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_skiprows.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_skiprows.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_skiprows.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_skiprows.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,10 +17,13 @@ ) import pandas._testing as tm -# XFAIL ME PLS once hanging tests issues identified -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +@xfail_pyarrow # ValueError: skiprows argument must be an integer @pytest.mark.parametrize("skiprows", [list(range(6)), 6]) def test_skip_rows_bug(all_parsers, skiprows): # see gh-505 @@ -48,6 +51,7 @@ tm.assert_frame_equal(result, expected) +@xfail_pyarrow # ValueError: skiprows argument must be an integer def test_deep_skip_rows(all_parsers): # see gh-4382 parser = all_parsers @@ -63,6 +67,7 @@ tm.assert_frame_equal(result, condensed_result) +@xfail_pyarrow # AssertionError: DataFrame are different def test_skip_rows_blank(all_parsers): # see gh-9832 parser = all_parsers @@ -122,6 +127,7 @@ ), ], ) +@xfail_pyarrow # ValueError: skiprows argument must be an integer def test_skip_row_with_newline(all_parsers, data, kwargs, expected): # see gh-12775 and gh-10911 parser = all_parsers @@ -129,6 +135,7 @@ tm.assert_frame_equal(result, expected) +@xfail_pyarrow # ValueError: skiprows argument must be an integer def test_skip_row_with_quote(all_parsers): # see gh-12775 and gh-10911 parser = all_parsers @@ -170,6 +177,7 @@ ), ], ) +@xfail_pyarrow # ValueError: skiprows argument must be an integer def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): # see gh-12775 and gh-10911 parser = all_parsers @@ -179,6 +187,7 @@ tm.assert_frame_equal(result, expected) +@xfail_pyarrow # ValueError: The 'delim_whitespace' option is not supported @pytest.mark.parametrize( "lineterminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" ) @@ -204,18 +213,24 @@ if parser.engine == "python" and lineterminator == "\r": mark = pytest.mark.xfail(reason="'CR' not respect with the Python parser yet") - request.node.add_marker(mark) + request.applymarker(mark) data = data.replace("\n", lineterminator) - result = parser.read_csv( - StringIO(data), - skiprows=1, - delim_whitespace=True, - names=["date", "time", "var", "flag", "oflag"], - ) + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), + skiprows=1, + delim_whitespace=True, + names=["date", "time", "var", "flag", "oflag"], + ) tm.assert_frame_equal(result, expected) +@xfail_pyarrow # AssertionError: DataFrame are different def test_skiprows_infield_quote(all_parsers): # see gh-14459 parser = all_parsers @@ -226,6 +241,7 @@ tm.assert_frame_equal(result, expected) +@xfail_pyarrow # ValueError: skiprows argument must be an integer @pytest.mark.parametrize( "kwargs,expected", [ @@ -241,6 +257,7 @@ tm.assert_frame_equal(result, expected) +@xfail_pyarrow # ValueError: skiprows argument must be an integer def test_skip_rows_callable_not_in(all_parsers): parser = all_parsers data = "0,a\n1,b\n2,c\n3,d\n4,e" @@ -252,6 +269,7 @@ tm.assert_frame_equal(result, expected) +@xfail_pyarrow # ValueError: skiprows argument must be an integer def test_skip_rows_skip_all(all_parsers): parser = all_parsers data = "a\n1\n2\n3\n4\n5" @@ -261,6 +279,7 @@ parser.read_csv(StringIO(data), skiprows=lambda x: True) +@xfail_pyarrow # ValueError: skiprows argument must be an integer def test_skip_rows_bad_callable(all_parsers): msg = "by zero" parser = all_parsers @@ -270,6 +289,7 @@ parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0) +@xfail_pyarrow # ValueError: skiprows argument must be an integer def test_skip_rows_and_n_rows(all_parsers): # GH#44021 data = """a,b @@ -286,3 +306,29 @@ result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6]) expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]}) tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_skip_rows_with_chunks(all_parsers): + # GH 55677 + data = """col_a +10 +20 +30 +40 +50 +60 +70 +80 +90 +100 +""" + parser = all_parsers + reader = parser.read_csv( + StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4 + ) + df1 = next(reader) + df2 = next(reader) + + tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]})) + tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6])) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_textreader.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_textreader.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_textreader.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_textreader.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,6 +12,7 @@ import pandas._libs.parsers as parser from pandas._libs.parsers import TextReader +from pandas.errors import ParserWarning from pandas import DataFrame import pandas._testing as tm @@ -125,7 +126,7 @@ expected = DataFrame([123456, 12500]) tm.assert_frame_equal(result, expected) - def test_skip_bad_lines(self, capsys): + def test_skip_bad_lines(self): # too many lines, see #2430 for why data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r" @@ -145,14 +146,11 @@ } assert_array_dicts_equal(result, expected) - reader = TextReader( - StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn - ) - reader.read() - captured = capsys.readouterr() - - assert "Skipping line 4" in captured.err - assert "Skipping line 6" in captured.err + with tm.assert_produces_warning(ParserWarning, match="Skipping line"): + reader = TextReader( + StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn + ) + reader.read() def test_header_not_enough_lines(self): data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6" @@ -300,6 +298,8 @@ } assert_array_dicts_equal(result, expected) + @pytest.mark.parametrize("repeat", range(10)) + def test_empty_field_eof_mem_access_bug(self, repeat): # GH5664 a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"]) b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1]) @@ -314,21 +314,20 @@ index=[0, 5, 7, 12], ) - for _ in range(100): - df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c") - tm.assert_frame_equal(df, a) + df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c") + tm.assert_frame_equal(df, a) - df = read_csv( - StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c" - ) - tm.assert_frame_equal(df, b) + df = read_csv( + StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c" + ) + tm.assert_frame_equal(df, b) - df = read_csv( - StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"), - names=list("abcd"), - engine="c", - ) - tm.assert_frame_equal(df, c) + df = read_csv( + StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"), + names=list("abcd"), + engine="c", + ) + tm.assert_frame_equal(df, c) def test_empty_csv_input(self): # GH14867 diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/test_unsupported.py pandas-2.2.2+dfsg/pandas/tests/io/parser/test_unsupported.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/test_unsupported.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/test_unsupported.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,11 +12,6 @@ import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, - is_platform_windows, -) from pandas.errors import ParserError import pandas._testing as tm @@ -24,6 +19,10 @@ from pandas.io.parsers import read_csv import pandas.io.parsers.readers as parsers +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + @pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val) def python_engine(request): @@ -44,9 +43,12 @@ data = "a b c\n1 2 3" msg = "does not support" + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + # specify C engine with unsupported options (raise) with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False) with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="c", sep=r"\s") with pytest.raises(ValueError, match=msg): @@ -55,7 +57,7 @@ read_csv(StringIO(data), engine="c", skipfooter=1) # specify C-unsupported options without python-unsupported options - with tm.assert_produces_warning(parsers.ParserWarning): + with tm.assert_produces_warning((parsers.ParserWarning, FutureWarning)): read_csv(StringIO(data), sep=None, delim_whitespace=False) with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), sep=r"\s") @@ -153,16 +155,31 @@ kwargs[default] = True elif default == "on_bad_lines": kwargs[default] = "warn" + + warn = None + depr_msg = None + if "delim_whitespace" in kwargs: + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + warn = FutureWarning + if "verbose" in kwargs: + depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated" + warn = FutureWarning + with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine="pyarrow", **kwargs) + with tm.assert_produces_warning(warn, match=depr_msg): + read_csv(StringIO(data), engine="pyarrow", **kwargs) - def test_on_bad_lines_callable_python_only(self, all_parsers): + def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers): # GH 5686 + # GH 54643 sio = StringIO("a,b\n1,2") bad_lines_func = lambda x: x parser = all_parsers - if all_parsers.engine != "python": - msg = "on_bad_line can only be a callable function if engine='python'" + if all_parsers.engine not in ["python", "pyarrow"]: + msg = ( + "on_bad_line can only be a callable " + "function if engine='python' or 'pyarrow'" + ) with pytest.raises(ValueError, match=msg): parser.read_csv(sio, on_bad_lines=bad_lines_func) else: @@ -175,11 +192,8 @@ error = ValueError if parser.engine == "pyarrow": - pyarrow = pytest.importorskip("pyarrow") - error = pyarrow.lib.ArrowKeyError - if is_ci_environment() and (is_platform_windows() or is_platform_mac()): - # GH#45547 causes timeouts on windows/mac builds - pytest.skip("GH#45547 causing timeouts on windows/mac builds 2022-01-22") + # Raises pyarrow.lib.ArrowKeyError + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") with tm.ensure_clean("test.csv") as fname: Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8") @@ -194,7 +208,7 @@ # GH#45957 parser = all_parsers if parser.engine == "python": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.") ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/usecols/test_parse_dates.py pandas-2.2.2+dfsg/pandas/tests/io/parser/usecols/test_parse_dates.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/usecols/test_parse_dates.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/usecols/test_parse_dates.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,9 +13,16 @@ ) import pandas._testing as tm -# TODO(1.4): Change these to xfails whenever parse_dates support(which was -# intentionally disable to keep small PR sizes) is added back -pytestmark = pytest.mark.usefixtures("pyarrow_skip") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") +skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") + +_msg_pyarrow_requires_names = ( + "The pyarrow engine does not allow 'usecols' to be integer column " + "positions. Pass a list of string column names instead." +) @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) @@ -27,15 +34,34 @@ parser = all_parsers parse_dates = [[1, 2]] + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" + ) + cols = { "a": [0, 0], "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], } expected = DataFrame(cols, columns=["c_d", "a"]) - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + if parser.engine == "pyarrow": + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv( + StringIO(data), usecols=usecols, parse_dates=parse_dates + ) + return + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), usecols=usecols, parse_dates=parse_dates + ) tm.assert_frame_equal(result, expected) +@skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns def test_usecols_with_parse_dates2(all_parsers): # see gh-13604 parser = all_parsers @@ -115,11 +141,17 @@ } expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - result = parser.read_csv( - StringIO(data), - usecols=usecols, - parse_dates=parse_dates, + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), + usecols=usecols, + parse_dates=parse_dates, + ) tm.assert_frame_equal(result, expected) @@ -131,20 +163,32 @@ list("acd"), # Names span only the selected columns. ], ) -def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): +def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request): # see gh-9755 s = """0,1,2014-01-01,09:00,4 0,1,2014-01-02,10:00,4""" parse_dates = [[1, 2]] parser = all_parsers + if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0): + mark = pytest.mark.xfail( + reason="Length mismatch in some cases, UserWarning in other" + ) + request.applymarker(mark) + cols = { "a": [0, 0], "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], } expected = DataFrame(cols, columns=["c_d", "a"]) - result = parser.read_csv( - StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols + depr_msg = ( + "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" ) + with tm.assert_produces_warning( + (FutureWarning, DeprecationWarning), match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols + ) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/usecols/test_strings.py pandas-2.2.2+dfsg/pandas/tests/io/parser/usecols/test_strings.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/usecols/test_strings.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/usecols/test_strings.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,10 @@ from pandas import DataFrame import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + def test_usecols_with_unicode_strings(all_parsers): # see gh-13219 diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/parser/usecols/test_usecols_basic.py pandas-2.2.2+dfsg/pandas/tests/io/parser/usecols/test_usecols_basic.py --- pandas-2.1.4+dfsg/pandas/tests/io/parser/usecols/test_usecols_basic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/parser/usecols/test_usecols_basic.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,6 +16,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + _msg_validate_usecols_arg = ( "'usecols' must either be list-like " "of all strings, all unicode, all " @@ -24,10 +28,18 @@ _msg_validate_usecols_names = ( "Usecols do not match columns, columns expected but not found: {0}" ) +_msg_pyarrow_requires_names = ( + "The pyarrow engine does not allow 'usecols' to be integer column " + "positions. Pass a list of string column names instead." +) -# TODO: Switch to xfails +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning" +) + def test_raise_on_mixed_dtype_usecols(all_parsers): # See gh-12678 @@ -42,9 +54,8 @@ parser.read_csv(StringIO(data), usecols=usecols) -@skip_pyarrow @pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) -def test_usecols(all_parsers, usecols): +def test_usecols(all_parsers, usecols, request): data = """\ a,b,c 1,2,3 @@ -52,13 +63,17 @@ 7,8,9 10,11,12""" parser = all_parsers + if parser.engine == "pyarrow" and isinstance(usecols[0], int): + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), usecols=usecols) + return + result = parser.read_csv(StringIO(data), usecols=usecols) expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_usecols_with_names(all_parsers): data = """\ a,b,c @@ -68,13 +83,18 @@ 10,11,12""" parser = all_parsers names = ["foo", "bar"] + + if parser.engine == "pyarrow": + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) + return + result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] ) @@ -85,13 +105,16 @@ 7,8,9 10,11,12""" parser = all_parsers + if parser.engine == "pyarrow" and not isinstance(usecols[0], int): + # ArrowKeyError: Column 'fb' in include_columns does not exist + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols) expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_usecols_relative_to_names2(all_parsers): # see gh-5766 data = """\ @@ -100,6 +123,7 @@ 7,8,9 10,11,12""" parser = all_parsers + result = parser.read_csv( StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] ) @@ -108,7 +132,8 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +# regex mismatch: "Length mismatch: Expected axis has 1 elements" +@xfail_pyarrow def test_usecols_name_length_conflict(all_parsers): data = """\ 1,2,3 @@ -117,7 +142,6 @@ 10,11,12""" parser = all_parsers msg = "Number of passed names did not match number of header fields in the file" - with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) @@ -133,7 +157,7 @@ parser.read_csv(StringIO(data), usecols="foo") -@skip_pyarrow +@skip_pyarrow # CSV parse error in one case, AttributeError in another @pytest.mark.parametrize( "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] ) @@ -147,13 +171,18 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("index_col", ["b", 0]) @pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) -def test_usecols_index_col_conflict(all_parsers, usecols, index_col): +def test_usecols_index_col_conflict(all_parsers, usecols, index_col, request): # see gh-4201: test that index_col as integer reflects usecols parser = all_parsers data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + + if parser.engine == "pyarrow" and isinstance(usecols[0], int): + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) + return + expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) @@ -174,7 +203,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Expected 3 columns, got 4 def test_usecols_implicit_index_col(all_parsers): # see gh-2654 parser = all_parsers @@ -207,28 +236,50 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_usecols_regex_sep(all_parsers): # see gh-2733 parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + + if parser.engine == "pyarrow": + msg = "the 'pyarrow' engine does not support regex separators" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) + return + result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b")) + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + + if parser.engine == "pyarrow": + msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv( + StringIO(data), delim_whitespace=True, usecols=("a", "b") + ) + return + + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), delim_whitespace=True, usecols=("a", "b") + ) expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "usecols,expected", [ @@ -241,17 +292,22 @@ ), ], ) -def test_usecols_with_integer_like_header(all_parsers, usecols, expected): +def test_usecols_with_integer_like_header(all_parsers, usecols, expected, request): parser = all_parsers data = """2,0,1 1000,2000,3000 4000,5000,6000""" + if parser.engine == "pyarrow" and isinstance(usecols[0], int): + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), usecols=usecols) + return + result = parser.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(result, expected) -@skip_pyarrow +@xfail_pyarrow # mismatched shape def test_empty_usecols(all_parsers): data = "a,b,c\n1,2,3\n4,5,6" expected = DataFrame(columns=Index([])) @@ -272,7 +328,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "usecols,expected", [ @@ -301,10 +356,17 @@ 3.568935038,7,False,a""" parser = all_parsers + if parser.engine == "pyarrow": + msg = "The pyarrow engine does not allow 'usecols' to be a callable" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), usecols=usecols) + return + result = parser.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(result, expected) +# ArrowKeyError: Column 'fa' in include_columns does not exist in CSV file @skip_pyarrow @pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) def test_incomplete_first_row(all_parsers, usecols): @@ -318,7 +380,7 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow +@skip_pyarrow # CSV parse error: Expected 3 columns, got 4 @pytest.mark.parametrize( "data,usecols,kwargs,expected", [ @@ -351,7 +413,6 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize( "usecols,kwargs,expected,msg", [ @@ -395,11 +456,20 @@ ), ], ) -def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg): +def test_raises_on_usecols_names_mismatch( + all_parsers, usecols, kwargs, expected, msg, request +): data = "a,b,c,d\n1,2,3,4\n5,6,7,8" kwargs.update(usecols=usecols) parser = all_parsers + if parser.engine == "pyarrow" and not ( + usecols is not None and expected is not None + ): + # everything but the first case + # ArrowKeyError: Column 'f' in include_columns does not exist in CSV file + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + if expected is None: with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), **kwargs) @@ -408,19 +478,25 @@ tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) -def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): +def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request): data = "a,b,c,d\n1,2,3,4\n5,6,7,8" names = ["A", "B", "C", "D"] parser = all_parsers + if parser.engine == "pyarrow": + if isinstance(usecols[0], int): + with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): + parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) + return + # "pyarrow.lib.ArrowKeyError: Column 'A' in include_columns does not exist" + pytest.skip(reason="https://github.com/apache/arrow/issues/38676") + result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) expected = DataFrame({"A": [1, 5], "C": [3, 7]}) tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.parametrize("names", [None, ["a", "b"]]) def test_usecols_indices_out_of_bounds(all_parsers, names): # GH#25623 & GH 41130; enforced in 2.0 @@ -429,25 +505,41 @@ a,b 1,2 """ - with pytest.raises(ParserError, match="Defining usecols without of bounds"): + + err = ParserError + msg = "Defining usecols with out-of-bounds" + if parser.engine == "pyarrow": + err = ValueError + msg = _msg_pyarrow_requires_names + + with pytest.raises(err, match=msg): parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0) -@skip_pyarrow def test_usecols_additional_columns(all_parsers): # GH#46997 parser = all_parsers usecols = lambda header: header.strip() in ["a", "b", "c"] + + if parser.engine == "pyarrow": + msg = "The pyarrow engine does not allow 'usecols' to be a callable" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols) + return result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols) expected = DataFrame({"a": ["x"], "b": "y"}) tm.assert_frame_equal(result, expected) -@skip_pyarrow def test_usecols_additional_columns_integer_columns(all_parsers): # GH#46997 parser = all_parsers usecols = lambda header: header.strip() in ["0", "1"] + if parser.engine == "pyarrow": + msg = "The pyarrow engine does not allow 'usecols' to be a callable" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols) + return result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols) expected = DataFrame({"0": ["x"], "1": "y"}) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_append.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_append.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_append.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_append.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,6 +11,7 @@ import pandas as pd from pandas import ( DataFrame, + Index, Series, _testing as tm, concat, @@ -32,7 +33,11 @@ with ensure_clean_store(setup_path) as store: # this is allowed by almost always don't want to do it # tables.NaturalNameWarning): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((20, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=20, freq="B"), + ) _maybe_remove(store, "df1") store.append("df1", df[:10]) store.append("df1", df[10:]) @@ -99,8 +104,10 @@ def test_append_series(setup_path): with ensure_clean_store(setup_path) as store: # basic - ss = tm.makeStringSeries() - ts = tm.makeTimeSeries() + ss = Series(range(20), dtype=np.float64, index=[f"i_{i}" for i in range(20)]) + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) ns = Series(np.arange(100)) store.append("ss", ss) @@ -278,7 +285,11 @@ def test_append_frame_column_oriented(setup_path): with ensure_clean_store(setup_path) as store: # column oriented - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) df.index = df.index._with_freq(None) # freq doesn't round-trip _maybe_remove(store, "df1") @@ -397,7 +408,14 @@ store.append("df_new", df_new) # min_itemsize on Series index (GH 11412) - df = tm.makeMixedDataFrame().set_index("C") + df = DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "D": date_range("20130101", periods=5), + } + ).set_index("C") store.append("ss", df["B"], min_itemsize={"index": 4}) tm.assert_series_equal(store.select("ss"), df["B"]) @@ -419,7 +437,11 @@ # with nans _maybe_remove(store, "df") - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) df["string"] = "foo" df.loc[df.index[1:4], "string"] = np.nan df["string2"] = "bar" @@ -479,7 +501,11 @@ def test_append_with_data_columns(setup_path): with ensure_clean_store(setup_path) as store: - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) df.iloc[0, df.columns.get_loc("B")] = 1.0 _maybe_remove(store, "df") store.append("df", df[:2], data_columns=["B"]) @@ -643,7 +669,7 @@ tm.assert_frame_equal(result, expected) path = tmp_path / "test.hdf" - df.to_hdf(path, "df", format="table") + df.to_hdf(path, key="df", format="table") result = read_hdf(path, "df", columns=["A", "B"]) expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) @@ -651,7 +677,11 @@ def test_append_misc(setup_path): with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) store.append("df", df, chunksize=1) result = store.select("df") tm.assert_frame_equal(result, df) @@ -664,7 +694,11 @@ @pytest.mark.parametrize("chunksize", [10, 200, 1000]) def test_append_misc_chunksize(setup_path, chunksize): # more chunksize in append tests - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) df["string"] = "foo" df["float322"] = 1.0 df["float322"] = df["float322"].astype("float32") @@ -708,7 +742,11 @@ # test append with invalid input to get good error messages # list in column - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) df["invalid"] = [["a"]] * len(df) assert df.dtypes["invalid"] == np.object_ msg = re.escape( @@ -725,7 +763,11 @@ store.append("df", df) # datetime with embedded nans as object - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) s[0:5] = np.nan @@ -749,7 +791,11 @@ store.append("df", Series(np.arange(10))) # appending an incompatible table - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) store.append("df", df) df["foo"] = "foo" @@ -772,7 +818,7 @@ "dtype->bytes24,kind->string,shape->(1, 30)] " "vs current table " "[name->values_block_1,cname->values_block_1," - "dtype->datetime64,kind->datetime64,shape->None]" + "dtype->datetime64[s],kind->datetime64[s],shape->None]" ) with pytest.raises(ValueError, match=msg): store.append("df", df) @@ -826,8 +872,12 @@ def test_append_to_multiple(setup_path): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1 = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + df2 = df1.copy().rename(columns="{}_2".format) df2["foo"] = "bar" df = concat([df1, df2], axis=1) @@ -859,8 +909,16 @@ def test_append_to_multiple_dropna(setup_path): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1 = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + df2 = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ).rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) @@ -876,8 +934,12 @@ def test_append_to_multiple_dropna_false(setup_path): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1 = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + df2 = df1.copy().rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_categorical.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_categorical.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_categorical.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_categorical.py 2024-04-10 17:42:52.000000000 +0000 @@ -152,7 +152,7 @@ # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] path = tmp_path / setup_path - df.to_hdf(path, "df", format="table", data_columns=True) + df.to_hdf(path, key="df", format="table", data_columns=True) result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) @@ -163,7 +163,7 @@ # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] path = tmp_path / setup_path - df.to_hdf(path, "df", format="table", data_columns=True) + df.to_hdf(path, key="df", format="table", data_columns=True) result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) @@ -185,7 +185,7 @@ df["d"] = df.b.astype("category") expected = df path = tmp_path / setup_path - df.to_hdf(path, "df", format="table", data_columns=True) + df.to_hdf(path, key="df", format="table", data_columns=True) result = read_hdf(path, "df") tm.assert_frame_equal(result, expected) @@ -209,6 +209,6 @@ expected.col = expected.col.cat.set_categories(categorical_values) path = tmp_path / setup_path - df.to_hdf(path, "df", format="table", min_itemsize=max_widths) + df.to_hdf(path, key="df", format="table", min_itemsize=max_widths) result = read_hdf(path, where=where) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_complex.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_complex.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_complex.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_complex.py 2024-04-10 17:42:52.000000000 +0000 @@ -20,7 +20,7 @@ ) path = tmp_path / setup_path - df.to_hdf(path, "df") + df.to_hdf(path, key="df") reread = read_hdf(path, "df") tm.assert_frame_equal(df, reread) @@ -30,7 +30,7 @@ columns=list("ABCDE"), ) path = tmp_path / setup_path - df.to_hdf(path, "df") + df.to_hdf(path, key="df") reread = read_hdf(path, "df") tm.assert_frame_equal(df, reread) @@ -43,8 +43,8 @@ ) path = tmp_path / setup_path - df.to_hdf(path, "df", format="table") - reread = read_hdf(path, "df") + df.to_hdf(path, key="df", format="table") + reread = read_hdf(path, key="df") tm.assert_frame_equal(df, reread) df = DataFrame( @@ -54,7 +54,7 @@ ) path = tmp_path / setup_path - df.to_hdf(path, "df", format="table", mode="w") + df.to_hdf(path, key="df", format="table", mode="w") reread = read_hdf(path, "df") tm.assert_frame_equal(df, reread) @@ -77,7 +77,7 @@ index=list("abcd"), ) path = tmp_path / setup_path - df.to_hdf(path, "df") + df.to_hdf(path, key="df") reread = read_hdf(path, "df") tm.assert_frame_equal(df, reread) @@ -106,7 +106,7 @@ tm.assert_frame_equal(df.loc[df.A > 2], result) path = tmp_path / setup_path - df.to_hdf(path, "df", format="table") + df.to_hdf(path, key="df", format="table") reread = read_hdf(path, "df") tm.assert_frame_equal(df, reread) @@ -120,7 +120,7 @@ comps = [tm.assert_series_equal, tm.assert_frame_equal] for obj, comp in zip(objs, comps): path = tmp_path / setup_path - obj.to_hdf(path, "obj", format="fixed") + obj.to_hdf(path, key="obj", format="fixed") reread = read_hdf(path, "obj") comp(obj, reread) @@ -131,7 +131,7 @@ df = DataFrame({"A": s, "B": s}) path = tmp_path / setup_path - df.to_hdf(path, "obj", format="table") + df.to_hdf(path, key="obj", format="table") reread = read_hdf(path, "obj") tm.assert_frame_equal(df, reread) @@ -172,10 +172,10 @@ path = tmp_path / setup_path with pytest.raises(TypeError, match=msg): - s.to_hdf(path, "obj", format="t") + s.to_hdf(path, key="obj", format="t") path = tmp_path / setup_path - s.to_hdf(path, "obj", format="t", index=False) + s.to_hdf(path, key="obj", format="t", index=False) reread = read_hdf(path, "obj") tm.assert_series_equal(s, reread) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_errors.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_errors.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_errors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_errors.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,7 @@ CategoricalIndex, DataFrame, HDFStore, + Index, MultiIndex, _testing as tm, date_range, @@ -25,7 +26,11 @@ def test_pass_spec_to_storer(setup_path): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) with ensure_clean_store(setup_path) as store: store.put("df", df) @@ -49,7 +54,7 @@ with ensure_clean_store(setup_path) as store: store.put("frame", df1, format="table") - msg = re.escape("incompatible kind in col [integer - datetime64]") + msg = re.escape("incompatible kind in col [integer - datetime64[ns]]") with pytest.raises(TypeError, match=msg): store.put("frame", df2, format="table", append=True) @@ -60,14 +65,22 @@ # currently not supported dtypes #### for n, f in dtypes: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) df[n] = f msg = re.escape(f"[{n}] is not implemented as a table column") with pytest.raises(TypeError, match=msg): store.append(f"df1_{n}", df) # frame - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) df["obj1"] = "foo" df["obj2"] = "bar" df["datetime1"] = datetime.date(2001, 1, 2) @@ -85,7 +98,11 @@ def test_invalid_terms(tmp_path, setup_path): with ensure_clean_store(setup_path) as store: - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) df["string"] = "foo" df.loc[df.index[0:4], "string"] = "bar" @@ -115,7 +132,7 @@ columns=list("ABCD"), index=date_range("20130101", periods=10), ) - dfq.to_hdf(path, "dfq", format="table", data_columns=True) + dfq.to_hdf(path, key="dfq", format="table", data_columns=True) # check ok read_hdf(path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']") @@ -128,7 +145,7 @@ columns=list("ABCD"), index=date_range("20130101", periods=10), ) - dfq.to_hdf(path, "dfq", format="table") + dfq.to_hdf(path, key="dfq", format="table") msg = ( r"The passed where expression: A>0 or C>0\n\s*" @@ -169,7 +186,7 @@ with tm.ensure_clean(setup_path) as path: msg = r"complib only supports \[.*\] compression." with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", complib="foolib") + df.to_hdf(path, key="df", complib="foolib") @pytest.mark.parametrize( @@ -185,7 +202,7 @@ df = DataFrame(0, index=mi, columns=["a"]) path = tmp_path / setup_path with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): - df.to_hdf(path, "df") + df.to_hdf(path, key="df") def test_unsuppored_hdf_file_error(datapath): @@ -212,7 +229,7 @@ with pytest.raises(OSError, match=msg): read_hdf(path, "key") - df.to_hdf(path, "df") + df.to_hdf(path, key="df") store = HDFStore(path, mode="r") store.close() diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_file_handling.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_file_handling.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_file_handling.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_file_handling.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,8 +17,10 @@ from pandas import ( DataFrame, HDFStore, + Index, Series, _testing as tm, + date_range, read_hdf, ) from pandas.tests.io.pytables.common import ( @@ -35,7 +37,11 @@ @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) def test_mode(setup_path, tmp_path, mode): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) msg = r"[\S]* does not exist" path = tmp_path / setup_path @@ -64,10 +70,10 @@ # conv write if mode in ["r", "r+"]: with pytest.raises(OSError, match=msg): - df.to_hdf(path, "df", mode=mode) - df.to_hdf(path, "df", mode="w") + df.to_hdf(path, key="df", mode=mode) + df.to_hdf(path, key="df", mode="w") else: - df.to_hdf(path, "df", mode=mode) + df.to_hdf(path, key="df", mode=mode) # conv read if mode in ["w"]: @@ -84,9 +90,13 @@ def test_default_mode(tmp_path, setup_path): # read_hdf uses default mode - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) path = tmp_path / setup_path - df.to_hdf(path, "df", mode="w") + df.to_hdf(path, key="df", mode="w") result = read_hdf(path, "df") tm.assert_frame_equal(result, df) @@ -95,7 +105,9 @@ path = tmp_path / setup_path store = HDFStore(path, mode="a") - store["a"] = tm.makeTimeSeries() + store["a"] = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) msg = ( r"Re-opening the file \[[\S]*\] with mode \[a\] will delete the " @@ -116,7 +128,9 @@ assert not store.is_open store = HDFStore(path, mode="a") - store["a"] = tm.makeTimeSeries() + store["a"] = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) # reopen as read store.open("r") @@ -145,7 +159,11 @@ def test_open_args(setup_path): with tm.ensure_clean(setup_path) as path: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) # create an in memory store store = HDFStore( @@ -165,19 +183,23 @@ def test_flush(setup_path): with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() + store["a"] = Series(range(5)) store.flush() store.flush(fsync=True) def test_complibs_default_settings(tmp_path, setup_path): # GH15943 - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) # Set complevel and check if complib is automatically set to # default value tmpfile = tmp_path / setup_path - df.to_hdf(tmpfile, "df", complevel=9) + df.to_hdf(tmpfile, key="df", complevel=9) result = read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) @@ -188,7 +210,7 @@ # Set complib and check to see if compression is disabled tmpfile = tmp_path / setup_path - df.to_hdf(tmpfile, "df", complib="zlib") + df.to_hdf(tmpfile, key="df", complib="zlib") result = read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) @@ -199,7 +221,7 @@ # Check if not setting complib or complevel results in no compression tmpfile = tmp_path / setup_path - df.to_hdf(tmpfile, "df") + df.to_hdf(tmpfile, key="df") result = read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) @@ -211,7 +233,11 @@ def test_complibs_default_settings_override(tmp_path, setup_path): # Check if file-defaults can be overridden on a per table basis - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) tmpfile = tmp_path / setup_path store = HDFStore(tmpfile) store.append("dfc", df, complevel=9, complib="blosc") @@ -236,8 +262,12 @@ # with xfail, would sometimes raise UnicodeDecodeError # invalid state byte ) -def test_complibs(tmp_path, lvl, lib): +def test_complibs(tmp_path, lvl, lib, request): # GH14478 + if PY311 and is_platform_linux() and lib == "blosc2" and lvl != 0: + request.applymarker( + pytest.mark.xfail(reason=f"Fails for {lib} on Linux and PY > 3.11") + ) df = DataFrame( np.ones((30, 4)), columns=list("ABCD"), index=np.arange(30).astype(np.str_) ) @@ -253,7 +283,7 @@ gname = f"{lvl}_{lib}" # Write and read file to see if data is consistent - df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) + df.to_hdf(tmpfile, key=gname, complib=lib, complevel=lvl) result = read_hdf(tmpfile, gname) tm.assert_frame_equal(result, df) @@ -308,10 +338,18 @@ ser = Series(val, dtype=dtype) store = tmp_path / setup_path - ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) + ser.to_hdf(store, key=key, format="table", encoding=enc, nan_rep=nan_rep) retr = read_hdf(store, key) - s_nan = ser.replace(nan_rep, np.nan) + # TODO:(3.0): once Categorical replace deprecation is enforced, + # we may be able to re-simplify the construction of s_nan + if dtype == "category": + if nan_rep in ser.cat.categories: + s_nan = ser.cat.remove_categories([nan_rep]) + else: + s_nan = ser + else: + s_nan = ser.replace(nan_rep, np.nan) tm.assert_series_equal(s_nan, retr) @@ -321,8 +359,12 @@ path = tmp_path / setup_path - df = tm.makeDataFrame() - df.to_hdf(path, "df", mode="w", format="table") + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) + df.to_hdf(path, key="df", mode="w", format="table") # single store = HDFStore(path) @@ -398,8 +440,12 @@ # ops on a closed store path = tmp_path / setup_path - df = tm.makeDataFrame() - df.to_hdf(path, "df", mode="w", format="table") + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) + df.to_hdf(path, key="df", mode="w", format="table") store = HDFStore(path) store.close() diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_keys.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_keys.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_keys.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_keys.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,9 +1,12 @@ +import numpy as np import pytest from pandas import ( DataFrame, HDFStore, - _testing as tm, + Index, + Series, + date_range, ) from pandas.tests.io.pytables.common import ( ensure_clean_store, @@ -15,9 +18,17 @@ def test_keys(setup_path): with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeStringSeries() - store["c"] = tm.makeDataFrame() + store["a"] = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + store["b"] = Series( + range(10), dtype="float64", index=[f"i_{i}" for i in range(10)] + ) + store["c"] = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) assert len(store) == 3 expected = {"/a", "/b", "/c"} diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_put.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_put.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_put.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_put.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,3 @@ -import datetime import re import numpy as np @@ -15,6 +14,7 @@ Series, _testing as tm, concat, + date_range, ) from pandas.tests.io.pytables.common import ( _maybe_remove, @@ -47,7 +47,11 @@ def test_api_default_format(tmp_path, setup_path): # default_format option with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) with pd.option_context("io.hdf.default_format", "fixed"): _maybe_remove(store, "df") @@ -68,28 +72,38 @@ assert store.get_storer("df").is_table path = tmp_path / setup_path - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) with pd.option_context("io.hdf.default_format", "fixed"): - df.to_hdf(path, "df") + df.to_hdf(path, key="df") with HDFStore(path) as store: assert not store.get_storer("df").is_table with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df2", append=True) + df.to_hdf(path, key="df2", append=True) with pd.option_context("io.hdf.default_format", "table"): - df.to_hdf(path, "df3") + df.to_hdf(path, key="df3") with HDFStore(path) as store: assert store.get_storer("df3").is_table - df.to_hdf(path, "df4", append=True) + df.to_hdf(path, key="df4", append=True) with HDFStore(path) as store: assert store.get_storer("df4").is_table def test_put(setup_path): with ensure_clean_store(setup_path) as store: - ts = tm.makeTimeSeries() - df = tm.makeTimeDataFrame() + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + df = DataFrame( + np.random.default_rng(2).standard_normal((20, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=20, freq="B"), + ) store["a"] = ts store["b"] = df[:10] store["foo/bar/bah"] = df[:10] @@ -145,7 +159,11 @@ def test_put_compression(setup_path): with ensure_clean_store(setup_path) as store: - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) store.put("c", df, format="table", complib="zlib") tm.assert_frame_equal(store["c"], df) @@ -158,7 +176,11 @@ @td.skip_if_windows def test_put_compression_blosc(setup_path): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) with ensure_clean_store(setup_path) as store: # can't compress if format='fixed' @@ -171,7 +193,11 @@ def test_put_mixed_type(setup_path): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) df["obj1"] = "foo" df["obj2"] = "bar" df["bool1"] = df["A"] > 0 @@ -196,32 +222,27 @@ tm.assert_frame_equal(expected, df) +@pytest.mark.parametrize("format", ["table", "fixed"]) @pytest.mark.parametrize( - "format, index", + "index", [ - ["table", tm.makeFloatIndex], - ["table", tm.makeStringIndex], - ["table", tm.makeIntIndex], - ["table", tm.makeDateIndex], - ["fixed", tm.makeFloatIndex], - ["fixed", tm.makeStringIndex], - ["fixed", tm.makeIntIndex], - ["fixed", tm.makeDateIndex], - ["table", tm.makePeriodIndex], # GH#7796 - ["fixed", tm.makePeriodIndex], + Index([str(i) for i in range(10)]), + Index(np.arange(10, dtype=float)), + Index(np.arange(10)), + date_range("2020-01-01", periods=10), + pd.period_range("2020-01-01", periods=10), ], ) -@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_store_index_types(setup_path, format, index): # GH5386 # test storing various index types with ensure_clean_store(setup_path) as store: df = DataFrame( - np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB") + np.random.default_rng(2).standard_normal((10, 2)), + columns=list("AB"), + index=index, ) - df.index = index(len(df)) - _maybe_remove(store, "df") store.put("df", df, format=format) tm.assert_frame_equal(df, store["df"]) @@ -279,15 +300,9 @@ with ensure_clean_store(setup_path) as store: def make_index(names=None): - return MultiIndex.from_tuples( - [ - (datetime.datetime(2013, 12, d), s, t) - for d in range(1, 3) - for s in range(2) - for t in range(3) - ], - names=names, - ) + dti = date_range("2013-12-01", "2013-12-02") + mi = MultiIndex.from_product([dti, range(2), range(3)], names=names) + return mi # no names _maybe_remove(store, "df") @@ -306,11 +321,11 @@ tm.assert_frame_equal(store.select("df"), df) # series - _maybe_remove(store, "s") - s = Series(np.zeros(12), index=make_index(["date", None, None])) - store.append("s", s) + _maybe_remove(store, "ser") + ser = Series(np.zeros(12), index=make_index(["date", None, None])) + store.append("ser", ser) xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) - tm.assert_series_equal(store.select("s"), xp) + tm.assert_series_equal(store.select("ser"), xp) # dup with column _maybe_remove(store, "df") @@ -354,6 +369,6 @@ ) path = tmp_path / setup_path - df.to_hdf(path, "df", mode="w", format=format) + df.to_hdf(path, key="df", mode="w", format=format) expected = pd.read_hdf(path, "df") tm.assert_frame_equal(df, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_pytables_missing.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_pytables_missing.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_pytables_missing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_pytables_missing.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,4 +11,4 @@ df = pd.DataFrame({"A": [1, 2]}) with pytest.raises(ImportError, match="tables"): with tm.ensure_clean("foo.h5") as path: - df.to_hdf(path, "df") + df.to_hdf(path, key="df") diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_read.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_read.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_read.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_read.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,6 +15,7 @@ Index, Series, _testing as tm, + date_range, read_hdf, ) from pandas.tests.io.pytables.common import ( @@ -32,35 +33,35 @@ # GH 25766 path = tmp_path / setup_path df = DataFrame({"a": range(2), "b": range(2)}) - df.to_hdf(path, "k1") + df.to_hdf(path, key="k1") with pytest.raises(KeyError, match="'No object named k2 in the file'"): read_hdf(path, "k2") # smoke test to test that file is properly closed after # read with KeyError before another write - df.to_hdf(path, "k2") + df.to_hdf(path, key="k2") def test_read_index_error_close_store(tmp_path, setup_path): # GH 25766 path = tmp_path / setup_path df = DataFrame({"A": [], "B": []}, index=[]) - df.to_hdf(path, "k1") + df.to_hdf(path, key="k1") with pytest.raises(IndexError, match=r"list index out of range"): read_hdf(path, "k1", stop=0) # smoke test to test that file is properly closed after # read with IndexError before another write - df.to_hdf(path, "k1") + df.to_hdf(path, key="k1") def test_read_missing_key_opened_store(tmp_path, setup_path): # GH 28699 path = tmp_path / setup_path df = DataFrame({"a": range(2), "b": range(2)}) - df.to_hdf(path, "k1") + df.to_hdf(path, key="k1") with HDFStore(path, "r") as store: with pytest.raises(KeyError, match="'No object named k2 in the file'"): @@ -72,7 +73,11 @@ def test_read_column(setup_path): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") @@ -154,7 +159,7 @@ datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" ) as store: d2 = store["detector/readout"] - assert isinstance(d2, DataFrame) + assert isinstance(d2, DataFrame) @pytest.mark.skipif(is_platform_windows(), reason="native2 read fails oddly on windows") @@ -164,7 +169,7 @@ ) as store: str(store) d1 = store["detector"] - assert isinstance(d1, DataFrame) + assert isinstance(d1, DataFrame) def test_legacy_table_fixed_format_read_py2(datapath): @@ -174,28 +179,29 @@ datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" ) as store: result = store.select("df") - expected = DataFrame( - [[1, 2, 3, "D"]], - columns=["A", "B", "C", "D"], - index=Index(["ABC"], name="INDEX_NAME"), - ) - tm.assert_frame_equal(expected, result) + expected = DataFrame( + [[1, 2, 3, "D"]], + columns=["A", "B", "C", "D"], + index=Index(["ABC"], name="INDEX_NAME"), + ) + tm.assert_frame_equal(expected, result) def test_legacy_table_fixed_format_read_datetime_py2(datapath): # GH 31750 # legacy table with fixed format and datetime64 column written in Python 2 + expected = DataFrame( + [[Timestamp("2020-02-06T18:00")]], + columns=["A"], + index=Index(["date"]), + dtype="M8[ns]", + ) with ensure_clean_store( datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"), mode="r", ) as store: result = store.select("df") - expected = DataFrame( - [[Timestamp("2020-02-06T18:00")]], - columns=["A"], - index=Index(["date"]), - ) - tm.assert_frame_equal(expected, result) + tm.assert_frame_equal(expected, result) def test_legacy_table_read_py2(datapath): @@ -222,7 +228,7 @@ df = df.set_index(keys="E", append=True) path = tmp_path / setup_path - df.to_hdf(path, "df", mode="w") + df.to_hdf(path, key="df", mode="w") direct = read_hdf(path, "df") with HDFStore(path, mode="r") as store: indirect = read_hdf(store, "df") @@ -241,7 +247,7 @@ ) path = tmp_path / setup_path - df.to_hdf(path, "df", mode="w", format="table") + df.to_hdf(path, key="df", mode="w", format="table") df2 = read_hdf(path, "df") assert df2.index._data.base is None @@ -258,13 +264,13 @@ df = df.set_index(keys="E", append=True) path = tmp_path / setup_path - df.to_hdf(path, "df", mode="w", format="t") + df.to_hdf(path, key="df", mode="w", format="t") direct = read_hdf(path, "df") iterator = read_hdf(path, "df", iterator=True) with closing(iterator.store): assert isinstance(iterator, TableIterator) indirect = next(iterator.__iter__()) - tm.assert_frame_equal(direct, indirect) + tm.assert_frame_equal(direct, indirect) def test_read_nokey(tmp_path, setup_path): @@ -278,10 +284,10 @@ # Categorical dtype not supported for "fixed" format. So no need # to test with that dtype in the dataframe here. path = tmp_path / setup_path - df.to_hdf(path, "df", mode="a") + df.to_hdf(path, key="df", mode="a") reread = read_hdf(path) tm.assert_frame_equal(df, reread) - df.to_hdf(path, "df2", mode="a") + df.to_hdf(path, key="df2", mode="a") msg = "key must be provided when HDF5 file contains multiple datasets." with pytest.raises(ValueError, match=msg): @@ -293,10 +299,10 @@ df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")}) path = tmp_path / setup_path - df.to_hdf(path, "df", mode="a", format="table") + df.to_hdf(path, key="df", mode="a", format="table") reread = read_hdf(path) tm.assert_frame_equal(df, reread) - df.to_hdf(path, "df2", mode="a", format="table") + df.to_hdf(path, key="df2", mode="a", format="table") msg = "key must be provided when HDF5 file contains multiple datasets." with pytest.raises(ValueError, match=msg): @@ -325,8 +331,8 @@ filename = tmp_path / setup_path path_obj = Path(filename) - expected.to_hdf(path_obj, "df", mode="a") - actual = read_hdf(path_obj, "df") + expected.to_hdf(path_obj, key="df", mode="a") + actual = read_hdf(path_obj, key="df") tm.assert_frame_equal(expected, actual) @@ -344,8 +350,8 @@ filename = tmp_path / setup_path path_obj = LocalPath(filename) - expected.to_hdf(path_obj, "df", mode="a") - actual = read_hdf(path_obj, "df") + expected.to_hdf(path_obj, key="df", mode="a") + actual = read_hdf(path_obj, key="df") tm.assert_frame_equal(expected, actual) @@ -355,7 +361,7 @@ # GH 16583 # Tests that reading a Series saved to an HDF file # still works if a mode='r' argument is supplied - series = tm.makeFloatSeries() + series = Series(range(10), dtype=np.float64) path = tmp_path / setup_path series.to_hdf(path, key="data", format=format) result = read_hdf(path, key="data", mode="r") @@ -387,7 +393,7 @@ mode="r", ) as store: result = store["p"] - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_read_infer_string(tmp_path, setup_path): diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_retain_attributes.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_retain_attributes.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_retain_attributes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_retain_attributes.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,9 +1,8 @@ import pytest -from pandas._libs.tslibs import Timestamp - from pandas import ( DataFrame, + DatetimeIndex, Series, _testing as tm, date_range, @@ -18,11 +17,10 @@ pytestmark = pytest.mark.single_cpu -def test_retain_index_attributes(setup_path): +def test_retain_index_attributes(setup_path, unit): # GH 3499, losing frequency info on index recreation - df = DataFrame( - {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} - ) + dti = date_range("2000-1-1", periods=3, freq="h", unit=unit) + df = DataFrame({"A": Series(range(3), index=dti)}) with ensure_clean_store(setup_path) as store: _maybe_remove(store, "data") @@ -37,37 +35,30 @@ getattr(result, idx), attr, None ) + dti2 = date_range("2002-1-1", periods=3, freq="D", unit=unit) # try to append a table with a different frequency with tm.assert_produces_warning(errors.AttributeConflictWarning): - df2 = DataFrame( - { - "A": Series( - range(3), index=date_range("2002-1-1", periods=3, freq="D") - ) - } - ) + df2 = DataFrame({"A": Series(range(3), index=dti2)}) store.append("data", df2) assert store.get_storer("data").info["index"]["freq"] is None # this is ok _maybe_remove(store, "df2") + dti3 = DatetimeIndex( + ["2001-01-01", "2001-01-02", "2002-01-01"], dtype=f"M8[{unit}]" + ) df2 = DataFrame( { "A": Series( range(3), - index=[ - Timestamp("20010101"), - Timestamp("20010102"), - Timestamp("20020101"), - ], + index=dti3, ) } ) store.append("df2", df2) - df3 = DataFrame( - {"A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D"))} - ) + dti4 = date_range("2002-1-1", periods=3, freq="D", unit=unit) + df3 = DataFrame({"A": Series(range(3), index=dti4)}) store.append("df2", df3) @@ -76,26 +67,26 @@ with tm.assert_produces_warning(errors.AttributeConflictWarning): df = DataFrame( - {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} + {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="h"))} ) - df.to_hdf(path, "data", mode="w", append=True) + df.to_hdf(path, key="data", mode="w", append=True) df2 = DataFrame( {"A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D"))} ) - df2.to_hdf(path, "data", append=True) + df2.to_hdf(path, key="data", append=True) - idx = date_range("2000-1-1", periods=3, freq="H") + idx = date_range("2000-1-1", periods=3, freq="h") idx.name = "foo" df = DataFrame({"A": Series(range(3), index=idx)}) - df.to_hdf(path, "data", mode="w", append=True) + df.to_hdf(path, key="data", mode="w", append=True) - assert read_hdf(path, "data").index.name == "foo" + assert read_hdf(path, key="data").index.name == "foo" with tm.assert_produces_warning(errors.AttributeConflictWarning): - idx2 = date_range("2001-1-1", periods=3, freq="H") + idx2 = date_range("2001-1-1", periods=3, freq="h") idx2.name = "bar" df2 = DataFrame({"A": Series(range(3), index=idx2)}) - df2.to_hdf(path, "data", append=True) + df2.to_hdf(path, key="data", append=True) assert read_hdf(path, "data").index.name is None diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_round_trip.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_round_trip.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_round_trip.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_round_trip.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,10 +10,12 @@ import pandas as pd from pandas import ( DataFrame, + DatetimeIndex, Index, Series, _testing as tm, bdate_range, + date_range, read_hdf, ) from pandas.tests.io.pytables.common import ( @@ -29,28 +31,35 @@ with tm.ensure_clean() as path: def roundtrip(key, obj, **kwargs): - obj.to_hdf(path, key, **kwargs) + obj.to_hdf(path, key=key, **kwargs) return read_hdf(path, key) - o = tm.makeTimeSeries() + o = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) tm.assert_series_equal(o, roundtrip("series", o)) - o = tm.makeStringSeries() + o = Series(range(10), dtype="float64", index=[f"i_{i}" for i in range(10)]) tm.assert_series_equal(o, roundtrip("string_series", o)) - o = tm.makeDataFrame() + o = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) tm.assert_frame_equal(o, roundtrip("frame", o)) # table df = DataFrame({"A": range(5), "B": range(5)}) - df.to_hdf(path, "table", append=True) + df.to_hdf(path, key="table", append=True) result = read_hdf(path, "table", where=["index>2"]) tm.assert_frame_equal(df[df.index > 2], result) def test_long_strings(setup_path): # GH6166 - df = DataFrame({"a": tm.makeStringIndex(10)}, index=tm.makeStringIndex(10)) + data = ["a" * 50] * 10 + df = DataFrame({"a": data}, index=data) with ensure_clean_store(setup_path) as store: store.append("df", df, data_columns=["a"]) @@ -64,49 +73,49 @@ # API issue when to_hdf doesn't accept append AND format args path = tmp_path / setup_path - df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path, "df", append=True, format="table") - df.iloc[10:].to_hdf(path, "df", append=True, format="table") + df = DataFrame(range(20)) + df.iloc[:10].to_hdf(path, key="df", append=True, format="table") + df.iloc[10:].to_hdf(path, key="df", append=True, format="table") tm.assert_frame_equal(read_hdf(path, "df"), df) # append to False - df.iloc[:10].to_hdf(path, "df", append=False, format="table") - df.iloc[10:].to_hdf(path, "df", append=True, format="table") + df.iloc[:10].to_hdf(path, key="df", append=False, format="table") + df.iloc[10:].to_hdf(path, key="df", append=True, format="table") tm.assert_frame_equal(read_hdf(path, "df"), df) def test_api_append(tmp_path, setup_path): path = tmp_path / setup_path - df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path, "df", append=True) - df.iloc[10:].to_hdf(path, "df", append=True, format="table") + df = DataFrame(range(20)) + df.iloc[:10].to_hdf(path, key="df", append=True) + df.iloc[10:].to_hdf(path, key="df", append=True, format="table") tm.assert_frame_equal(read_hdf(path, "df"), df) # append to False - df.iloc[:10].to_hdf(path, "df", append=False, format="table") - df.iloc[10:].to_hdf(path, "df", append=True) + df.iloc[:10].to_hdf(path, key="df", append=False, format="table") + df.iloc[10:].to_hdf(path, key="df", append=True) tm.assert_frame_equal(read_hdf(path, "df"), df) def test_api_2(tmp_path, setup_path): path = tmp_path / setup_path - df = tm.makeDataFrame() - df.to_hdf(path, "df", append=False, format="fixed") + df = DataFrame(range(20)) + df.to_hdf(path, key="df", append=False, format="fixed") tm.assert_frame_equal(read_hdf(path, "df"), df) - df.to_hdf(path, "df", append=False, format="f") + df.to_hdf(path, key="df", append=False, format="f") tm.assert_frame_equal(read_hdf(path, "df"), df) - df.to_hdf(path, "df", append=False) + df.to_hdf(path, key="df", append=False) tm.assert_frame_equal(read_hdf(path, "df"), df) - df.to_hdf(path, "df") + df.to_hdf(path, key="df") tm.assert_frame_equal(read_hdf(path, "df"), df) with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() + df = DataFrame(range(20)) _maybe_remove(store, "df") store.append("df", df.iloc[:10], append=True, format="table") @@ -134,23 +143,27 @@ def test_api_invalid(tmp_path, setup_path): path = tmp_path / setup_path # Invalid. - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) msg = "Can only append to Tables" with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", append=True, format="f") + df.to_hdf(path, key="df", append=True, format="f") with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", append=True, format="fixed") + df.to_hdf(path, key="df", append=True, format="fixed") msg = r"invalid HDFStore format specified \[foo\]" with pytest.raises(TypeError, match=msg): - df.to_hdf(path, "df", append=True, format="foo") + df.to_hdf(path, key="df", append=True, format="foo") with pytest.raises(TypeError, match=msg): - df.to_hdf(path, "df", append=False, format="foo") + df.to_hdf(path, key="df", append=False, format="foo") # File path doesn't exist path = "" @@ -162,7 +175,9 @@ def test_get(setup_path): with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() + store["a"] = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) left = store.get("a") right = store["a"] tm.assert_series_equal(left, right) @@ -248,10 +263,12 @@ @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") def test_series(setup_path): - s = tm.makeStringSeries() + s = Series(range(10), dtype="float64", index=[f"i_{i}" for i in range(10)]) _check_roundtrip(s, tm.assert_series_equal, path=setup_path) - ts = tm.makeTimeSeries() + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) ts2 = Series(ts.index, Index(ts.index, dtype=object)) @@ -320,7 +337,11 @@ ser = Series(values, [1, 5]) _check_roundtrip(ser, func, path=setup_path) - ser = Series(values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)]) + dti = DatetimeIndex(["2012-01-01", "2012-01-02"], dtype="M8[ns]") + ser = Series(values, index=dti) + _check_roundtrip(ser, func, path=setup_path) + + ser.index = ser.index.as_unit("s") _check_roundtrip(ser, func, path=setup_path) @@ -331,7 +352,7 @@ _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) except OverflowError: if is_platform_windows(): - request.node.add_marker( + request.applymarker( pytest.mark.xfail("known failure on some windows platforms") ) raise @@ -341,7 +362,11 @@ "compression", [False, pytest.param(True, marks=td.skip_if_windows)] ) def test_frame(compression, setup_path): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) # put in some random NAs df.iloc[0, 0] = np.nan @@ -354,7 +379,11 @@ df, tm.assert_frame_equal, path=setup_path, compression=compression ) - tdf = tm.makeTimeDataFrame() + tdf = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) _check_roundtrip( tdf, tm.assert_frame_equal, path=setup_path, compression=compression ) @@ -418,7 +447,11 @@ ) def test_store_mixed(compression, setup_path): def _make_one(): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) df["obj1"] = "foo" df["obj2"] = "bar" df["bool1"] = df["A"] > 0 @@ -511,7 +544,9 @@ def test_store_datetime_mixed(setup_path): df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) - ts = tm.makeTimeSeries() + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) df["d"] = ts.index[:3] _check_roundtrip(df, tm.assert_frame_equal, path=setup_path) @@ -521,7 +556,7 @@ df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) path = tmp_path / setup_path - df.to_hdf(path, "df", format="table") + df.to_hdf(path, key="df", format="table") other = read_hdf(path, "df") tm.assert_frame_equal(df, other) assert df.equals(other) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_select.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_select.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_select.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_select.py 2024-04-10 17:42:52.000000000 +0000 @@ -64,7 +64,7 @@ df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "A", "B", "B"] ) - df.index = date_range("20130101 9:30", periods=10, freq="T") + df.index = date_range("20130101 9:30", periods=10, freq="min") with ensure_clean_store(setup_path) as store: store.append("df", df) @@ -95,7 +95,7 @@ ], axis=1, ) - df.index = date_range("20130101 9:30", periods=10, freq="T") + df.index = date_range("20130101 9:30", periods=10, freq="min") with ensure_clean_store(setup_path) as store: store.append("df", df) @@ -130,7 +130,11 @@ def test_select(setup_path): with ensure_clean_store(setup_path) as store: # select with columns= - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) _maybe_remove(store, "df") store.append("df", df) result = store.select("df", columns=["A", "B"]) @@ -266,7 +270,11 @@ # test selection with comparison against numpy scalar # GH 11283 with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) expected = df[df["A"] > 0] @@ -327,7 +335,11 @@ def test_select_iterator(tmp_path, setup_path): # single table with ensure_clean_store(setup_path) as store: - df = tm.makeTimeDataFrame(500) + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) _maybe_remove(store, "df") store.append("df", df) @@ -337,33 +349,41 @@ result = concat(results) tm.assert_frame_equal(expected, result) - results = list(store.select("df", chunksize=100)) + results = list(store.select("df", chunksize=2)) assert len(results) == 5 result = concat(results) tm.assert_frame_equal(expected, result) - results = list(store.select("df", chunksize=150)) + results = list(store.select("df", chunksize=2)) result = concat(results) tm.assert_frame_equal(result, expected) path = tmp_path / setup_path - df = tm.makeTimeDataFrame(500) - df.to_hdf(path, "df_non_table") + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + df.to_hdf(path, key="df_non_table") msg = "can only use an iterator or chunksize on a table" with pytest.raises(TypeError, match=msg): - read_hdf(path, "df_non_table", chunksize=100) + read_hdf(path, "df_non_table", chunksize=2) with pytest.raises(TypeError, match=msg): read_hdf(path, "df_non_table", iterator=True) path = tmp_path / setup_path - df = tm.makeTimeDataFrame(500) - df.to_hdf(path, "df", format="table") + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + df.to_hdf(path, key="df", format="table") - results = list(read_hdf(path, "df", chunksize=100)) + results = list(read_hdf(path, "df", chunksize=2)) result = concat(results) assert len(results) == 5 @@ -373,9 +393,13 @@ # multiple with ensure_clean_store(setup_path) as store: - df1 = tm.makeTimeDataFrame(500) + df1 = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) store.append("df1", df1, data_columns=True) - df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format) + df2 = df1.copy().rename(columns="{}_2".format) df2["foo"] = "bar" store.append("df2", df2) @@ -384,7 +408,7 @@ # full selection expected = store.select_as_multiple(["df1", "df2"], selector="df1") results = list( - store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150) + store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=2) ) result = concat(results) tm.assert_frame_equal(expected, result) @@ -397,7 +421,11 @@ # no iterator with ensure_clean_store(setup_path) as store: - expected = tm.makeTimeDataFrame(100064, "S") + expected = DataFrame( + np.random.default_rng(2).standard_normal((100064, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100064, freq="s"), + ) _maybe_remove(store, "df") store.append("df", expected) @@ -428,7 +456,11 @@ # with iterator, full range with ensure_clean_store(setup_path) as store: - expected = tm.makeTimeDataFrame(100064, "S") + expected = DataFrame( + np.random.default_rng(2).standard_normal((100064, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100064, freq="s"), + ) _maybe_remove(store, "df") store.append("df", expected) @@ -466,7 +498,11 @@ # with iterator, non complete range with ensure_clean_store(setup_path) as store: - expected = tm.makeTimeDataFrame(100064, "S") + expected = DataFrame( + np.random.default_rng(2).standard_normal((100064, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100064, freq="s"), + ) _maybe_remove(store, "df") store.append("df", expected) @@ -496,7 +532,11 @@ # with iterator, empty where with ensure_clean_store(setup_path) as store: - expected = tm.makeTimeDataFrame(100064, "S") + expected = DataFrame( + np.random.default_rng(2).standard_normal((100064, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100064, freq="s"), + ) _maybe_remove(store, "df") store.append("df", expected) @@ -516,7 +556,11 @@ # with iterator, range limited to the first chunk with ensure_clean_store(setup_path) as store: - expected = tm.makeTimeDataFrame(100000, "S") + expected = DataFrame( + np.random.default_rng(2).standard_normal((100064, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=100064, freq="s"), + ) _maybe_remove(store, "df") store.append("df", expected) @@ -564,7 +608,11 @@ def test_frame_select(setup_path): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) with ensure_clean_store(setup_path) as store: store.put("frame", df, format="table") @@ -585,7 +633,11 @@ tm.assert_frame_equal(result, expected) # invalid terms - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) store.append("df_time", df) msg = "day is out of range for month: 0" with pytest.raises(ValueError, match=msg): @@ -600,7 +652,11 @@ def test_frame_select_complex(setup_path): # select via complex criteria - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) df["string"] = "foo" df.loc[df.index[0:4], "string"] = "bar" @@ -657,7 +713,7 @@ # use non-trivial selection criteria params = DataFrame({"A": [1, 1, 2, 2, 3]}) - params.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) + params.to_hdf(pp, key="df", mode="w", format="table", data_columns=["A"]) selection = read_hdf(pp, "df", where="A=[2,3]") hist = DataFrame( @@ -668,7 +724,7 @@ ), ) - hist.to_hdf(hh, "df", mode="w", format="table") + hist.to_hdf(hh, key="df", mode="w", format="table") expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") @@ -713,7 +769,11 @@ def test_invalid_filtering(setup_path): # can't use more than one filter (atm) - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) with ensure_clean_store(setup_path) as store: store.put("df", df, format="table") @@ -731,7 +791,11 @@ def test_string_select(setup_path): # GH 2973 with ensure_clean_store(setup_path) as store: - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) # test string ==/!= df["x"] = "none" @@ -771,8 +835,12 @@ def test_select_as_multiple(setup_path): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1 = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + df2 = df1.copy().rename(columns="{}_2".format) df2["foo"] = "bar" with ensure_clean_store(setup_path) as store: @@ -832,7 +900,8 @@ tm.assert_frame_equal(result, expected) # test exception for diff rows - store.append("df3", tm.makeTimeDataFrame(nper=50)) + df3 = df1.copy().head(2) + store.append("df3", df3) msg = "all tables must have exactly the same nrows!" with pytest.raises(ValueError, match=msg): store.select_as_multiple( diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_store.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_store.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_store.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_store.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,6 +17,7 @@ Timestamp, concat, date_range, + period_range, timedelta_range, ) import pandas._testing as tm @@ -44,7 +45,11 @@ pass with tm.ensure_clean(setup_path) as path: with HDFStore(path) as tbl: - tbl["a"] = tm.makeDataFrame() + tbl["a"] = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) assert len(tbl) == 1 assert type(tbl["a"]) == DataFrame @@ -102,11 +107,23 @@ with ensure_clean_store(setup_path) as store: repr(store) store.info() - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeStringSeries() - store["c"] = tm.makeDataFrame() + store["a"] = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + store["b"] = Series( + range(10), dtype="float64", index=[f"i_{i}" for i in range(10)] + ) + store["c"] = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) df["obj1"] = "foo" df["obj2"] = "bar" df["bool1"] = df["A"] > 0 @@ -133,7 +150,11 @@ # storers with ensure_clean_store(setup_path) as store: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) store.append("df", df) s = store.get_storer("df") @@ -143,9 +164,19 @@ def test_contains(setup_path): with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeDataFrame() - store["foo/bar"] = tm.makeDataFrame() + store["a"] = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + store["b"] = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) + store["foo/bar"] = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) assert "a" in store assert "b" in store assert "c" not in store @@ -158,15 +189,29 @@ with tm.assert_produces_warning( tables.NaturalNameWarning, check_stacklevel=False ): - store["node())"] = tm.makeDataFrame() + store["node())"] = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) assert "node())" in store def test_versioning(setup_path): with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeSeries() - store["b"] = tm.makeDataFrame() - df = tm.makeTimeDataFrame() + store["a"] = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + store["b"] = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) + df = DataFrame( + np.random.default_rng(2).standard_normal((20, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=20, freq="B"), + ) _maybe_remove(store, "df1") store.append("df1", df[:10]) store.append("df1", df[10:]) @@ -251,7 +296,9 @@ def test_getattr(setup_path): with ensure_clean_store(setup_path) as store: - s = tm.makeTimeSeries() + s = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) store["a"] = s # test attribute access @@ -260,7 +307,11 @@ result = getattr(store, "a") tm.assert_series_equal(result, s) - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) store["df"] = df result = store.df tm.assert_frame_equal(result, df) @@ -288,35 +339,56 @@ # # Test to make sure defaults are to not drop. # # Corresponding to Issue 9382 path = tmp_path / setup_path - df_with_missing.to_hdf(path, "df", format="table") + df_with_missing.to_hdf(path, key="df", format="table") reloaded = read_hdf(path, "df") tm.assert_frame_equal(df_with_missing, reloaded) path = tmp_path / setup_path - df_with_missing.to_hdf(path, "df", format="table", dropna=False) + df_with_missing.to_hdf(path, key="df", format="table", dropna=False) reloaded = read_hdf(path, "df") tm.assert_frame_equal(df_with_missing, reloaded) path = tmp_path / setup_path - df_with_missing.to_hdf(path, "df", format="table", dropna=True) + df_with_missing.to_hdf(path, key="df", format="table", dropna=True) reloaded = read_hdf(path, "df") tm.assert_frame_equal(df_without_missing, reloaded) +def test_keyword_deprecation(tmp_path, setup_path): + # GH 54229 + path = tmp_path / setup_path + + msg = ( + "Starting with pandas version 3.0 all arguments of to_hdf except for the " + "argument 'path_or_buf' will be keyword-only." + ) + df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) + + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_hdf(path, "key") + + def test_to_hdf_with_min_itemsize(tmp_path, setup_path): path = tmp_path / setup_path # min_itemsize in index with to_hdf (GH 10381) - df = tm.makeMixedDataFrame().set_index("C") - df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6}) + df = DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "D": date_range("20130101", periods=5), + } + ).set_index("C") + df.to_hdf(path, key="ss3", format="table", min_itemsize={"index": 6}) # just make sure there is a longer string: df2 = df.copy().reset_index().assign(C="longer").set_index("C") - df2.to_hdf(path, "ss3", append=True, format="table") + df2.to_hdf(path, key="ss3", append=True, format="table") tm.assert_frame_equal(read_hdf(path, "ss3"), concat([df, df2])) # same as above, with a Series - df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6}) - df2["B"].to_hdf(path, "ss4", append=True, format="table") + df["B"].to_hdf(path, key="ss4", format="table", min_itemsize={"index": 6}) + df2["B"].to_hdf(path, key="ss4", append=True, format="table") tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) @@ -326,7 +398,7 @@ ser = Series(data, index=Index(data)) path = tmp_path / setup_path # GH 20835 - ser.to_hdf(path, "table", format=format, errors="surrogatepass") + ser.to_hdf(path, key="table", format=format, errors="surrogatepass") result = read_hdf(path, "table", errors="surrogatepass") tm.assert_series_equal(result, ser) @@ -339,7 +411,11 @@ return getattr(store.get_storer(t).table.cols, column) # data columns - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) df["string"] = "foo" df["string2"] = "bar" store.append("f", df, data_columns=["string", "string2"]) @@ -370,7 +446,11 @@ return getattr(store.get_storer(t).table.cols, column) # data columns - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) df["string"] = "foo" df["string2"] = "bar" store.append("f", df, data_columns=["string"]) @@ -408,7 +488,11 @@ def test_table_mixed_dtypes(setup_path): # frame - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) df["obj1"] = "foo" df["obj2"] = "bar" df["bool1"] = df["A"] > 0 @@ -457,8 +541,14 @@ def test_remove(setup_path): with ensure_clean_store(setup_path) as store: - ts = tm.makeTimeSeries() - df = tm.makeDataFrame() + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) store["a"] = ts store["b"] = df _maybe_remove(store, "a") @@ -518,7 +608,11 @@ def test_store_index_name(setup_path): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) df.index.name = "foo" with ensure_clean_store(setup_path) as store: @@ -527,32 +621,41 @@ tm.assert_frame_equal(recons, df) +@pytest.mark.parametrize("tz", [None, "US/Pacific"]) @pytest.mark.parametrize("table_format", ["table", "fixed"]) -def test_store_index_name_numpy_str(tmp_path, table_format, setup_path): +def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz): # GH #13492 - idx = Index( - pd.to_datetime([dt.date(2000, 1, 1), dt.date(2000, 1, 2)]), + idx = DatetimeIndex( + [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], name="cols\u05d2", - ) - idx1 = Index( - pd.to_datetime([dt.date(2010, 1, 1), dt.date(2010, 1, 2)]), - name="rows\u05d0", + ).tz_localize(tz) + idx1 = ( + DatetimeIndex( + [dt.date(2010, 1, 1), dt.date(2010, 1, 2)], + name="rows\u05d0", + ) + .as_unit(unit) + .tz_localize(tz) ) df = DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) # This used to fail, returning numpy strings instead of python strings. path = tmp_path / setup_path - df.to_hdf(path, "df", format=table_format) + df.to_hdf(path, key="df", format=table_format) df2 = read_hdf(path, "df") tm.assert_frame_equal(df, df2, check_names=True) - assert type(df2.index.name) == str - assert type(df2.columns.name) == str + assert isinstance(df2.index.name, str) + assert isinstance(df2.columns.name, str) def test_store_series_name(setup_path): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) series = df["A"] with ensure_clean_store(setup_path) as store: @@ -563,15 +666,25 @@ def test_overwrite_node(setup_path): with ensure_clean_store(setup_path) as store: - store["a"] = tm.makeTimeDataFrame() - ts = tm.makeTimeSeries() + store["a"] = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) store["a"] = ts tm.assert_series_equal(store["a"], ts) def test_coordinates(setup_path): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") @@ -602,8 +715,12 @@ # multiple tables _maybe_remove(store, "df1") _maybe_remove(store, "df2") - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) + df1 = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + df2 = df1.copy().rename(columns="{}_2".format) store.append("df1", df1, data_columns=["A", "B"]) store.append("df2", df2) @@ -754,7 +871,11 @@ tm.assert_series_equal(result, expected) # sparse; not implemented - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan @@ -777,10 +898,14 @@ def test_path_pathlib(): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, "df"), lambda p: read_hdf(p, "df") + lambda p: df.to_hdf(p, key="df"), lambda p: read_hdf(p, "df") ) tm.assert_frame_equal(df, result) @@ -803,11 +928,15 @@ def test_path_pathlib_hdfstore(): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) def writer(path): with HDFStore(path) as store: - df.to_hdf(store, "df") + df.to_hdf(store, key="df") def reader(path): with HDFStore(path) as store: @@ -818,19 +947,27 @@ def test_pickle_path_localpath(): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, "df"), lambda p: read_hdf(p, "df") + lambda p: df.to_hdf(p, key="df"), lambda p: read_hdf(p, "df") ) tm.assert_frame_equal(df, result) def test_path_localpath_hdfstore(): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) def writer(path): with HDFStore(path) as store: - df.to_hdf(store, "df") + df.to_hdf(store, key="df") def reader(path): with HDFStore(path) as store: @@ -842,7 +979,11 @@ @pytest.mark.parametrize("propindexes", [True, False]) def test_copy(propindexes): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) with tm.ensure_clean() as path: with HDFStore(path) as st: @@ -876,9 +1017,9 @@ path = tmp_path / setup_path msg = "Columns index has to be unique for fixed format" with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", format="fixed") + df.to_hdf(path, key="df", format="fixed") - df.to_hdf(path, "df", format="table") + df.to_hdf(path, key="df", format="table") other = read_hdf(path, "df") tm.assert_frame_equal(df, other) @@ -911,7 +1052,7 @@ path = tmp_path / setup_path df.to_hdf( path, - "df", + key="df", mode="a", append=True, data_columns=data_columns, @@ -925,38 +1066,36 @@ @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -def test_to_hdf_with_object_column_names(tmp_path, setup_path): +@pytest.mark.parametrize( + "columns", + [ + Index([0, 1], dtype=np.int64), + Index([0.0, 1.0], dtype=np.float64), + date_range("2020-01-01", periods=2), + timedelta_range("1 day", periods=2), + period_range("2020-01-01", periods=2, freq="D"), + ], +) +def test_to_hdf_with_object_column_names_should_fail(tmp_path, setup_path, columns): # GH9057 + df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)), columns=columns) + path = tmp_path / setup_path + msg = "cannot have non-object label DataIndexableCol" + with pytest.raises(ValueError, match=msg): + df.to_hdf(path, key="df", format="table", data_columns=True) - types_should_fail = [ - tm.makeIntIndex, - tm.makeFloatIndex, - tm.makeDateIndex, - tm.makeTimedeltaIndex, - tm.makePeriodIndex, - ] - types_should_run = [ - tm.makeStringIndex, - tm.makeCategoricalIndex, - ] - for index in types_should_fail: - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 2)), columns=index(2) - ) - path = tmp_path / setup_path - msg = "cannot have non-object label DataIndexableCol" - with pytest.raises(ValueError, match=msg): - df.to_hdf(path, "df", format="table", data_columns=True) - - for index in types_should_run: - df = DataFrame( - np.random.default_rng(2).standard_normal((10, 2)), columns=index(2) - ) - path = tmp_path / setup_path - df.to_hdf(path, "df", format="table", data_columns=True) - result = read_hdf(path, "df", where=f"index = [{df.index[0]}]") - assert len(result) +@pytest.mark.parametrize("dtype", [None, "category"]) +def test_to_hdf_with_object_column_names_should_run(tmp_path, setup_path, dtype): + # GH9057 + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 2)), + columns=Index(["a", "b"], dtype=dtype), + ) + path = tmp_path / setup_path + df.to_hdf(path, key="df", format="table", data_columns=True) + result = read_hdf(path, "df", where=f"index = [{df.index[0]}]") + assert len(result) def test_hdfstore_strides(setup_path): @@ -975,6 +1114,6 @@ # # Test to make sure defaults are to not drop. # # Corresponding to Issue 9382 path = tmp_path / setup_path - df.to_hdf(path, "a") + df.to_hdf(path, key="a") result = read_hdf(path, "a") tm.assert_frame_equal(expected, result) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_subclass.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_subclass.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_subclass.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_subclass.py 2024-04-10 17:42:52.000000000 +0000 @@ -24,7 +24,7 @@ expected = DataFrame(data, dtype=np.intp) path = tmp_path / "temp.h5" - sdf.to_hdf(path, "df") + sdf.to_hdf(path, key="df") result = read_hdf(path, "df") tm.assert_frame_equal(result, expected) @@ -41,7 +41,7 @@ expected = Series(data, dtype=np.intp) path = tmp_path / "temp.h5" - sser.to_hdf(path, "ser") + sser.to_hdf(path, key="ser") result = read_hdf(path, "ser") tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_time_series.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_time_series.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_time_series.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_time_series.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,18 +5,23 @@ from pandas import ( DataFrame, + DatetimeIndex, Series, _testing as tm, + date_range, + period_range, ) from pandas.tests.io.pytables.common import ensure_clean_store pytestmark = pytest.mark.single_cpu -def test_store_datetime_fractional_secs(setup_path): +@pytest.mark.parametrize("unit", ["us", "ns"]) +def test_store_datetime_fractional_secs(setup_path, unit): + dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) + dti = DatetimeIndex([dt], dtype=f"M8[{unit}]") + series = Series([0], index=dti) with ensure_clean_store(setup_path) as store: - dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) - series = Series([0], [dt]) store["a"] = series assert store["a"].index[0] == dt @@ -24,7 +29,7 @@ @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_tseries_indices_series(setup_path): with ensure_clean_store(setup_path) as store: - idx = tm.makeDateIndex(10) + idx = date_range("2020-01-01", periods=10) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) store["a"] = ser result = store["a"] @@ -33,7 +38,7 @@ assert result.index.freq == ser.index.freq tm.assert_class_equal(result.index, ser.index, obj="series index") - idx = tm.makePeriodIndex(10) + idx = period_range("2020-01-01", periods=10, freq="D") ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) store["a"] = ser result = store["a"] @@ -46,7 +51,7 @@ @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_tseries_indices_frame(setup_path): with ensure_clean_store(setup_path) as store: - idx = tm.makeDateIndex(10) + idx = date_range("2020-01-01", periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx ) @@ -57,7 +62,7 @@ assert result.index.freq == df.index.freq tm.assert_class_equal(result.index, df.index, obj="dataframe index") - idx = tm.makePeriodIndex(10) + idx = period_range("2020-01-01", periods=10, freq="D") df = DataFrame(np.random.default_rng(2).standard_normal((len(idx), 3)), idx) store["a"] = df result = store["a"] diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_timezones.py pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_timezones.py --- pandas-2.1.4+dfsg/pandas/tests/io/pytables/test_timezones.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/pytables/test_timezones.py 2024-04-10 17:42:52.000000000 +0000 @@ -104,7 +104,7 @@ msg = ( r"invalid info for \[values_block_1\] for \[tz\], " - r"existing_value \[(dateutil/.*)?US/Eastern\] " + r"existing_value \[(dateutil/.*)?(US/Eastern|America/New_York)\] " r"conflicts with new value \[(dateutil/.*)?EET\]" ) with pytest.raises(ValueError, match=msg): @@ -131,7 +131,7 @@ def test_append_with_timezones_as_index(setup_path, gettz): # GH#4098 example - dti = date_range("2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern")) + dti = date_range("2000-1-1", periods=3, freq="h", tz=gettz("US/Eastern")) dti = dti._with_freq(None) # freq doesn't round-trip df = DataFrame({"A": Series(range(3), index=dti)}) @@ -148,16 +148,20 @@ tm.assert_frame_equal(result, df) -def test_roundtrip_tz_aware_index(setup_path): +def test_roundtrip_tz_aware_index(setup_path, unit): # GH 17618 - time = Timestamp("2000-01-01 01:00:00", tz="US/Eastern") - df = DataFrame(data=[0], index=[time]) + ts = Timestamp("2000-01-01 01:00:00", tz="US/Eastern") + dti = DatetimeIndex([ts]).as_unit(unit) + df = DataFrame(data=[0], index=dti) with ensure_clean_store(setup_path) as store: store.put("frame", df, format="fixed") recons = store["frame"] tm.assert_frame_equal(recons, df) - assert recons.index[0]._value == 946706400000000000 + + value = recons.index[0]._value + denom = {"ns": 1, "us": 1000, "ms": 10**6, "s": 10**9}[unit] + assert value == 946706400000000000 // denom def test_store_index_name_with_tz(setup_path): @@ -332,7 +336,7 @@ "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", - freq="H", + freq="h", ambiguous="infer", ) times = times._with_freq(None) # freq doesn't round-trip @@ -365,7 +369,7 @@ # Python 3. # # GH26443 - index = [Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] + index = DatetimeIndex(["2019-01-01T18:00"], dtype="M8[ns, America/New_York]") expected = DataFrame({"data": 123}, index=index) with ensure_clean_store( datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/sas/test_sas7bdat.py pandas-2.2.2+dfsg/pandas/tests/io/sas/test_sas7bdat.py --- pandas-2.1.4+dfsg/pandas/tests/io/sas/test_sas7bdat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/sas/test_sas7bdat.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,16 +4,18 @@ import os from pathlib import Path -import dateutil.parser import numpy as np import pytest +from pandas.compat import IS64 from pandas.errors import EmptyDataError import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm +from pandas.io.sas.sas7bdat import SAS7BDATReader + @pytest.fixture def dirpath(datapath): @@ -27,9 +29,9 @@ df = pd.read_csv(fname) epoch = datetime(1960, 1, 1) t1 = pd.to_timedelta(df["Column4"], unit="d") - df["Column4"] = epoch + t1 + df["Column4"] = (epoch + t1).astype("M8[s]") t2 = pd.to_timedelta(df["Column12"], unit="d") - df["Column12"] = epoch + t2 + df["Column12"] = (epoch + t2).astype("M8[s]") for k in range(df.shape[1]): col = df.iloc[:, k] if col.dtype == np.int64: @@ -41,15 +43,15 @@ class TestSAS7BDAT: @pytest.mark.slow def test_from_file(self, dirpath, data_test_ix): - df0, test_ix = data_test_ix + expected, test_ix = data_test_ix for k in test_ix: fname = os.path.join(dirpath, f"test{k}.sas7bdat") df = pd.read_sas(fname, encoding="utf-8") - tm.assert_frame_equal(df, df0) + tm.assert_frame_equal(df, expected) @pytest.mark.slow def test_from_buffer(self, dirpath, data_test_ix): - df0, test_ix = data_test_ix + expected, test_ix = data_test_ix for k in test_ix: fname = os.path.join(dirpath, f"test{k}.sas7bdat") with open(fname, "rb") as f: @@ -59,37 +61,37 @@ buf, format="sas7bdat", iterator=True, encoding="utf-8" ) as rdr: df = rdr.read() - tm.assert_frame_equal(df, df0, check_exact=False) + tm.assert_frame_equal(df, expected) @pytest.mark.slow def test_from_iterator(self, dirpath, data_test_ix): - df0, test_ix = data_test_ix + expected, test_ix = data_test_ix for k in test_ix: fname = os.path.join(dirpath, f"test{k}.sas7bdat") with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr: df = rdr.read(2) - tm.assert_frame_equal(df, df0.iloc[0:2, :]) + tm.assert_frame_equal(df, expected.iloc[0:2, :]) df = rdr.read(3) - tm.assert_frame_equal(df, df0.iloc[2:5, :]) + tm.assert_frame_equal(df, expected.iloc[2:5, :]) @pytest.mark.slow def test_path_pathlib(self, dirpath, data_test_ix): - df0, test_ix = data_test_ix + expected, test_ix = data_test_ix for k in test_ix: fname = Path(os.path.join(dirpath, f"test{k}.sas7bdat")) df = pd.read_sas(fname, encoding="utf-8") - tm.assert_frame_equal(df, df0) + tm.assert_frame_equal(df, expected) @td.skip_if_no("py.path") @pytest.mark.slow def test_path_localpath(self, dirpath, data_test_ix): from py.path import local as LocalPath - df0, test_ix = data_test_ix + expected, test_ix = data_test_ix for k in test_ix: fname = LocalPath(os.path.join(dirpath, f"test{k}.sas7bdat")) df = pd.read_sas(fname, encoding="utf-8") - tm.assert_frame_equal(df, df0) + tm.assert_frame_equal(df, expected) @pytest.mark.slow @pytest.mark.parametrize("chunksize", (3, 5, 10, 11)) @@ -127,8 +129,6 @@ pass tm.assert_frame_equal(df1, df2) - from pandas.io.sas.sas7bdat import SAS7BDATReader - with contextlib.closing(SAS7BDATReader(fname, convert_header_text=False)) as rdr: df3 = rdr.read() for x, y in zip(df1.columns, df3.columns): @@ -157,6 +157,8 @@ df0 = pd.read_csv(fname, parse_dates=["MONTH"]) vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) + + df0["MONTH"] = df0["MONTH"].astype("M8[s]") tm.assert_frame_equal(df, df0) @@ -175,7 +177,7 @@ fname = datapath("io", "sas", "data", "airline.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) - tm.assert_frame_equal(df, df0, check_exact=False) + tm.assert_frame_equal(df, df0) def test_date_time(datapath): @@ -187,7 +189,23 @@ fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"] ) # GH 19732: Timestamps imported from sas will incur floating point errors + # See GH#56014 for discussion of the correct "expected" results + # We are really just testing that we are "close". This only seems to be + # an issue near the implementation bounds. + df[df.columns[3]] = df.iloc[:, 3].dt.round("us") + df0["Date1"] = df0["Date1"].astype("M8[s]") + df0["Date2"] = df0["Date2"].astype("M8[s]") + df0["DateTime"] = df0["DateTime"].astype("M8[ms]") + df0["Taiw"] = df0["Taiw"].astype("M8[s]") + + res = df0["DateTimeHi"].astype("M8[us]").dt.round("ms") + df0["DateTimeHi"] = res.astype("M8[ms]") + + if not IS64: + # No good reason for this, just what we get on the CI + df0.loc[0, "DateTimeHi"] += np.timedelta64(1, "ms") + df0.loc[[2, 3], "DateTimeHi"] -= np.timedelta64(1, "ms") tm.assert_frame_equal(df, df0) @@ -247,48 +265,42 @@ pd.read_sas(fname) -def round_datetime_to_ms(ts): - if isinstance(ts, datetime): - return ts.replace(microsecond=int(round(ts.microsecond, -3) / 1000) * 1000) - elif isinstance(ts, str): - _ts = dateutil.parser.parse(timestr=ts) - return _ts.replace(microsecond=int(round(_ts.microsecond, -3) / 1000) * 1000) - else: - return ts - - def test_max_sas_date(datapath): # GH 20927 # NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999 # but this is read as 29DEC9999:23:59:59.998993 by a buggy # sas7bdat module + # See also GH#56014 for discussion of the correct "expected" results. fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") df = pd.read_sas(fname, encoding="iso-8859-1") - # SAS likes to left pad strings with spaces - lstrip before comparing - df = df.map(lambda x: x.lstrip() if isinstance(x, str) else x) - # GH 19732: Timestamps imported from sas will incur floating point errors - try: - df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") - except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: - df = df.map(round_datetime_to_ms) - except AttributeError: - df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms) - # if there are any date/times > pandas.Timestamp.max then ALL in that chunk - # are returned as datetime.datetime expected = pd.DataFrame( { "text": ["max", "normal"], "dt_as_float": [253717747199.999, 1880323199.999], - "dt_as_dt": [ - datetime(9999, 12, 29, 23, 59, 59, 999000), - datetime(2019, 8, 1, 23, 59, 59, 999000), - ], + "dt_as_dt": np.array( + [ + datetime(9999, 12, 29, 23, 59, 59, 999000), + datetime(2019, 8, 1, 23, 59, 59, 999000), + ], + dtype="M8[ms]", + ), "date_as_float": [2936547.0, 21762.0], - "date_as_date": [datetime(9999, 12, 29), datetime(2019, 8, 1)], + "date_as_date": np.array( + [ + datetime(9999, 12, 29), + datetime(2019, 8, 1), + ], + dtype="M8[s]", + ), }, columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"], ) + + if not IS64: + # No good reason for this, just what we get on the CI + expected.loc[:, "dt_as_dt"] -= np.timedelta64(1, "ms") + tm.assert_frame_equal(df, expected) @@ -301,15 +313,7 @@ fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat") results = [] for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1): - # SAS likes to left pad strings with spaces - lstrip before comparing - df = df.map(lambda x: x.lstrip() if isinstance(x, str) else x) # GH 19732: Timestamps imported from sas will incur floating point errors - try: - df["dt_as_dt"] = df["dt_as_dt"].dt.round("us") - except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime: - df = df.map(round_datetime_to_ms) - except AttributeError: - df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms) df.reset_index(inplace=True, drop=True) results.append(df) expected = [ @@ -317,9 +321,11 @@ { "text": ["max"], "dt_as_float": [253717747199.999], - "dt_as_dt": [datetime(9999, 12, 29, 23, 59, 59, 999000)], + "dt_as_dt": np.array( + [datetime(9999, 12, 29, 23, 59, 59, 999000)], dtype="M8[ms]" + ), "date_as_float": [2936547.0], - "date_as_date": [datetime(9999, 12, 29)], + "date_as_date": np.array([datetime(9999, 12, 29)], dtype="M8[s]"), }, columns=col_order, ), @@ -327,15 +333,20 @@ { "text": ["normal"], "dt_as_float": [1880323199.999], - "dt_as_dt": [np.datetime64("2019-08-01 23:59:59.999")], + "dt_as_dt": np.array(["2019-08-01 23:59:59.999"], dtype="M8[ms]"), "date_as_float": [21762.0], - "date_as_date": [np.datetime64("2019-08-01")], + "date_as_date": np.array(["2019-08-01"], dtype="M8[s]"), }, columns=col_order, ), ] - for result, expected in zip(results, expected): - tm.assert_frame_equal(result, expected) + if not IS64: + # No good reason for this, just what we get on the CI + expected[0].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms") + expected[1].loc[0, "dt_as_dt"] -= np.timedelta64(1, "ms") + + tm.assert_frame_equal(results[0], expected[0]) + tm.assert_frame_equal(results[1], expected[1]) def test_null_date(datapath): @@ -344,16 +355,25 @@ expected = pd.DataFrame( { - "datecol": [ - datetime(9999, 12, 29), - pd.NaT, - ], - "datetimecol": [ - datetime(9999, 12, 29, 23, 59, 59, 998993), - pd.NaT, - ], + "datecol": np.array( + [ + datetime(9999, 12, 29), + np.datetime64("NaT"), + ], + dtype="M8[s]", + ), + "datetimecol": np.array( + [ + datetime(9999, 12, 29, 23, 59, 59, 999000), + np.datetime64("NaT"), + ], + dtype="M8[ms]", + ), }, ) + if not IS64: + # No good reason for this, just what we get on the CI + expected.loc[0, "datetimecol"] -= np.timedelta64(1, "ms") tm.assert_frame_equal(df, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_clipboard.py pandas-2.2.2+dfsg/pandas/tests/io/test_clipboard.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_clipboard.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_clipboard.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,13 +1,8 @@ -import os from textwrap import dedent import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, -) from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -30,8 +25,7 @@ from pandas.io.clipboard import ( CheckedCall, _stringifyText, - clipboard_get, - clipboard_set, + init_qt_clipboard, ) @@ -70,32 +64,21 @@ {"a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"], "b": ["abc", "def"]} ) elif data_type == "string": - return tm.makeCustomDataframe( - 5, 3, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None] + return DataFrame( + np.array([f"i-{i}" for i in range(15)]).reshape(5, 3), columns=list("abc") ) elif data_type == "long": max_rows = get_option("display.max_rows") - return tm.makeCustomDataframe( - max_rows + 1, - 3, - data_gen_f=lambda *args: np.random.default_rng(2).integers(2), - c_idx_type="s", - r_idx_type="i", - c_idx_names=[None], - r_idx_names=[None], + return DataFrame( + np.random.default_rng(2).integers(0, 10, size=(max_rows + 1, 3)), + columns=list("abc"), ) elif data_type == "nonascii": return DataFrame({"en": "in English".split(), "es": "en español".split()}) elif data_type == "colwidth": _cw = get_option("display.max_colwidth") + 1 - return tm.makeCustomDataframe( - 5, - 3, - data_gen_f=lambda *args: "x" * _cw, - c_idx_type="s", - r_idx_type="i", - c_idx_names=[None], - r_idx_names=[None], + return DataFrame( + np.array(["x" * _cw for _ in range(15)]).reshape(5, 3), columns=list("abc") ) elif data_type == "mixed": return DataFrame( @@ -106,24 +89,10 @@ } ) elif data_type == "float": - return tm.makeCustomDataframe( - 5, - 3, - data_gen_f=lambda r, c: float(r) + 0.01, - c_idx_type="s", - r_idx_type="i", - c_idx_names=[None], - r_idx_names=[None], - ) + return DataFrame(np.random.default_rng(2).random((5, 3)), columns=list("abc")) elif data_type == "int": - return tm.makeCustomDataframe( - 5, - 3, - data_gen_f=lambda *args: np.random.default_rng(2).integers(2), - c_idx_type="s", - r_idx_type="i", - c_idx_names=[None], - r_idx_names=[None], + return DataFrame( + np.random.default_rng(2).integers(0, 10, (5, 3)), columns=list("abc") ) else: raise ValueError @@ -205,60 +174,34 @@ @pytest.fixture -def mock_clipboard(monkeypatch, request): - """Fixture mocking clipboard IO. - - This mocks pandas.io.clipboard.clipboard_get and - pandas.io.clipboard.clipboard_set. - - This uses a local dict for storing data. The dictionary - key used is the test ID, available with ``request.node.name``. - - This returns the local dictionary, for direct manipulation by - tests. - """ - # our local clipboard for tests - _mock_data = {} - - def _mock_set(data): - _mock_data[request.node.name] = data - - def _mock_get(): - return _mock_data[request.node.name] - - monkeypatch.setattr("pandas.io.clipboard.clipboard_set", _mock_set) - monkeypatch.setattr("pandas.io.clipboard.clipboard_get", _mock_get) - - yield _mock_data - +def set_pyqt_clipboard(monkeypatch): + qt_cut, qt_paste = init_qt_clipboard() + with monkeypatch.context() as m: + m.setattr(pd.io.clipboard, "clipboard_set", qt_cut) + m.setattr(pd.io.clipboard, "clipboard_get", qt_paste) + yield -@pytest.mark.clipboard -def test_mock_clipboard(mock_clipboard): - import pandas.io.clipboard - pandas.io.clipboard.clipboard_set("abc") - assert "abc" in set(mock_clipboard.values()) - result = pandas.io.clipboard.clipboard_get() - assert result == "abc" +@pytest.fixture +def clipboard(qapp): + clip = qapp.clipboard() + yield clip + clip.clear() @pytest.mark.single_cpu @pytest.mark.clipboard -@pytest.mark.usefixtures("mock_clipboard") +@pytest.mark.usefixtures("set_pyqt_clipboard") +@pytest.mark.usefixtures("clipboard") class TestClipboard: - def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): - data.to_clipboard(excel=excel, sep=sep, encoding=encoding) - result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding) - tm.assert_frame_equal(data, result) - # Test that default arguments copy as tab delimited - def test_round_trip_frame(self, df): - self.check_round_trip_frame(df) - # Test that explicit delimiters are respected - @pytest.mark.parametrize("sep", ["\t", ",", "|"]) - def test_round_trip_frame_sep(self, df, sep): - self.check_round_trip_frame(df, sep=sep) + @pytest.mark.parametrize("sep", [None, "\t", ",", "|"]) + @pytest.mark.parametrize("encoding", [None, "UTF-8", "utf-8", "utf8"]) + def test_round_trip_frame_sep(self, df, sep, encoding): + df.to_clipboard(excel=None, sep=sep, encoding=encoding) + result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding) + tm.assert_frame_equal(df, result) # Test white space separator def test_round_trip_frame_string(self, df): @@ -286,22 +229,21 @@ # delimited and excel="True" @pytest.mark.parametrize("sep", ["\t", None, "default"]) @pytest.mark.parametrize("excel", [True, None, "default"]) - def test_clipboard_copy_tabs_default(self, sep, excel, df, request, mock_clipboard): + def test_clipboard_copy_tabs_default(self, sep, excel, df, clipboard): kwargs = build_kwargs(sep, excel) df.to_clipboard(**kwargs) - assert mock_clipboard[request.node.name] == df.to_csv(sep="\t") + assert clipboard.text() == df.to_csv(sep="\t") # Tests reading of white space separated tables @pytest.mark.parametrize("sep", [None, "default"]) - @pytest.mark.parametrize("excel", [False]) - def test_clipboard_copy_strings(self, sep, excel, df): - kwargs = build_kwargs(sep, excel) + def test_clipboard_copy_strings(self, sep, df): + kwargs = build_kwargs(sep, False) df.to_clipboard(**kwargs) result = read_clipboard(sep=r"\s+") assert result.to_string() == df.to_string() assert df.shape == result.shape - def test_read_clipboard_infer_excel(self, request, mock_clipboard): + def test_read_clipboard_infer_excel(self, clipboard): # gh-19010: avoid warnings clip_kwargs = {"engine": "python"} @@ -312,7 +254,7 @@ 4\tHarry Carney """.strip() ) - mock_clipboard[request.node.name] = text + clipboard.setText(text) df = read_clipboard(**clip_kwargs) # excel data is parsed correctly @@ -326,7 +268,7 @@ 3 4 """.strip() ) - mock_clipboard[request.node.name] = text + clipboard.setText(text) res = read_clipboard(**clip_kwargs) text = dedent( @@ -336,16 +278,16 @@ 3 4 """.strip() ) - mock_clipboard[request.node.name] = text + clipboard.setText(text) exp = read_clipboard(**clip_kwargs) tm.assert_frame_equal(res, exp) - def test_infer_excel_with_nulls(self, request, mock_clipboard): + def test_infer_excel_with_nulls(self, clipboard): # GH41108 text = "col1\tcol2\n1\tred\n\tblue\n2\tgreen" - mock_clipboard[request.node.name] = text + clipboard.setText(text) df = read_clipboard() df_expected = DataFrame( data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]} @@ -376,10 +318,10 @@ ), ], ) - def test_infer_excel_with_multiindex(self, request, mock_clipboard, multiindex): + def test_infer_excel_with_multiindex(self, clipboard, multiindex): # GH41108 - mock_clipboard[request.node.name] = multiindex[0] + clipboard.setText(multiindex[0]) df = read_clipboard() df_expected = DataFrame( data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]}, @@ -397,26 +339,17 @@ with pytest.raises(NotImplementedError, match=msg): read_clipboard(encoding="ascii") - @pytest.mark.parametrize("enc", ["UTF-8", "utf-8", "utf8"]) - def test_round_trip_valid_encodings(self, enc, df): - self.check_round_trip_frame(df, encoding=enc) - - @pytest.mark.single_cpu @pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑`...", "abcd..."]) - @pytest.mark.xfail( - (os.environ.get("DISPLAY") is None and not is_platform_mac()) - or is_ci_environment(), - reason="Cannot pass if a headless system is not put in place with Xvfb", - strict=not is_ci_environment(), # Flaky failures in the CI - ) def test_raw_roundtrip(self, data): # PR #25040 wide unicode wasn't copied correctly on PY3 on windows - clipboard_set(data) - assert data == clipboard_get() + df = DataFrame({"data": [data]}) + df.to_clipboard() + result = read_clipboard() + tm.assert_frame_equal(df, result) @pytest.mark.parametrize("engine", ["c", "python"]) def test_read_clipboard_dtype_backend( - self, request, mock_clipboard, string_storage, dtype_backend, engine + self, clipboard, string_storage, dtype_backend, engine ): # GH#50502 if string_storage == "pyarrow" or dtype_backend == "pyarrow": @@ -426,6 +359,13 @@ string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow" and engine != "c": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["x", "y"])) + string_array_na = ArrowExtensionArray(pa.array(["x", None])) + else: string_array = ArrowStringArray(pa.array(["x", "y"])) string_array_na = ArrowStringArray(pa.array(["x", None])) @@ -433,7 +373,7 @@ text = """a,b,c,d,e,f,g,h,i x,1,4.0,x,2,4.0,,True,False y,2,5.0,,,,,False,""" - mock_clipboard[request.node.name] = text + clipboard.setText(text) with pd.option_context("mode.string_storage", string_storage): result = read_clipboard(sep=",", dtype_backend=dtype_backend, engine=engine) @@ -471,3 +411,13 @@ ) with pytest.raises(ValueError, match=msg): read_clipboard(dtype_backend="numpy") + + def test_to_clipboard_pos_args_deprecation(self): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_clipboard " + r"will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_clipboard(True, None) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_common.py pandas-2.2.2+dfsg/pandas/tests/io/test_common.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_common.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_common.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,6 +15,7 @@ import pickle import tempfile +import numpy as np import pytest from pandas.compat import is_platform_windows @@ -25,6 +26,10 @@ import pandas.io.common as icom +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + class CustomFSPath: """For testing fspath on unknown objects""" @@ -436,7 +441,11 @@ def test_unknown_engine(self): with tm.ensure_clean() as path: - df = tm.makeDataFrame() + df = pd.DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) df.to_csv(path) with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") @@ -448,7 +457,11 @@ GH 35058 """ with tm.ensure_clean() as path: - df = tm.makeDataFrame() + df = pd.DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) df.to_csv(path, mode="w+b") tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) @@ -462,7 +475,11 @@ GH 35681 """ - df = tm.makeDataFrame() + df = pd.DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) with tm.ensure_clean() as path: with tm.assert_produces_warning(UnicodeWarning): df.to_csv(path, compression=compression_, encoding=encoding) @@ -492,7 +509,11 @@ @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): # GH39247 - expected = tm.makeDataFrame() + expected = pd.DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) with tm.ensure_clean() as path: with codecs.open(path, mode="w", encoding=encoding) as handle: getattr(expected, f"to_{format}")(handle) @@ -506,7 +527,11 @@ def test_codecs_get_writer_reader(): # GH39247 - expected = tm.makeDataFrame() + expected = pd.DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) with tm.ensure_clean() as path: with open(path, "wb") as handle: with codecs.getwriter("utf-8")(handle) as encoded: @@ -528,7 +553,11 @@ # GH39247; this test makes sure that if a user provides mode="*t" or "*b", # it is used. In the case of this test it leads to an error as intentionally the # wrong mode is requested - expected = tm.makeDataFrame() + expected = pd.DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) with io_class() as buffer: with pytest.raises(TypeError, match=msg): expected.to_csv(buffer, mode=f"w{mode}") diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_compression.py pandas-2.2.2+dfsg/pandas/tests/io/test_compression.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_compression.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_compression.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,7 @@ import time import zipfile +import numpy as np import pytest from pandas.compat import is_platform_windows @@ -142,7 +143,11 @@ GH22555 """ - df = tm.makeDataFrame() + df = pd.DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) # with a file with tm.ensure_clean() as path: @@ -170,7 +175,11 @@ GH 28103 """ - df = tm.makeDataFrame() + df = pd.DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) compression_options = {"method": "gzip", "mtime": 1} # test for filename @@ -189,7 +198,11 @@ GH 28103 """ - df = tm.makeDataFrame() + df = pd.DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) compression_options = {"method": "gzip", "mtime": 1} # test for file object diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_feather.py pandas-2.2.2+dfsg/pandas/tests/io/test_feather.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_feather.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_feather.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,7 +11,11 @@ from pandas.io.feather_format import read_feather, to_feather # isort:skip -pyarrow = pytest.importorskip("pyarrow") +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + +pa = pytest.importorskip("pyarrow") @pytest.mark.single_cpu @@ -128,17 +132,29 @@ self.check_round_trip(df, use_threads=False) def test_path_pathlib(self): - df = tm.makeDataFrame().reset_index() + df = pd.DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ).reset_index() result = tm.round_trip_pathlib(df.to_feather, read_feather) tm.assert_frame_equal(df, result) def test_path_localpath(self): - df = tm.makeDataFrame().reset_index() + df = pd.DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ).reset_index() result = tm.round_trip_localpath(df.to_feather, read_feather) tm.assert_frame_equal(df, result) def test_passthrough_keywords(self): - df = tm.makeDataFrame().reset_index() + df = pd.DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ).reset_index() self.check_round_trip(df, write_kwargs={"version": 1}) @pytest.mark.network @@ -153,7 +169,6 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): # GH#50765 - pa = pytest.importorskip("pyarrow") df = pd.DataFrame( { "a": pd.Series([1, np.nan, 3], dtype="Int64"), @@ -171,6 +186,12 @@ string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) + else: string_array = ArrowStringArray(pa.array(["a", "b", "c"])) string_array_na = ArrowStringArray(pa.array(["a", "b", None])) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_fsspec.py pandas-2.2.2+dfsg/pandas/tests/io/test_fsspec.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_fsspec.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_fsspec.py 2024-04-10 17:42:52.000000000 +0000 @@ -18,6 +18,32 @@ import pandas._testing as tm from pandas.util import _test_decorators as td +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + +@pytest.fixture +def fsspectest(): + pytest.importorskip("fsspec") + from fsspec import register_implementation + from fsspec.implementations.memory import MemoryFileSystem + from fsspec.registry import _registry as registry + + class TestMemoryFS(MemoryFileSystem): + protocol = "testmem" + test = [None] + + def __init__(self, **kwargs) -> None: + self.test[0] = kwargs.pop("test", None) + super().__init__(**kwargs) + + register_implementation("testmem", TestMemoryFS, clobber=True) + yield TestMemoryFS() + registry.pop("testmem", None) + TestMemoryFS.test[0] = None + TestMemoryFS.store.clear() + @pytest.fixture def df1(): diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_gbq.py pandas-2.2.2+dfsg/pandas/tests/io/test_gbq.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_gbq.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_gbq.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,14 @@ +import pandas as pd +import pandas._testing as tm + + +def test_read_gbq_deprecated(): + with tm.assert_produces_warning(FutureWarning): + with tm.external_error_raised(Exception): + pd.read_gbq("fake") + + +def test_to_gbq_deprecated(): + with tm.assert_produces_warning(FutureWarning): + with tm.external_error_raised(Exception): + pd.DataFrame(range(1)).to_gbq("fake") diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_gcs.py pandas-2.2.2+dfsg/pandas/tests/io/test_gcs.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_gcs.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_gcs.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,7 @@ from pandas import ( DataFrame, + Index, date_range, read_csv, read_excel, @@ -18,6 +19,10 @@ import pandas._testing as tm from pandas.util import _test_decorators as td +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + @pytest.fixture def gcs_buffer(): @@ -141,7 +146,11 @@ GH 35677 (to_csv, compression), GH 26124 (to_csv, encoding), and GH 32392 (read_csv, encoding) """ - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) # reference of compressed and encoded file compression = {"method": compression_only} diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_html.py pandas-2.2.2+dfsg/pandas/tests/io/test_html.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_html.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_html.py 2024-04-10 17:42:52.000000000 +0000 @@ -99,15 +99,18 @@ assert_framelist_equal(dfs_lxml, dfs_bs4) -@pytest.mark.parametrize( - "flavor", - [ +@pytest.fixture( + params=[ pytest.param("bs4", marks=[td.skip_if_no("bs4"), td.skip_if_no("html5lib")]), pytest.param("lxml", marks=td.skip_if_no("lxml")), ], ) +def flavor_read_html(request): + return partial(read_html, flavor=request.param) + + class TestReadHtml: - def test_literal_html_deprecation(self): + def test_literal_html_deprecation(self, flavor_read_html): # GH 53785 msg = ( "Passing literal html to 'read_html' is deprecated and " @@ -116,7 +119,7 @@ ) with tm.assert_produces_warning(FutureWarning, match=msg): - self.read_html( + flavor_read_html( """ @@ -147,30 +150,22 @@ def banklist_data(self, datapath): return datapath("io", "data", "html", "banklist.html") - @pytest.fixture(autouse=True) - def set_defaults(self, flavor): - self.read_html = partial(read_html, flavor=flavor) - yield - - def test_to_html_compat(self): + def test_to_html_compat(self, flavor_read_html): df = ( - tm.makeCustomDataframe( - 4, - 3, - data_gen_f=lambda *args: np.random.default_rng(2).random(), - c_idx_names=False, - r_idx_names=False, + DataFrame( + np.random.default_rng(2).random((4, 3)), + columns=pd.Index(list("abc"), dtype=object), ) # pylint: disable-next=consider-using-f-string .map("{:.3f}".format).astype(float) ) out = df.to_html() - res = self.read_html(StringIO(out), attrs={"class": "dataframe"}, index_col=0)[ - 0 - ] + res = flavor_read_html( + StringIO(out), attrs={"class": "dataframe"}, index_col=0 + )[0] tm.assert_frame_equal(res, df) - def test_dtype_backend(self, string_storage, dtype_backend): + def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html): # GH#50286 df = DataFrame( { @@ -188,7 +183,12 @@ if string_storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["a", "b", "c"])) @@ -196,7 +196,7 @@ out = df.to_html(index=False) with pd.option_context("mode.string_storage", string_storage): - result = self.read_html(StringIO(out), dtype_backend=dtype_backend)[0] + result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0] expected = DataFrame( { @@ -227,16 +227,16 @@ @pytest.mark.network @pytest.mark.single_cpu - def test_banklist_url(self, httpserver, banklist_data): + def test_banklist_url(self, httpserver, banklist_data, flavor_read_html): with open(banklist_data, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) - df1 = self.read_html( + df1 = flavor_read_html( # lxml cannot find attrs leave out for now httpserver.url, match="First Federal Bank of Florida", # attrs={"class": "dataTable"} ) # lxml cannot find attrs leave out for now - df2 = self.read_html( + df2 = flavor_read_html( httpserver.url, match="Metcalf Bank", ) # attrs={"class": "dataTable"}) @@ -245,165 +245,169 @@ @pytest.mark.network @pytest.mark.single_cpu - def test_spam_url(self, httpserver, spam_data): + def test_spam_url(self, httpserver, spam_data, flavor_read_html): with open(spam_data, encoding="utf-8") as f: httpserver.serve_content(content=f.read()) - df1 = self.read_html(httpserver.url, match=".*Water.*") - df2 = self.read_html(httpserver.url, match="Unit") + df1 = flavor_read_html(httpserver.url, match=".*Water.*") + df2 = flavor_read_html(httpserver.url, match="Unit") assert_framelist_equal(df1, df2) @pytest.mark.slow - def test_banklist(self, banklist_data): - df1 = self.read_html(banklist_data, match=".*Florida.*", attrs={"id": "table"}) - df2 = self.read_html(banklist_data, match="Metcalf Bank", attrs={"id": "table"}) + def test_banklist(self, banklist_data, flavor_read_html): + df1 = flavor_read_html( + banklist_data, match=".*Florida.*", attrs={"id": "table"} + ) + df2 = flavor_read_html( + banklist_data, match="Metcalf Bank", attrs={"id": "table"} + ) assert_framelist_equal(df1, df2) - def test_spam(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*") - df2 = self.read_html(spam_data, match="Unit") + def test_spam(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*") + df2 = flavor_read_html(spam_data, match="Unit") assert_framelist_equal(df1, df2) assert df1[0].iloc[0, 0] == "Proximates" assert df1[0].columns[0] == "Nutrient" - def test_spam_no_match(self, spam_data): - dfs = self.read_html(spam_data) + def test_spam_no_match(self, spam_data, flavor_read_html): + dfs = flavor_read_html(spam_data) for df in dfs: assert isinstance(df, DataFrame) - def test_banklist_no_match(self, banklist_data): - dfs = self.read_html(banklist_data, attrs={"id": "table"}) + def test_banklist_no_match(self, banklist_data, flavor_read_html): + dfs = flavor_read_html(banklist_data, attrs={"id": "table"}) for df in dfs: assert isinstance(df, DataFrame) - def test_spam_header(self, spam_data): - df = self.read_html(spam_data, match=".*Water.*", header=2)[0] + def test_spam_header(self, spam_data, flavor_read_html): + df = flavor_read_html(spam_data, match=".*Water.*", header=2)[0] assert df.columns[0] == "Proximates" assert not df.empty - def test_skiprows_int(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=1) - df2 = self.read_html(spam_data, match="Unit", skiprows=1) + def test_skiprows_int(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=1) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=1) assert_framelist_equal(df1, df2) - def test_skiprows_range(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=range(2)) - df2 = self.read_html(spam_data, match="Unit", skiprows=range(2)) + def test_skiprows_range(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=range(2)) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=range(2)) assert_framelist_equal(df1, df2) - def test_skiprows_list(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=[1, 2]) - df2 = self.read_html(spam_data, match="Unit", skiprows=[2, 1]) + def test_skiprows_list(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=[1, 2]) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=[2, 1]) assert_framelist_equal(df1, df2) - def test_skiprows_set(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows={1, 2}) - df2 = self.read_html(spam_data, match="Unit", skiprows={2, 1}) + def test_skiprows_set(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows={1, 2}) + df2 = flavor_read_html(spam_data, match="Unit", skiprows={2, 1}) assert_framelist_equal(df1, df2) - def test_skiprows_slice(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=1) - df2 = self.read_html(spam_data, match="Unit", skiprows=1) + def test_skiprows_slice(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=1) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=1) assert_framelist_equal(df1, df2) - def test_skiprows_slice_short(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=slice(2)) - df2 = self.read_html(spam_data, match="Unit", skiprows=slice(2)) + def test_skiprows_slice_short(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=slice(2)) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=slice(2)) assert_framelist_equal(df1, df2) - def test_skiprows_slice_long(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=slice(2, 5)) - df2 = self.read_html(spam_data, match="Unit", skiprows=slice(4, 1, -1)) + def test_skiprows_slice_long(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=slice(2, 5)) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) - def test_skiprows_ndarray(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", skiprows=np.arange(2)) - df2 = self.read_html(spam_data, match="Unit", skiprows=np.arange(2)) + def test_skiprows_ndarray(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=np.arange(2)) + df2 = flavor_read_html(spam_data, match="Unit", skiprows=np.arange(2)) assert_framelist_equal(df1, df2) - def test_skiprows_invalid(self, spam_data): + def test_skiprows_invalid(self, spam_data, flavor_read_html): with pytest.raises(TypeError, match=("is not a valid type for skipping rows")): - self.read_html(spam_data, match=".*Water.*", skiprows="asdf") + flavor_read_html(spam_data, match=".*Water.*", skiprows="asdf") - def test_index(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", index_col=0) - df2 = self.read_html(spam_data, match="Unit", index_col=0) + def test_index(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", index_col=0) + df2 = flavor_read_html(spam_data, match="Unit", index_col=0) assert_framelist_equal(df1, df2) - def test_header_and_index_no_types(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", header=1, index_col=0) - df2 = self.read_html(spam_data, match="Unit", header=1, index_col=0) + def test_header_and_index_no_types(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", header=1, index_col=0) + df2 = flavor_read_html(spam_data, match="Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) - def test_header_and_index_with_types(self, spam_data): - df1 = self.read_html(spam_data, match=".*Water.*", header=1, index_col=0) - df2 = self.read_html(spam_data, match="Unit", header=1, index_col=0) + def test_header_and_index_with_types(self, spam_data, flavor_read_html): + df1 = flavor_read_html(spam_data, match=".*Water.*", header=1, index_col=0) + df2 = flavor_read_html(spam_data, match="Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) - def test_infer_types(self, spam_data): + def test_infer_types(self, spam_data, flavor_read_html): # 10892 infer_types removed - df1 = self.read_html(spam_data, match=".*Water.*", index_col=0) - df2 = self.read_html(spam_data, match="Unit", index_col=0) + df1 = flavor_read_html(spam_data, match=".*Water.*", index_col=0) + df2 = flavor_read_html(spam_data, match="Unit", index_col=0) assert_framelist_equal(df1, df2) - def test_string_io(self, spam_data): + def test_string_io(self, spam_data, flavor_read_html): with open(spam_data, encoding="UTF-8") as f: data1 = StringIO(f.read()) with open(spam_data, encoding="UTF-8") as f: data2 = StringIO(f.read()) - df1 = self.read_html(data1, match=".*Water.*") - df2 = self.read_html(data2, match="Unit") + df1 = flavor_read_html(data1, match=".*Water.*") + df2 = flavor_read_html(data2, match="Unit") assert_framelist_equal(df1, df2) - def test_string(self, spam_data): + def test_string(self, spam_data, flavor_read_html): with open(spam_data, encoding="UTF-8") as f: data = f.read() - df1 = self.read_html(StringIO(data), match=".*Water.*") - df2 = self.read_html(StringIO(data), match="Unit") + df1 = flavor_read_html(StringIO(data), match=".*Water.*") + df2 = flavor_read_html(StringIO(data), match="Unit") assert_framelist_equal(df1, df2) - def test_file_like(self, spam_data): + def test_file_like(self, spam_data, flavor_read_html): with open(spam_data, encoding="UTF-8") as f: - df1 = self.read_html(f, match=".*Water.*") + df1 = flavor_read_html(f, match=".*Water.*") with open(spam_data, encoding="UTF-8") as f: - df2 = self.read_html(f, match="Unit") + df2 = flavor_read_html(f, match="Unit") assert_framelist_equal(df1, df2) @pytest.mark.network @pytest.mark.single_cpu - def test_bad_url_protocol(self, httpserver): + def test_bad_url_protocol(self, httpserver, flavor_read_html): httpserver.serve_content("urlopen error unknown url type: git", code=404) with pytest.raises(URLError, match="urlopen error unknown url type: git"): - self.read_html("git://github.com", match=".*Water.*") + flavor_read_html("git://github.com", match=".*Water.*") @pytest.mark.slow @pytest.mark.network @pytest.mark.single_cpu - def test_invalid_url(self, httpserver): + def test_invalid_url(self, httpserver, flavor_read_html): httpserver.serve_content("Name or service not known", code=404) with pytest.raises((URLError, ValueError), match="HTTP Error 404: NOT FOUND"): - self.read_html(httpserver.url, match=".*Water.*") + flavor_read_html(httpserver.url, match=".*Water.*") @pytest.mark.slow - def test_file_url(self, banklist_data): + def test_file_url(self, banklist_data, flavor_read_html): url = banklist_data - dfs = self.read_html( + dfs = flavor_read_html( file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"} ) assert isinstance(dfs, list) @@ -411,54 +415,78 @@ assert isinstance(df, DataFrame) @pytest.mark.slow - def test_invalid_table_attrs(self, banklist_data): + def test_invalid_table_attrs(self, banklist_data, flavor_read_html): url = banklist_data with pytest.raises(ValueError, match="No tables found"): - self.read_html( + flavor_read_html( url, match="First Federal Bank of Florida", attrs={"id": "tasdfable"} ) - def _bank_data(self, path, **kwargs): - return self.read_html(path, match="Metcalf", attrs={"id": "table"}, **kwargs) - @pytest.mark.slow - def test_multiindex_header(self, banklist_data): - df = self._bank_data(banklist_data, header=[0, 1])[0] + def test_multiindex_header(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, match="Metcalf", attrs={"id": "table"}, header=[0, 1] + )[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_multiindex_index(self, banklist_data): - df = self._bank_data(banklist_data, index_col=[0, 1])[0] + def test_multiindex_index(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, match="Metcalf", attrs={"id": "table"}, index_col=[0, 1] + )[0] assert isinstance(df.index, MultiIndex) @pytest.mark.slow - def test_multiindex_header_index(self, banklist_data): - df = self._bank_data(banklist_data, header=[0, 1], index_col=[0, 1])[0] + def test_multiindex_header_index(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, + match="Metcalf", + attrs={"id": "table"}, + header=[0, 1], + index_col=[0, 1], + )[0] assert isinstance(df.columns, MultiIndex) assert isinstance(df.index, MultiIndex) @pytest.mark.slow - def test_multiindex_header_skiprows_tuples(self, banklist_data): - df = self._bank_data(banklist_data, header=[0, 1], skiprows=1)[0] + def test_multiindex_header_skiprows_tuples(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, + match="Metcalf", + attrs={"id": "table"}, + header=[0, 1], + skiprows=1, + )[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_multiindex_header_skiprows(self, banklist_data): - df = self._bank_data(banklist_data, header=[0, 1], skiprows=1)[0] + def test_multiindex_header_skiprows(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, + match="Metcalf", + attrs={"id": "table"}, + header=[0, 1], + skiprows=1, + )[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_multiindex_header_index_skiprows(self, banklist_data): - df = self._bank_data( - banklist_data, header=[0, 1], index_col=[0, 1], skiprows=1 + def test_multiindex_header_index_skiprows(self, banklist_data, flavor_read_html): + df = flavor_read_html( + banklist_data, + match="Metcalf", + attrs={"id": "table"}, + header=[0, 1], + index_col=[0, 1], + skiprows=1, )[0] assert isinstance(df.index, MultiIndex) assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_regex_idempotency(self, banklist_data): + def test_regex_idempotency(self, banklist_data, flavor_read_html): url = banklist_data - dfs = self.read_html( + dfs = flavor_read_html( file_path_to_url(os.path.abspath(url)), match=re.compile(re.compile("Florida")), attrs={"id": "table"}, @@ -467,10 +495,10 @@ for df in dfs: assert isinstance(df, DataFrame) - def test_negative_skiprows(self, spam_data): + def test_negative_skiprows(self, spam_data, flavor_read_html): msg = r"\(you passed a negative value\)" with pytest.raises(ValueError, match=msg): - self.read_html(spam_data, match="Water", skiprows=-1) + flavor_read_html(spam_data, match="Water", skiprows=-1) @pytest.fixture def python_docs(self): @@ -523,20 +551,20 @@ @pytest.mark.network @pytest.mark.single_cpu - def test_multiple_matches(self, python_docs, httpserver): + def test_multiple_matches(self, python_docs, httpserver, flavor_read_html): httpserver.serve_content(content=python_docs) - dfs = self.read_html(httpserver.url, match="Python") + dfs = flavor_read_html(httpserver.url, match="Python") assert len(dfs) > 1 @pytest.mark.network @pytest.mark.single_cpu - def test_python_docs_table(self, python_docs, httpserver): + def test_python_docs_table(self, python_docs, httpserver, flavor_read_html): httpserver.serve_content(content=python_docs) - dfs = self.read_html(httpserver.url, match="Python") + dfs = flavor_read_html(httpserver.url, match="Python") zz = [df.iloc[0, 0][0:4] for df in dfs] assert sorted(zz) == ["Pyth", "What"] - def test_empty_tables(self): + def test_empty_tables(self, flavor_read_html): """ Make sure that read_html ignores empty tables. """ @@ -560,13 +588,13 @@
""" - result = self.read_html(StringIO(html)) + result = flavor_read_html(StringIO(html)) assert len(result) == 1 - def test_multiple_tbody(self): + def test_multiple_tbody(self, flavor_read_html): # GH-20690 # Read all tbody tags within a single table. - result = self.read_html( + result = flavor_read_html( StringIO( """ @@ -595,12 +623,12 @@ tm.assert_frame_equal(result, expected) - def test_header_and_one_column(self): + def test_header_and_one_column(self, flavor_read_html): """ Don't fail with bs4 when there is a header and only one column as described in issue #9178 """ - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -621,11 +649,11 @@ tm.assert_frame_equal(result, expected) - def test_thead_without_tr(self): + def test_thead_without_tr(self, flavor_read_html): """ Ensure parser adds within on malformed HTML. """ - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -653,7 +681,7 @@ tm.assert_frame_equal(result, expected) - def test_tfoot_read(self): + def test_tfoot_read(self, flavor_read_html): """ Make sure that read_html reads tfoot, containing td or th. Ignores empty tfoot @@ -685,16 +713,16 @@ data1 = data_template.format(footer="") data2 = data_template.format(footer="") - result1 = self.read_html(StringIO(data1))[0] - result2 = self.read_html(StringIO(data2))[0] + result1 = flavor_read_html(StringIO(data1))[0] + result2 = flavor_read_html(StringIO(data2))[0] tm.assert_frame_equal(result1, expected1) tm.assert_frame_equal(result2, expected2) - def test_parse_header_of_non_string_column(self): + def test_parse_header_of_non_string_column(self, flavor_read_html): # GH5048: if header is specified explicitly, an int column should be # parsed as int while its header is parsed as str - result = self.read_html( + result = flavor_read_html( StringIO( """
footAfootB
@@ -717,7 +745,7 @@ tm.assert_frame_equal(result, expected) @pytest.mark.slow - def test_banklist_header(self, banklist_data, datapath): + def test_banklist_header(self, banklist_data, datapath, flavor_read_html): from pandas.io.html import _remove_whitespace def try_remove_ws(x): @@ -726,7 +754,7 @@ except AttributeError: return x - df = self.read_html(banklist_data, match="Metcalf", attrs={"id": "table"})[0] + df = flavor_read_html(banklist_data, match="Metcalf", attrs={"id": "table"})[0] ground_truth = read_csv( datapath("io", "data", "csv", "banklist.csv"), converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, @@ -765,19 +793,19 @@ tm.assert_frame_equal(converted, gtnew) @pytest.mark.slow - def test_gold_canyon(self, banklist_data): + def test_gold_canyon(self, banklist_data, flavor_read_html): gc = "Gold Canyon" with open(banklist_data, encoding="utf-8") as f: raw_text = f.read() assert gc in raw_text - df = self.read_html(banklist_data, match="Gold Canyon", attrs={"id": "table"})[ - 0 - ] + df = flavor_read_html( + banklist_data, match="Gold Canyon", attrs={"id": "table"} + )[0] assert gc in df.to_string() - def test_different_number_of_cols(self): - expected = self.read_html( + def test_different_number_of_cols(self, flavor_read_html): + expected = flavor_read_html( StringIO( """
@@ -813,7 +841,7 @@ index_col=0, )[0] - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -848,9 +876,9 @@ tm.assert_frame_equal(result, expected) - def test_colspan_rowspan_1(self): + def test_colspan_rowspan_1(self, flavor_read_html): # GH17054 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -873,7 +901,7 @@ tm.assert_frame_equal(result, expected) - def test_colspan_rowspan_copy_values(self): + def test_colspan_rowspan_copy_values(self, flavor_read_html): # GH17054 # In ASCII, with lowercase letters being copies: @@ -881,7 +909,7 @@ # X x Y Z W # A B b z C - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -908,7 +936,7 @@ tm.assert_frame_equal(result, expected) - def test_colspan_rowspan_both_not_1(self): + def test_colspan_rowspan_both_not_1(self, flavor_read_html): # GH17054 # In ASCII, with lowercase letters being copies: @@ -916,7 +944,7 @@ # A B b b C # a b b b D - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -940,7 +968,7 @@ tm.assert_frame_equal(result, expected) - def test_rowspan_at_end_of_row(self): + def test_rowspan_at_end_of_row(self, flavor_read_html): # GH17054 # In ASCII, with lowercase letters being copies: @@ -948,7 +976,7 @@ # A B # C b - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -969,10 +997,10 @@ tm.assert_frame_equal(result, expected) - def test_rowspan_only_rows(self): + def test_rowspan_only_rows(self, flavor_read_html): # GH17054 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -990,9 +1018,9 @@ tm.assert_frame_equal(result, expected) - def test_header_inferred_from_rows_with_only_th(self): + def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): # GH17054 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -1018,15 +1046,15 @@ tm.assert_frame_equal(result, expected) - def test_parse_dates_list(self): + def test_parse_dates_list(self, flavor_read_html): df = DataFrame({"date": date_range("1/1/2001", periods=10)}) expected = df.to_html() - res = self.read_html(StringIO(expected), parse_dates=[1], index_col=0) + res = flavor_read_html(StringIO(expected), parse_dates=[1], index_col=0) tm.assert_frame_equal(df, res[0]) - res = self.read_html(StringIO(expected), parse_dates=["date"], index_col=0) + res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0) tm.assert_frame_equal(df, res[0]) - def test_parse_dates_combine(self): + def test_parse_dates_combine(self, flavor_read_html): raw_dates = Series(date_range("1/1/2001", periods=10)) df = DataFrame( { @@ -1034,32 +1062,32 @@ "time": raw_dates.map(lambda x: str(x.time())), } ) - res = self.read_html( + res = flavor_read_html( StringIO(df.to_html()), parse_dates={"datetime": [1, 2]}, index_col=1 ) newdf = DataFrame({"datetime": raw_dates}) tm.assert_frame_equal(newdf, res[0]) - def test_wikipedia_states_table(self, datapath): + def test_wikipedia_states_table(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") assert os.path.isfile(data), f"{repr(data)} is not a file" assert os.path.getsize(data), f"{repr(data)} is an empty file" - result = self.read_html(data, match="Arizona", header=1)[0] + result = flavor_read_html(data, match="Arizona", header=1)[0] assert result.shape == (60, 12) assert "Unnamed" in result.columns[-1] assert result["sq mi"].dtype == np.dtype("float64") assert np.allclose(result.loc[0, "sq mi"], 665384.04) - def test_wikipedia_states_multiindex(self, datapath): + def test_wikipedia_states_multiindex(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") - result = self.read_html(data, match="Arizona", index_col=0)[0] + result = flavor_read_html(data, match="Arizona", index_col=0)[0] assert result.shape == (60, 11) assert "Unnamed" in result.columns[-1][1] assert result.columns.nlevels == 2 assert np.allclose(result.loc["Alaska", ("Total area[2]", "sq mi")], 665384.04) - def test_parser_error_on_empty_header_row(self): - result = self.read_html( + def test_parser_error_on_empty_header_row(self, flavor_read_html): + result = flavor_read_html( StringIO( """
@@ -1083,9 +1111,9 @@ ) tm.assert_frame_equal(result[0], expected) - def test_decimal_rows(self): + def test_decimal_rows(self, flavor_read_html): # GH 12907 - result = self.read_html( + result = flavor_read_html( StringIO( """ @@ -1113,7 +1141,7 @@ tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("arg", [True, False]) - def test_bool_header_arg(self, spam_data, arg): + def test_bool_header_arg(self, spam_data, arg, flavor_read_html): # GH 6114 msg = re.escape( "Passing a bool to header is invalid. Use header=None for no header or " @@ -1121,11 +1149,11 @@ "column names" ) with pytest.raises(TypeError, match=msg): - self.read_html(spam_data, header=arg) + flavor_read_html(spam_data, header=arg) - def test_converters(self): + def test_converters(self, flavor_read_html): # GH 13461 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -1150,9 +1178,9 @@ tm.assert_frame_equal(result, expected) - def test_na_values(self): + def test_na_values(self, flavor_read_html): # GH 13461 - result = self.read_html( + result = flavor_read_html( StringIO( """
@@ -1177,7 +1205,7 @@ tm.assert_frame_equal(result, expected) - def test_keep_default_na(self): + def test_keep_default_na(self, flavor_read_html): html_data = """
@@ -1195,15 +1223,15 @@
""" expected_df = DataFrame({"a": ["N/A", "NA"]}) - html_df = self.read_html(StringIO(html_data), keep_default_na=False)[0] + html_df = flavor_read_html(StringIO(html_data), keep_default_na=False)[0] tm.assert_frame_equal(expected_df, html_df) expected_df = DataFrame({"a": [np.nan, np.nan]}) - html_df = self.read_html(StringIO(html_data), keep_default_na=True)[0] + html_df = flavor_read_html(StringIO(html_data), keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) - def test_preserve_empty_rows(self): - result = self.read_html( + def test_preserve_empty_rows(self, flavor_read_html): + result = flavor_read_html( StringIO( """ @@ -1228,8 +1256,8 @@ tm.assert_frame_equal(result, expected) - def test_ignore_empty_rows_when_inferring_header(self): - result = self.read_html( + def test_ignore_empty_rows_when_inferring_header(self, flavor_read_html): + result = flavor_read_html( StringIO( """
@@ -1251,7 +1279,7 @@ tm.assert_frame_equal(result, expected) - def test_multiple_header_rows(self): + def test_multiple_header_rows(self, flavor_read_html): # Issue #13434 expected_df = DataFrame( data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")] @@ -1261,20 +1289,20 @@ ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"], ] html = expected_df.to_html(index=False) - html_df = self.read_html(StringIO(html))[0] + html_df = flavor_read_html(StringIO(html))[0] tm.assert_frame_equal(expected_df, html_df) - def test_works_on_valid_markup(self, datapath): + def test_works_on_valid_markup(self, datapath, flavor_read_html): filename = datapath("io", "data", "html", "valid_markup.html") - dfs = self.read_html(filename, index_col=0) + dfs = flavor_read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow - def test_fallback_success(self, datapath): + def test_fallback_success(self, datapath, flavor_read_html): banklist_data = datapath("io", "data", "html", "banklist.html") - self.read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"]) + flavor_read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"]) def test_to_html_timestamp(self): rng = date_range("2000-01-01", periods=10) @@ -1309,7 +1337,7 @@ (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"])), ], ) - def test_displayed_only(self, displayed_only, exp0, exp1): + def test_displayed_only(self, displayed_only, exp0, exp1, flavor_read_html): # GH 20027 data = """ @@ -1331,7 +1359,7 @@ """ - dfs = self.read_html(StringIO(data), displayed_only=displayed_only) + dfs = flavor_read_html(StringIO(data), displayed_only=displayed_only) tm.assert_frame_equal(dfs[0], exp0) if exp1 is not None: @@ -1340,7 +1368,7 @@ assert len(dfs) == 1 # Should not parse hidden table @pytest.mark.parametrize("displayed_only", [True, False]) - def test_displayed_only_with_many_elements(self, displayed_only): + def test_displayed_only_with_many_elements(self, displayed_only, flavor_read_html): html_table = """
@@ -1357,7 +1385,9 @@
""" - result = read_html(StringIO(html_table), displayed_only=displayed_only)[0] + result = flavor_read_html(StringIO(html_table), displayed_only=displayed_only)[ + 0 + ] expected = DataFrame({"A": [1, 4], "B": [2, 5]}) tm.assert_frame_equal(result, expected) @@ -1365,23 +1395,23 @@ "ignore:You provided Unicode markup but also provided a value for " "from_encoding.*:UserWarning" ) - def test_encode(self, html_encoding_file): + def test_encode(self, html_encoding_file, flavor_read_html): base_path = os.path.basename(html_encoding_file) root = os.path.splitext(base_path)[0] _, encoding = root.split("_") try: with open(html_encoding_file, "rb") as fobj: - from_string = self.read_html( + from_string = flavor_read_html( fobj.read(), encoding=encoding, index_col=0 ).pop() with open(html_encoding_file, "rb") as fobj: - from_file_like = self.read_html( + from_file_like = flavor_read_html( BytesIO(fobj.read()), encoding=encoding, index_col=0 ).pop() - from_filename = self.read_html( + from_filename = flavor_read_html( html_encoding_file, encoding=encoding, index_col=0 ).pop() tm.assert_frame_equal(from_string, from_file_like) @@ -1393,10 +1423,10 @@ pytest.skip() raise - def test_parse_failure_unseekable(self): + def test_parse_failure_unseekable(self, flavor_read_html): # Issue #17975 - if self.read_html.keywords.get("flavor") == "lxml": + if flavor_read_html.keywords.get("flavor") == "lxml": pytest.skip("Not applicable for lxml") class UnseekableStringIO(StringIO): @@ -1408,12 +1438,12 @@
spameggs
""" ) - assert self.read_html(bad) + assert flavor_read_html(bad) with pytest.raises(ValueError, match="passed a non-rewindable file object"): - self.read_html(bad) + flavor_read_html(bad) - def test_parse_failure_rewinds(self): + def test_parse_failure_rewinds(self, flavor_read_html): # Issue #17975 class MockFile: @@ -1444,12 +1474,12 @@ good = MockFile("
spam
eggs
") bad = MockFile("
spameggs
") - assert self.read_html(good) - assert self.read_html(bad) + assert flavor_read_html(good) + assert flavor_read_html(bad) @pytest.mark.slow @pytest.mark.single_cpu - def test_importcheck_thread_safety(self, datapath): + def test_importcheck_thread_safety(self, datapath, flavor_read_html): # see gh-16928 class ErrorThread(threading.Thread): @@ -1462,8 +1492,8 @@ self.err = None filename = datapath("io", "data", "html", "valid_markup.html") - helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) - helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) + helper_thread1 = ErrorThread(target=flavor_read_html, args=(filename,)) + helper_thread2 = ErrorThread(target=flavor_read_html, args=(filename,)) helper_thread1.start() helper_thread2.start() @@ -1472,17 +1502,17 @@ pass assert None is helper_thread1.err is helper_thread2.err - def test_parse_path_object(self, datapath): + def test_parse_path_object(self, datapath, flavor_read_html): # GH 37705 file_path_string = datapath("io", "data", "html", "spam.html") file_path = Path(file_path_string) - df1 = self.read_html(file_path_string)[0] - df2 = self.read_html(file_path)[0] + df1 = flavor_read_html(file_path_string)[0] + df2 = flavor_read_html(file_path)[0] tm.assert_frame_equal(df1, df2) - def test_parse_br_as_space(self): + def test_parse_br_as_space(self, flavor_read_html): # GH 29528: pd.read_html() convert
to space - result = self.read_html( + result = flavor_read_html( StringIO( """ @@ -1502,7 +1532,7 @@ tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"]) - def test_extract_links(self, arg): + def test_extract_links(self, arg, flavor_read_html): gh_13141_data = """
@@ -1565,7 +1595,7 @@ elif arg == "header": head_exp = gh_13141_expected["head_extract"] - result = self.read_html(StringIO(gh_13141_data), extract_links=arg)[0] + result = flavor_read_html(StringIO(gh_13141_data), extract_links=arg)[0] expected = DataFrame([data_exp, foot_exp], columns=head_exp) expected = expected.fillna(np.nan) tm.assert_frame_equal(result, expected) @@ -1578,7 +1608,7 @@ with pytest.raises(ValueError, match=msg): read_html(spam_data, extract_links="incorrect") - def test_extract_links_all_no_header(self): + def test_extract_links_all_no_header(self, flavor_read_html): # GH 48316 data = """
@@ -1589,7 +1619,7 @@
""" - result = self.read_html(StringIO(data), extract_links="all")[0] + result = flavor_read_html(StringIO(data), extract_links="all")[0] expected = DataFrame([[("Google.com", "https://google.com")]]) tm.assert_frame_equal(result, expected) @@ -1601,7 +1631,7 @@ with pytest.raises(ValueError, match=msg): read_html("test", dtype_backend="numpy") - def test_style_tag(self): + def test_style_tag(self, flavor_read_html): # GH 48316 data = """ @@ -1622,6 +1652,6 @@
""" - result = self.read_html(StringIO(data))[0] + result = flavor_read_html(StringIO(data))[0] expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_http_headers.py pandas-2.2.2+dfsg/pandas/tests/io/test_http_headers.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_http_headers.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_http_headers.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,172 @@ +""" +Tests for the pandas custom headers in http(s) requests +""" +from functools import partial +import gzip +from io import BytesIO + +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm + +pytestmark = [ + pytest.mark.single_cpu, + pytest.mark.network, + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), +] + + +def gzip_bytes(response_bytes): + with BytesIO() as bio: + with gzip.GzipFile(fileobj=bio, mode="w") as zipper: + zipper.write(response_bytes) + return bio.getvalue() + + +def csv_responder(df): + return df.to_csv(index=False).encode("utf-8") + + +def gz_csv_responder(df): + return gzip_bytes(csv_responder(df)) + + +def json_responder(df): + return df.to_json().encode("utf-8") + + +def gz_json_responder(df): + return gzip_bytes(json_responder(df)) + + +def html_responder(df): + return df.to_html(index=False).encode("utf-8") + + +def parquetpyarrow_reponder(df): + return df.to_parquet(index=False, engine="pyarrow") + + +def parquetfastparquet_responder(df): + # the fastparquet engine doesn't like to write to a buffer + # it can do it via the open_with function being set appropriately + # however it automatically calls the close method and wipes the buffer + # so just overwrite that attribute on this instance to not do that + + # protected by an importorskip in the respective test + import fsspec + + df.to_parquet( + "memory://fastparquet_user_agent.parquet", + index=False, + engine="fastparquet", + compression=None, + ) + with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f: + return f.read() + + +def pickle_respnder(df): + with BytesIO() as bio: + df.to_pickle(bio) + return bio.getvalue() + + +def stata_responder(df): + with BytesIO() as bio: + df.to_stata(bio, write_index=False) + return bio.getvalue() + + +@pytest.mark.parametrize( + "responder, read_method", + [ + (csv_responder, pd.read_csv), + (json_responder, pd.read_json), + ( + html_responder, + lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0], + ), + pytest.param( + parquetpyarrow_reponder, + partial(pd.read_parquet, engine="pyarrow"), + marks=td.skip_if_no("pyarrow"), + ), + pytest.param( + parquetfastparquet_responder, + partial(pd.read_parquet, engine="fastparquet"), + # TODO(ArrayManager) fastparquet + marks=[ + td.skip_if_no("fastparquet"), + td.skip_if_no("fsspec"), + td.skip_array_manager_not_yet_implemented, + ], + ), + (pickle_respnder, pd.read_pickle), + (stata_responder, pd.read_stata), + (gz_csv_responder, pd.read_csv), + (gz_json_responder, pd.read_json), + ], +) +@pytest.mark.parametrize( + "storage_options", + [ + None, + {"User-Agent": "foo"}, + {"User-Agent": "foo", "Auth": "bar"}, + ], +) +def test_request_headers(responder, read_method, httpserver, storage_options): + expected = pd.DataFrame({"a": ["b"]}) + default_headers = ["Accept-Encoding", "Host", "Connection", "User-Agent"] + if "gz" in responder.__name__: + extra = {"Content-Encoding": "gzip"} + if storage_options is None: + storage_options = extra + else: + storage_options |= extra + else: + extra = None + expected_headers = set(default_headers).union( + storage_options.keys() if storage_options else [] + ) + httpserver.serve_content(content=responder(expected), headers=extra) + result = read_method(httpserver.url, storage_options=storage_options) + tm.assert_frame_equal(result, expected) + + request_headers = dict(httpserver.requests[0].headers) + for header in expected_headers: + exp = request_headers.pop(header) + if storage_options and header in storage_options: + assert exp == storage_options[header] + # No extra headers added + assert not request_headers + + +@pytest.mark.parametrize( + "engine", + [ + "pyarrow", + "fastparquet", + ], +) +def test_to_parquet_to_disk_with_storage_options(engine): + headers = { + "User-Agent": "custom", + "Auth": "other_custom", + } + + pytest.importorskip(engine) + + true_df = pd.DataFrame({"column_name": ["column_value"]}) + msg = ( + "storage_options passed with file object or non-fsspec file path|" + "storage_options passed with buffer, or non-supported URL" + ) + with pytest.raises(ValueError, match=msg): + true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_orc.py pandas-2.2.2+dfsg/pandas/tests/io/test_orc.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_orc.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_orc.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,6 +17,10 @@ import pyarrow as pa +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + @pytest.fixture def dirpath(datapath): diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_parquet.py pandas-2.2.2+dfsg/pandas/tests/io/test_parquet.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_parquet.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_parquet.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,15 +8,11 @@ import numpy as np import pytest -from pandas._config import ( - get_option, - using_copy_on_write, -) +from pandas._config import using_copy_on_write +from pandas._config.config import _get_option from pandas.compat import is_platform_windows from pandas.compat.pyarrow import ( - pa_version_under7p0, - pa_version_under8p0, pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, @@ -51,9 +47,12 @@ # TODO(ArrayManager) fastparquet relies on BlockManager internals -pytestmark = pytest.mark.filterwarnings( - "ignore:DataFrame._data is deprecated:FutureWarning" -) +pytestmark = [ + pytest.mark.filterwarnings("ignore:DataFrame._data is deprecated:FutureWarning"), + pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" + ), +] # setup engines & skips @@ -62,7 +61,8 @@ pytest.param( "fastparquet", marks=pytest.mark.skipif( - not _HAVE_FASTPARQUET or get_option("mode.data_manager") == "array", + not _HAVE_FASTPARQUET + or _get_option("mode.data_manager", silent=True) == "array", reason="fastparquet is not installed or ArrayManager is used", ), ), @@ -89,7 +89,7 @@ def fp(): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") - elif get_option("mode.data_manager") == "array": + elif _get_option("mode.data_manager", silent=True) == "array": pytest.skip("ArrayManager is not supported with fastparquet") return "fastparquet" @@ -232,17 +232,10 @@ expected: iterable of str Expected partition names. """ - if pa_version_under7p0: - import pyarrow.parquet as pq - - dataset = pq.ParquetDataset(path, validate_schema=False) - assert len(dataset.partitions.partition_names) == len(expected) - assert dataset.partitions.partition_names == set(expected) - else: - import pyarrow.dataset as ds + import pyarrow.dataset as ds - dataset = ds.dataset(path, partitioning="hive") - assert dataset.partitioning.schema.names == expected + dataset = ds.dataset(path, partitioning="hive") + assert dataset.partitioning.schema.names == expected def test_invalid_engine(df_compat): @@ -362,6 +355,23 @@ tm.assert_frame_equal(result, df[["a", "d"]]) +def test_parquet_pos_args_deprecation(engine): + # GH-54229 + df = pd.DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_parquet except for the " + r"argument 'path' will be keyword-only." + ) + with tm.ensure_clean() as path: + with tm.assert_produces_warning( + FutureWarning, + match=msg, + check_stacklevel=False, + raise_on_extra_warnings=False, + ): + df.to_parquet(path, engine) + + class Base: def check_error_on_write(self, df, engine, exc, err_msg): # check that we are raising the exception on writing @@ -441,7 +451,7 @@ def test_write_index(self, engine, using_copy_on_write, request): check_names = engine != "fastparquet" if using_copy_on_write and engine == "fastparquet": - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="fastparquet write into index") ) @@ -604,9 +614,8 @@ else: check_round_trip(df, engine) - @pytest.mark.skipif(pa_version_under7p0, reason="minimum pyarrow not installed") def test_dtype_backend(self, engine, request): - import pyarrow.parquet as pq + pq = pytest.importorskip("pyarrow.parquet") if engine == "fastparquet": # We are manually disabling fastparquet's @@ -614,7 +623,7 @@ mark = pytest.mark.xfail( reason="Fastparquet nullable dtype support is disabled" ) - request.node.add_marker(mark) + request.applymarker(mark) table = pyarrow.table( { @@ -716,21 +725,15 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): # GH 37105 - msg = "Mismatched null-like values nan and None found" - warn = None - if using_copy_on_write(): - warn = FutureWarning - buf_bytes = df_full.to_parquet(engine=pa) assert isinstance(buf_bytes, bytes) buf_stream = BytesIO(buf_bytes) res = read_parquet(buf_stream) - expected = df_full.copy(deep=False) + expected = df_full.copy() expected.loc[1, "string_with_nan"] = None - with tm.assert_produces_warning(warn, match=msg): - tm.assert_frame_equal(df_full, res) + tm.assert_frame_equal(res, expected) def test_duplicate_columns(self, pa): # not currently able to handle duplicate columns @@ -739,10 +742,7 @@ def test_timedelta(self, pa): df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) - if pa_version_under8p0: - self.check_external_error_on_write(df, pa, NotImplementedError) - else: - check_round_trip(df, pa) + check_round_trip(df, pa) def test_unsupported(self, pa): # mixed python objects @@ -968,19 +968,13 @@ # with version 2.6, pyarrow defaults to writing the nanoseconds, so # this should work without error # Note in previous pyarrows(<7.0.0), only the pseudo-version 2.0 was available - if not pa_version_under7p0: - ver = "2.6" - else: - ver = "2.0" - df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)}) + ver = "2.6" + df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": ver}) def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): - if ( - not pa_version_under7p0 - and timezone_aware_date_list.tzinfo != datetime.timezone.utc - ): - request.node.add_marker( + if timezone_aware_date_list.tzinfo != datetime.timezone.utc: + request.applymarker( pytest.mark.xfail( reason="temporary skip this test until it is properly resolved: " "https://github.com/pandas-dev/pandas/issues/37286" @@ -1003,12 +997,10 @@ def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 pytest.importorskip("pyarrow") - df = pd.DataFrame({"a": list(range(0, 3))}) + df = pd.DataFrame({"a": list(range(3))}) with tm.ensure_clean() as path: - df.to_parquet(path, pa) - result = read_parquet( - path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False - ) + df.to_parquet(path, engine=pa) + result = read_parquet(path, pa, filters=[("a", "==", 0)]) assert len(result) == 1 def test_read_parquet_manager(self, pa, using_array_manager): @@ -1018,7 +1010,7 @@ ) with tm.ensure_clean() as path: - df.to_parquet(path, pa) + df.to_parquet(path, engine=pa) result = read_parquet(path, pa) if using_array_manager: assert isinstance(result._mgr, pd.core.internals.ArrayManager) @@ -1222,10 +1214,10 @@ check_round_trip(df, fp) def test_filter_row_groups(self, fp): - d = {"a": list(range(0, 3))} + d = {"a": list(range(3))} df = pd.DataFrame(d) with tm.ensure_clean() as path: - df.to_parquet(path, fp, compression=None, row_group_offsets=1) + df.to_parquet(path, engine=fp, compression=None, row_group_offsets=1) result = read_parquet(path, fp, filters=[("a", "==", 0)]) assert len(result) == 1 diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_pickle.py pandas-2.2.2+dfsg/pandas/tests/io/test_pickle.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_pickle.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_pickle.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,6 +10,8 @@ 3. Move the created pickle to "data/legacy_pickle/" directory. """ +from __future__ import annotations + from array import array import bz2 import datetime @@ -22,6 +24,7 @@ import pickle import shutil import tarfile +from typing import Any import uuid import zipfile @@ -38,6 +41,7 @@ import pandas as pd from pandas import ( + DataFrame, Index, Series, period_range, @@ -52,12 +56,6 @@ ) -@pytest.fixture -def current_pickle_data(): - # our current version pickle data - return create_pickle_data() - - # --------------------- # comparison functions # --------------------- @@ -173,6 +171,15 @@ return pickle.load(fh) +def flatten(data: dict) -> list[tuple[str, Any]]: + """Flatten create_pickle_data""" + return [ + (typ, example) + for typ, examples in data.items() + for example in examples.values() + ] + + @pytest.mark.parametrize( "pickle_writer", [ @@ -190,39 +197,45 @@ ], ) @pytest.mark.parametrize("writer", [pd.to_pickle, python_pickler]) -def test_round_trip_current(current_pickle_data, pickle_writer, writer): - data = current_pickle_data - for typ, dv in data.items(): - for dt, expected in dv.items(): - with tm.ensure_clean() as path: - # test writing with each pickler - pickle_writer(expected, path) - - # test reading with each unpickler - result = pd.read_pickle(path) - compare_element(result, expected, typ) - - result = python_unpickler(path) - compare_element(result, expected, typ) - - # and the same for file objects (GH 35679) - with open(path, mode="wb") as handle: - writer(expected, path) - handle.seek(0) # shouldn't close file handle - with open(path, mode="rb") as handle: - result = pd.read_pickle(handle) - handle.seek(0) # shouldn't close file handle - compare_element(result, expected, typ) +@pytest.mark.parametrize("typ, expected", flatten(create_pickle_data())) +def test_round_trip_current(typ, expected, pickle_writer, writer): + with tm.ensure_clean() as path: + # test writing with each pickler + pickle_writer(expected, path) + + # test reading with each unpickler + result = pd.read_pickle(path) + compare_element(result, expected, typ) + + result = python_unpickler(path) + compare_element(result, expected, typ) + + # and the same for file objects (GH 35679) + with open(path, mode="wb") as handle: + writer(expected, path) + handle.seek(0) # shouldn't close file handle + with open(path, mode="rb") as handle: + result = pd.read_pickle(handle) + handle.seek(0) # shouldn't close file handle + compare_element(result, expected, typ) def test_pickle_path_pathlib(): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) result = tm.round_trip_pathlib(df.to_pickle, pd.read_pickle) tm.assert_frame_equal(df, result) def test_pickle_path_localpath(): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) result = tm.round_trip_localpath(df.to_pickle, pd.read_pickle) tm.assert_frame_equal(df, result) @@ -276,7 +289,11 @@ path2 = base + ".raw" with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) # write to compressed file df.to_pickle(p1, compression=compression) @@ -295,7 +312,11 @@ def test_write_explicit_bad(self, compression, get_random_path): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean(get_random_path) as path: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) df.to_pickle(path, compression=compression) def test_write_infer(self, compression_ext, get_random_path): @@ -305,7 +326,11 @@ compression = self._extension_to_compression.get(compression_ext.lower()) with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) # write to compressed file by inferred compression method df.to_pickle(p1) @@ -326,7 +351,11 @@ path2 = base + ".compressed" with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) # write to uncompressed file df.to_pickle(p1, compression=None) @@ -345,7 +374,11 @@ compression = self._extension_to_compression.get(compression_ext.lower()) with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) # write to uncompressed file df.to_pickle(p1, compression=None) @@ -367,7 +400,11 @@ @pytest.mark.parametrize("protocol", [-1, 0, 1, 2]) def test_read(self, protocol, get_random_path): with tm.ensure_clean(get_random_path) as path: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) df.to_pickle(path, protocol=protocol) df2 = pd.read_pickle(path) tm.assert_frame_equal(df, df2) @@ -400,7 +437,11 @@ def test_pickle_buffer_roundtrip(): with tm.ensure_clean() as path: - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) with open(path, "wb") as fh: df.to_pickle(fh) with open(path, "rb") as fh: @@ -446,7 +487,11 @@ def mock_urlopen_read(*args, **kwargs): return MockReadResponse(path) - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) python_pickler(df, path) monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read) result = pd.read_pickle(mockurl) @@ -457,7 +502,11 @@ pytest.importorskip("fsspec") with tm.ensure_clean(): mockurl = "memory://mockfile" - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) df.to_pickle(mockurl) result = pd.read_pickle(mockurl) tm.assert_frame_equal(df, result) @@ -483,7 +532,11 @@ GH 26237, GH 29054, and GH 29570 """ - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) # reference for compression with tm.ensure_clean() as path: @@ -526,14 +579,14 @@ prng = period_range("1/1/2011", "1/1/2012", freq="M") ts = Series(np.random.default_rng(2).standard_normal(len(prng)), prng) new_ts = tm.round_trip_pickle(ts) - assert new_ts.index.freq == "M" + assert new_ts.index.freqstr == "M" @pytest.mark.parametrize( "name", [777, 777.0, "name", datetime.datetime(2001, 11, 11), (1, 2)] ) def test_pickle_preserve_name(name): - unpickled = tm.round_trip_pickle(tm.makeTimeSeries(name=name)) + unpickled = tm.round_trip_pickle(Series(np.arange(10, dtype=np.float64), name=name)) assert unpickled.name == name @@ -563,7 +616,7 @@ @pytest.mark.parametrize("protocol", [pickle.DEFAULT_PROTOCOL, pickle.HIGHEST_PROTOCOL]) def test_pickle_big_dataframe_compression(protocol, compression): # GH#39002 - df = pd.DataFrame(range(100000)) + df = DataFrame(range(100000)) result = tm.round_trip_pathlib( partial(df.to_pickle, protocol=protocol, compression=compression), partial(pd.read_pickle, compression=compression), @@ -583,5 +636,17 @@ with open(path, "rb") as fd: df = pickle.load(fd) - expected = pd.DataFrame(index=[], columns=[]) + expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(df, expected) + + +def test_pickle_pos_args_deprecation(): + # GH-54229 + df = DataFrame({"a": [1, 2, 3]}) + msg = ( + r"Starting with pandas version 3.0 all arguments of to_pickle except for the " + r"argument 'path' will be keyword-only." + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + buffer = io.BytesIO() + df.to_pickle(buffer, "infer") diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_s3.py pandas-2.2.2+dfsg/pandas/tests/io/test_s3.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_s3.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_s3.py 2024-04-10 17:42:52.000000000 +0000 @@ -30,15 +30,10 @@ @pytest.mark.single_cpu -def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, monkeypatch, s3so): +def test_read_with_creds_from_pub_bucket(s3_public_bucket_with_data, s3so): # Ensure we can read from a public bucket with credentials # GH 34626 - - # temporary workaround as moto fails for botocore >= 1.11 otherwise, - # see https://github.com/spulec/moto/issues/1924 & 1952 pytest.importorskip("s3fs") - monkeypatch.setenv("AWS_ACCESS_KEY_ID", "foobar_key") - monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") df = read_csv( f"s3://{s3_public_bucket_with_data.name}/tips.csv", nrows=5, diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_spss.py pandas-2.2.2+dfsg/pandas/tests/io/test_spss.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_spss.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_spss.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,3 +1,4 @@ +import datetime from pathlib import Path import numpy as np @@ -5,6 +6,7 @@ import pandas as pd import pandas._testing as tm +from pandas.util.version import Version pyreadstat = pytest.importorskip("pyreadstat") @@ -12,9 +14,11 @@ # TODO(CoW) - detection of chained assignment in cython # https://github.com/pandas-dev/pandas/issues/51315 @pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") +@pytest.mark.filterwarnings("ignore:ChainedAssignmentError:FutureWarning") @pytest.mark.parametrize("path_klass", [lambda p: p, Path]) def test_spss_labelled_num(path_klass, datapath): # test file from the Haven project (https://haven.tidyverse.org/) + # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT fname = path_klass(datapath("io", "data", "spss", "labelled-num.sav")) df = pd.read_spss(fname, convert_categoricals=True) @@ -28,8 +32,10 @@ @pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") +@pytest.mark.filterwarnings("ignore:ChainedAssignmentError:FutureWarning") def test_spss_labelled_num_na(datapath): # test file from the Haven project (https://haven.tidyverse.org/) + # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT fname = datapath("io", "data", "spss", "labelled-num-na.sav") df = pd.read_spss(fname, convert_categoricals=True) @@ -43,8 +49,10 @@ @pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") +@pytest.mark.filterwarnings("ignore:ChainedAssignmentError:FutureWarning") def test_spss_labelled_str(datapath): # test file from the Haven project (https://haven.tidyverse.org/) + # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT fname = datapath("io", "data", "spss", "labelled-str.sav") df = pd.read_spss(fname, convert_categoricals=True) @@ -58,8 +66,10 @@ @pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") +@pytest.mark.filterwarnings("ignore:ChainedAssignmentError:FutureWarning") def test_spss_umlauts(datapath): # test file from the Haven project (https://haven.tidyverse.org/) + # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT fname = datapath("io", "data", "spss", "umlauts.sav") df = pd.read_spss(fname, convert_categoricals=True) @@ -84,6 +94,7 @@ def test_spss_umlauts_dtype_backend(datapath, dtype_backend): # test file from the Haven project (https://haven.tidyverse.org/) + # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT fname = datapath("io", "data", "spss", "umlauts.sav") df = pd.read_spss(fname, convert_categoricals=False, dtype_backend=dtype_backend) @@ -111,3 +122,43 @@ ) with pytest.raises(ValueError, match=msg): pd.read_spss("test", dtype_backend="numpy") + + +@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") +@pytest.mark.filterwarnings("ignore:ChainedAssignmentError:FutureWarning") +def test_spss_metadata(datapath): + # GH 54264 + fname = datapath("io", "data", "spss", "labelled-num.sav") + + df = pd.read_spss(fname) + metadata = { + "column_names": ["VAR00002"], + "column_labels": [None], + "column_names_to_labels": {"VAR00002": None}, + "file_encoding": "UTF-8", + "number_columns": 1, + "number_rows": 1, + "variable_value_labels": {"VAR00002": {1.0: "This is one"}}, + "value_labels": {"labels0": {1.0: "This is one"}}, + "variable_to_label": {"VAR00002": "labels0"}, + "notes": [], + "original_variable_types": {"VAR00002": "F8.0"}, + "readstat_variable_types": {"VAR00002": "double"}, + "table_name": None, + "missing_ranges": {}, + "missing_user_values": {}, + "variable_storage_width": {"VAR00002": 8}, + "variable_display_width": {"VAR00002": 8}, + "variable_alignment": {"VAR00002": "unknown"}, + "variable_measure": {"VAR00002": "unknown"}, + "file_label": None, + "file_format": "sav/zsav", + } + if Version(pyreadstat.__version__) >= Version("1.2.4"): + metadata.update( + { + "creation_time": datetime.datetime(2015, 2, 6, 14, 33, 36), + "modification_time": datetime.datetime(2015, 2, 6, 14, 33, 36), + } + ) + assert df.attrs == metadata diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_sql.py pandas-2.2.2+dfsg/pandas/tests/io/test_sql.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_sql.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_sql.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,20 +1,3 @@ -"""SQL io tests - -The SQL tests are broken down in different classes: - -- `PandasSQLTest`: base class with common methods for all test classes -- Tests for the public API (only tests with sqlite3) - - `_TestSQLApi` base class - - `TestSQLApi`: test the public API with sqlalchemy engine - - `TestSQLiteFallbackApi`: test the public API with a sqlite DBAPI - connection -- Tests for the different SQL flavors (flavor specific type conversions) - - Tests for the sqlalchemy mode: `_TestSQLAlchemy` is the base class with - common methods. The different tested flavors (sqlite3, MySQL, - PostgreSQL) derive from the base class - - Tests for the fallback mode (`TestSQLiteFallback`) - -""" from __future__ import annotations import contextlib @@ -29,18 +12,23 @@ from io import StringIO from pathlib import Path import sqlite3 +from typing import TYPE_CHECKING import uuid import numpy as np import pytest from pandas._libs import lib +from pandas.compat import ( + pa_version_under13p0, + pa_version_under14p1, +) +from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td import pandas as pd from pandas import ( DataFrame, - DatetimeTZDtype, Index, MultiIndex, Series, @@ -69,12 +57,13 @@ read_sql_table, ) -try: +if TYPE_CHECKING: import sqlalchemy - SQLALCHEMY_INSTALLED = True -except ImportError: - SQLALCHEMY_INSTALLED = False + +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) @pytest.fixture @@ -106,17 +95,18 @@ } -def iris_table_metadata(dialect: str): +def iris_table_metadata(): + import sqlalchemy from sqlalchemy import ( - REAL, Column, + Double, Float, MetaData, String, Table, ) - dtype = Float if dialect == "postgresql" else REAL + dtype = Double if Version(sqlalchemy.__version__) >= Version("2.0.0") else Float metadata = MetaData() iris = Table( "iris", @@ -130,8 +120,7 @@ return iris -def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): - cur = conn.cursor() +def create_and_load_iris_sqlite3(conn, iris_file: Path): stmt = """CREATE TABLE iris ( "SepalLength" REAL, "SepalWidth" REAL, @@ -139,36 +128,77 @@ "PetalWidth" REAL, "Name" TEXT )""" + + cur = conn.cursor() cur.execute(stmt) with iris_file.open(newline=None, encoding="utf-8") as csvfile: reader = csv.reader(csvfile) next(reader) stmt = "INSERT INTO iris VALUES(?, ?, ?, ?, ?)" - cur.executemany(stmt, reader) + # ADBC requires explicit types - no implicit str -> float conversion + records = [] + records = [ + ( + float(row[0]), + float(row[1]), + float(row[2]), + float(row[3]), + row[4], + ) + for row in reader + ] + + cur.executemany(stmt, records) + cur.close() + + conn.commit() -def create_and_load_iris(conn, iris_file: Path, dialect: str): +def create_and_load_iris_postgresql(conn, iris_file: Path): + stmt = """CREATE TABLE iris ( + "SepalLength" DOUBLE PRECISION, + "SepalWidth" DOUBLE PRECISION, + "PetalLength" DOUBLE PRECISION, + "PetalWidth" DOUBLE PRECISION, + "Name" TEXT + )""" + with conn.cursor() as cur: + cur.execute(stmt) + with iris_file.open(newline=None, encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + next(reader) + stmt = "INSERT INTO iris VALUES($1, $2, $3, $4, $5)" + # ADBC requires explicit types - no implicit str -> float conversion + records = [ + ( + float(row[0]), + float(row[1]), + float(row[2]), + float(row[3]), + row[4], + ) + for row in reader + ] + + cur.executemany(stmt, records) + + conn.commit() + + +def create_and_load_iris(conn, iris_file: Path): from sqlalchemy import insert - from sqlalchemy.engine import Engine - iris = iris_table_metadata(dialect) + iris = iris_table_metadata() with iris_file.open(newline=None, encoding="utf-8") as csvfile: reader = csv.reader(csvfile) header = next(reader) params = [dict(zip(header, row)) for row in reader] stmt = insert(iris).values(params) - if isinstance(conn, Engine): - with conn.connect() as conn: - with conn.begin(): - iris.drop(conn, checkfirst=True) - iris.create(bind=conn) - conn.execute(stmt) - else: - with conn.begin(): - iris.drop(conn, checkfirst=True) - iris.create(bind=conn) - conn.execute(stmt) + with conn.begin() as con: + iris.drop(con, checkfirst=True) + iris.create(bind=con) + con.execute(stmt) def create_and_load_iris_view(conn): @@ -177,17 +207,17 @@ cur = conn.cursor() cur.execute(stmt) else: - from sqlalchemy import text - from sqlalchemy.engine import Engine - - stmt = text(stmt) - if isinstance(conn, Engine): - with conn.connect() as conn: - with conn.begin(): - conn.execute(stmt) + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(stmt) + conn.commit() else: - with conn.begin(): - conn.execute(stmt) + from sqlalchemy import text + + stmt = text(stmt) + with conn.begin() as con: + con.execute(stmt) def types_table_metadata(dialect: str): @@ -218,13 +248,10 @@ Column("IntColWithNull", Integer), Column("BoolColWithNull", bool_type), ) - if dialect == "postgresql": - types.append_column(Column("DateColWithTz", DateTime(timezone=True))) return types -def create_and_load_types_sqlite3(conn: sqlite3.Connection, types_data: list[dict]): - cur = conn.cursor() +def create_and_load_types_sqlite3(conn, types_data: list[dict]): stmt = """CREATE TABLE types ( "TextCol" TEXT, "DateCol" TEXT, @@ -236,13 +263,47 @@ "IntColWithNull" INTEGER, "BoolColWithNull" INTEGER )""" - cur.execute(stmt) - stmt = """ - INSERT INTO types - VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) - """ - cur.executemany(stmt, types_data) + ins_stmt = """ + INSERT INTO types + VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) + """ + + if isinstance(conn, sqlite3.Connection): + cur = conn.cursor() + cur.execute(stmt) + cur.executemany(ins_stmt, types_data) + else: + with conn.cursor() as cur: + cur.execute(stmt) + cur.executemany(ins_stmt, types_data) + + conn.commit() + + +def create_and_load_types_postgresql(conn, types_data: list[dict]): + with conn.cursor() as cur: + stmt = """CREATE TABLE types ( + "TextCol" TEXT, + "DateCol" TIMESTAMP, + "IntDateCol" INTEGER, + "IntDateOnlyCol" INTEGER, + "FloatCol" DOUBLE PRECISION, + "IntCol" INTEGER, + "BoolCol" BOOLEAN, + "IntColWithNull" INTEGER, + "BoolColWithNull" BOOLEAN + )""" + cur.execute(stmt) + + stmt = """ + INSERT INTO types + VALUES($1, $2::timestamp, $3, $4, $5, $6, $7, $8, $9) + """ + + cur.executemany(stmt, types_data) + + conn.commit() def create_and_load_types(conn, types_data: list[dict], dialect: str): @@ -265,19 +326,71 @@ conn.execute(stmt) +def create_and_load_postgres_datetz(conn): + from sqlalchemy import ( + Column, + DateTime, + MetaData, + Table, + insert, + ) + from sqlalchemy.engine import Engine + + metadata = MetaData() + datetz = Table("datetz", metadata, Column("DateColWithTz", DateTime(timezone=True))) + datetz_data = [ + { + "DateColWithTz": "2000-01-01 00:00:00-08:00", + }, + { + "DateColWithTz": "2000-06-01 00:00:00-07:00", + }, + ] + stmt = insert(datetz).values(datetz_data) + if isinstance(conn, Engine): + with conn.connect() as conn: + with conn.begin(): + datetz.drop(conn, checkfirst=True) + datetz.create(bind=conn) + conn.execute(stmt) + else: + with conn.begin(): + datetz.drop(conn, checkfirst=True) + datetz.create(bind=conn) + conn.execute(stmt) + + # "2000-01-01 00:00:00-08:00" should convert to + # "2000-01-01 08:00:00" + # "2000-06-01 00:00:00-07:00" should convert to + # "2000-06-01 07:00:00" + # GH 6415 + expected_data = [ + Timestamp("2000-01-01 08:00:00", tz="UTC"), + Timestamp("2000-06-01 07:00:00", tz="UTC"), + ] + return Series(expected_data, name="DateColWithTz") + + def check_iris_frame(frame: DataFrame): pytype = frame.dtypes.iloc[0].type row = frame.iloc[0] assert issubclass(pytype, np.floating) - tm.equalContents(row.values, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) + tm.assert_series_equal( + row, Series([5.1, 3.5, 1.4, 0.2, "Iris-setosa"], index=frame.columns, name=0) + ) assert frame.shape in ((150, 5), (8, 5)) def count_rows(conn, table_name: str): stmt = f"SELECT count(*) AS count_1 FROM {table_name}" + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") if isinstance(conn, sqlite3.Connection): cur = conn.cursor() return cur.execute(stmt).fetchone()[0] + elif adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(stmt) + return cur.fetchone()[0] else: from sqlalchemy import create_engine from sqlalchemy.engine import Engine @@ -315,7 +428,6 @@ "BoolCol": False, "IntColWithNull": 1, "BoolColWithNull": False, - "DateColWithTz": "2000-01-01 00:00:00-08:00", }, { "TextCol": "first", @@ -327,7 +439,6 @@ "BoolCol": False, "IntColWithNull": None, "BoolColWithNull": None, - "DateColWithTz": "2000-06-01 00:00:00-07:00", }, ] @@ -397,8 +508,99 @@ return DataFrame(data, columns=columns) +def get_all_views(conn): + if isinstance(conn, sqlite3.Connection): + c = conn.execute("SELECT name FROM sqlite_master WHERE type='view'") + return [view[0] for view in c.fetchall()] + else: + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + results = [] + info = conn.adbc_get_objects().read_all().to_pylist() + for catalog in info: + catalog["catalog_name"] + for schema in catalog["catalog_db_schemas"]: + schema["db_schema_name"] + for table in schema["db_schema_tables"]: + if table["table_type"] == "view": + view_name = table["table_name"] + results.append(view_name) + + return results + else: + from sqlalchemy import inspect + + return inspect(conn).get_view_names() + + +def get_all_tables(conn): + if isinstance(conn, sqlite3.Connection): + c = conn.execute("SELECT name FROM sqlite_master WHERE type='table'") + return [table[0] for table in c.fetchall()] + else: + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + + if adbc and isinstance(conn, adbc.Connection): + results = [] + info = conn.adbc_get_objects().read_all().to_pylist() + for catalog in info: + for schema in catalog["catalog_db_schemas"]: + for table in schema["db_schema_tables"]: + if table["table_type"] == "table": + table_name = table["table_name"] + results.append(table_name) + + return results + else: + from sqlalchemy import inspect + + return inspect(conn).get_table_names() + + +def drop_table( + table_name: str, + conn: sqlite3.Connection | sqlalchemy.engine.Engine | sqlalchemy.engine.Connection, +): + if isinstance(conn, sqlite3.Connection): + conn.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}") + conn.commit() + + else: + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(f'DROP TABLE IF EXISTS "{table_name}"') + else: + with conn.begin() as con: + with sql.SQLDatabase(con) as db: + db.drop_table(table_name) + + +def drop_view( + view_name: str, + conn: sqlite3.Connection | sqlalchemy.engine.Engine | sqlalchemy.engine.Connection, +): + import sqlalchemy + + if isinstance(conn, sqlite3.Connection): + conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}") + conn.commit() + else: + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(f'DROP VIEW IF EXISTS "{view_name}"') + else: + quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( + view_name + ) + stmt = sqlalchemy.text(f"DROP VIEW IF EXISTS {quoted_view}") + with conn.begin() as con: + con.execute(stmt) # type: ignore[union-attr] + + @pytest.fixture -def mysql_pymysql_engine(iris_path, types_data): +def mysql_pymysql_engine(): sqlalchemy = pytest.importorskip("sqlalchemy") pymysql = pytest.importorskip("pymysql") engine = sqlalchemy.create_engine( @@ -406,59 +608,148 @@ connect_args={"client_flag": pymysql.constants.CLIENT.MULTI_STATEMENTS}, poolclass=sqlalchemy.pool.NullPool, ) - insp = sqlalchemy.inspect(engine) - if not insp.has_table("iris"): - create_and_load_iris(engine, iris_path, "mysql") - if not insp.has_table("types"): - for entry in types_data: - entry.pop("DateColWithTz") - create_and_load_types(engine, types_data, "mysql") yield engine - with engine.connect() as conn: - with conn.begin(): - stmt = sqlalchemy.text("DROP TABLE IF EXISTS test_frame;") - conn.execute(stmt) + for view in get_all_views(engine): + drop_view(view, engine) + for tbl in get_all_tables(engine): + drop_table(tbl, engine) engine.dispose() @pytest.fixture +def mysql_pymysql_engine_iris(mysql_pymysql_engine, iris_path): + create_and_load_iris(mysql_pymysql_engine, iris_path) + create_and_load_iris_view(mysql_pymysql_engine) + yield mysql_pymysql_engine + + +@pytest.fixture +def mysql_pymysql_engine_types(mysql_pymysql_engine, types_data): + create_and_load_types(mysql_pymysql_engine, types_data, "mysql") + yield mysql_pymysql_engine + + +@pytest.fixture def mysql_pymysql_conn(mysql_pymysql_engine): with mysql_pymysql_engine.connect() as conn: yield conn @pytest.fixture -def postgresql_psycopg2_engine(iris_path, types_data): +def mysql_pymysql_conn_iris(mysql_pymysql_engine_iris): + with mysql_pymysql_engine_iris.connect() as conn: + yield conn + + +@pytest.fixture +def mysql_pymysql_conn_types(mysql_pymysql_engine_types): + with mysql_pymysql_engine_types.connect() as conn: + yield conn + + +@pytest.fixture +def postgresql_psycopg2_engine(): sqlalchemy = pytest.importorskip("sqlalchemy") pytest.importorskip("psycopg2") engine = sqlalchemy.create_engine( "postgresql+psycopg2://postgres:postgres@localhost:5432/pandas", poolclass=sqlalchemy.pool.NullPool, ) - insp = sqlalchemy.inspect(engine) - if not insp.has_table("iris"): - create_and_load_iris(engine, iris_path, "postgresql") - if not insp.has_table("types"): - create_and_load_types(engine, types_data, "postgresql") yield engine - with engine.connect() as conn: - with conn.begin(): - stmt = sqlalchemy.text("DROP TABLE IF EXISTS test_frame;") - conn.execute(stmt) + for view in get_all_views(engine): + drop_view(view, engine) + for tbl in get_all_tables(engine): + drop_table(tbl, engine) engine.dispose() @pytest.fixture +def postgresql_psycopg2_engine_iris(postgresql_psycopg2_engine, iris_path): + create_and_load_iris(postgresql_psycopg2_engine, iris_path) + create_and_load_iris_view(postgresql_psycopg2_engine) + yield postgresql_psycopg2_engine + + +@pytest.fixture +def postgresql_psycopg2_engine_types(postgresql_psycopg2_engine, types_data): + create_and_load_types(postgresql_psycopg2_engine, types_data, "postgres") + yield postgresql_psycopg2_engine + + +@pytest.fixture def postgresql_psycopg2_conn(postgresql_psycopg2_engine): with postgresql_psycopg2_engine.connect() as conn: yield conn @pytest.fixture +def postgresql_adbc_conn(): + pytest.importorskip("adbc_driver_postgresql") + from adbc_driver_postgresql import dbapi + + uri = "postgresql://postgres:postgres@localhost:5432/pandas" + with dbapi.connect(uri) as conn: + yield conn + for view in get_all_views(conn): + drop_view(view, conn) + for tbl in get_all_tables(conn): + drop_table(tbl, conn) + conn.commit() + + +@pytest.fixture +def postgresql_adbc_iris(postgresql_adbc_conn, iris_path): + import adbc_driver_manager as mgr + + conn = postgresql_adbc_conn + + try: + conn.adbc_get_table_schema("iris") + except mgr.ProgrammingError: + conn.rollback() + create_and_load_iris_postgresql(conn, iris_path) + try: + conn.adbc_get_table_schema("iris_view") + except mgr.ProgrammingError: # note arrow-adbc issue 1022 + conn.rollback() + create_and_load_iris_view(conn) + yield conn + + +@pytest.fixture +def postgresql_adbc_types(postgresql_adbc_conn, types_data): + import adbc_driver_manager as mgr + + conn = postgresql_adbc_conn + + try: + conn.adbc_get_table_schema("types") + except mgr.ProgrammingError: + conn.rollback() + new_data = [tuple(entry.values()) for entry in types_data] + + create_and_load_types_postgresql(conn, new_data) + + yield conn + + +@pytest.fixture +def postgresql_psycopg2_conn_iris(postgresql_psycopg2_engine_iris): + with postgresql_psycopg2_engine_iris.connect() as conn: + yield conn + + +@pytest.fixture +def postgresql_psycopg2_conn_types(postgresql_psycopg2_engine_types): + with postgresql_psycopg2_engine_types.connect() as conn: + yield conn + + +@pytest.fixture def sqlite_str(): pytest.importorskip("sqlalchemy") with tm.ensure_clean() as name: - yield "sqlite:///" + name + yield f"sqlite:///{name}" @pytest.fixture @@ -466,6 +757,10 @@ sqlalchemy = pytest.importorskip("sqlalchemy") engine = sqlalchemy.create_engine(sqlite_str, poolclass=sqlalchemy.pool.NullPool) yield engine + for view in get_all_views(engine): + drop_view(view, engine) + for tbl in get_all_tables(engine): + drop_table(tbl, engine) engine.dispose() @@ -476,27 +771,106 @@ @pytest.fixture -def sqlite_iris_str(sqlite_str, iris_path): +def sqlite_str_iris(sqlite_str, iris_path): + sqlalchemy = pytest.importorskip("sqlalchemy") + engine = sqlalchemy.create_engine(sqlite_str) + create_and_load_iris(engine, iris_path) + create_and_load_iris_view(engine) + engine.dispose() + return sqlite_str + + +@pytest.fixture +def sqlite_engine_iris(sqlite_engine, iris_path): + create_and_load_iris(sqlite_engine, iris_path) + create_and_load_iris_view(sqlite_engine) + yield sqlite_engine + + +@pytest.fixture +def sqlite_conn_iris(sqlite_engine_iris): + with sqlite_engine_iris.connect() as conn: + yield conn + + +@pytest.fixture +def sqlite_str_types(sqlite_str, types_data): sqlalchemy = pytest.importorskip("sqlalchemy") engine = sqlalchemy.create_engine(sqlite_str) - create_and_load_iris(engine, iris_path, "sqlite") + create_and_load_types(engine, types_data, "sqlite") engine.dispose() return sqlite_str @pytest.fixture -def sqlite_iris_engine(sqlite_engine, iris_path): - create_and_load_iris(sqlite_engine, iris_path, "sqlite") - return sqlite_engine +def sqlite_engine_types(sqlite_engine, types_data): + create_and_load_types(sqlite_engine, types_data, "sqlite") + yield sqlite_engine @pytest.fixture -def sqlite_iris_conn(sqlite_iris_engine): - with sqlite_iris_engine.connect() as conn: +def sqlite_conn_types(sqlite_engine_types): + with sqlite_engine_types.connect() as conn: yield conn @pytest.fixture +def sqlite_adbc_conn(): + pytest.importorskip("adbc_driver_sqlite") + from adbc_driver_sqlite import dbapi + + with tm.ensure_clean() as name: + uri = f"file:{name}" + with dbapi.connect(uri) as conn: + yield conn + for view in get_all_views(conn): + drop_view(view, conn) + for tbl in get_all_tables(conn): + drop_table(tbl, conn) + conn.commit() + + +@pytest.fixture +def sqlite_adbc_iris(sqlite_adbc_conn, iris_path): + import adbc_driver_manager as mgr + + conn = sqlite_adbc_conn + try: + conn.adbc_get_table_schema("iris") + except mgr.ProgrammingError: + conn.rollback() + create_and_load_iris_sqlite3(conn, iris_path) + try: + conn.adbc_get_table_schema("iris_view") + except mgr.ProgrammingError: + conn.rollback() + create_and_load_iris_view(conn) + yield conn + + +@pytest.fixture +def sqlite_adbc_types(sqlite_adbc_conn, types_data): + import adbc_driver_manager as mgr + + conn = sqlite_adbc_conn + try: + conn.adbc_get_table_schema("types") + except mgr.ProgrammingError: + conn.rollback() + new_data = [] + for entry in types_data: + entry["BoolCol"] = int(entry["BoolCol"]) + if entry["BoolColWithNull"] is not None: + entry["BoolColWithNull"] = int(entry["BoolColWithNull"]) + new_data.append(tuple(entry.values())) + + create_and_load_types_sqlite3(conn, new_data) + conn.commit() + + yield conn + + +@pytest.fixture def sqlite_buildin(): with contextlib.closing(sqlite3.connect(":memory:")) as closing_conn: with closing_conn as conn: @@ -506,18 +880,45 @@ @pytest.fixture def sqlite_buildin_iris(sqlite_buildin, iris_path): create_and_load_iris_sqlite3(sqlite_buildin, iris_path) - return sqlite_buildin + create_and_load_iris_view(sqlite_buildin) + yield sqlite_buildin + + +@pytest.fixture +def sqlite_buildin_types(sqlite_buildin, types_data): + types_data = [tuple(entry.values()) for entry in types_data] + create_and_load_types_sqlite3(sqlite_buildin, types_data) + yield sqlite_buildin mysql_connectable = [ - "mysql_pymysql_engine", - "mysql_pymysql_conn", + pytest.param("mysql_pymysql_engine", marks=pytest.mark.db), + pytest.param("mysql_pymysql_conn", marks=pytest.mark.db), ] +mysql_connectable_iris = [ + pytest.param("mysql_pymysql_engine_iris", marks=pytest.mark.db), + pytest.param("mysql_pymysql_conn_iris", marks=pytest.mark.db), +] + +mysql_connectable_types = [ + pytest.param("mysql_pymysql_engine_types", marks=pytest.mark.db), + pytest.param("mysql_pymysql_conn_types", marks=pytest.mark.db), +] postgresql_connectable = [ - "postgresql_psycopg2_engine", - "postgresql_psycopg2_conn", + pytest.param("postgresql_psycopg2_engine", marks=pytest.mark.db), + pytest.param("postgresql_psycopg2_conn", marks=pytest.mark.db), +] + +postgresql_connectable_iris = [ + pytest.param("postgresql_psycopg2_engine_iris", marks=pytest.mark.db), + pytest.param("postgresql_psycopg2_conn_iris", marks=pytest.mark.db), +] + +postgresql_connectable_types = [ + pytest.param("postgresql_psycopg2_engine_types", marks=pytest.mark.db), + pytest.param("postgresql_psycopg2_conn_types", marks=pytest.mark.db), ] sqlite_connectable = [ @@ -526,24 +927,55 @@ "sqlite_str", ] -sqlite_iris_connectable = [ - "sqlite_iris_engine", - "sqlite_iris_conn", - "sqlite_iris_str", +sqlite_connectable_iris = [ + "sqlite_engine_iris", + "sqlite_conn_iris", + "sqlite_str_iris", +] + +sqlite_connectable_types = [ + "sqlite_engine_types", + "sqlite_conn_types", + "sqlite_str_types", ] sqlalchemy_connectable = mysql_connectable + postgresql_connectable + sqlite_connectable sqlalchemy_connectable_iris = ( - mysql_connectable + postgresql_connectable + sqlite_iris_connectable + mysql_connectable_iris + postgresql_connectable_iris + sqlite_connectable_iris ) -all_connectable = sqlalchemy_connectable + ["sqlite_buildin"] +sqlalchemy_connectable_types = ( + mysql_connectable_types + postgresql_connectable_types + sqlite_connectable_types +) -all_connectable_iris = sqlalchemy_connectable_iris + ["sqlite_buildin_iris"] +adbc_connectable = [ + "sqlite_adbc_conn", + pytest.param("postgresql_adbc_conn", marks=pytest.mark.db), +] + +adbc_connectable_iris = [ + pytest.param("postgresql_adbc_iris", marks=pytest.mark.db), + pytest.param("sqlite_adbc_iris", marks=pytest.mark.db), +] + +adbc_connectable_types = [ + pytest.param("postgresql_adbc_types", marks=pytest.mark.db), + pytest.param("sqlite_adbc_types", marks=pytest.mark.db), +] + + +all_connectable = sqlalchemy_connectable + ["sqlite_buildin"] + adbc_connectable + +all_connectable_iris = ( + sqlalchemy_connectable_iris + ["sqlite_buildin_iris"] + adbc_connectable_iris +) + +all_connectable_types = ( + sqlalchemy_connectable_types + ["sqlite_buildin_types"] + adbc_connectable_types +) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql(conn, test_frame1, request): # GH 51086 if conn is sqlite_engine @@ -551,7 +983,21 @@ test_frame1.to_sql(name="test", con=conn, if_exists="append", index=False) -@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_dataframe_to_sql_empty(conn, test_frame1, request): + if conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="postgres ADBC driver cannot insert index with null type", + strict=True, + ) + ) + # GH 51086 if conn is sqlite_engine + conn = request.getfixturevalue(conn) + empty_df = test_frame1.iloc[:0] + empty_df.to_sql(name="test", con=conn, if_exists="append", index=False) + + @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_arrow_dtypes(conn, request): # GH 52046 @@ -567,12 +1013,25 @@ "string": pd.array(["a"], dtype="string[pyarrow]"), } ) + + if "adbc" in conn: + if conn == "sqlite_adbc_conn": + df = df.drop(columns=["timedelta"]) + if pa_version_under14p1: + exp_warning = DeprecationWarning + msg = "is_sparse is deprecated" + else: + exp_warning = None + msg = "" + else: + exp_warning = UserWarning + msg = "the 'timedelta'" + conn = request.getfixturevalue(conn) - with tm.assert_produces_warning(UserWarning, match="the 'timedelta'"): + with tm.assert_produces_warning(exp_warning, match=msg, check_stacklevel=False): df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): # GH 52046 @@ -588,10 +1047,16 @@ df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("method", [None, "multi"]) def test_to_sql(conn, method, test_frame1, request): + if method == "multi" and "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'method' not implemented for ADBC drivers", strict=True + ) + ) + conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: pandasSQL.to_sql(test_frame1, "test_frame", method=method) @@ -599,7 +1064,6 @@ assert count_rows(conn, "test_frame") == len(test_frame1) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("mode, num_row_coef", [("replace", 1), ("append", 2)]) def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): @@ -611,7 +1075,6 @@ assert count_rows(conn, "test_frame") == num_row_coef * len(test_frame1) -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_to_sql_exist_fail(conn, test_frame1, request): conn = request.getfixturevalue(conn) @@ -624,7 +1087,6 @@ pandasSQL.to_sql(test_frame1, "test_frame", if_exists="fail") -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query(conn, request): conn = request.getfixturevalue(conn) @@ -637,9 +1099,15 @@ assert "SepalWidth" in iris_frame.columns -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query_chunksize(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'chunksize' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) iris_frame = concat(read_sql_query("SELECT * FROM iris", conn, chunksize=7)) check_iris_frame(iris_frame) @@ -650,9 +1118,15 @@ assert "SepalWidth" in iris_frame.columns -@pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_query_expression_with_parameter(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'chunksize' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) from sqlalchemy import ( MetaData, @@ -672,9 +1146,16 @@ autoload_con.dispose() -@pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query_string_with_parameter(conn, request, sql_strings): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'chunksize' not implemented for ADBC drivers", + strict=True, + ) + ) + for db, query in sql_strings["read_parameters"].items(): if db in conn: break @@ -685,7 +1166,6 @@ check_iris_frame(iris_frame) -@pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_table(conn, request): # GH 51015 if conn = sqlite_iris_str @@ -696,9 +1176,12 @@ check_iris_frame(iris_frame) -@pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_table_chunksize(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) iris_frame = concat(read_sql_table("iris", conn, chunksize=7)) check_iris_frame(iris_frame) @@ -706,7 +1189,6 @@ check_iris_frame(iris_frame) -@pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable) def test_to_sql_callable(conn, test_frame1, request): conn = request.getfixturevalue(conn) @@ -725,26 +1207,38 @@ assert count_rows(conn, "test_frame") == len(test_frame1) -@pytest.mark.db -@pytest.mark.parametrize("conn", mysql_connectable) +@pytest.mark.parametrize("conn", all_connectable_types) def test_default_type_conversion(conn, request): + conn_name = conn + if conn_name == "sqlite_buildin_types": + request.applymarker( + pytest.mark.xfail( + reason="sqlite_buildin connection does not implement read_sql_table" + ) + ) + conn = request.getfixturevalue(conn) df = sql.read_sql_table("types", conn) assert issubclass(df.FloatCol.dtype.type, np.floating) assert issubclass(df.IntCol.dtype.type, np.integer) - # MySQL has no real BOOL type (it's an alias for TINYINT) - assert issubclass(df.BoolCol.dtype.type, np.integer) + # MySQL/sqlite has no real BOOL type + if "postgresql" in conn_name: + assert issubclass(df.BoolCol.dtype.type, np.bool_) + else: + assert issubclass(df.BoolCol.dtype.type, np.integer) # Int column with NA values stays as float assert issubclass(df.IntColWithNull.dtype.type, np.floating) # Bool column with NA = int column with NA values => becomes float - assert issubclass(df.BoolColWithNull.dtype.type, np.floating) + if "postgresql" in conn_name: + assert issubclass(df.BoolColWithNull.dtype.type, object) + else: + assert issubclass(df.BoolColWithNull.dtype.type, np.floating) -@pytest.mark.db @pytest.mark.parametrize("conn", mysql_connectable) def test_read_procedure(conn, request): conn = request.getfixturevalue(conn) @@ -782,7 +1276,6 @@ tm.assert_frame_equal(df, res2) -@pytest.mark.db @pytest.mark.parametrize("conn", postgresql_connectable) @pytest.mark.parametrize("expected_count", [2, "Success!"]) def test_copy_from_callable_insertion_method(conn, expected_count, request): @@ -822,7 +1315,6 @@ tm.assert_frame_equal(result, expected) -@pytest.mark.db @pytest.mark.parametrize("conn", postgresql_connectable) def test_insertion_method_on_conflict_do_nothing(conn, request): # GH 15988: Example in to_sql docstring @@ -881,7 +1373,30 @@ pandasSQL.drop_table("test_insert_conflict") -@pytest.mark.db +@pytest.mark.parametrize("conn", all_connectable) +def test_to_sql_on_public_schema(conn, request): + if "sqlite" in conn or "mysql" in conn: + request.applymarker( + pytest.mark.xfail( + reason="test for public schema only specific to postgresql" + ) + ) + + conn = request.getfixturevalue(conn) + + test_data = DataFrame([[1, 2.1, "a"], [2, 3.1, "b"]], columns=list("abc")) + test_data.to_sql( + name="test_public_schema", + con=conn, + if_exists="append", + index=False, + schema="public", + ) + + df_out = sql.read_sql_table("test_public_schema", conn, schema="public") + tm.assert_frame_equal(test_data, df_out) + + @pytest.mark.parametrize("conn", mysql_connectable) def test_insertion_method_on_conflict_update(conn, request): # GH 14553: Example in to_sql docstring @@ -935,7 +1450,6 @@ pandasSQL.drop_table("test_insert_conflict") -@pytest.mark.db @pytest.mark.parametrize("conn", postgresql_connectable) def test_read_view_postgres(conn, request): # GH 52969 @@ -997,1767 +1511,2175 @@ tm.assert_frame_equal(result, expected) -def test_execute_typeerror(sqlite_iris_engine): +def test_execute_typeerror(sqlite_engine_iris): with pytest.raises(TypeError, match="pandas.io.sql.execute requires a connection"): with tm.assert_produces_warning( FutureWarning, match="`pandas.io.sql.execute` is deprecated and " "will be removed in the future version.", ): - sql.execute("select * from iris", sqlite_iris_engine) + sql.execute("select * from iris", sqlite_engine_iris) -def test_execute_deprecated(sqlite_buildin_iris): +def test_execute_deprecated(sqlite_conn_iris): # GH50185 with tm.assert_produces_warning( FutureWarning, match="`pandas.io.sql.execute` is deprecated and " "will be removed in the future version.", ): - sql.execute("select * from iris", sqlite_buildin_iris) - + sql.execute("select * from iris", sqlite_conn_iris) -class MixInBase: - def teardown_method(self): - # if setup fails, there may not be a connection to close. - if hasattr(self, "conn"): - self.conn.close() - # use a fresh connection to ensure we can drop all tables. - try: - conn = self.connect() - except (sqlalchemy.exc.OperationalError, sqlite3.OperationalError): - pass - else: - with conn: - for view in self._get_all_views(conn): - self.drop_view(view, conn) - for tbl in self._get_all_tables(conn): - self.drop_table(tbl, conn) +def flavor(conn_name): + if "postgresql" in conn_name: + return "postgresql" + elif "sqlite" in conn_name: + return "sqlite" + elif "mysql" in conn_name: + return "mysql" -class SQLiteMixIn(MixInBase): - def connect(self): - return sqlite3.connect(":memory:") + raise ValueError(f"unsupported connection: {conn_name}") - def drop_table(self, table_name, conn): - conn.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}") - conn.commit() - def _get_all_tables(self, conn): - c = conn.execute("SELECT name FROM sqlite_master WHERE type='table'") - return [table[0] for table in c.fetchall()] +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_read_sql_iris_parameter(conn, request, sql_strings): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'params' not implemented for ADBC drivers", + strict=True, + ) + ) + conn_name = conn + conn = request.getfixturevalue(conn) + query = sql_strings["read_parameters"][flavor(conn_name)] + params = ("Iris-setosa", 5.1) + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=params) + check_iris_frame(iris_frame) - def drop_view(self, view_name, conn): - conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}") - conn.commit() - def _get_all_views(self, conn): - c = conn.execute("SELECT name FROM sqlite_master WHERE type='view'") - return [view[0] for view in c.fetchall()] +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_read_sql_iris_named_parameter(conn, request, sql_strings): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'params' not implemented for ADBC drivers", + strict=True, + ) + ) + conn_name = conn + conn = request.getfixturevalue(conn) + query = sql_strings["read_named_parameters"][flavor(conn_name)] + params = {"name": "Iris-setosa", "length": 5.1} + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=params) + check_iris_frame(iris_frame) -class SQLAlchemyMixIn(MixInBase): - @classmethod - def teardown_class(cls): - cls.engine.dispose() - - def connect(self): - return self.engine.connect() - - def drop_table(self, table_name, conn): - if conn.in_transaction(): - conn.get_transaction().rollback() - with conn.begin(): - sql.SQLDatabase(conn).drop_table(table_name) - def _get_all_tables(self, conn): - from sqlalchemy import inspect +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_read_sql_iris_no_parameter_with_percent(conn, request, sql_strings): + if "mysql" in conn or ("postgresql" in conn and "adbc" not in conn): + request.applymarker(pytest.mark.xfail(reason="broken test")) - return inspect(conn).get_table_names() + conn_name = conn + conn = request.getfixturevalue(conn) - def drop_view(self, view_name, conn): - quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( - view_name - ) - if conn.in_transaction(): - conn.get_transaction().rollback() - with conn.begin(): - conn.exec_driver_sql(f"DROP VIEW IF EXISTS {quoted_view}") + query = sql_strings["read_no_parameters_with_percent"][flavor(conn_name)] + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + iris_frame = pandasSQL.read_query(query, params=None) + check_iris_frame(iris_frame) - def _get_all_views(self, conn): - from sqlalchemy import inspect - return inspect(conn).get_view_names() +# ----------------------------------------------------------------------------- +# -- Testing the public API -class PandasSQLTest: - """ - Base class with common private methods for SQLAlchemy and fallback cases. +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_read_sql_view(conn, request): + conn = request.getfixturevalue(conn) + iris_frame = sql.read_sql_query("SELECT * FROM iris_view", conn) + check_iris_frame(iris_frame) - """ - def load_iris_data(self, iris_path): - self.drop_table("iris", self.conn) - if isinstance(self.conn, sqlite3.Connection): - create_and_load_iris_sqlite3(self.conn, iris_path) - else: - create_and_load_iris(self.conn, iris_path, self.flavor) +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_read_sql_with_chunksize_no_result(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) + conn = request.getfixturevalue(conn) + query = 'SELECT * FROM iris_view WHERE "SepalLength" < 0.0' + with_batch = sql.read_sql_query(query, conn, chunksize=5) + without_batch = sql.read_sql_query(query, conn) + tm.assert_frame_equal(concat(with_batch), without_batch) - def load_types_data(self, types_data): - if self.flavor != "postgresql": - for entry in types_data: - entry.pop("DateColWithTz") - if isinstance(self.conn, sqlite3.Connection): - types_data = [tuple(entry.values()) for entry in types_data] - create_and_load_types_sqlite3(self.conn, types_data) - else: - create_and_load_types(self.conn, types_data, self.flavor) - def _read_sql_iris_parameter(self, sql_strings): - query = sql_strings["read_parameters"][self.flavor] - params = ("Iris-setosa", 5.1) - iris_frame = self.pandasSQL.read_query(query, params=params) - check_iris_frame(iris_frame) - - def _read_sql_iris_named_parameter(self, sql_strings): - query = sql_strings["read_named_parameters"][self.flavor] - params = {"name": "Iris-setosa", "length": 5.1} - iris_frame = self.pandasSQL.read_query(query, params=params) - check_iris_frame(iris_frame) - - def _read_sql_iris_no_parameter_with_percent(self, sql_strings): - query = sql_strings["read_no_parameters_with_percent"][self.flavor] - iris_frame = self.pandasSQL.read_query(query, params=None) - check_iris_frame(iris_frame) - - def _to_sql_empty(self, test_frame1): - self.drop_table("test_frame1", self.conn) - assert self.pandasSQL.to_sql(test_frame1.iloc[:0], "test_frame1") == 0 - - def _to_sql_with_sql_engine(self, test_frame1, engine="auto", **engine_kwargs): - """`to_sql` with the `engine` param""" - # mostly copied from this class's `_to_sql()` method - self.drop_table("test_frame1", self.conn) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame1", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame1") - assert ( - self.pandasSQL.to_sql( - test_frame1, "test_frame1", engine=engine, **engine_kwargs - ) - == 4 - ) - assert self.pandasSQL.has_table("test_frame1") + sql.to_sql(test_frame1, "test_frame1", conn) + assert sql.has_table("test_frame1", conn) - num_entries = len(test_frame1) - num_rows = count_rows(self.conn, "test_frame1") - assert num_rows == num_entries - # Nuke table - self.drop_table("test_frame1", self.conn) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_fail(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame2", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame2") + + sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail") + assert sql.has_table("test_frame2", conn) + + msg = "Table 'test_frame2' already exists" + with pytest.raises(ValueError, match=msg): + sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail") - def _roundtrip(self, test_frame1): - self.drop_table("test_frame_roundtrip", self.conn) - assert self.pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 - result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") - result.set_index("level_0", inplace=True) - # result.index.astype(int) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_replace(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame3", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame3") + + sql.to_sql(test_frame1, "test_frame3", conn, if_exists="fail") + # Add to table again + sql.to_sql(test_frame1, "test_frame3", conn, if_exists="replace") + assert sql.has_table("test_frame3", conn) - result.index.name = None + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame3") - tm.assert_frame_equal(result, test_frame1) + assert num_rows == num_entries - def _execute_sql(self): - # drop_sql = "DROP TABLE IF EXISTS test" # should already be done - iris_results = self.pandasSQL.execute("SELECT * FROM iris") - row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) - def _to_sql_save_index(self): - df = DataFrame.from_records( - [(1, 2.1, "line1"), (2, 1.5, "line2")], columns=["A", "B", "C"], index=["A"] - ) - assert self.pandasSQL.to_sql(df, "test_to_sql_saves_index") == 2 - ix_cols = self._get_index_columns("test_to_sql_saves_index") - assert ix_cols == [["A"]] +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_append(conn, request, test_frame1): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame4", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame4") - def _transaction_test(self): - with self.pandasSQL.run_transaction() as trans: - stmt = "CREATE TABLE test_trans (A INT, B TEXT)" - if isinstance(self.pandasSQL, SQLiteDatabase): - trans.execute(stmt) - else: - from sqlalchemy import text + assert sql.to_sql(test_frame1, "test_frame4", conn, if_exists="fail") == 4 - stmt = text(stmt) - trans.execute(stmt) + # Add to table again + assert sql.to_sql(test_frame1, "test_frame4", conn, if_exists="append") == 4 + assert sql.has_table("test_frame4", conn) - class DummyException(Exception): - pass + num_entries = 2 * len(test_frame1) + num_rows = count_rows(conn, "test_frame4") - # Make sure when transaction is rolled back, no rows get inserted - ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')" - if isinstance(self.pandasSQL, SQLDatabase): - from sqlalchemy import text + assert num_rows == num_entries - ins_sql = text(ins_sql) - try: - with self.pandasSQL.run_transaction() as trans: - trans.execute(ins_sql) - raise DummyException("error") - except DummyException: - # ignore raised exception - pass - res = self.pandasSQL.read_query("SELECT * FROM test_trans") - assert len(res) == 0 - # Make sure when transaction is committed, rows do get inserted - with self.pandasSQL.run_transaction() as trans: - trans.execute(ins_sql) - res2 = self.pandasSQL.read_query("SELECT * FROM test_trans") - assert len(res2) == 1 +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_type_mapping(conn, request, test_frame3): + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame5", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame5") + sql.to_sql(test_frame3, "test_frame5", conn, index=False) + result = sql.read_sql("SELECT * FROM test_frame5", conn) -# ----------------------------------------------------------------------------- -# -- Testing the public API + tm.assert_frame_equal(test_frame3, result) -class _TestSQLApi(PandasSQLTest): - """ - Base class to test the public API. +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_series(conn, request): + conn = request.getfixturevalue(conn) + if sql.has_table("test_series", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_series") + + s = Series(np.arange(5, dtype="int64"), name="series") + sql.to_sql(s, "test_series", conn, index=False) + s2 = sql.read_sql_query("SELECT * FROM test_series", conn) + tm.assert_frame_equal(s.to_frame(), s2) - From this two classes are derived to run these tests for both the - sqlalchemy mode (`TestSQLApi`) and the fallback mode - (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific - tests for the different sql flavours are included in `_TestSQLAlchemy`. - - Notes: - flavor can always be passed even in SQLAlchemy mode, - should be correctly ignored. - we don't use drop_table because that isn't part of the public api +@pytest.mark.parametrize("conn", all_connectable) +def test_api_roundtrip(conn, request, test_frame1): + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame_roundtrip", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame_roundtrip") + + sql.to_sql(test_frame1, "test_frame_roundtrip", con=conn) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn) + + # HACK! + if "adbc" in conn_name: + result = result.rename(columns={"__index_level_0__": "level_0"}) + result.index = test_frame1.index + result.set_index("level_0", inplace=True) + result.index.astype(int) + result.index.name = None + tm.assert_frame_equal(result, test_frame1) - """ - flavor = "sqlite" - mode: str +@pytest.mark.parametrize("conn", all_connectable) +def test_api_roundtrip_chunksize(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) + conn = request.getfixturevalue(conn) + if sql.has_table("test_frame_roundtrip", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_frame_roundtrip") + + sql.to_sql( + test_frame1, + "test_frame_roundtrip", + con=conn, + index=False, + chunksize=2, + ) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn) + tm.assert_frame_equal(result, test_frame1) - @pytest.fixture(autouse=True) - def setup_method(self, iris_path, types_data): - self.conn = self.connect() - self.load_iris_data(iris_path) - self.load_types_data(types_data) - self.load_test_data_and_sql() - - def load_test_data_and_sql(self): - create_and_load_iris_view(self.conn) - - def test_read_sql_view(self): - iris_frame = sql.read_sql_query("SELECT * FROM iris_view", self.conn) - check_iris_frame(iris_frame) - - def test_read_sql_with_chunksize_no_result(self): - query = "SELECT * FROM iris_view WHERE SepalLength < 0.0" - with_batch = sql.read_sql_query(query, self.conn, chunksize=5) - without_batch = sql.read_sql_query(query, self.conn) - tm.assert_frame_equal(concat(with_batch), without_batch) - - def test_to_sql(self, test_frame1): - sql.to_sql(test_frame1, "test_frame1", self.conn) - assert sql.has_table("test_frame1", self.conn) - - def test_to_sql_fail(self, test_frame1): - sql.to_sql(test_frame1, "test_frame2", self.conn, if_exists="fail") - assert sql.has_table("test_frame2", self.conn) - msg = "Table 'test_frame2' already exists" - with pytest.raises(ValueError, match=msg): - sql.to_sql(test_frame1, "test_frame2", self.conn, if_exists="fail") +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_api_execute_sql(conn, request): + # drop_sql = "DROP TABLE IF EXISTS test" # should already be done + conn = request.getfixturevalue(conn) + with sql.pandasSQL_builder(conn) as pandas_sql: + iris_results = pandas_sql.execute("SELECT * FROM iris") + row = iris_results.fetchone() + iris_results.close() + assert list(row) == [5.1, 3.5, 1.4, 0.2, "Iris-setosa"] - def test_to_sql_replace(self, test_frame1): - sql.to_sql(test_frame1, "test_frame3", self.conn, if_exists="fail") - # Add to table again - sql.to_sql(test_frame1, "test_frame3", self.conn, if_exists="replace") - assert sql.has_table("test_frame3", self.conn) - num_entries = len(test_frame1) - num_rows = count_rows(self.conn, "test_frame3") +@pytest.mark.parametrize("conn", all_connectable_types) +def test_api_date_parsing(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + # Test date parsing in read_sql + # No Parsing + df = sql.read_sql_query("SELECT * FROM types", conn) + if not ("mysql" in conn_name or "postgres" in conn_name): + assert not issubclass(df.DateCol.dtype.type, np.datetime64) - assert num_rows == num_entries + df = sql.read_sql_query("SELECT * FROM types", conn, parse_dates=["DateCol"]) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + Timestamp(2000, 1, 3, 0, 0, 0), + Timestamp(2000, 1, 4, 0, 0, 0), + ] - def test_to_sql_append(self, test_frame1): - assert sql.to_sql(test_frame1, "test_frame4", self.conn, if_exists="fail") == 4 + df = sql.read_sql_query( + "SELECT * FROM types", + conn, + parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}, + ) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + Timestamp(2000, 1, 3, 0, 0, 0), + Timestamp(2000, 1, 4, 0, 0, 0), + ] - # Add to table again - assert ( - sql.to_sql(test_frame1, "test_frame4", self.conn, if_exists="append") == 4 - ) - assert sql.has_table("test_frame4", self.conn) + df = sql.read_sql_query("SELECT * FROM types", conn, parse_dates=["IntDateCol"]) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + Timestamp(1986, 12, 25, 0, 0, 0), + Timestamp(2013, 1, 1, 0, 0, 0), + ] - num_entries = 2 * len(test_frame1) - num_rows = count_rows(self.conn, "test_frame4") + df = sql.read_sql_query( + "SELECT * FROM types", conn, parse_dates={"IntDateCol": "s"} + ) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + Timestamp(1986, 12, 25, 0, 0, 0), + Timestamp(2013, 1, 1, 0, 0, 0), + ] - assert num_rows == num_entries + df = sql.read_sql_query( + "SELECT * FROM types", + conn, + parse_dates={"IntDateOnlyCol": "%Y%m%d"}, + ) + assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) + assert df.IntDateOnlyCol.tolist() == [ + Timestamp("2010-10-10"), + Timestamp("2010-12-12"), + ] - def test_to_sql_type_mapping(self, test_frame3): - sql.to_sql(test_frame3, "test_frame5", self.conn, index=False) - result = sql.read_sql("SELECT * FROM test_frame5", self.conn) - - tm.assert_frame_equal(test_frame3, result) - - def test_to_sql_series(self): - s = Series(np.arange(5, dtype="int64"), name="series") - sql.to_sql(s, "test_series", self.conn, index=False) - s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn) - tm.assert_frame_equal(s.to_frame(), s2) - - def test_roundtrip(self, test_frame1): - sql.to_sql(test_frame1, "test_frame_roundtrip", con=self.conn) - result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) - - # HACK! - result.index = test_frame1.index - result.set_index("level_0", inplace=True) - result.index.astype(int) - result.index.name = None - tm.assert_frame_equal(result, test_frame1) - def test_roundtrip_chunksize(self, test_frame1): - sql.to_sql( - test_frame1, - "test_frame_roundtrip", - con=self.conn, - index=False, - chunksize=2, +@pytest.mark.parametrize("conn", all_connectable_types) +@pytest.mark.parametrize("error", ["ignore", "raise", "coerce"]) +@pytest.mark.parametrize( + "read_sql, text, mode", + [ + (sql.read_sql, "SELECT * FROM types", ("sqlalchemy", "fallback")), + (sql.read_sql, "types", ("sqlalchemy")), + ( + sql.read_sql_query, + "SELECT * FROM types", + ("sqlalchemy", "fallback"), + ), + (sql.read_sql_table, "types", ("sqlalchemy")), + ], +) +def test_api_custom_dateparsing_error( + conn, request, read_sql, text, mode, error, types_data_frame +): + conn_name = conn + conn = request.getfixturevalue(conn) + if text == "types" and conn_name == "sqlite_buildin_types": + request.applymarker( + pytest.mark.xfail(reason="failing combination of arguments") ) - result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) - tm.assert_frame_equal(result, test_frame1) - def test_execute_sql(self): - # drop_sql = "DROP TABLE IF EXISTS test" # should already be done - with sql.pandasSQL_builder(self.conn) as pandas_sql: - iris_results = pandas_sql.execute("SELECT * FROM iris") - row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) - - def test_date_parsing(self): - # Test date parsing in read_sql - # No Parsing - df = sql.read_sql_query("SELECT * FROM types", self.conn) - assert not issubclass(df.DateCol.dtype.type, np.datetime64) + expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) - df = sql.read_sql_query( - "SELECT * FROM types", self.conn, parse_dates=["DateCol"] - ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - assert df.DateCol.tolist() == [ - Timestamp(2000, 1, 3, 0, 0, 0), - Timestamp(2000, 1, 4, 0, 0, 0), - ] + result = read_sql( + text, + con=conn, + parse_dates={ + "DateCol": {"errors": error}, + }, + ) + if "postgres" in conn_name: + # TODO: clean up types_data_frame fixture + result["BoolCol"] = result["BoolCol"].astype(int) + result["BoolColWithNull"] = result["BoolColWithNull"].astype(float) - df = sql.read_sql_query( - "SELECT * FROM types", - self.conn, - parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}, + if conn_name == "postgresql_adbc_types": + expected = expected.astype( + { + "IntDateCol": "int32", + "IntDateOnlyCol": "int32", + "IntCol": "int32", + } ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - assert df.DateCol.tolist() == [ - Timestamp(2000, 1, 3, 0, 0, 0), - Timestamp(2000, 1, 4, 0, 0, 0), - ] - df = sql.read_sql_query( - "SELECT * FROM types", self.conn, parse_dates=["IntDateCol"] - ) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - assert df.IntDateCol.tolist() == [ - Timestamp(1986, 12, 25, 0, 0, 0), - Timestamp(2013, 1, 1, 0, 0, 0), - ] + if not pa_version_under13p0: + # TODO: is this astype safe? + expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") - df = sql.read_sql_query( - "SELECT * FROM types", self.conn, parse_dates={"IntDateCol": "s"} - ) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - assert df.IntDateCol.tolist() == [ - Timestamp(1986, 12, 25, 0, 0, 0), - Timestamp(2013, 1, 1, 0, 0, 0), - ] + tm.assert_frame_equal(result, expected) - df = sql.read_sql_query( - "SELECT * FROM types", - self.conn, - parse_dates={"IntDateOnlyCol": "%Y%m%d"}, - ) - assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) - assert df.IntDateOnlyCol.tolist() == [ - Timestamp("2010-10-10"), - Timestamp("2010-12-12"), - ] - @pytest.mark.parametrize("error", ["ignore", "raise", "coerce"]) - @pytest.mark.parametrize( - "read_sql, text, mode", - [ - (sql.read_sql, "SELECT * FROM types", ("sqlalchemy", "fallback")), - (sql.read_sql, "types", ("sqlalchemy")), - ( - sql.read_sql_query, - "SELECT * FROM types", - ("sqlalchemy", "fallback"), - ), - (sql.read_sql_table, "types", ("sqlalchemy")), - ], +@pytest.mark.parametrize("conn", all_connectable_types) +def test_api_date_and_index(conn, request): + # Test case where same column appears in parse_date and index_col + conn = request.getfixturevalue(conn) + df = sql.read_sql_query( + "SELECT * FROM types", + conn, + index_col="DateCol", + parse_dates=["DateCol", "IntDateCol"], ) - def test_custom_dateparsing_error( - self, read_sql, text, mode, error, types_data_frame - ): - if self.mode in mode: - expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) - result = read_sql( - text, - con=self.conn, - parse_dates={ - "DateCol": {"errors": error}, - }, - ) + assert issubclass(df.index.dtype.type, np.datetime64) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - tm.assert_frame_equal(result, expected) - def test_date_and_index(self): - # Test case where same column appears in parse_date and index_col +@pytest.mark.parametrize("conn", all_connectable) +def test_api_timedelta(conn, request): + # see #6921 + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("test_timedelta", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_timedelta") + + df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() + + if conn_name == "sqlite_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="sqlite ADBC driver doesn't implement timedelta", + ) + ) - df = sql.read_sql_query( - "SELECT * FROM types", - self.conn, - index_col="DateCol", - parse_dates=["DateCol", "IntDateCol"], + if "adbc" in conn_name: + if pa_version_under14p1: + exp_warning = DeprecationWarning + else: + exp_warning = None + else: + exp_warning = UserWarning + + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + result_count = df.to_sql(name="test_timedelta", con=conn) + assert result_count == 2 + result = sql.read_sql_query("SELECT * FROM test_timedelta", conn) + + if conn_name == "postgresql_adbc_conn": + # TODO: Postgres stores an INTERVAL, which ADBC reads as a Month-Day-Nano + # Interval; the default pandas type mapper maps this to a DateOffset + # but maybe we should try and restore the timedelta here? + expected = Series( + [ + pd.DateOffset(months=0, days=0, microseconds=1000000, nanoseconds=0), + pd.DateOffset(months=0, days=0, microseconds=3000000, nanoseconds=0), + ], + name="foo", ) + else: + expected = df["foo"].astype("int64") + tm.assert_series_equal(result["foo"], expected) - assert issubclass(df.index.dtype.type, np.datetime64) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - def test_timedelta(self): - # see #6921 - df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() - with tm.assert_produces_warning(UserWarning): - result_count = df.to_sql(name="test_timedelta", con=self.conn) - assert result_count == 2 - result = sql.read_sql_query("SELECT * FROM test_timedelta", self.conn) - tm.assert_series_equal(result["foo"], df["foo"].view("int64")) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_complex_raises(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame({"a": [1 + 1j, 2j]}) - def test_complex_raises(self): - df = DataFrame({"a": [1 + 1j, 2j]}) + if "adbc" in conn_name: + msg = "datatypes not supported" + else: msg = "Complex datatypes not supported" - with pytest.raises(ValueError, match=msg): - assert df.to_sql("test_complex", con=self.conn) is None + with pytest.raises(ValueError, match=msg): + assert df.to_sql("test_complex", con=conn) is None - @pytest.mark.parametrize( - "index_name,index_label,expected", - [ - # no index name, defaults to 'index' - (None, None, "index"), - # specifying index_label - (None, "other_label", "other_label"), - # using the index name - ("index_name", None, "index_name"), - # has index name, but specifying index_label - ("index_name", "other_label", "other_label"), - # index name is integer - (0, None, "0"), - # index name is None but index label is integer - (None, 0, "0"), - ], - ) - def test_to_sql_index_label(self, index_name, index_label, expected): - temp_frame = DataFrame({"col1": range(4)}) - temp_frame.index.name = index_name - query = "SELECT * FROM test_index_label" - sql.to_sql(temp_frame, "test_index_label", self.conn, index_label=index_label) - frame = sql.read_sql_query(query, self.conn) - assert frame.columns[0] == expected - - def test_to_sql_index_label_multiindex(self): - expected_row_count = 4 - temp_frame = DataFrame( - {"col1": range(4)}, - index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]), - ) - - # no index name, defaults to 'level_0' and 'level_1' - result = sql.to_sql(temp_frame, "test_index_label", self.conn) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[0] == "level_0" - assert frame.columns[1] == "level_1" +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize( + "index_name,index_label,expected", + [ + # no index name, defaults to 'index' + (None, None, "index"), # specifying index_label - result = sql.to_sql( - temp_frame, - "test_index_label", - self.conn, - if_exists="replace", - index_label=["A", "B"], + (None, "other_label", "other_label"), + # using the index name + ("index_name", None, "index_name"), + # has index name, but specifying index_label + ("index_name", "other_label", "other_label"), + # index name is integer + (0, None, "0"), + # index name is None but index label is integer + (None, 0, "0"), + ], +) +def test_api_to_sql_index_label(conn, request, index_name, index_label, expected): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="index_label argument NotImplemented with ADBC") ) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[:2].tolist() == ["A", "B"] + conn = request.getfixturevalue(conn) + if sql.has_table("test_index_label", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_index_label") + + temp_frame = DataFrame({"col1": range(4)}) + temp_frame.index.name = index_name + query = "SELECT * FROM test_index_label" + sql.to_sql(temp_frame, "test_index_label", conn, index_label=index_label) + frame = sql.read_sql_query(query, conn) + assert frame.columns[0] == expected - # using the index name - temp_frame.index.names = ["A", "B"] - result = sql.to_sql( - temp_frame, "test_index_label", self.conn, if_exists="replace" - ) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[:2].tolist() == ["A", "B"] - # has index name, but specifying index_label - result = sql.to_sql( +@pytest.mark.parametrize("conn", all_connectable) +def test_api_to_sql_index_label_multiindex(conn, request): + conn_name = conn + if "mysql" in conn_name: + request.applymarker( + pytest.mark.xfail( + reason="MySQL can fail using TEXT without length as key", strict=False + ) + ) + elif "adbc" in conn_name: + request.node.add_marker( + pytest.mark.xfail(reason="index_label argument NotImplemented with ADBC") + ) + + conn = request.getfixturevalue(conn) + if sql.has_table("test_index_label", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_index_label") + + expected_row_count = 4 + temp_frame = DataFrame( + {"col1": range(4)}, + index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]), + ) + + # no index name, defaults to 'level_0' and 'level_1' + result = sql.to_sql(temp_frame, "test_index_label", conn) + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[0] == "level_0" + assert frame.columns[1] == "level_1" + + # specifying index_label + result = sql.to_sql( + temp_frame, + "test_index_label", + conn, + if_exists="replace", + index_label=["A", "B"], + ) + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[:2].tolist() == ["A", "B"] + + # using the index name + temp_frame.index.names = ["A", "B"] + result = sql.to_sql(temp_frame, "test_index_label", conn, if_exists="replace") + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[:2].tolist() == ["A", "B"] + + # has index name, but specifying index_label + result = sql.to_sql( + temp_frame, + "test_index_label", + conn, + if_exists="replace", + index_label=["C", "D"], + ) + assert result == expected_row_count + frame = sql.read_sql_query("SELECT * FROM test_index_label", conn) + assert frame.columns[:2].tolist() == ["C", "D"] + + msg = "Length of 'index_label' should match number of levels, which is 2" + with pytest.raises(ValueError, match=msg): + sql.to_sql( temp_frame, "test_index_label", - self.conn, + conn, if_exists="replace", - index_label=["C", "D"], + index_label="C", ) - assert result == expected_row_count - frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) - assert frame.columns[:2].tolist() == ["C", "D"] - msg = "Length of 'index_label' should match number of levels, which is 2" - with pytest.raises(ValueError, match=msg): - sql.to_sql( - temp_frame, - "test_index_label", - self.conn, - if_exists="replace", - index_label="C", - ) - def test_multiindex_roundtrip(self): - df = DataFrame.from_records( - [(1, 2.1, "line1"), (2, 1.5, "line2")], - columns=["A", "B", "C"], - index=["A", "B"], +@pytest.mark.parametrize("conn", all_connectable) +def test_api_multiindex_roundtrip(conn, request): + conn = request.getfixturevalue(conn) + if sql.has_table("test_multiindex_roundtrip", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_multiindex_roundtrip") + + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], + columns=["A", "B", "C"], + index=["A", "B"], + ) + + df.to_sql(name="test_multiindex_roundtrip", con=conn) + result = sql.read_sql_query( + "SELECT * FROM test_multiindex_roundtrip", conn, index_col=["A", "B"] + ) + tm.assert_frame_equal(df, result, check_index_type=True) + + +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize( + "dtype", + [ + None, + int, + float, + {"A": int, "B": float}, + ], +) +def test_api_dtype_argument(conn, request, dtype): + # GH10285 Add dtype argument to read_sql_query + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("test_dtype_argument", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_dtype_argument") + + df = DataFrame([[1.2, 3.4], [5.6, 7.8]], columns=["A", "B"]) + assert df.to_sql(name="test_dtype_argument", con=conn) == 2 + + expected = df.astype(dtype) + + if "postgres" in conn_name: + query = 'SELECT "A", "B" FROM test_dtype_argument' + else: + query = "SELECT A, B FROM test_dtype_argument" + result = sql.read_sql_query(query, con=conn, dtype=dtype) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("conn", all_connectable) +def test_api_integer_col_names(conn, request): + conn = request.getfixturevalue(conn) + df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) + sql.to_sql(df, "test_frame_integer_col_names", conn, if_exists="replace") + + +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) ) + conn = request.getfixturevalue(conn) + create_sql = sql.get_schema(test_frame1, "test", con=conn) + assert "CREATE" in create_sql + - df.to_sql(name="test_multiindex_roundtrip", con=self.conn) - result = sql.read_sql_query( - "SELECT * FROM test_multiindex_roundtrip", self.conn, index_col=["A", "B"] +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema_with_schema(conn, request, test_frame1): + # GH28486 + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) ) - tm.assert_frame_equal(df, result, check_index_type=True) + conn = request.getfixturevalue(conn) + create_sql = sql.get_schema(test_frame1, "test", con=conn, schema="pypi") + assert "CREATE TABLE pypi." in create_sql - @pytest.mark.parametrize( - "dtype", - [ - None, - int, - float, - {"A": int, "B": float}, - ], - ) - def test_dtype_argument(self, dtype): - # GH10285 Add dtype argument to read_sql_query - df = DataFrame([[1.2, 3.4], [5.6, 7.8]], columns=["A", "B"]) - assert df.to_sql(name="test_dtype_argument", con=self.conn) == 2 - - expected = df.astype(dtype) - result = sql.read_sql_query( - "SELECT A, B FROM test_dtype_argument", con=self.conn, dtype=dtype + +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema_dtypes(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) ) + conn_name = conn + conn = request.getfixturevalue(conn) + float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) - tm.assert_frame_equal(result, expected) + if conn_name == "sqlite_buildin": + dtype = "INTEGER" + else: + from sqlalchemy import Integer - def test_integer_col_names(self): - df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) - sql.to_sql(df, "test_frame_integer_col_names", self.conn, if_exists="replace") - - def test_get_schema(self, test_frame1): - create_sql = sql.get_schema(test_frame1, "test", con=self.conn) - assert "CREATE" in create_sql - - def test_get_schema_with_schema(self, test_frame1): - # GH28486 - create_sql = sql.get_schema(test_frame1, "test", con=self.conn, schema="pypi") - assert "CREATE TABLE pypi." in create_sql - - def test_get_schema_dtypes(self): - if self.mode == "sqlalchemy": - from sqlalchemy import Integer + dtype = Integer + create_sql = sql.get_schema(float_frame, "test", con=conn, dtype={"b": dtype}) + assert "CREATE" in create_sql + assert "INTEGER" in create_sql - dtype = Integer - else: - dtype = "INTEGER" - float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) - create_sql = sql.get_schema( - float_frame, "test", con=self.conn, dtype={"b": dtype} - ) - assert "CREATE" in create_sql - assert "INTEGER" in create_sql - - def test_get_schema_keys(self, test_frame1): - frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) - create_sql = sql.get_schema(frame, "test", con=self.conn, keys="Col1") +@pytest.mark.parametrize("conn", all_connectable) +def test_api_get_schema_keys(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) + conn_name = conn + conn = request.getfixturevalue(conn) + frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) + create_sql = sql.get_schema(frame, "test", con=conn, keys="Col1") + + if "mysql" in conn_name: + constraint_sentence = "CONSTRAINT test_pk PRIMARY KEY (`Col1`)" + else: constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' - assert constraint_sentence in create_sql + assert constraint_sentence in create_sql - # multiple columns as key (GH10385) - create_sql = sql.get_schema(test_frame1, "test", con=self.conn, keys=["A", "B"]) + # multiple columns as key (GH10385) + create_sql = sql.get_schema(test_frame1, "test", con=conn, keys=["A", "B"]) + if "mysql" in conn_name: + constraint_sentence = "CONSTRAINT test_pk PRIMARY KEY (`A`, `B`)" + else: constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")' - assert constraint_sentence in create_sql + assert constraint_sentence in create_sql - def test_chunksize_read(self): - df = DataFrame( - np.random.default_rng(2).standard_normal((22, 5)), columns=list("abcde") + +@pytest.mark.parametrize("conn", all_connectable) +def test_api_chunksize_read(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") ) - df.to_sql(name="test_chunksize", con=self.conn, index=False) + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("test_chunksize", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_chunksize") + + df = DataFrame( + np.random.default_rng(2).standard_normal((22, 5)), columns=list("abcde") + ) + df.to_sql(name="test_chunksize", con=conn, index=False) - # reading the query in one time - res1 = sql.read_sql_query("select * from test_chunksize", self.conn) + # reading the query in one time + res1 = sql.read_sql_query("select * from test_chunksize", conn) - # reading the query in chunks with read_sql_query - res2 = DataFrame() + # reading the query in chunks with read_sql_query + res2 = DataFrame() + i = 0 + sizes = [5, 5, 5, 5, 2] + + for chunk in sql.read_sql_query("select * from test_chunksize", conn, chunksize=5): + res2 = concat([res2, chunk], ignore_index=True) + assert len(chunk) == sizes[i] + i += 1 + + tm.assert_frame_equal(res1, res2) + + # reading the query in chunks with read_sql_query + if conn_name == "sqlite_buildin": + with pytest.raises(NotImplementedError, match=""): + sql.read_sql_table("test_chunksize", conn, chunksize=5) + else: + res3 = DataFrame() i = 0 sizes = [5, 5, 5, 5, 2] - for chunk in sql.read_sql_query( - "select * from test_chunksize", self.conn, chunksize=5 - ): - res2 = concat([res2, chunk], ignore_index=True) + for chunk in sql.read_sql_table("test_chunksize", conn, chunksize=5): + res3 = concat([res3, chunk], ignore_index=True) assert len(chunk) == sizes[i] i += 1 - tm.assert_frame_equal(res1, res2) + tm.assert_frame_equal(res1, res3) + + +@pytest.mark.parametrize("conn", all_connectable) +def test_api_categorical(conn, request): + if conn == "postgresql_adbc_conn": + adbc = import_optional_dependency("adbc_driver_postgresql", errors="ignore") + if adbc is not None and Version(adbc.__version__) < Version("0.9.0"): + request.node.add_marker( + pytest.mark.xfail( + reason="categorical dtype not implemented for ADBC postgres driver", + strict=True, + ) + ) + # GH8624 + # test that categorical gets written correctly as dense column + conn = request.getfixturevalue(conn) + if sql.has_table("test_categorical", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_categorical") + + df = DataFrame( + { + "person_id": [1, 2, 3], + "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"], + } + ) + df2 = df.copy() + df2["person_name"] = df2["person_name"].astype("category") + + df2.to_sql(name="test_categorical", con=conn, index=False) + res = sql.read_sql_query("SELECT * FROM test_categorical", conn) + + tm.assert_frame_equal(res, df) - # reading the query in chunks with read_sql_query - if self.mode == "sqlalchemy": - res3 = DataFrame() - i = 0 - sizes = [5, 5, 5, 5, 2] - - for chunk in sql.read_sql_table("test_chunksize", self.conn, chunksize=5): - res3 = concat([res3, chunk], ignore_index=True) - assert len(chunk) == sizes[i] - i += 1 - - tm.assert_frame_equal(res1, res3) - - def test_categorical(self): - # GH8624 - # test that categorical gets written correctly as dense column - df = DataFrame( - { - "person_id": [1, 2, 3], - "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"], - } - ) - df2 = df.copy() - df2["person_name"] = df2["person_name"].astype("category") - df2.to_sql(name="test_categorical", con=self.conn, index=False) - res = sql.read_sql_query("SELECT * FROM test_categorical", self.conn) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_unicode_column_name(conn, request): + # GH 11431 + conn = request.getfixturevalue(conn) + if sql.has_table("test_unicode", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_unicode") + + df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"]) + df.to_sql(name="test_unicode", con=conn, index=False) - tm.assert_frame_equal(res, df) - def test_unicode_column_name(self): - # GH 11431 - df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"]) - df.to_sql(name="test_unicode", con=self.conn, index=False) +@pytest.mark.parametrize("conn", all_connectable) +def test_api_escaped_table_name(conn, request): + # GH 13206 + conn_name = conn + conn = request.getfixturevalue(conn) + if sql.has_table("d1187b08-4943-4c8d-a7f6", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("d1187b08-4943-4c8d-a7f6") - def test_escaped_table_name(self): - # GH 13206 - df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) - df.to_sql(name="d1187b08-4943-4c8d-a7f6", con=self.conn, index=False) + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + df.to_sql(name="d1187b08-4943-4c8d-a7f6", con=conn, index=False) - res = sql.read_sql_query("SELECT * FROM `d1187b08-4943-4c8d-a7f6`", self.conn) + if "postgres" in conn_name: + query = 'SELECT * FROM "d1187b08-4943-4c8d-a7f6"' + else: + query = "SELECT * FROM `d1187b08-4943-4c8d-a7f6`" + res = sql.read_sql_query(query, conn) - tm.assert_frame_equal(res, df) + tm.assert_frame_equal(res, df) - def test_read_sql_duplicate_columns(self): - # GH#53117 - df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1}) - df.to_sql(name="test_table", con=self.conn, index=False) - result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", self.conn) - expected = DataFrame( - [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]], - columns=["a", "b", "a", "c"], +@pytest.mark.parametrize("conn", all_connectable) +def test_api_read_sql_duplicate_columns(conn, request): + # GH#53117 + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="pyarrow->pandas throws ValueError", strict=True) ) - tm.assert_frame_equal(result, expected) + conn = request.getfixturevalue(conn) + if sql.has_table("test_table", conn): + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("test_table") + + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1}) + df.to_sql(name="test_table", con=conn, index=False) + + result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table", conn) + expected = DataFrame( + [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]], + columns=["a", "b", "a", "c"], + ) + tm.assert_frame_equal(result, expected) -@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed") -class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): - """ - Test the public API as it would be used directly +@pytest.mark.parametrize("conn", all_connectable) +def test_read_table_columns(conn, request, test_frame1): + # test columns argument in read_table + conn_name = conn + if conn_name == "sqlite_buildin": + request.applymarker(pytest.mark.xfail(reason="Not Implemented")) - Tests for `read_sql_table` are included here, as this is specific for the - sqlalchemy mode. + conn = request.getfixturevalue(conn) + sql.to_sql(test_frame1, "test_frame", conn) - """ + cols = ["A", "B"] - flavor = "sqlite" - mode = "sqlalchemy" + result = sql.read_sql_table("test_frame", conn, columns=cols) + assert result.columns.tolist() == cols - @classmethod - def setup_class(cls): - cls.engine = sqlalchemy.create_engine("sqlite:///:memory:") - def test_read_table_columns(self, test_frame1): - # test columns argument in read_table - sql.to_sql(test_frame1, "test_frame", self.conn) +@pytest.mark.parametrize("conn", all_connectable) +def test_read_table_index_col(conn, request, test_frame1): + # test columns argument in read_table + conn_name = conn + if conn_name == "sqlite_buildin": + request.applymarker(pytest.mark.xfail(reason="Not Implemented")) - cols = ["A", "B"] - result = sql.read_sql_table("test_frame", self.conn, columns=cols) - assert result.columns.tolist() == cols + conn = request.getfixturevalue(conn) + sql.to_sql(test_frame1, "test_frame", conn) - def test_read_table_index_col(self, test_frame1): - # test columns argument in read_table - sql.to_sql(test_frame1, "test_frame", self.conn) + result = sql.read_sql_table("test_frame", conn, index_col="index") + assert result.index.names == ["index"] - result = sql.read_sql_table("test_frame", self.conn, index_col="index") - assert result.index.names == ["index"] + result = sql.read_sql_table("test_frame", conn, index_col=["A", "B"]) + assert result.index.names == ["A", "B"] - result = sql.read_sql_table("test_frame", self.conn, index_col=["A", "B"]) - assert result.index.names == ["A", "B"] + result = sql.read_sql_table( + "test_frame", conn, index_col=["A", "B"], columns=["C", "D"] + ) + assert result.index.names == ["A", "B"] + assert result.columns.tolist() == ["C", "D"] - result = sql.read_sql_table( - "test_frame", self.conn, index_col=["A", "B"], columns=["C", "D"] + +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_read_sql_delegate(conn, request): + if conn == "sqlite_buildin_iris": + request.applymarker( + pytest.mark.xfail( + reason="sqlite_buildin connection does not implement read_sql_table" + ) ) - assert result.index.names == ["A", "B"] - assert result.columns.tolist() == ["C", "D"] - def test_read_sql_delegate(self): - iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) - iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) - tm.assert_frame_equal(iris_frame1, iris_frame2) + conn = request.getfixturevalue(conn) + iris_frame1 = sql.read_sql_query("SELECT * FROM iris", conn) + iris_frame2 = sql.read_sql("SELECT * FROM iris", conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) - iris_frame1 = sql.read_sql_table("iris", self.conn) - iris_frame2 = sql.read_sql("iris", self.conn) - tm.assert_frame_equal(iris_frame1, iris_frame2) + iris_frame1 = sql.read_sql_table("iris", conn) + iris_frame2 = sql.read_sql("iris", conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) - def test_not_reflect_all_tables(self): - from sqlalchemy import text - from sqlalchemy.engine import Engine - # create invalid table - query_list = [ - text("CREATE TABLE invalid (x INTEGER, y UNKNOWN);"), - text("CREATE TABLE other_table (x INTEGER, y INTEGER);"), - ] - for query in query_list: - if isinstance(self.conn, Engine): - with self.conn.connect() as conn: - with conn.begin(): - conn.execute(query) - else: - with self.conn.begin(): - self.conn.execute(query) +def test_not_reflect_all_tables(sqlite_conn): + conn = sqlite_conn + from sqlalchemy import text + from sqlalchemy.engine import Engine - with tm.assert_produces_warning(None): - sql.read_sql_table("other_table", self.conn) - sql.read_sql_query("SELECT * FROM other_table", self.conn) + # create invalid table + query_list = [ + text("CREATE TABLE invalid (x INTEGER, y UNKNOWN);"), + text("CREATE TABLE other_table (x INTEGER, y INTEGER);"), + ] - def test_warning_case_insensitive_table_name(self, test_frame1): - # see gh-7815 - with tm.assert_produces_warning( - UserWarning, - match=( - r"The provided table name 'TABLE1' is not found exactly as such in " - r"the database after writing the table, possibly due to case " - r"sensitivity issues. Consider using lower case table names." - ), - ): - sql.SQLDatabase(self.conn).check_case_sensitive("TABLE1", "") + for query in query_list: + if isinstance(conn, Engine): + with conn.connect() as conn: + with conn.begin(): + conn.execute(query) + else: + with conn.begin(): + conn.execute(query) - # Test that the warning is certainly NOT triggered in a normal case. - with tm.assert_produces_warning(None): - test_frame1.to_sql(name="CaseSensitive", con=self.conn) + with tm.assert_produces_warning(None): + sql.read_sql_table("other_table", conn) + sql.read_sql_query("SELECT * FROM other_table", conn) - def _get_index_columns(self, tbl_name): - from sqlalchemy.engine import reflection - insp = reflection.Inspector.from_engine(self.conn) - ixs = insp.get_indexes("test_index_saved") - ixs = [i["column_names"] for i in ixs] - return ixs +@pytest.mark.parametrize("conn", all_connectable) +def test_warning_case_insensitive_table_name(conn, request, test_frame1): + conn_name = conn + if conn_name == "sqlite_buildin" or "adbc" in conn_name: + request.applymarker(pytest.mark.xfail(reason="Does not raise warning")) + + conn = request.getfixturevalue(conn) + # see gh-7815 + with tm.assert_produces_warning( + UserWarning, + match=( + r"The provided table name 'TABLE1' is not found exactly as such in " + r"the database after writing the table, possibly due to case " + r"sensitivity issues. Consider using lower case table names." + ), + ): + with sql.SQLDatabase(conn) as db: + db.check_case_sensitive("TABLE1", "") - def test_sqlalchemy_type_mapping(self): - from sqlalchemy import TIMESTAMP + # Test that the warning is certainly NOT triggered in a normal case. + with tm.assert_produces_warning(None): + test_frame1.to_sql(name="CaseSensitive", con=conn) - # Test Timestamp objects (no datetime64 because of timezone) (GH9085) - df = DataFrame( - {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} - ) - db = sql.SQLDatabase(self.conn) + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_sqlalchemy_type_mapping(conn, request): + conn = request.getfixturevalue(conn) + from sqlalchemy import TIMESTAMP + + # Test Timestamp objects (no datetime64 because of timezone) (GH9085) + df = DataFrame( + {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} + ) + with sql.SQLDatabase(conn) as db: table = sql.SQLTable("test_type", db, frame=df) # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones assert isinstance(table.table.c["time"].type, TIMESTAMP) - @pytest.mark.parametrize( - "integer, expected", - [ - ("int8", "SMALLINT"), - ("Int8", "SMALLINT"), - ("uint8", "SMALLINT"), - ("UInt8", "SMALLINT"), - ("int16", "SMALLINT"), - ("Int16", "SMALLINT"), - ("uint16", "INTEGER"), - ("UInt16", "INTEGER"), - ("int32", "INTEGER"), - ("Int32", "INTEGER"), - ("uint32", "BIGINT"), - ("UInt32", "BIGINT"), - ("int64", "BIGINT"), - ("Int64", "BIGINT"), - (int, "BIGINT" if np.dtype(int).name == "int64" else "INTEGER"), - ], - ) - def test_sqlalchemy_integer_mapping(self, integer, expected): - # GH35076 Map pandas integer to optimal SQLAlchemy integer type - df = DataFrame([0, 1], columns=["a"], dtype=integer) - db = sql.SQLDatabase(self.conn) + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +@pytest.mark.parametrize( + "integer, expected", + [ + ("int8", "SMALLINT"), + ("Int8", "SMALLINT"), + ("uint8", "SMALLINT"), + ("UInt8", "SMALLINT"), + ("int16", "SMALLINT"), + ("Int16", "SMALLINT"), + ("uint16", "INTEGER"), + ("UInt16", "INTEGER"), + ("int32", "INTEGER"), + ("Int32", "INTEGER"), + ("uint32", "BIGINT"), + ("UInt32", "BIGINT"), + ("int64", "BIGINT"), + ("Int64", "BIGINT"), + (int, "BIGINT" if np.dtype(int).name == "int64" else "INTEGER"), + ], +) +def test_sqlalchemy_integer_mapping(conn, request, integer, expected): + # GH35076 Map pandas integer to optimal SQLAlchemy integer type + conn = request.getfixturevalue(conn) + df = DataFrame([0, 1], columns=["a"], dtype=integer) + with sql.SQLDatabase(conn) as db: table = sql.SQLTable("test_type", db, frame=df) result = str(table.table.c.a.type) - assert result == expected + assert result == expected + - @pytest.mark.parametrize("integer", ["uint64", "UInt64"]) - def test_sqlalchemy_integer_overload_mapping(self, integer): - # GH35076 Map pandas integer to optimal SQLAlchemy integer type - df = DataFrame([0, 1], columns=["a"], dtype=integer) - db = sql.SQLDatabase(self.conn) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +@pytest.mark.parametrize("integer", ["uint64", "UInt64"]) +def test_sqlalchemy_integer_overload_mapping(conn, request, integer): + conn = request.getfixturevalue(conn) + # GH35076 Map pandas integer to optimal SQLAlchemy integer type + df = DataFrame([0, 1], columns=["a"], dtype=integer) + with sql.SQLDatabase(conn) as db: with pytest.raises( ValueError, match="Unsigned 64 bit integer datatype is not supported" ): sql.SQLTable("test_type", db, frame=df) - def test_database_uri_string(self, test_frame1): - # Test read_sql and .to_sql method with a database URI (GH10654) - # db_uri = 'sqlite:///:memory:' # raises - # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near - # "iris": syntax error [SQL: 'iris'] - with tm.ensure_clean() as name: - db_uri = "sqlite:///" + name - table = "iris" - test_frame1.to_sql(name=table, con=db_uri, if_exists="replace", index=False) - test_frame2 = sql.read_sql(table, db_uri) - test_frame3 = sql.read_sql_table(table, db_uri) - query = "SELECT * FROM iris" - test_frame4 = sql.read_sql_query(query, db_uri) - tm.assert_frame_equal(test_frame1, test_frame2) - tm.assert_frame_equal(test_frame1, test_frame3) - tm.assert_frame_equal(test_frame1, test_frame4) - - @td.skip_if_installed("pg8000") - def test_pg8000_sqlalchemy_passthrough_error(self): - # using driver that will not be installed on CI to trigger error - # in sqlalchemy.create_engine -> test passing of this error to user - db_uri = "postgresql+pg8000://user:pass@host/dbname" - with pytest.raises(ImportError, match="pg8000"): - sql.read_sql("select * from table", db_uri) - def test_query_by_text_obj(self): - # WIP : GH10846 - from sqlalchemy import text +@pytest.mark.parametrize("conn", all_connectable) +def test_database_uri_string(conn, request, test_frame1): + pytest.importorskip("sqlalchemy") + conn = request.getfixturevalue(conn) + # Test read_sql and .to_sql method with a database URI (GH10654) + # db_uri = 'sqlite:///:memory:' # raises + # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near + # "iris": syntax error [SQL: 'iris'] + with tm.ensure_clean() as name: + db_uri = "sqlite:///" + name + table = "iris" + test_frame1.to_sql(name=table, con=db_uri, if_exists="replace", index=False) + test_frame2 = sql.read_sql(table, db_uri) + test_frame3 = sql.read_sql_table(table, db_uri) + query = "SELECT * FROM iris" + test_frame4 = sql.read_sql_query(query, db_uri) + tm.assert_frame_equal(test_frame1, test_frame2) + tm.assert_frame_equal(test_frame1, test_frame3) + tm.assert_frame_equal(test_frame1, test_frame4) + +@td.skip_if_installed("pg8000") +@pytest.mark.parametrize("conn", all_connectable) +def test_pg8000_sqlalchemy_passthrough_error(conn, request): + pytest.importorskip("sqlalchemy") + conn = request.getfixturevalue(conn) + # using driver that will not be installed on CI to trigger error + # in sqlalchemy.create_engine -> test passing of this error to user + db_uri = "postgresql+pg8000://user:pass@host/dbname" + with pytest.raises(ImportError, match="pg8000"): + sql.read_sql("select * from table", db_uri) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_query_by_text_obj(conn, request): + # WIP : GH10846 + conn_name = conn + conn = request.getfixturevalue(conn) + from sqlalchemy import text + + if "postgres" in conn_name: + name_text = text('select * from iris where "Name"=:name') + else: name_text = text("select * from iris where name=:name") - iris_df = sql.read_sql(name_text, self.conn, params={"name": "Iris-versicolor"}) - all_names = set(iris_df["Name"]) - assert all_names == {"Iris-versicolor"} + iris_df = sql.read_sql(name_text, conn, params={"name": "Iris-versicolor"}) + all_names = set(iris_df["Name"]) + assert all_names == {"Iris-versicolor"} - def test_query_by_select_obj(self): - # WIP : GH10846 - from sqlalchemy import ( - bindparam, - select, - ) - iris = iris_table_metadata(self.flavor) - name_select = select(iris).where(iris.c.Name == bindparam("name")) - iris_df = sql.read_sql(name_select, self.conn, params={"name": "Iris-setosa"}) - all_names = set(iris_df["Name"]) - assert all_names == {"Iris-setosa"} +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_query_by_select_obj(conn, request): + conn = request.getfixturevalue(conn) + # WIP : GH10846 + from sqlalchemy import ( + bindparam, + select, + ) - def test_column_with_percentage(self): - # GH 37157 - df = DataFrame({"A": [0, 1, 2], "%_variation": [3, 4, 5]}) - df.to_sql(name="test_column_percentage", con=self.conn, index=False) + iris = iris_table_metadata() + name_select = select(iris).where(iris.c.Name == bindparam("name")) + iris_df = sql.read_sql(name_select, conn, params={"name": "Iris-setosa"}) + all_names = set(iris_df["Name"]) + assert all_names == {"Iris-setosa"} - res = sql.read_sql_table("test_column_percentage", self.conn) - tm.assert_frame_equal(res, df) +@pytest.mark.parametrize("conn", all_connectable) +def test_column_with_percentage(conn, request): + # GH 37157 + conn_name = conn + if conn_name == "sqlite_buildin": + request.applymarker(pytest.mark.xfail(reason="Not Implemented")) + conn = request.getfixturevalue(conn) + df = DataFrame({"A": [0, 1, 2], "%_variation": [3, 4, 5]}) + df.to_sql(name="test_column_percentage", con=conn, index=False) -class TestSQLiteFallbackApi(SQLiteMixIn, _TestSQLApi): - """ - Test the public sqlite connection fallback API + res = sql.read_sql_table("test_column_percentage", conn) - """ + tm.assert_frame_equal(res, df) - flavor = "sqlite" - mode = "fallback" - def connect(self, database=":memory:"): - return sqlite3.connect(database) +def test_sql_open_close(test_frame3): + # Test if the IO in the database still work if the connection closed + # between the writing and reading (as in many real situations). - def test_sql_open_close(self, test_frame3): - # Test if the IO in the database still work if the connection closed - # between the writing and reading (as in many real situations). - - with tm.ensure_clean() as name: - with closing(self.connect(name)) as conn: - assert ( - sql.to_sql(test_frame3, "test_frame3_legacy", conn, index=False) - == 4 - ) + with tm.ensure_clean() as name: + with closing(sqlite3.connect(name)) as conn: + assert sql.to_sql(test_frame3, "test_frame3_legacy", conn, index=False) == 4 - with closing(self.connect(name)) as conn: - result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) + with closing(sqlite3.connect(name)) as conn: + result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) - tm.assert_frame_equal(test_frame3, result) + tm.assert_frame_equal(test_frame3, result) - @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") - def test_con_string_import_error(self): - conn = "mysql://root@localhost/pandas" - msg = "Using URI string without sqlalchemy installed" - with pytest.raises(ImportError, match=msg): - sql.read_sql("SELECT * FROM iris", conn) - - @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") - def test_con_unknown_dbapi2_class_does_not_error_without_sql_alchemy_installed( - self, - ): - class MockSqliteConnection: - def __init__(self, *args, **kwargs) -> None: - self.conn = sqlite3.Connection(*args, **kwargs) - - def __getattr__(self, name): - return getattr(self.conn, name) - - def close(self): - self.conn.close() - - with contextlib.closing(MockSqliteConnection(":memory:")) as conn: - with tm.assert_produces_warning(UserWarning): - sql.read_sql("SELECT 1", conn) - - def test_read_sql_delegate(self): - iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) - iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) - tm.assert_frame_equal(iris_frame1, iris_frame2) - - msg = "Execution failed on sql 'iris': near \"iris\": syntax error" - with pytest.raises(sql.DatabaseError, match=msg): - sql.read_sql("iris", self.conn) - - def test_get_schema2(self, test_frame1): - # without providing a connection object (available for backwards comp) - create_sql = sql.get_schema(test_frame1, "test") - assert "CREATE" in create_sql - - def _get_sqlite_column_type(self, schema, column): - for col in schema.split("\n"): - if col.split()[0].strip('"') == column: - return col.split()[1] - raise ValueError(f"Column {column} not found") - def test_sqlite_type_mapping(self): - # Test Timestamp objects (no datetime64 because of timezone) (GH9085) - df = DataFrame( - {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} - ) - db = sql.SQLiteDatabase(self.conn) - table = sql.SQLiteTable("test_type", db, frame=df) - schema = table.sql_schema() - assert self._get_sqlite_column_type(schema, "time") == "TIMESTAMP" +@td.skip_if_installed("sqlalchemy") +def test_con_string_import_error(): + conn = "mysql://root@localhost/pandas" + msg = "Using URI string without sqlalchemy installed" + with pytest.raises(ImportError, match=msg): + sql.read_sql("SELECT * FROM iris", conn) -# ----------------------------------------------------------------------------- -# -- Database flavor specific tests +@td.skip_if_installed("sqlalchemy") +def test_con_unknown_dbapi2_class_does_not_error_without_sql_alchemy_installed(): + class MockSqliteConnection: + def __init__(self, *args, **kwargs) -> None: + self.conn = sqlite3.Connection(*args, **kwargs) + def __getattr__(self, name): + return getattr(self.conn, name) -@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed") -class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): - """ - Base class for testing the sqlalchemy backend. + def close(self): + self.conn.close() - Subclasses for specific database types are created below. Tests that - deviate for each flavor are overwritten there. + with contextlib.closing(MockSqliteConnection(":memory:")) as conn: + with tm.assert_produces_warning(UserWarning): + sql.read_sql("SELECT 1", conn) - """ - flavor: str +def test_sqlite_read_sql_delegate(sqlite_buildin_iris): + conn = sqlite_buildin_iris + iris_frame1 = sql.read_sql_query("SELECT * FROM iris", conn) + iris_frame2 = sql.read_sql("SELECT * FROM iris", conn) + tm.assert_frame_equal(iris_frame1, iris_frame2) - @classmethod - def setup_class(cls): - cls.setup_driver() - cls.setup_engine() + msg = "Execution failed on sql 'iris': near \"iris\": syntax error" + with pytest.raises(sql.DatabaseError, match=msg): + sql.read_sql("iris", conn) - @pytest.fixture(autouse=True) - def setup_method(self, iris_path, types_data): - try: - self.conn = self.engine.connect() - self.pandasSQL = sql.SQLDatabase(self.conn) - except sqlalchemy.exc.OperationalError: - pytest.skip(f"Can't connect to {self.flavor} server") - self.load_iris_data(iris_path) - self.load_types_data(types_data) - - @classmethod - def setup_driver(cls): - raise NotImplementedError() - - @classmethod - def setup_engine(cls): - raise NotImplementedError() - - def test_read_sql_parameter(self, sql_strings): - self._read_sql_iris_parameter(sql_strings) - def test_read_sql_named_parameter(self, sql_strings): - self._read_sql_iris_named_parameter(sql_strings) +def test_get_schema2(test_frame1): + # without providing a connection object (available for backwards comp) + create_sql = sql.get_schema(test_frame1, "test") + assert "CREATE" in create_sql - def test_to_sql_empty(self, test_frame1): - self._to_sql_empty(test_frame1) - def test_create_table(self): - from sqlalchemy import inspect +def test_sqlite_type_mapping(sqlite_buildin): + # Test Timestamp objects (no datetime64 because of timezone) (GH9085) + conn = sqlite_buildin + df = DataFrame( + {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} + ) + db = sql.SQLiteDatabase(conn) + table = sql.SQLiteTable("test_type", db, frame=df) + schema = table.sql_schema() + for col in schema.split("\n"): + if col.split()[0].strip('"') == "time": + assert col.split()[1] == "TIMESTAMP" - temp_conn = self.connect() - temp_frame = DataFrame( - {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} - ) - with sql.SQLDatabase(temp_conn, need_transaction=True) as pandasSQL: - assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 - insp = inspect(temp_conn) - assert insp.has_table("temp_frame") +# ----------------------------------------------------------------------------- +# -- Database flavor specific tests - # Cleanup - with sql.SQLDatabase(temp_conn, need_transaction=True) as pandasSQL: - pandasSQL.drop_table("temp_frame") - def test_drop_table(self): - from sqlalchemy import inspect +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_create_table(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") - temp_conn = self.connect() - temp_frame = DataFrame( - {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} - ) - pandasSQL = sql.SQLDatabase(temp_conn) + conn = request.getfixturevalue(conn) + + from sqlalchemy import inspect + + temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 - insp = inspect(temp_conn) - assert insp.has_table("temp_frame") + insp = inspect(conn) + assert insp.has_table("temp_frame") + # Cleanup + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: pandasSQL.drop_table("temp_frame") + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_drop_table(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") + + conn = request.getfixturevalue(conn) + + from sqlalchemy import inspect + + temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) + with sql.SQLDatabase(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4 + + insp = inspect(conn) + assert insp.has_table("temp_frame") + + with pandasSQL.run_transaction(): + pandasSQL.drop_table("temp_frame") try: insp.clear_cache() # needed with SQLAlchemy 2.0, unavailable prior except AttributeError: pass assert not insp.has_table("temp_frame") - def test_roundtrip(self, test_frame1): - self._roundtrip(test_frame1) - def test_execute_sql(self): - self._execute_sql() +@pytest.mark.parametrize("conn", all_connectable) +def test_roundtrip(conn, request, test_frame1): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") - def test_read_table(self): - iris_frame = sql.read_sql_table("iris", con=self.conn) - check_iris_frame(iris_frame) + conn_name = conn + conn = request.getfixturevalue(conn) + pandasSQL = pandasSQL_builder(conn) + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 + result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") + + if "adbc" in conn_name: + result = result.rename(columns={"__index_level_0__": "level_0"}) + result.set_index("level_0", inplace=True) + # result.index.astype(int) - def test_read_table_columns(self): - iris_frame = sql.read_sql_table( - "iris", con=self.conn, columns=["SepalLength", "SepalLength"] - ) - tm.equalContents(iris_frame.columns.values, ["SepalLength", "SepalLength"]) + result.index.name = None - def test_read_table_absent_raises(self): - msg = "Table this_doesnt_exist not found" - with pytest.raises(ValueError, match=msg): - sql.read_sql_table("this_doesnt_exist", con=self.conn) + tm.assert_frame_equal(result, test_frame1) - def test_default_type_conversion(self): - df = sql.read_sql_table("types", self.conn) - assert issubclass(df.FloatCol.dtype.type, np.floating) - assert issubclass(df.IntCol.dtype.type, np.integer) - assert issubclass(df.BoolCol.dtype.type, np.bool_) +@pytest.mark.parametrize("conn", all_connectable_iris) +def test_execute_sql(conn, request): + conn = request.getfixturevalue(conn) + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + iris_results = pandasSQL.execute("SELECT * FROM iris") + row = iris_results.fetchone() + iris_results.close() + assert list(row) == [5.1, 3.5, 1.4, 0.2, "Iris-setosa"] - # Int column with NA values stays as float - assert issubclass(df.IntColWithNull.dtype.type, np.floating) - # Bool column with NA values becomes object - assert issubclass(df.BoolColWithNull.dtype.type, object) - def test_bigint(self): - # int64 should be converted to BigInteger, GH7433 - df = DataFrame(data={"i64": [2**62]}) - assert df.to_sql(name="test_bigint", con=self.conn, index=False) == 1 - result = sql.read_sql_table("test_bigint", self.conn) - - tm.assert_frame_equal(df, result) - - def test_default_date_load(self): - df = sql.read_sql_table("types", self.conn) - - # IMPORTANT - sqlite has no native date type, so shouldn't parse, but - # MySQL SHOULD be converted. - assert issubclass(df.DateCol.dtype.type, np.datetime64) - - def test_datetime_with_timezone(self, request): - # edge case that converts postgresql datetime with time zone types - # to datetime64[ns,psycopg2.tz.FixedOffsetTimezone..], which is ok - # but should be more natural, so coerce to datetime64[ns] for now - - def check(col): - # check that a column is either datetime64[ns] - # or datetime64[ns, UTC] - if lib.is_np_dtype(col.dtype, "M"): - # "2000-01-01 00:00:00-08:00" should convert to - # "2000-01-01 08:00:00" - assert col[0] == Timestamp("2000-01-01 08:00:00") - - # "2000-06-01 00:00:00-07:00" should convert to - # "2000-06-01 07:00:00" - assert col[1] == Timestamp("2000-06-01 07:00:00") - - elif isinstance(col.dtype, DatetimeTZDtype): - assert str(col.dt.tz) == "UTC" - - # "2000-01-01 00:00:00-08:00" should convert to - # "2000-01-01 08:00:00" - # "2000-06-01 00:00:00-07:00" should convert to - # "2000-06-01 07:00:00" - # GH 6415 - expected_data = [ - Timestamp("2000-01-01 08:00:00", tz="UTC"), - Timestamp("2000-06-01 07:00:00", tz="UTC"), - ] - expected = Series(expected_data, name=col.name) - tm.assert_series_equal(col, expected) +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_sqlalchemy_read_table(conn, request): + conn = request.getfixturevalue(conn) + iris_frame = sql.read_sql_table("iris", con=conn) + check_iris_frame(iris_frame) - else: - raise AssertionError( - f"DateCol loaded with incorrect type -> {col.dtype}" - ) - # GH11216 - df = read_sql_query("select * from types", self.conn) - if not hasattr(df, "DateColWithTz"): - request.node.add_marker( - pytest.mark.xfail(reason="no column with datetime with time zone") - ) +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_sqlalchemy_read_table_columns(conn, request): + conn = request.getfixturevalue(conn) + iris_frame = sql.read_sql_table( + "iris", con=conn, columns=["SepalLength", "SepalLength"] + ) + tm.assert_index_equal(iris_frame.columns, Index(["SepalLength", "SepalLength__1"])) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) +def test_read_table_absent_raises(conn, request): + conn = request.getfixturevalue(conn) + msg = "Table this_doesnt_exist not found" + with pytest.raises(ValueError, match=msg): + sql.read_sql_table("this_doesnt_exist", con=conn) - # this is parsed on Travis (linux), but not on macosx for some reason - # even with the same versions of psycopg2 & sqlalchemy, possibly a - # Postgresql server version difference - col = df.DateColWithTz - assert isinstance(col.dtype, DatetimeTZDtype) - df = read_sql_query( - "select * from types", self.conn, parse_dates=["DateColWithTz"] +@pytest.mark.parametrize("conn", sqlalchemy_connectable_types) +def test_sqlalchemy_default_type_conversion(conn, request): + conn_name = conn + if conn_name == "sqlite_str": + pytest.skip("types tables not created in sqlite_str fixture") + elif "mysql" in conn_name or "sqlite" in conn_name: + request.applymarker( + pytest.mark.xfail(reason="boolean dtype not inferred properly") ) - if not hasattr(df, "DateColWithTz"): - request.node.add_marker( - pytest.mark.xfail(reason="no column with datetime with time zone") - ) - col = df.DateColWithTz - assert isinstance(col.dtype, DatetimeTZDtype) - assert str(col.dt.tz) == "UTC" - check(df.DateColWithTz) - - df = concat( - list(read_sql_query("select * from types", self.conn, chunksize=1)), - ignore_index=True, - ) - col = df.DateColWithTz - assert isinstance(col.dtype, DatetimeTZDtype) - assert str(col.dt.tz) == "UTC" - expected = sql.read_sql_table("types", self.conn) - col = expected.DateColWithTz - assert isinstance(col.dtype, DatetimeTZDtype) - tm.assert_series_equal(df.DateColWithTz, expected.DateColWithTz) - - # xref #7139 - # this might or might not be converted depending on the postgres driver - df = sql.read_sql_table("types", self.conn) - check(df.DateColWithTz) - - def test_datetime_with_timezone_roundtrip(self): - # GH 9086 - # Write datetimetz data to a db and read it back - # For dbs that support timestamps with timezones, should get back UTC - # otherwise naive data should be returned - expected = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} - ) - assert expected.to_sql(name="test_datetime_tz", con=self.conn, index=False) == 3 - - if self.flavor == "postgresql": - # SQLAlchemy "timezones" (i.e. offsets) are coerced to UTC - expected["A"] = expected["A"].dt.tz_convert("UTC") - else: - # Otherwise, timestamps are returned as local, naive - expected["A"] = expected["A"].dt.tz_localize(None) - result = sql.read_sql_table("test_datetime_tz", self.conn) - tm.assert_frame_equal(result, expected) + conn = request.getfixturevalue(conn) + df = sql.read_sql_table("types", conn) - result = sql.read_sql_query("SELECT * FROM test_datetime_tz", self.conn) - if self.flavor == "sqlite": - # read_sql_query does not return datetime type like read_sql_table - assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"]) - tm.assert_frame_equal(result, expected) + assert issubclass(df.FloatCol.dtype.type, np.floating) + assert issubclass(df.IntCol.dtype.type, np.integer) + assert issubclass(df.BoolCol.dtype.type, np.bool_) - def test_out_of_bounds_datetime(self): - # GH 26761 - data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) - assert data.to_sql(name="test_datetime_obb", con=self.conn, index=False) == 1 - result = sql.read_sql_table("test_datetime_obb", self.conn) - expected = DataFrame([pd.NaT], columns=["date"]) - tm.assert_frame_equal(result, expected) + # Int column with NA values stays as float + assert issubclass(df.IntColWithNull.dtype.type, np.floating) + # Bool column with NA values becomes object + assert issubclass(df.BoolColWithNull.dtype.type, object) - def test_naive_datetimeindex_roundtrip(self): - # GH 23510 - # Ensure that a naive DatetimeIndex isn't converted to UTC - dates = date_range("2018-01-01", periods=5, freq="6H")._with_freq(None) - expected = DataFrame({"nums": range(5)}, index=dates) - assert ( - expected.to_sql(name="foo_table", con=self.conn, index_label="info_date") - == 5 - ) - result = sql.read_sql_table("foo_table", self.conn, index_col="info_date") - # result index with gain a name from a set_index operation; expected - tm.assert_frame_equal(result, expected, check_names=False) - def test_date_parsing(self): - # No Parsing - df = sql.read_sql_table("types", self.conn) - expected_type = object if self.flavor == "sqlite" else np.datetime64 - assert issubclass(df.DateCol.dtype.type, expected_type) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_bigint(conn, request): + # int64 should be converted to BigInteger, GH7433 + conn = request.getfixturevalue(conn) + df = DataFrame(data={"i64": [2**62]}) + assert df.to_sql(name="test_bigint", con=conn, index=False) == 1 + result = sql.read_sql_table("test_bigint", conn) + + tm.assert_frame_equal(df, result) - df = sql.read_sql_table("types", self.conn, parse_dates=["DateCol"]) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_table( - "types", self.conn, parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"} +@pytest.mark.parametrize("conn", sqlalchemy_connectable_types) +def test_default_date_load(conn, request): + conn_name = conn + if conn_name == "sqlite_str": + pytest.skip("types tables not created in sqlite_str fixture") + elif "sqlite" in conn_name: + request.applymarker( + pytest.mark.xfail(reason="sqlite does not read date properly") ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_table( - "types", - self.conn, - parse_dates={"DateCol": {"format": "%Y-%m-%d %H:%M:%S"}}, - ) - assert issubclass(df.DateCol.dtype.type, np.datetime64) + conn = request.getfixturevalue(conn) + df = sql.read_sql_table("types", conn) - df = sql.read_sql_table("types", self.conn, parse_dates=["IntDateCol"]) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_table("types", self.conn, parse_dates={"IntDateCol": "s"}) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - df = sql.read_sql_table( - "types", self.conn, parse_dates={"IntDateCol": {"unit": "s"}} - ) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) +@pytest.mark.parametrize("conn", postgresql_connectable) +@pytest.mark.parametrize("parse_dates", [None, ["DateColWithTz"]]) +def test_datetime_with_timezone_query(conn, request, parse_dates): + # edge case that converts postgresql datetime with time zone types + # to datetime64[ns,psycopg2.tz.FixedOffsetTimezone..], which is ok + # but should be more natural, so coerce to datetime64[ns] for now + conn = request.getfixturevalue(conn) + expected = create_and_load_postgres_datetz(conn) + + # GH11216 + df = read_sql_query("select * from datetz", conn, parse_dates=parse_dates) + col = df.DateColWithTz + tm.assert_series_equal(col, expected) + + +@pytest.mark.parametrize("conn", postgresql_connectable) +def test_datetime_with_timezone_query_chunksize(conn, request): + conn = request.getfixturevalue(conn) + expected = create_and_load_postgres_datetz(conn) + + df = concat( + list(read_sql_query("select * from datetz", conn, chunksize=1)), + ignore_index=True, + ) + col = df.DateColWithTz + tm.assert_series_equal(col, expected) + + +@pytest.mark.parametrize("conn", postgresql_connectable) +def test_datetime_with_timezone_table(conn, request): + conn = request.getfixturevalue(conn) + expected = create_and_load_postgres_datetz(conn) + result = sql.read_sql_table("datetz", conn) + tm.assert_frame_equal(result, expected.to_frame()) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime_with_timezone_roundtrip(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + # GH 9086 + # Write datetimetz data to a db and read it back + # For dbs that support timestamps with timezones, should get back UTC + # otherwise naive data should be returned + expected = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + ) + assert expected.to_sql(name="test_datetime_tz", con=conn, index=False) == 3 + + if "postgresql" in conn_name: + # SQLAlchemy "timezones" (i.e. offsets) are coerced to UTC + expected["A"] = expected["A"].dt.tz_convert("UTC") + else: + # Otherwise, timestamps are returned as local, naive + expected["A"] = expected["A"].dt.tz_localize(None) + + result = sql.read_sql_table("test_datetime_tz", conn) + tm.assert_frame_equal(result, expected) + + result = sql.read_sql_query("SELECT * FROM test_datetime_tz", conn) + if "sqlite" in conn_name: + # read_sql_query does not return datetime type like read_sql_table + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_out_of_bounds_datetime(conn, request): + # GH 26761 + conn = request.getfixturevalue(conn) + data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) + assert data.to_sql(name="test_datetime_obb", con=conn, index=False) == 1 + result = sql.read_sql_table("test_datetime_obb", conn) + expected = DataFrame([pd.NaT], columns=["date"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_naive_datetimeindex_roundtrip(conn, request): + # GH 23510 + # Ensure that a naive DatetimeIndex isn't converted to UTC + conn = request.getfixturevalue(conn) + dates = date_range("2018-01-01", periods=5, freq="6h")._with_freq(None) + expected = DataFrame({"nums": range(5)}, index=dates) + assert expected.to_sql(name="foo_table", con=conn, index_label="info_date") == 5 + result = sql.read_sql_table("foo_table", conn, index_col="info_date") + # result index with gain a name from a set_index operation; expected + tm.assert_frame_equal(result, expected, check_names=False) - def test_datetime(self): - df = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} - ) - assert df.to_sql(name="test_datetime", con=self.conn) == 3 - # with read_table -> type information from schema used - result = sql.read_sql_table("test_datetime", self.conn) - result = result.drop("index", axis=1) +@pytest.mark.parametrize("conn", sqlalchemy_connectable_types) +def test_date_parsing(conn, request): + # No Parsing + conn_name = conn + conn = request.getfixturevalue(conn) + df = sql.read_sql_table("types", conn) + expected_type = object if "sqlite" in conn_name else np.datetime64 + assert issubclass(df.DateCol.dtype.type, expected_type) + + df = sql.read_sql_table("types", conn, parse_dates=["DateCol"]) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table("types", conn, parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table( + "types", + conn, + parse_dates={"DateCol": {"format": "%Y-%m-%d %H:%M:%S"}}, + ) + assert issubclass(df.DateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table("types", conn, parse_dates=["IntDateCol"]) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table("types", conn, parse_dates={"IntDateCol": "s"}) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + + df = sql.read_sql_table("types", conn, parse_dates={"IntDateCol": {"unit": "s"}}) + assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + ) + assert df.to_sql(name="test_datetime", con=conn) == 3 + + # with read_table -> type information from schema used + result = sql.read_sql_table("test_datetime", conn) + result = result.drop("index", axis=1) + tm.assert_frame_equal(result, df) + + # with read_sql -> no type information -> sqlite has no native + result = sql.read_sql_query("SELECT * FROM test_datetime", conn) + result = result.drop("index", axis=1) + if "sqlite" in conn_name: + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"]) + tm.assert_frame_equal(result, df) + else: tm.assert_frame_equal(result, df) - # with read_sql -> no type information -> sqlite has no native - result = sql.read_sql_query("SELECT * FROM test_datetime", self.conn) - result = result.drop("index", axis=1) - if self.flavor == "sqlite": - assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"]) - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) - def test_datetime_NaT(self): - df = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} - ) - df.loc[1, "A"] = np.nan - assert df.to_sql(name="test_datetime", con=self.conn, index=False) == 3 +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime_NaT(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + ) + df.loc[1, "A"] = np.nan + assert df.to_sql(name="test_datetime", con=conn, index=False) == 3 - # with read_table -> type information from schema used - result = sql.read_sql_table("test_datetime", self.conn) + # with read_table -> type information from schema used + result = sql.read_sql_table("test_datetime", conn) + tm.assert_frame_equal(result, df) + + # with read_sql -> no type information -> sqlite has no native + result = sql.read_sql_query("SELECT * FROM test_datetime", conn) + if "sqlite" in conn_name: + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"], errors="coerce") + tm.assert_frame_equal(result, df) + else: tm.assert_frame_equal(result, df) - # with read_sql -> no type information -> sqlite has no native - result = sql.read_sql_query("SELECT * FROM test_datetime", self.conn) - if self.flavor == "sqlite": - assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"], errors="coerce") - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) - def test_datetime_date(self): - # test support for datetime.date - df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - assert df.to_sql(name="test_date", con=self.conn, index=False) == 2 - res = read_sql_table("test_date", self.conn) - result = res["a"] - expected = to_datetime(df["a"]) - # comes back as datetime64 - tm.assert_series_equal(result, expected) - - def test_datetime_time(self, sqlite_buildin): - # test support for datetime.time - df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) - assert df.to_sql(name="test_time", con=self.conn, index=False) == 2 - res = read_sql_table("test_time", self.conn) - tm.assert_frame_equal(res, df) - - # GH8341 - # first, use the fallback to have the sqlite adapter put in place - sqlite_conn = sqlite_buildin - assert sql.to_sql(df, "test_time2", sqlite_conn, index=False) == 2 - res = sql.read_sql_query("SELECT * FROM test_time2", sqlite_conn) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime_date(conn, request): + # test support for datetime.date + conn = request.getfixturevalue(conn) + df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) + assert df.to_sql(name="test_date", con=conn, index=False) == 2 + res = read_sql_table("test_date", conn) + result = res["a"] + expected = to_datetime(df["a"]) + # comes back as datetime64 + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_datetime_time(conn, request, sqlite_buildin): + # test support for datetime.time + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) + assert df.to_sql(name="test_time", con=conn, index=False) == 2 + res = read_sql_table("test_time", conn) + tm.assert_frame_equal(res, df) + + # GH8341 + # first, use the fallback to have the sqlite adapter put in place + sqlite_conn = sqlite_buildin + assert sql.to_sql(df, "test_time2", sqlite_conn, index=False) == 2 + res = sql.read_sql_query("SELECT * FROM test_time2", sqlite_conn) + ref = df.map(lambda _: _.strftime("%H:%M:%S.%f")) + tm.assert_frame_equal(ref, res) # check if adapter is in place + # then test if sqlalchemy is unaffected by the sqlite adapter + assert sql.to_sql(df, "test_time3", conn, index=False) == 2 + if "sqlite" in conn_name: + res = sql.read_sql_query("SELECT * FROM test_time3", conn) ref = df.map(lambda _: _.strftime("%H:%M:%S.%f")) - tm.assert_frame_equal(ref, res) # check if adapter is in place - # then test if sqlalchemy is unaffected by the sqlite adapter - assert sql.to_sql(df, "test_time3", self.conn, index=False) == 2 - if self.flavor == "sqlite": - res = sql.read_sql_query("SELECT * FROM test_time3", self.conn) - ref = df.map(lambda _: _.strftime("%H:%M:%S.%f")) - tm.assert_frame_equal(ref, res) - res = sql.read_sql_table("test_time3", self.conn) - tm.assert_frame_equal(df, res) - - def test_mixed_dtype_insert(self): - # see GH6509 - s1 = Series(2**25 + 1, dtype=np.int32) - s2 = Series(0.0, dtype=np.float32) - df = DataFrame({"s1": s1, "s2": s2}) - - # write and read again - assert df.to_sql(name="test_read_write", con=self.conn, index=False) == 1 - df2 = sql.read_sql_table("test_read_write", self.conn) - - tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) - - def test_nan_numeric(self): - # NaNs in numeric float column - df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) - assert df.to_sql(name="test_nan", con=self.conn, index=False) == 3 + tm.assert_frame_equal(ref, res) + res = sql.read_sql_table("test_time3", conn) + tm.assert_frame_equal(df, res) - # with read_table - result = sql.read_sql_table("test_nan", self.conn) - tm.assert_frame_equal(result, df) - # with read_sql - result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) - tm.assert_frame_equal(result, df) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_mixed_dtype_insert(conn, request): + # see GH6509 + conn = request.getfixturevalue(conn) + s1 = Series(2**25 + 1, dtype=np.int32) + s2 = Series(0.0, dtype=np.float32) + df = DataFrame({"s1": s1, "s2": s2}) + + # write and read again + assert df.to_sql(name="test_read_write", con=conn, index=False) == 1 + df2 = sql.read_sql_table("test_read_write", conn) - def test_nan_fullcolumn(self): - # full NaN column (numeric float column) - df = DataFrame({"A": [0, 1, 2], "B": [np.nan, np.nan, np.nan]}) - assert df.to_sql(name="test_nan", con=self.conn, index=False) == 3 + tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) - # with read_table - result = sql.read_sql_table("test_nan", self.conn) - tm.assert_frame_equal(result, df) - # with read_sql -> not type info from table -> stays None - df["B"] = df["B"].astype("object") - df["B"] = None - result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) - tm.assert_frame_equal(result, df) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_nan_numeric(conn, request): + # NaNs in numeric float column + conn = request.getfixturevalue(conn) + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + assert df.to_sql(name="test_nan", con=conn, index=False) == 3 - def test_nan_string(self): - # NaNs in string column - df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", np.nan]}) - assert df.to_sql(name="test_nan", con=self.conn, index=False) == 3 + # with read_table + result = sql.read_sql_table("test_nan", conn) + tm.assert_frame_equal(result, df) + + # with read_sql + result = sql.read_sql_query("SELECT * FROM test_nan", conn) + tm.assert_frame_equal(result, df) - # NaNs are coming back as None - df.loc[2, "B"] = None - # with read_table - result = sql.read_sql_table("test_nan", self.conn) - tm.assert_frame_equal(result, df) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_nan_fullcolumn(conn, request): + # full NaN column (numeric float column) + conn = request.getfixturevalue(conn) + df = DataFrame({"A": [0, 1, 2], "B": [np.nan, np.nan, np.nan]}) + assert df.to_sql(name="test_nan", con=conn, index=False) == 3 - # with read_sql - result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) - tm.assert_frame_equal(result, df) + # with read_table + result = sql.read_sql_table("test_nan", conn) + tm.assert_frame_equal(result, df) + + # with read_sql -> not type info from table -> stays None + df["B"] = df["B"].astype("object") + df["B"] = None + result = sql.read_sql_query("SELECT * FROM test_nan", conn) + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_nan_string(conn, request): + # NaNs in string column + conn = request.getfixturevalue(conn) + df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", np.nan]}) + assert df.to_sql(name="test_nan", con=conn, index=False) == 3 + + # NaNs are coming back as None + df.loc[2, "B"] = None + + # with read_table + result = sql.read_sql_table("test_nan", conn) + tm.assert_frame_equal(result, df) + + # with read_sql + result = sql.read_sql_query("SELECT * FROM test_nan", conn) + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("conn", all_connectable) +def test_to_sql_save_index(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="ADBC implementation does not create index", strict=True + ) + ) + conn_name = conn + conn = request.getfixturevalue(conn) + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], columns=["A", "B", "C"], index=["A"] + ) - def _get_index_columns(self, tbl_name): + tbl_name = "test_to_sql_saves_index" + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(df, tbl_name) == 2 + + if conn_name in {"sqlite_buildin", "sqlite_str"}: + ixs = sql.read_sql_query( + "SELECT * FROM sqlite_master WHERE type = 'index' " + f"AND tbl_name = '{tbl_name}'", + conn, + ) + ix_cols = [] + for ix_name in ixs.name: + ix_info = sql.read_sql_query(f"PRAGMA index_info({ix_name})", conn) + ix_cols.append(ix_info.name.tolist()) + else: from sqlalchemy import inspect - insp = inspect(self.conn) + insp = inspect(conn) ixs = insp.get_indexes(tbl_name) - ixs = [i["column_names"] for i in ixs] - return ixs + ix_cols = [i["column_names"] for i in ixs] + + assert ix_cols == [["A"]] - def test_to_sql_save_index(self): - self._to_sql_save_index() - def test_transactions(self): - self._transaction_test() +@pytest.mark.parametrize("conn", all_connectable) +def test_transactions(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) - def test_get_schema_create_table(self, test_frame3): - # Use a dataframe without a bool column, since MySQL converts bool to - # TINYINT (which read_sql_table returns as an int and causes a dtype - # mismatch) + stmt = "CREATE TABLE test_trans (A INT, B TEXT)" + if conn_name != "sqlite_buildin" and "adbc" not in conn_name: from sqlalchemy import text - from sqlalchemy.engine import Engine - tbl = "test_get_schema_create_table" - create_sql = sql.get_schema(test_frame3, tbl, con=self.conn) - blank_test_df = test_frame3.iloc[:0] - - self.drop_table(tbl, self.conn) - create_sql = text(create_sql) - if isinstance(self.conn, Engine): - with self.conn.connect() as conn: - with conn.begin(): - conn.execute(create_sql) - else: - with self.conn.begin(): - self.conn.execute(create_sql) - returned_df = sql.read_sql_table(tbl, self.conn) - tm.assert_frame_equal(returned_df, blank_test_df, check_index_type=False) - self.drop_table(tbl, self.conn) - - def test_dtype(self): - from sqlalchemy import ( - TEXT, - String, - ) - from sqlalchemy.schema import MetaData - - cols = ["A", "B"] - data = [(0.8, True), (0.9, None)] - df = DataFrame(data, columns=cols) - assert df.to_sql(name="dtype_test", con=self.conn) == 2 - assert df.to_sql(name="dtype_test2", con=self.conn, dtype={"B": TEXT}) == 2 - meta = MetaData() - meta.reflect(bind=self.conn) - sqltype = meta.tables["dtype_test2"].columns["B"].type - assert isinstance(sqltype, TEXT) - msg = "The type of B is not a SQLAlchemy type" - with pytest.raises(ValueError, match=msg): - df.to_sql(name="error", con=self.conn, dtype={"B": str}) + stmt = text(stmt) - # GH9083 - assert ( - df.to_sql(name="dtype_test3", con=self.conn, dtype={"B": String(10)}) == 2 + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction() as trans: + trans.execute(stmt) + + +@pytest.mark.parametrize("conn", all_connectable) +def test_transaction_rollback(conn, request): + conn_name = conn + conn = request.getfixturevalue(conn) + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction() as trans: + stmt = "CREATE TABLE test_trans (A INT, B TEXT)" + if "adbc" in conn_name or isinstance(pandasSQL, SQLiteDatabase): + trans.execute(stmt) + else: + from sqlalchemy import text + + stmt = text(stmt) + trans.execute(stmt) + + class DummyException(Exception): + pass + + # Make sure when transaction is rolled back, no rows get inserted + ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')" + if isinstance(pandasSQL, SQLDatabase): + from sqlalchemy import text + + ins_sql = text(ins_sql) + try: + with pandasSQL.run_transaction() as trans: + trans.execute(ins_sql) + raise DummyException("error") + except DummyException: + # ignore raised exception + pass + with pandasSQL.run_transaction(): + res = pandasSQL.read_query("SELECT * FROM test_trans") + assert len(res) == 0 + + # Make sure when transaction is committed, rows do get inserted + with pandasSQL.run_transaction() as trans: + trans.execute(ins_sql) + res2 = pandasSQL.read_query("SELECT * FROM test_trans") + assert len(res2) == 1 + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_get_schema_create_table(conn, request, test_frame3): + # Use a dataframe without a bool column, since MySQL converts bool to + # TINYINT (which read_sql_table returns as an int and causes a dtype + # mismatch) + if conn == "sqlite_str": + request.applymarker( + pytest.mark.xfail(reason="test does not support sqlite_str fixture") ) - meta.reflect(bind=self.conn) - sqltype = meta.tables["dtype_test3"].columns["B"].type - assert isinstance(sqltype, String) - assert sqltype.length == 10 - - # single dtype - assert df.to_sql(name="single_dtype_test", con=self.conn, dtype=TEXT) == 2 - meta.reflect(bind=self.conn) - sqltypea = meta.tables["single_dtype_test"].columns["A"].type - sqltypeb = meta.tables["single_dtype_test"].columns["B"].type - assert isinstance(sqltypea, TEXT) - assert isinstance(sqltypeb, TEXT) - - def test_notna_dtype(self): - from sqlalchemy import ( - Boolean, - DateTime, - Float, - Integer, - ) - from sqlalchemy.schema import MetaData - - cols = { - "Bool": Series([True, None]), - "Date": Series([datetime(2012, 5, 1), None]), - "Int": Series([1, None], dtype="object"), - "Float": Series([1.1, None]), + + conn = request.getfixturevalue(conn) + + from sqlalchemy import text + from sqlalchemy.engine import Engine + + tbl = "test_get_schema_create_table" + create_sql = sql.get_schema(test_frame3, tbl, con=conn) + blank_test_df = test_frame3.iloc[:0] + + create_sql = text(create_sql) + if isinstance(conn, Engine): + with conn.connect() as newcon: + with newcon.begin(): + newcon.execute(create_sql) + else: + conn.execute(create_sql) + returned_df = sql.read_sql_table(tbl, conn) + tm.assert_frame_equal(returned_df, blank_test_df, check_index_type=False) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_dtype(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") + + conn = request.getfixturevalue(conn) + + from sqlalchemy import ( + TEXT, + String, + ) + from sqlalchemy.schema import MetaData + + cols = ["A", "B"] + data = [(0.8, True), (0.9, None)] + df = DataFrame(data, columns=cols) + assert df.to_sql(name="dtype_test", con=conn) == 2 + assert df.to_sql(name="dtype_test2", con=conn, dtype={"B": TEXT}) == 2 + meta = MetaData() + meta.reflect(bind=conn) + sqltype = meta.tables["dtype_test2"].columns["B"].type + assert isinstance(sqltype, TEXT) + msg = "The type of B is not a SQLAlchemy type" + with pytest.raises(ValueError, match=msg): + df.to_sql(name="error", con=conn, dtype={"B": str}) + + # GH9083 + assert df.to_sql(name="dtype_test3", con=conn, dtype={"B": String(10)}) == 2 + meta.reflect(bind=conn) + sqltype = meta.tables["dtype_test3"].columns["B"].type + assert isinstance(sqltype, String) + assert sqltype.length == 10 + + # single dtype + assert df.to_sql(name="single_dtype_test", con=conn, dtype=TEXT) == 2 + meta.reflect(bind=conn) + sqltypea = meta.tables["single_dtype_test"].columns["A"].type + sqltypeb = meta.tables["single_dtype_test"].columns["B"].type + assert isinstance(sqltypea, TEXT) + assert isinstance(sqltypeb, TEXT) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_notna_dtype(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") + + conn_name = conn + conn = request.getfixturevalue(conn) + + from sqlalchemy import ( + Boolean, + DateTime, + Float, + Integer, + ) + from sqlalchemy.schema import MetaData + + cols = { + "Bool": Series([True, None]), + "Date": Series([datetime(2012, 5, 1), None]), + "Int": Series([1, None], dtype="object"), + "Float": Series([1.1, None]), + } + df = DataFrame(cols) + + tbl = "notna_dtype_test" + assert df.to_sql(name=tbl, con=conn) == 2 + _ = sql.read_sql_table(tbl, conn) + meta = MetaData() + meta.reflect(bind=conn) + my_type = Integer if "mysql" in conn_name else Boolean + col_dict = meta.tables[tbl].columns + assert isinstance(col_dict["Bool"].type, my_type) + assert isinstance(col_dict["Date"].type, DateTime) + assert isinstance(col_dict["Int"].type, Integer) + assert isinstance(col_dict["Float"].type, Float) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_double_precision(conn, request): + if conn == "sqlite_str": + pytest.skip("sqlite_str has no inspection system") + + conn = request.getfixturevalue(conn) + + from sqlalchemy import ( + BigInteger, + Float, + Integer, + ) + from sqlalchemy.schema import MetaData + + V = 1.23456789101112131415 + + df = DataFrame( + { + "f32": Series([V], dtype="float32"), + "f64": Series([V], dtype="float64"), + "f64_as_f32": Series([V], dtype="float64"), + "i32": Series([5], dtype="int32"), + "i64": Series([5], dtype="int64"), } - df = DataFrame(cols) + ) - tbl = "notna_dtype_test" - assert df.to_sql(name=tbl, con=self.conn) == 2 - _ = sql.read_sql_table(tbl, self.conn) - meta = MetaData() - meta.reflect(bind=self.conn) - my_type = Integer if self.flavor == "mysql" else Boolean - col_dict = meta.tables[tbl].columns - assert isinstance(col_dict["Bool"].type, my_type) - assert isinstance(col_dict["Date"].type, DateTime) - assert isinstance(col_dict["Int"].type, Integer) - assert isinstance(col_dict["Float"].type, Float) - - def test_double_precision(self): - from sqlalchemy import ( - BigInteger, - Float, - Integer, + assert ( + df.to_sql( + name="test_dtypes", + con=conn, + index=False, + if_exists="replace", + dtype={"f64_as_f32": Float(precision=23)}, ) - from sqlalchemy.schema import MetaData + == 1 + ) + res = sql.read_sql_table("test_dtypes", conn) - V = 1.23456789101112131415 + # check precision of float64 + assert np.round(df["f64"].iloc[0], 14) == np.round(res["f64"].iloc[0], 14) - df = DataFrame( - { - "f32": Series([V], dtype="float32"), - "f64": Series([V], dtype="float64"), - "f64_as_f32": Series([V], dtype="float64"), - "i32": Series([5], dtype="int32"), - "i64": Series([5], dtype="int64"), - } - ) + # check sql types + meta = MetaData() + meta.reflect(bind=conn) + col_dict = meta.tables["test_dtypes"].columns + assert str(col_dict["f32"].type) == str(col_dict["f64_as_f32"].type) + assert isinstance(col_dict["f32"].type, Float) + assert isinstance(col_dict["f64"].type, Float) + assert isinstance(col_dict["i32"].type, Integer) + assert isinstance(col_dict["i64"].type, BigInteger) - assert ( - df.to_sql( - name="test_dtypes", - con=self.conn, - index=False, - if_exists="replace", - dtype={"f64_as_f32": Float(precision=23)}, - ) - == 1 - ) - res = sql.read_sql_table("test_dtypes", self.conn) - - # check precision of float64 - assert np.round(df["f64"].iloc[0], 14) == np.round(res["f64"].iloc[0], 14) - - # check sql types - meta = MetaData() - meta.reflect(bind=self.conn) - col_dict = meta.tables["test_dtypes"].columns - assert str(col_dict["f32"].type) == str(col_dict["f64_as_f32"].type) - assert isinstance(col_dict["f32"].type, Float) - assert isinstance(col_dict["f64"].type, Float) - assert isinstance(col_dict["i32"].type, Integer) - assert isinstance(col_dict["i64"].type, BigInteger) - - def test_connectable_issue_example(self): - # This tests the example raised in issue - # https://github.com/pandas-dev/pandas/issues/10104 - from sqlalchemy.engine import Engine - def test_select(connection): - query = "SELECT test_foo_data FROM test_foo_data" - return sql.read_sql_query(query, con=connection) - - def test_append(connection, data): - data.to_sql(name="test_foo_data", con=connection, if_exists="append") - - def test_connectable(conn): - # https://github.com/sqlalchemy/sqlalchemy/commit/ - # 00b5c10846e800304caa86549ab9da373b42fa5d#r48323973 - foo_data = test_select(conn) - test_append(conn, foo_data) - - def main(connectable): - if isinstance(connectable, Engine): - with connectable.connect() as conn: - with conn.begin(): - test_connectable(conn) - else: - test_connectable(connectable) +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_connectable_issue_example(conn, request): + conn = request.getfixturevalue(conn) - assert ( - DataFrame({"test_foo_data": [0, 1, 2]}).to_sql( - name="test_foo_data", con=self.conn + # This tests the example raised in issue + # https://github.com/pandas-dev/pandas/issues/10104 + from sqlalchemy.engine import Engine + + def test_select(connection): + query = "SELECT test_foo_data FROM test_foo_data" + return sql.read_sql_query(query, con=connection) + + def test_append(connection, data): + data.to_sql(name="test_foo_data", con=connection, if_exists="append") + + def test_connectable(conn): + # https://github.com/sqlalchemy/sqlalchemy/commit/ + # 00b5c10846e800304caa86549ab9da373b42fa5d#r48323973 + foo_data = test_select(conn) + test_append(conn, foo_data) + + def main(connectable): + if isinstance(connectable, Engine): + with connectable.connect() as conn: + with conn.begin(): + test_connectable(conn) + else: + test_connectable(connectable) + + assert ( + DataFrame({"test_foo_data": [0, 1, 2]}).to_sql(name="test_foo_data", con=conn) + == 3 + ) + main(conn) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +@pytest.mark.parametrize( + "input", + [{"foo": [np.inf]}, {"foo": [-np.inf]}, {"foo": [-np.inf], "infe0": ["bar"]}], +) +def test_to_sql_with_negative_npinf(conn, request, input): + # GH 34431 + + df = DataFrame(input) + conn_name = conn + conn = request.getfixturevalue(conn) + + if "mysql" in conn_name: + # GH 36465 + # The input {"foo": [-np.inf], "infe0": ["bar"]} does not raise any error + # for pymysql version >= 0.10 + # TODO(GH#36465): remove this version check after GH 36465 is fixed + pymysql = pytest.importorskip("pymysql") + + if Version(pymysql.__version__) < Version("1.0.3") and "infe0" in df.columns: + mark = pytest.mark.xfail(reason="GH 36465") + request.applymarker(mark) + + msg = "inf cannot be used with MySQL" + with pytest.raises(ValueError, match=msg): + df.to_sql(name="foobar", con=conn, index=False) + else: + assert df.to_sql(name="foobar", con=conn, index=False) == 1 + res = sql.read_sql_table("foobar", conn) + tm.assert_equal(df, res) + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_temporary_table(conn, request): + if conn == "sqlite_str": + pytest.skip("test does not work with str connection") + + conn = request.getfixturevalue(conn) + + from sqlalchemy import ( + Column, + Integer, + Unicode, + select, + ) + from sqlalchemy.orm import ( + Session, + declarative_base, + ) + + test_data = "Hello, World!" + expected = DataFrame({"spam": [test_data]}) + Base = declarative_base() + + class Temporary(Base): + __tablename__ = "temp_test" + __table_args__ = {"prefixes": ["TEMPORARY"]} + id = Column(Integer, primary_key=True) + spam = Column(Unicode(30), nullable=False) + + with Session(conn) as session: + with session.begin(): + conn = session.connection() + Temporary.__table__.create(conn) + session.add(Temporary(spam=test_data)) + session.flush() + df = sql.read_sql_query(sql=select(Temporary.spam), con=conn) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("conn", all_connectable) +def test_invalid_engine(conn, request, test_frame1): + if conn == "sqlite_buildin" or "adbc" in conn: + request.applymarker( + pytest.mark.xfail( + reason="SQLiteDatabase/ADBCDatabase does not raise for bad engine" ) - == 3 ) - main(self.conn) - @pytest.mark.parametrize( - "input", - [{"foo": [np.inf]}, {"foo": [-np.inf]}, {"foo": [-np.inf], "infe0": ["bar"]}], - ) - def test_to_sql_with_negative_npinf(self, input, request): - # GH 34431 - - df = DataFrame(input) - - if self.flavor == "mysql": - # GH 36465 - # The input {"foo": [-np.inf], "infe0": ["bar"]} does not raise any error - # for pymysql version >= 0.10 - # TODO(GH#36465): remove this version check after GH 36465 is fixed - pymysql = pytest.importorskip("pymysql") - - if ( - Version(pymysql.__version__) < Version("1.0.3") - and "infe0" in df.columns - ): - mark = pytest.mark.xfail(reason="GH 36465") - request.node.add_marker(mark) - - msg = "inf cannot be used with MySQL" - with pytest.raises(ValueError, match=msg): - df.to_sql(name="foobar", con=self.conn, index=False) - else: - assert df.to_sql(name="foobar", con=self.conn, index=False) == 1 - res = sql.read_sql_table("foobar", self.conn) - tm.assert_equal(df, res) - - def test_temporary_table(self): - from sqlalchemy import ( - Column, - Integer, - Unicode, - select, - ) - from sqlalchemy.orm import ( - Session, - declarative_base, - ) - - test_data = "Hello, World!" - expected = DataFrame({"spam": [test_data]}) - Base = declarative_base() - - class Temporary(Base): - __tablename__ = "temp_test" - __table_args__ = {"prefixes": ["TEMPORARY"]} - id = Column(Integer, primary_key=True) - spam = Column(Unicode(30), nullable=False) - - with Session(self.conn) as session: - with session.begin(): - conn = session.connection() - Temporary.__table__.create(conn) - session.add(Temporary(spam=test_data)) - session.flush() - df = sql.read_sql_query(sql=select(Temporary.spam), con=conn) - tm.assert_frame_equal(df, expected) - - # -- SQL Engine tests (in the base class for now) - def test_invalid_engine(self, test_frame1): - msg = "engine must be one of 'auto', 'sqlalchemy'" + conn = request.getfixturevalue(conn) + msg = "engine must be one of 'auto', 'sqlalchemy'" + with pandasSQL_builder(conn) as pandasSQL: with pytest.raises(ValueError, match=msg): - self._to_sql_with_sql_engine(test_frame1, "bad_engine") + pandasSQL.to_sql(test_frame1, "test_frame1", engine="bad_engine") + + +@pytest.mark.parametrize("conn", all_connectable) +def test_to_sql_with_sql_engine(conn, request, test_frame1): + """`to_sql` with the `engine` param""" + # mostly copied from this class's `_to_sql()` method + conn = request.getfixturevalue(conn) + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1", engine="auto") == 4 + assert pandasSQL.has_table("test_frame1") + + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame1") + assert num_rows == num_entries + + +@pytest.mark.parametrize("conn", sqlalchemy_connectable) +def test_options_sqlalchemy(conn, request, test_frame1): + # use the set option + conn = request.getfixturevalue(conn) + with pd.option_context("io.sql.engine", "sqlalchemy"): + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 + assert pandasSQL.has_table("test_frame1") + + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame1") + assert num_rows == num_entries + + +@pytest.mark.parametrize("conn", all_connectable) +def test_options_auto(conn, request, test_frame1): + # use the set option + conn = request.getfixturevalue(conn) + with pd.option_context("io.sql.engine", "auto"): + with pandasSQL_builder(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4 + assert pandasSQL.has_table("test_frame1") + + num_entries = len(test_frame1) + num_rows = count_rows(conn, "test_frame1") + assert num_rows == num_entries - def test_options_sqlalchemy(self, test_frame1): - # use the set option - with pd.option_context("io.sql.engine", "sqlalchemy"): - self._to_sql_with_sql_engine(test_frame1) - - def test_options_auto(self, test_frame1): - # use the set option - with pd.option_context("io.sql.engine", "auto"): - self._to_sql_with_sql_engine(test_frame1) - def test_options_get_engine(self): +def test_options_get_engine(): + pytest.importorskip("sqlalchemy") + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) + + with pd.option_context("io.sql.engine", "sqlalchemy"): + assert isinstance(get_engine("auto"), SQLAlchemyEngine) assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) - with pd.option_context("io.sql.engine", "sqlalchemy"): - assert isinstance(get_engine("auto"), SQLAlchemyEngine) - assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) - - with pd.option_context("io.sql.engine", "auto"): - assert isinstance(get_engine("auto"), SQLAlchemyEngine) - assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) - - def test_get_engine_auto_error_message(self): - # Expect different error messages from get_engine(engine="auto") - # if engines aren't installed vs. are installed but bad version - pass - # TODO(GH#36893) fill this in when we add more engines - - @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) - def test_read_sql_dtype_backend(self, string_storage, func, dtype_backend): - # GH#50048 - table = "test" - df = self.dtype_backend_data() - df.to_sql(name=table, con=self.conn, index=False, if_exists="replace") - - with pd.option_context("mode.string_storage", string_storage): - result = getattr(pd, func)( - f"Select * from {table}", self.conn, dtype_backend=dtype_backend - ) - expected = self.dtype_backend_expected(string_storage, dtype_backend) - tm.assert_frame_equal(result, expected) + with pd.option_context("io.sql.engine", "auto"): + assert isinstance(get_engine("auto"), SQLAlchemyEngine) + assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine) - with pd.option_context("mode.string_storage", string_storage): - iterator = getattr(pd, func)( - f"Select * from {table}", - con=self.conn, - dtype_backend=dtype_backend, - chunksize=3, - ) - expected = self.dtype_backend_expected(string_storage, dtype_backend) - for result in iterator: - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) - def test_read_sql_dtype_backend_table(self, string_storage, func, dtype_backend): - # GH#50048 - table = "test" - df = self.dtype_backend_data() - df.to_sql(name=table, con=self.conn, index=False, if_exists="replace") - - with pd.option_context("mode.string_storage", string_storage): - result = getattr(pd, func)(table, self.conn, dtype_backend=dtype_backend) - expected = self.dtype_backend_expected(string_storage, dtype_backend) - tm.assert_frame_equal(result, expected) - with pd.option_context("mode.string_storage", string_storage): - iterator = getattr(pd, func)( - table, - self.conn, - dtype_backend=dtype_backend, - chunksize=3, - ) - expected = self.dtype_backend_expected(string_storage, dtype_backend) - for result in iterator: - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("func", ["read_sql", "read_sql_table", "read_sql_query"]) - def test_read_sql_invalid_dtype_backend_table(self, func): - table = "test" - df = self.dtype_backend_data() - df.to_sql(name=table, con=self.conn, index=False, if_exists="replace") - - msg = ( - "dtype_backend numpy is invalid, only 'numpy_nullable' and " - "'pyarrow' are allowed." +def test_get_engine_auto_error_message(): + # Expect different error messages from get_engine(engine="auto") + # if engines aren't installed vs. are installed but bad version + pass + # TODO(GH#36893) fill this in when we add more engines + + +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) +def test_read_sql_dtype_backend( + conn, + request, + string_storage, + func, + dtype_backend, + dtype_backend_data, + dtype_backend_expected, +): + # GH#50048 + conn_name = conn + conn = request.getfixturevalue(conn) + table = "test" + df = dtype_backend_data + df.to_sql(name=table, con=conn, index=False, if_exists="replace") + + with pd.option_context("mode.string_storage", string_storage): + result = getattr(pd, func)( + f"Select * from {table}", conn, dtype_backend=dtype_backend ) - with pytest.raises(ValueError, match=msg): - getattr(pd, func)(table, self.conn, dtype_backend="numpy") + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + tm.assert_frame_equal(result, expected) - def dtype_backend_data(self) -> DataFrame: - return DataFrame( - { - "a": Series([1, np.nan, 3], dtype="Int64"), - "b": Series([1, 2, 3], dtype="Int64"), - "c": Series([1.5, np.nan, 2.5], dtype="Float64"), - "d": Series([1.5, 2.0, 2.5], dtype="Float64"), - "e": [True, False, None], - "f": [True, False, True], - "g": ["a", "b", "c"], - "h": ["a", "b", None], - } + if "adbc" in conn_name: + # adbc does not support chunksize argument + request.applymarker( + pytest.mark.xfail(reason="adbc does not support chunksize argument") + ) + + with pd.option_context("mode.string_storage", string_storage): + iterator = getattr(pd, func)( + f"Select * from {table}", + con=conn, + dtype_backend=dtype_backend, + chunksize=3, + ) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + for result in iterator: + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) +def test_read_sql_dtype_backend_table( + conn, + request, + string_storage, + func, + dtype_backend, + dtype_backend_data, + dtype_backend_expected, +): + if "sqlite" in conn and "adbc" not in conn: + request.applymarker( + pytest.mark.xfail( + reason=( + "SQLite actually returns proper boolean values via " + "read_sql_table, but before pytest refactor was skipped" + ) + ) + ) + # GH#50048 + conn_name = conn + conn = request.getfixturevalue(conn) + table = "test" + df = dtype_backend_data + df.to_sql(name=table, con=conn, index=False, if_exists="replace") + + with pd.option_context("mode.string_storage", string_storage): + result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + tm.assert_frame_equal(result, expected) + + if "adbc" in conn_name: + # adbc does not support chunksize argument + return + + with pd.option_context("mode.string_storage", string_storage): + iterator = getattr(pd, func)( + table, + conn, + dtype_backend=dtype_backend, + chunksize=3, ) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + for result in iterator: + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("func", ["read_sql", "read_sql_table", "read_sql_query"]) +def test_read_sql_invalid_dtype_backend_table(conn, request, func, dtype_backend_data): + conn = request.getfixturevalue(conn) + table = "test" + df = dtype_backend_data + df.to_sql(name=table, con=conn, index=False, if_exists="replace") + + msg = ( + "dtype_backend numpy is invalid, only 'numpy_nullable' and " + "'pyarrow' are allowed." + ) + with pytest.raises(ValueError, match=msg): + getattr(pd, func)(table, conn, dtype_backend="numpy") + + +@pytest.fixture +def dtype_backend_data() -> DataFrame: + return DataFrame( + { + "a": Series([1, np.nan, 3], dtype="Int64"), + "b": Series([1, 2, 3], dtype="Int64"), + "c": Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": [True, False, None], + "f": [True, False, True], + "g": ["a", "b", "c"], + "h": ["a", "b", None], + } + ) - def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: + +@pytest.fixture +def dtype_backend_expected(): + def func(storage, dtype_backend, conn_name) -> DataFrame: string_array: StringArray | ArrowStringArray string_array_na: StringArray | ArrowStringArray if storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) # type: ignore[assignment] + string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) # type: ignore[assignment] + else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["a", "b", "c"])) @@ -2786,188 +3708,164 @@ for col in df.columns } ) + + if "mysql" in conn_name or "sqlite" in conn_name: + if dtype_backend == "numpy_nullable": + df = df.astype({"e": "Int64", "f": "Int64"}) + else: + df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) + return df - def test_chunksize_empty_dtypes(self): - # GH#50245 - dtypes = {"a": "int64", "b": "object"} - df = DataFrame(columns=["a", "b"]).astype(dtypes) - expected = df.copy() - df.to_sql(name="test", con=self.conn, index=False, if_exists="replace") - - for result in read_sql_query( - "SELECT * FROM test", - self.conn, - dtype=dtypes, - chunksize=1, - ): - tm.assert_frame_equal(result, expected) + return func - @pytest.mark.parametrize("dtype_backend", [lib.no_default, "numpy_nullable"]) - @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) - def test_read_sql_dtype(self, func, dtype_backend): - # GH#50797 - table = "test" - df = DataFrame({"a": [1, 2, 3], "b": 5}) - df.to_sql(name=table, con=self.conn, index=False, if_exists="replace") - result = getattr(pd, func)( - f"Select * from {table}", - self.conn, - dtype={"a": np.float64}, - dtype_backend=dtype_backend, - ) - expected = DataFrame( - { - "a": Series([1, 2, 3], dtype=np.float64), - "b": Series( - [5, 5, 5], - dtype="int64" if not dtype_backend == "numpy_nullable" else "Int64", - ), - } +@pytest.mark.parametrize("conn", all_connectable) +def test_chunksize_empty_dtypes(conn, request): + # GH#50245 + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") ) + conn = request.getfixturevalue(conn) + dtypes = {"a": "int64", "b": "object"} + df = DataFrame(columns=["a", "b"]).astype(dtypes) + expected = df.copy() + df.to_sql(name="test", con=conn, index=False, if_exists="replace") + + for result in read_sql_query( + "SELECT * FROM test", + conn, + dtype=dtypes, + chunksize=1, + ): tm.assert_frame_equal(result, expected) -class TestSQLiteAlchemy(_TestSQLAlchemy): - """ - Test the sqlalchemy backend against an in-memory sqlite database. +@pytest.mark.parametrize("conn", all_connectable) +@pytest.mark.parametrize("dtype_backend", [lib.no_default, "numpy_nullable"]) +@pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) +def test_read_sql_dtype(conn, request, func, dtype_backend): + # GH#50797 + conn = request.getfixturevalue(conn) + table = "test" + df = DataFrame({"a": [1, 2, 3], "b": 5}) + df.to_sql(name=table, con=conn, index=False, if_exists="replace") + + result = getattr(pd, func)( + f"Select * from {table}", + conn, + dtype={"a": np.float64}, + dtype_backend=dtype_backend, + ) + expected = DataFrame( + { + "a": Series([1, 2, 3], dtype=np.float64), + "b": Series( + [5, 5, 5], + dtype="int64" if not dtype_backend == "numpy_nullable" else "Int64", + ), + } + ) + tm.assert_frame_equal(result, expected) - """ - flavor = "sqlite" +def test_keyword_deprecation(sqlite_engine): + conn = sqlite_engine + # GH 54397 + msg = ( + "Starting with pandas version 3.0 all arguments of to_sql except for the " + "arguments 'name' and 'con' will be keyword-only." + ) + df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) + df.to_sql("example", conn) - @classmethod - def setup_engine(cls): - cls.engine = sqlalchemy.create_engine("sqlite:///:memory:") + with tm.assert_produces_warning(FutureWarning, match=msg): + df.to_sql("example", conn, None, if_exists="replace") - @classmethod - def setup_driver(cls): - # sqlite3 is built-in - cls.driver = None - def test_keyword_deprecation(self): - # GH 54397 - msg = ( - "Starting with pandas version 3.0 all arguments of to_sql except for the " - "arguments 'name' and 'con' will be keyword-only." - ) - df = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 1, "B": 2, "C": 3}]) - df.to_sql("example", self.conn) +def test_bigint_warning(sqlite_engine): + conn = sqlite_engine + # test no warning for BIGINT (to support int64) is raised (GH7433) + df = DataFrame({"a": [1, 2]}, dtype="int64") + assert df.to_sql(name="test_bigintwarning", con=conn, index=False) == 2 - with tm.assert_produces_warning(FutureWarning, match=msg): - df.to_sql("example", self.conn, None, if_exists="replace") + with tm.assert_produces_warning(None): + sql.read_sql_table("test_bigintwarning", conn) - def test_default_type_conversion(self): - df = sql.read_sql_table("types", self.conn) - assert issubclass(df.FloatCol.dtype.type, np.floating) - assert issubclass(df.IntCol.dtype.type, np.integer) +def test_valueerror_exception(sqlite_engine): + conn = sqlite_engine + df = DataFrame({"col1": [1, 2], "col2": [3, 4]}) + with pytest.raises(ValueError, match="Empty table name specified"): + df.to_sql(name="", con=conn, if_exists="replace", index=False) - # sqlite has no boolean type, so integer type is returned - assert issubclass(df.BoolCol.dtype.type, np.integer) - # Int column with NA values stays as float - assert issubclass(df.IntColWithNull.dtype.type, np.floating) +def test_row_object_is_named_tuple(sqlite_engine): + conn = sqlite_engine + # GH 40682 + # Test for the is_named_tuple() function + # Placed here due to its usage of sqlalchemy - # Non-native Bool column with NA values stays as float - assert issubclass(df.BoolColWithNull.dtype.type, np.floating) + from sqlalchemy import ( + Column, + Integer, + String, + ) + from sqlalchemy.orm import ( + declarative_base, + sessionmaker, + ) - def test_default_date_load(self): - df = sql.read_sql_table("types", self.conn) + BaseModel = declarative_base() - # IMPORTANT - sqlite has no native date type, so shouldn't parse, but - assert not issubclass(df.DateCol.dtype.type, np.datetime64) + class Test(BaseModel): + __tablename__ = "test_frame" + id = Column(Integer, primary_key=True) + string_column = Column(String(50)) + + with conn.begin(): + BaseModel.metadata.create_all(conn) + Session = sessionmaker(bind=conn) + with Session() as session: + df = DataFrame({"id": [0, 1], "string_column": ["hello", "world"]}) + assert ( + df.to_sql(name="test_frame", con=conn, index=False, if_exists="replace") + == 2 + ) + session.commit() + test_query = session.query(Test.id, Test.string_column) + df = DataFrame(test_query) - def test_bigint_warning(self): - # test no warning for BIGINT (to support int64) is raised (GH7433) - df = DataFrame({"a": [1, 2]}, dtype="int64") - assert df.to_sql(name="test_bigintwarning", con=self.conn, index=False) == 2 - - with tm.assert_produces_warning(None): - sql.read_sql_table("test_bigintwarning", self.conn) - - def test_valueerror_exception(self): - df = DataFrame({"col1": [1, 2], "col2": [3, 4]}) - with pytest.raises(ValueError, match="Empty table name specified"): - df.to_sql(name="", con=self.conn, if_exists="replace", index=False) - - def test_row_object_is_named_tuple(self): - # GH 40682 - # Test for the is_named_tuple() function - # Placed here due to its usage of sqlalchemy - - from sqlalchemy import ( - Column, - Integer, - String, - ) - from sqlalchemy.orm import ( - declarative_base, - sessionmaker, - ) - - BaseModel = declarative_base() - - class Test(BaseModel): - __tablename__ = "test_frame" - id = Column(Integer, primary_key=True) - string_column = Column(String(50)) - - with self.conn.begin(): - BaseModel.metadata.create_all(self.conn) - Session = sessionmaker(bind=self.conn) - with Session() as session: - df = DataFrame({"id": [0, 1], "string_column": ["hello", "world"]}) - assert ( - df.to_sql( - name="test_frame", con=self.conn, index=False, if_exists="replace" - ) - == 2 - ) - session.commit() - test_query = session.query(Test.id, Test.string_column) - df = DataFrame(test_query) - - assert list(df.columns) == ["id", "string_column"] - - def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: - df = super().dtype_backend_expected(storage, dtype_backend) - if dtype_backend == "numpy_nullable": - df = df.astype({"e": "Int64", "f": "Int64"}) - else: - df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) + assert list(df.columns) == ["id", "string_column"] - return df - @pytest.mark.parametrize("func", ["read_sql", "read_sql_table"]) - def test_read_sql_dtype_backend_table(self, string_storage, func): - # GH#50048 Not supported for sqlite - pass - - def test_read_sql_string_inference(self): - # GH#54430 - pytest.importorskip("pyarrow") - table = "test" - df = DataFrame({"a": ["x", "y"]}) - df.to_sql(table, con=self.conn, index=False, if_exists="replace") - - with pd.option_context("future.infer_string", True): - result = read_sql_table(table, self.conn) - - dtype = "string[pyarrow_numpy]" - expected = DataFrame( - {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) - ) +def test_read_sql_string_inference(sqlite_engine): + conn = sqlite_engine + # GH#54430 + pytest.importorskip("pyarrow") + table = "test" + df = DataFrame({"a": ["x", "y"]}) + df.to_sql(table, con=conn, index=False, if_exists="replace") + + with pd.option_context("future.infer_string", True): + result = read_sql_table(table, conn) + + dtype = "string[pyarrow_numpy]" + expected = DataFrame( + {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) + ) + + tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result, expected) - def test_roundtripping_datetimes(self): - # GH#54877 - df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]") - df.to_sql("test", self.conn, if_exists="replace", index=False) - result = pd.read_sql("select * from test", self.conn).iloc[0, 0] - assert result == "2020-12-31 12:00:00.000000" +def test_roundtripping_datetimes(sqlite_engine): + conn = sqlite_engine + # GH#54877 + df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]") + df.to_sql("test", conn, if_exists="replace", index=False) + result = pd.read_sql("select * from test", conn).iloc[0, 0] + assert result == "2020-12-31 12:00:00.000000" @pytest.fixture @@ -2989,392 +3887,252 @@ @pytest.mark.db -class TestMySQLAlchemy(_TestSQLAlchemy): - """ - Test the sqlalchemy backend against an MySQL database. - - """ - - flavor = "mysql" - port = 3306 - - @classmethod - def setup_engine(cls): - cls.engine = sqlalchemy.create_engine( - f"mysql+{cls.driver}://root@localhost:{cls.port}/pandas", - connect_args=cls.connect_args, - ) - - @classmethod - def setup_driver(cls): - pymysql = pytest.importorskip("pymysql") - cls.driver = "pymysql" - cls.connect_args = {"client_flag": pymysql.constants.CLIENT.MULTI_STATEMENTS} - - def test_default_type_conversion(self): - pass - - def dtype_backend_expected(self, storage, dtype_backend) -> DataFrame: - df = super().dtype_backend_expected(storage, dtype_backend) - if dtype_backend == "numpy_nullable": - df = df.astype({"e": "Int64", "f": "Int64"}) - else: - df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"}) - - return df - - -@pytest.mark.db -class TestPostgreSQLAlchemy(_TestSQLAlchemy): - """ - Test the sqlalchemy backend against an PostgreSQL database. - - """ - - flavor = "postgresql" - port = 5432 - - @classmethod - def setup_engine(cls): - cls.engine = sqlalchemy.create_engine( - f"postgresql+{cls.driver}://postgres:postgres@localhost:{cls.port}/pandas" - ) - - @classmethod - def setup_driver(cls): - pytest.importorskip("psycopg2") - cls.driver = "psycopg2" - - def test_schema_support(self): - from sqlalchemy.engine import Engine +def test_psycopg2_schema_support(postgresql_psycopg2_engine): + conn = postgresql_psycopg2_engine - # only test this for postgresql (schema's not supported in - # mysql/sqlite) - df = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) - - # create a schema - with self.conn.begin(): - self.conn.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;") - self.conn.exec_driver_sql("CREATE SCHEMA other;") - - # write dataframe to different schema's - assert df.to_sql(name="test_schema_public", con=self.conn, index=False) == 2 - assert ( - df.to_sql( - name="test_schema_public_explicit", - con=self.conn, - index=False, - schema="public", - ) - == 2 - ) - assert ( - df.to_sql( - name="test_schema_other", con=self.conn, index=False, schema="other" - ) - == 2 + # only test this for postgresql (schema's not supported in + # mysql/sqlite) + df = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) + + # create a schema + with conn.connect() as con: + with con.begin(): + con.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;") + con.exec_driver_sql("CREATE SCHEMA other;") + + # write dataframe to different schema's + assert df.to_sql(name="test_schema_public", con=conn, index=False) == 2 + assert ( + df.to_sql( + name="test_schema_public_explicit", + con=conn, + index=False, + schema="public", ) + == 2 + ) + assert ( + df.to_sql(name="test_schema_other", con=conn, index=False, schema="other") == 2 + ) - # read dataframes back in - res1 = sql.read_sql_table("test_schema_public", self.conn) - tm.assert_frame_equal(df, res1) - res2 = sql.read_sql_table("test_schema_public_explicit", self.conn) - tm.assert_frame_equal(df, res2) - res3 = sql.read_sql_table( - "test_schema_public_explicit", self.conn, schema="public" - ) - tm.assert_frame_equal(df, res3) - res4 = sql.read_sql_table("test_schema_other", self.conn, schema="other") - tm.assert_frame_equal(df, res4) - msg = "Table test_schema_other not found" - with pytest.raises(ValueError, match=msg): - sql.read_sql_table("test_schema_other", self.conn, schema="public") - - # different if_exists options - - # create a schema - with self.conn.begin(): - self.conn.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;") - self.conn.exec_driver_sql("CREATE SCHEMA other;") - - # write dataframe with different if_exists options - assert ( - df.to_sql( - name="test_schema_other", con=self.conn, schema="other", index=False - ) - == 2 - ) + # read dataframes back in + res1 = sql.read_sql_table("test_schema_public", conn) + tm.assert_frame_equal(df, res1) + res2 = sql.read_sql_table("test_schema_public_explicit", conn) + tm.assert_frame_equal(df, res2) + res3 = sql.read_sql_table("test_schema_public_explicit", conn, schema="public") + tm.assert_frame_equal(df, res3) + res4 = sql.read_sql_table("test_schema_other", conn, schema="other") + tm.assert_frame_equal(df, res4) + msg = "Table test_schema_other not found" + with pytest.raises(ValueError, match=msg): + sql.read_sql_table("test_schema_other", conn, schema="public") + + # different if_exists options + + # create a schema + with conn.connect() as con: + with con.begin(): + con.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;") + con.exec_driver_sql("CREATE SCHEMA other;") + + # write dataframe with different if_exists options + assert ( + df.to_sql(name="test_schema_other", con=conn, schema="other", index=False) == 2 + ) + df.to_sql( + name="test_schema_other", + con=conn, + schema="other", + index=False, + if_exists="replace", + ) + assert ( df.to_sql( name="test_schema_other", - con=self.conn, + con=conn, schema="other", index=False, - if_exists="replace", - ) - assert ( - df.to_sql( - name="test_schema_other", - con=self.conn, - schema="other", - index=False, - if_exists="append", - ) - == 2 + if_exists="append", ) - res = sql.read_sql_table("test_schema_other", self.conn, schema="other") - tm.assert_frame_equal(concat([df, df], ignore_index=True), res) - - # specifying schema in user-provided meta - - # The schema won't be applied on another Connection - # because of transactional schemas - if isinstance(self.conn, Engine): - engine2 = self.connect() - pdsql = sql.SQLDatabase(engine2, schema="other") - assert pdsql.to_sql(df, "test_schema_other2", index=False) == 2 - assert ( - pdsql.to_sql(df, "test_schema_other2", index=False, if_exists="replace") - == 2 - ) - assert ( - pdsql.to_sql(df, "test_schema_other2", index=False, if_exists="append") - == 2 - ) - res1 = sql.read_sql_table("test_schema_other2", self.conn, schema="other") - res2 = pdsql.read_table("test_schema_other2") - tm.assert_frame_equal(res1, res2) + == 2 + ) + res = sql.read_sql_table("test_schema_other", conn, schema="other") + tm.assert_frame_equal(concat([df, df], ignore_index=True), res) - def test_self_join_date_columns(self): - # GH 44421 - from sqlalchemy.engine import Engine - from sqlalchemy.sql import text - create_table = text( - """ - CREATE TABLE person - ( - id serial constraint person_pkey primary key, - created_dt timestamp with time zone - ); +@pytest.mark.db +def test_self_join_date_columns(postgresql_psycopg2_engine): + # GH 44421 + conn = postgresql_psycopg2_engine + from sqlalchemy.sql import text - INSERT INTO person - VALUES (1, '2021-01-01T00:00:00Z'); + create_table = text( """ - ) - if isinstance(self.conn, Engine): - with self.conn.connect() as con: - with con.begin(): - con.execute(create_table) - else: - with self.conn.begin(): - self.conn.execute(create_table) - - sql_query = ( - 'SELECT * FROM "person" AS p1 INNER JOIN "person" AS p2 ON p1.id = p2.id;' - ) - result = pd.read_sql(sql_query, self.conn) - expected = DataFrame( - [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2 - ) - tm.assert_frame_equal(result, expected) - - # Cleanup - with sql.SQLDatabase(self.conn, need_transaction=True) as pandasSQL: - pandasSQL.drop_table("person") - - -# ----------------------------------------------------------------------------- -# -- Test Sqlite / MySQL fallback - - -class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): - """ - Test the fallback mode against an in-memory sqlite database. + CREATE TABLE person + ( + id serial constraint person_pkey primary key, + created_dt timestamp with time zone + ); + INSERT INTO person + VALUES (1, '2021-01-01T00:00:00Z'); """ + ) + with conn.connect() as con: + with con.begin(): + con.execute(create_table) - flavor = "sqlite" - - @pytest.fixture(autouse=True) - def setup_method(self, iris_path, types_data): - self.conn = self.connect() - self.load_iris_data(iris_path) - self.load_types_data(types_data) - self.pandasSQL = sql.SQLiteDatabase(self.conn) - - def test_read_sql_parameter(self, sql_strings): - self._read_sql_iris_parameter(sql_strings) - - def test_read_sql_named_parameter(self, sql_strings): - self._read_sql_iris_named_parameter(sql_strings) - - def test_to_sql_empty(self, test_frame1): - self._to_sql_empty(test_frame1) - - def test_create_and_drop_table(self): - temp_frame = DataFrame( - {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} - ) - - assert self.pandasSQL.to_sql(temp_frame, "drop_test_frame") == 4 - - assert self.pandasSQL.has_table("drop_test_frame") - - self.pandasSQL.drop_table("drop_test_frame") - - assert not self.pandasSQL.has_table("drop_test_frame") - - def test_roundtrip(self, test_frame1): - self._roundtrip(test_frame1) - - def test_execute_sql(self): - self._execute_sql() - - def test_datetime_date(self): - # test support for datetime.date - df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - assert df.to_sql(name="test_date", con=self.conn, index=False) == 2 - res = read_sql_query("SELECT * FROM test_date", self.conn) - if self.flavor == "sqlite": - # comes back as strings - tm.assert_frame_equal(res, df.astype(str)) - elif self.flavor == "mysql": - tm.assert_frame_equal(res, df) - - @pytest.mark.parametrize("tz_aware", [False, True]) - def test_datetime_time(self, tz_aware): - # test support for datetime.time, GH #8341 - if not tz_aware: - tz_times = [time(9, 0, 0), time(9, 1, 30)] - else: - tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific") - tz_times = Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz()) + sql_query = ( + 'SELECT * FROM "person" AS p1 INNER JOIN "person" AS p2 ON p1.id = p2.id;' + ) + result = pd.read_sql(sql_query, conn) + expected = DataFrame( + [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2 + ) + tm.assert_frame_equal(result, expected) - df = DataFrame(tz_times, columns=["a"]) + # Cleanup + with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: + pandasSQL.drop_table("person") - assert df.to_sql(name="test_time", con=self.conn, index=False) == 2 - res = read_sql_query("SELECT * FROM test_time", self.conn) - if self.flavor == "sqlite": - # comes back as strings - expected = df.map(lambda _: _.strftime("%H:%M:%S.%f")) - tm.assert_frame_equal(res, expected) - def _get_index_columns(self, tbl_name): - ixs = sql.read_sql_query( - "SELECT * FROM sqlite_master WHERE type = 'index' " - f"AND tbl_name = '{tbl_name}'", - self.conn, - ) - ix_cols = [] - for ix_name in ixs.name: - ix_info = sql.read_sql_query(f"PRAGMA index_info({ix_name})", self.conn) - ix_cols.append(ix_info.name.tolist()) - return ix_cols +def test_create_and_drop_table(sqlite_engine): + conn = sqlite_engine + temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}) + with sql.SQLDatabase(conn) as pandasSQL: + with pandasSQL.run_transaction(): + assert pandasSQL.to_sql(temp_frame, "drop_test_frame") == 4 - def test_to_sql_save_index(self): - self._to_sql_save_index() + assert pandasSQL.has_table("drop_test_frame") - def test_transactions(self): - self._transaction_test() + with pandasSQL.run_transaction(): + pandasSQL.drop_table("drop_test_frame") - def _get_sqlite_column_type(self, table, column): - recs = self.conn.execute(f"PRAGMA table_info({table})") - for cid, name, ctype, not_null, default, pk in recs: - if name == column: - return ctype - raise ValueError(f"Table {table}, column {column} not found") - - def test_dtype(self): - if self.flavor == "mysql": - pytest.skip("Not applicable to MySQL legacy") - cols = ["A", "B"] - data = [(0.8, True), (0.9, None)] - df = DataFrame(data, columns=cols) - assert df.to_sql(name="dtype_test", con=self.conn) == 2 - assert df.to_sql(name="dtype_test2", con=self.conn, dtype={"B": "STRING"}) == 2 + assert not pandasSQL.has_table("drop_test_frame") - # sqlite stores Boolean values as INTEGER - assert self._get_sqlite_column_type("dtype_test", "B") == "INTEGER" - assert self._get_sqlite_column_type("dtype_test2", "B") == "STRING" - msg = r"B \(\) not a string" - with pytest.raises(ValueError, match=msg): - df.to_sql(name="error", con=self.conn, dtype={"B": bool}) +def test_sqlite_datetime_date(sqlite_buildin): + conn = sqlite_buildin + df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) + assert df.to_sql(name="test_date", con=conn, index=False) == 2 + res = read_sql_query("SELECT * FROM test_date", conn) + # comes back as strings + tm.assert_frame_equal(res, df.astype(str)) - # single dtype - assert df.to_sql(name="single_dtype_test", con=self.conn, dtype="STRING") == 2 - assert self._get_sqlite_column_type("single_dtype_test", "A") == "STRING" - assert self._get_sqlite_column_type("single_dtype_test", "B") == "STRING" - - def test_notna_dtype(self): - if self.flavor == "mysql": - pytest.skip("Not applicable to MySQL legacy") - - cols = { - "Bool": Series([True, None]), - "Date": Series([datetime(2012, 5, 1), None]), - "Int": Series([1, None], dtype="object"), - "Float": Series([1.1, None]), - } - df = DataFrame(cols) - tbl = "notna_dtype_test" - assert df.to_sql(name=tbl, con=self.conn) == 2 +@pytest.mark.parametrize("tz_aware", [False, True]) +def test_sqlite_datetime_time(tz_aware, sqlite_buildin): + conn = sqlite_buildin + # test support for datetime.time, GH #8341 + if not tz_aware: + tz_times = [time(9, 0, 0), time(9, 1, 30)] + else: + tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific") + tz_times = Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz()) - assert self._get_sqlite_column_type(tbl, "Bool") == "INTEGER" - assert self._get_sqlite_column_type(tbl, "Date") == "TIMESTAMP" - assert self._get_sqlite_column_type(tbl, "Int") == "INTEGER" - assert self._get_sqlite_column_type(tbl, "Float") == "REAL" + df = DataFrame(tz_times, columns=["a"]) - def test_illegal_names(self): - # For sqlite, these should work fine - df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) + assert df.to_sql(name="test_time", con=conn, index=False) == 2 + res = read_sql_query("SELECT * FROM test_time", conn) + # comes back as strings + expected = df.map(lambda _: _.strftime("%H:%M:%S.%f")) + tm.assert_frame_equal(res, expected) + + +def get_sqlite_column_type(conn, table, column): + recs = conn.execute(f"PRAGMA table_info({table})") + for cid, name, ctype, not_null, default, pk in recs: + if name == column: + return ctype + raise ValueError(f"Table {table}, column {column} not found") + + +def test_sqlite_test_dtype(sqlite_buildin): + conn = sqlite_buildin + cols = ["A", "B"] + data = [(0.8, True), (0.9, None)] + df = DataFrame(data, columns=cols) + assert df.to_sql(name="dtype_test", con=conn) == 2 + assert df.to_sql(name="dtype_test2", con=conn, dtype={"B": "STRING"}) == 2 + + # sqlite stores Boolean values as INTEGER + assert get_sqlite_column_type(conn, "dtype_test", "B") == "INTEGER" + + assert get_sqlite_column_type(conn, "dtype_test2", "B") == "STRING" + msg = r"B \(\) not a string" + with pytest.raises(ValueError, match=msg): + df.to_sql(name="error", con=conn, dtype={"B": bool}) + + # single dtype + assert df.to_sql(name="single_dtype_test", con=conn, dtype="STRING") == 2 + assert get_sqlite_column_type(conn, "single_dtype_test", "A") == "STRING" + assert get_sqlite_column_type(conn, "single_dtype_test", "B") == "STRING" + + +def test_sqlite_notna_dtype(sqlite_buildin): + conn = sqlite_buildin + cols = { + "Bool": Series([True, None]), + "Date": Series([datetime(2012, 5, 1), None]), + "Int": Series([1, None], dtype="object"), + "Float": Series([1.1, None]), + } + df = DataFrame(cols) - msg = "Empty table or column name specified" - with pytest.raises(ValueError, match=msg): - df.to_sql(name="", con=self.conn) + tbl = "notna_dtype_test" + assert df.to_sql(name=tbl, con=conn) == 2 - for ndx, weird_name in enumerate( - [ - "test_weird_name]", - "test_weird_name[", - "test_weird_name`", - 'test_weird_name"', - "test_weird_name'", - "_b.test_weird_name_01-30", - '"_b.test_weird_name_01-30"', - "99beginswithnumber", - "12345", - "\xe9", - ] - ): - assert df.to_sql(name=weird_name, con=self.conn) == 2 - sql.table_exists(weird_name, self.conn) + assert get_sqlite_column_type(conn, tbl, "Bool") == "INTEGER" + assert get_sqlite_column_type(conn, tbl, "Date") == "TIMESTAMP" + assert get_sqlite_column_type(conn, tbl, "Int") == "INTEGER" + assert get_sqlite_column_type(conn, tbl, "Float") == "REAL" - df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name]) - c_tbl = f"test_weird_col_name{ndx:d}" - assert df2.to_sql(name=c_tbl, con=self.conn) == 2 - sql.table_exists(c_tbl, self.conn) +def test_sqlite_illegal_names(sqlite_buildin): + # For sqlite, these should work fine + conn = sqlite_buildin + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) -# ----------------------------------------------------------------------------- -# -- Old tests from 0.13.1 (before refactor using sqlalchemy) + msg = "Empty table or column name specified" + with pytest.raises(ValueError, match=msg): + df.to_sql(name="", con=conn) + for ndx, weird_name in enumerate( + [ + "test_weird_name]", + "test_weird_name[", + "test_weird_name`", + 'test_weird_name"', + "test_weird_name'", + "_b.test_weird_name_01-30", + '"_b.test_weird_name_01-30"', + "99beginswithnumber", + "12345", + "\xe9", + ] + ): + assert df.to_sql(name=weird_name, con=conn) == 2 + sql.table_exists(weird_name, conn) -_formatters = { - datetime: "'{}'".format, - str: "'{}'".format, - np.str_: "'{}'".format, - bytes: "'{}'".format, - float: "{:.8f}".format, - int: "{:d}".format, - type(None): lambda x: "NULL", - np.float64: "{:.10f}".format, - bool: "'{!s}'".format, -} + df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name]) + c_tbl = f"test_weird_col_name{ndx:d}" + assert df2.to_sql(name=c_tbl, con=conn) == 2 + sql.table_exists(c_tbl, conn) def format_query(sql, *args): + _formatters = { + datetime: "'{}'".format, + str: "'{}'".format, + np.str_: "'{}'".format, + bytes: "'{}'".format, + float: "{:.8f}".format, + int: "{:d}".format, + type(None): lambda x: "NULL", + np.float64: "{:.10f}".format, + bool: "'{!s}'".format, + } processed_args = [] for arg in args: if isinstance(arg, float) and isna(arg): @@ -3393,227 +4151,238 @@ return None if res is None else list(res) -class TestXSQLite: - def drop_table(self, table_name, conn): - cur = conn.cursor() - cur.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}") - conn.commit() +def test_xsqlite_basic(sqlite_buildin): + frame = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + assert sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 10 + result = sql.read_sql("select * from test_table", sqlite_buildin) - def test_basic(self, sqlite_buildin): - frame = tm.makeTimeDataFrame() - assert ( - sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 30 - ) - result = sql.read_sql("select * from test_table", sqlite_buildin) + # HACK! Change this once indexes are handled properly. + result.index = frame.index - # HACK! Change this once indexes are handled properly. - result.index = frame.index + expected = frame + tm.assert_frame_equal(result, frame) - expected = frame - tm.assert_frame_equal(result, frame) + frame["txt"] = ["a"] * len(frame) + frame2 = frame.copy() + new_idx = Index(np.arange(len(frame2)), dtype=np.int64) + 10 + frame2["Idx"] = new_idx.copy() + assert sql.to_sql(frame2, name="test_table2", con=sqlite_buildin, index=False) == 10 + result = sql.read_sql("select * from test_table2", sqlite_buildin, index_col="Idx") + expected = frame.copy() + expected.index = new_idx + expected.index.name = "Idx" + tm.assert_frame_equal(expected, result) + + +def test_xsqlite_write_row_by_row(sqlite_buildin): + frame = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + frame.iloc[0, 0] = np.nan + create_sql = sql.get_schema(frame, "test") + cur = sqlite_buildin.cursor() + cur.execute(create_sql) + + ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" + for _, row in frame.iterrows(): + fmt_sql = format_query(ins, *row) + tquery(fmt_sql, con=sqlite_buildin) + + sqlite_buildin.commit() + + result = sql.read_sql("select * from test", con=sqlite_buildin) + result.index = frame.index + tm.assert_frame_equal(result, frame, rtol=1e-3) + + +def test_xsqlite_execute(sqlite_buildin): + frame = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + create_sql = sql.get_schema(frame, "test") + cur = sqlite_buildin.cursor() + cur.execute(create_sql) + ins = "INSERT INTO test VALUES (?, ?, ?, ?)" - frame["txt"] = ["a"] * len(frame) - frame2 = frame.copy() - new_idx = Index(np.arange(len(frame2)), dtype=np.int64) + 10 - frame2["Idx"] = new_idx.copy() - assert ( - sql.to_sql(frame2, name="test_table2", con=sqlite_buildin, index=False) - == 30 - ) - result = sql.read_sql( - "select * from test_table2", sqlite_buildin, index_col="Idx" - ) - expected = frame.copy() - expected.index = new_idx - expected.index.name = "Idx" - tm.assert_frame_equal(expected, result) - - def test_write_row_by_row(self, sqlite_buildin): - frame = tm.makeTimeDataFrame() - frame.iloc[0, 0] = np.nan - create_sql = sql.get_schema(frame, "test") - cur = sqlite_buildin.cursor() - cur.execute(create_sql) + row = frame.iloc[0] + with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: + pandas_sql.execute(ins, tuple(row)) + sqlite_buildin.commit() + + result = sql.read_sql("select * from test", sqlite_buildin) + result.index = frame.index[:1] + tm.assert_frame_equal(result, frame[:1]) - ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" - for _, row in frame.iterrows(): - fmt_sql = format_query(ins, *row) - tquery(fmt_sql, con=sqlite_buildin) - - sqlite_buildin.commit() - - result = sql.read_sql("select * from test", con=sqlite_buildin) - result.index = frame.index - tm.assert_frame_equal(result, frame, rtol=1e-3) - - def test_execute(self, sqlite_buildin): - frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, "test") - cur = sqlite_buildin.cursor() - cur.execute(create_sql) - ins = "INSERT INTO test VALUES (?, ?, ?, ?)" - row = frame.iloc[0] - with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: - pandas_sql.execute(ins, tuple(row)) - sqlite_buildin.commit() - - result = sql.read_sql("select * from test", sqlite_buildin) - result.index = frame.index[:1] - tm.assert_frame_equal(result, frame[:1]) - - def test_schema(self, sqlite_buildin): - frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, "test") - lines = create_sql.splitlines() - for line in lines: - tokens = line.split(" ") - if len(tokens) == 2 and tokens[0] == "A": - assert tokens[1] == "DATETIME" - - create_sql = sql.get_schema(frame, "test", keys=["A", "B"]) - lines = create_sql.splitlines() - assert 'PRIMARY KEY ("A", "B")' in create_sql - cur = sqlite_buildin.cursor() - cur.execute(create_sql) +def test_xsqlite_schema(sqlite_buildin): + frame = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) + create_sql = sql.get_schema(frame, "test") + lines = create_sql.splitlines() + for line in lines: + tokens = line.split(" ") + if len(tokens) == 2 and tokens[0] == "A": + assert tokens[1] == "DATETIME" + + create_sql = sql.get_schema(frame, "test", keys=["A", "B"]) + lines = create_sql.splitlines() + assert 'PRIMARY KEY ("A", "B")' in create_sql + cur = sqlite_buildin.cursor() + cur.execute(create_sql) + + +def test_xsqlite_execute_fail(sqlite_buildin): + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a, b) + ); + """ + cur = sqlite_buildin.cursor() + cur.execute(create_sql) - def test_execute_fail(self, sqlite_buildin): - create_sql = """ - CREATE TABLE test - ( - a TEXT, - b TEXT, - c REAL, - PRIMARY KEY (a, b) - ); - """ - cur = sqlite_buildin.cursor() + with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: + pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') + pandas_sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)') + + with pytest.raises(sql.DatabaseError, match="Execution failed on sql"): + pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 7)') + + +def test_xsqlite_execute_closed_connection(): + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a, b) + ); + """ + with contextlib.closing(sqlite3.connect(":memory:")) as conn: + cur = conn.cursor() cur.execute(create_sql) - with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql: + with sql.pandasSQL_builder(conn) as pandas_sql: pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') - pandas_sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)') - with pytest.raises(sql.DatabaseError, match="Execution failed on sql"): - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 7)') + msg = "Cannot operate on a closed database." + with pytest.raises(sqlite3.ProgrammingError, match=msg): + tquery("select * from test", con=conn) - def test_execute_closed_connection(self): - create_sql = """ - CREATE TABLE test - ( - a TEXT, - b TEXT, - c REAL, - PRIMARY KEY (a, b) - ); - """ - with contextlib.closing(sqlite3.connect(":memory:")) as conn: - cur = conn.cursor() - cur.execute(create_sql) - - with sql.pandasSQL_builder(conn) as pandas_sql: - pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)') - - msg = "Cannot operate on a closed database." - with pytest.raises(sqlite3.ProgrammingError, match=msg): - tquery("select * from test", con=conn) - - def test_keyword_as_column_names(self, sqlite_buildin): - df = DataFrame({"From": np.ones(5)}) - assert sql.to_sql(df, con=sqlite_buildin, name="testkeywords", index=False) == 5 - - def test_onecolumn_of_integer(self, sqlite_buildin): - # GH 3628 - # a column_of_integers dataframe should transfer well to sql - - mono_df = DataFrame([1, 2], columns=["c0"]) - assert sql.to_sql(mono_df, con=sqlite_buildin, name="mono_df", index=False) == 2 - # computing the sum via sql - con_x = sqlite_buildin - the_sum = sum(my_c0[0] for my_c0 in con_x.execute("select * from mono_df")) - # it should not fail, and gives 3 ( Issue #3628 ) - assert the_sum == 3 - - result = sql.read_sql("select * from mono_df", con_x) - tm.assert_frame_equal(result, mono_df) - - def test_if_exists(self, sqlite_buildin): - df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) - df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) - table_name = "table_if_exists" - sql_select = f"SELECT * FROM {table_name}" - msg = "'notvalidvalue' is not valid for if_exists" - with pytest.raises(ValueError, match=msg): - sql.to_sql( - frame=df_if_exists_1, - con=sqlite_buildin, - name=table_name, - if_exists="notvalidvalue", - ) - self.drop_table(table_name, sqlite_buildin) +def test_xsqlite_keyword_as_column_names(sqlite_buildin): + df = DataFrame({"From": np.ones(5)}) + assert sql.to_sql(df, con=sqlite_buildin, name="testkeywords", index=False) == 5 + - # test if_exists='fail' +def test_xsqlite_onecolumn_of_integer(sqlite_buildin): + # GH 3628 + # a column_of_integers dataframe should transfer well to sql + + mono_df = DataFrame([1, 2], columns=["c0"]) + assert sql.to_sql(mono_df, con=sqlite_buildin, name="mono_df", index=False) == 2 + # computing the sum via sql + con_x = sqlite_buildin + the_sum = sum(my_c0[0] for my_c0 in con_x.execute("select * from mono_df")) + # it should not fail, and gives 3 ( Issue #3628 ) + assert the_sum == 3 + + result = sql.read_sql("select * from mono_df", con_x) + tm.assert_frame_equal(result, mono_df) + + +def test_xsqlite_if_exists(sqlite_buildin): + df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) + df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) + table_name = "table_if_exists" + sql_select = f"SELECT * FROM {table_name}" + + msg = "'notvalidvalue' is not valid for if_exists" + with pytest.raises(ValueError, match=msg): sql.to_sql( - frame=df_if_exists_1, con=sqlite_buildin, name=table_name, if_exists="fail" + frame=df_if_exists_1, + con=sqlite_buildin, + name=table_name, + if_exists="notvalidvalue", ) - msg = "Table 'table_if_exists' already exists" - with pytest.raises(ValueError, match=msg): - sql.to_sql( - frame=df_if_exists_1, - con=sqlite_buildin, - name=table_name, - if_exists="fail", - ) - # test if_exists='replace' + drop_table(table_name, sqlite_buildin) + + # test if_exists='fail' + sql.to_sql( + frame=df_if_exists_1, con=sqlite_buildin, name=table_name, if_exists="fail" + ) + msg = "Table 'table_if_exists' already exists" + with pytest.raises(ValueError, match=msg): sql.to_sql( frame=df_if_exists_1, con=sqlite_buildin, name=table_name, + if_exists="fail", + ) + # test if_exists='replace' + sql.to_sql( + frame=df_if_exists_1, + con=sqlite_buildin, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")] + assert ( + sql.to_sql( + frame=df_if_exists_2, + con=sqlite_buildin, + name=table_name, if_exists="replace", index=False, ) - assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")] - assert ( - sql.to_sql( - frame=df_if_exists_2, - con=sqlite_buildin, - name=table_name, - if_exists="replace", - index=False, - ) - == 3 - ) - assert tquery(sql_select, con=sqlite_buildin) == [(3, "C"), (4, "D"), (5, "E")] - self.drop_table(table_name, sqlite_buildin) + == 3 + ) + assert tquery(sql_select, con=sqlite_buildin) == [(3, "C"), (4, "D"), (5, "E")] + drop_table(table_name, sqlite_buildin) - # test if_exists='append' - assert ( - sql.to_sql( - frame=df_if_exists_1, - con=sqlite_buildin, - name=table_name, - if_exists="fail", - index=False, - ) - == 2 + # test if_exists='append' + assert ( + sql.to_sql( + frame=df_if_exists_1, + con=sqlite_buildin, + name=table_name, + if_exists="fail", + index=False, ) - assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")] - assert ( - sql.to_sql( - frame=df_if_exists_2, - con=sqlite_buildin, - name=table_name, - if_exists="append", - index=False, - ) - == 3 - ) - assert tquery(sql_select, con=sqlite_buildin) == [ - (1, "A"), - (2, "B"), - (3, "C"), - (4, "D"), - (5, "E"), - ] - self.drop_table(table_name, sqlite_buildin) + == 2 + ) + assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")] + assert ( + sql.to_sql( + frame=df_if_exists_2, + con=sqlite_buildin, + name=table_name, + if_exists="append", + index=False, + ) + == 3 + ) + assert tquery(sql_select, con=sqlite_buildin) == [ + (1, "A"), + (2, "B"), + (3, "C"), + (4, "D"), + (5, "E"), + ] + drop_table(table_name, sqlite_buildin) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_stata.py pandas-2.2.2+dfsg/pandas/tests/io/test_stata.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_stata.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_stata.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,6 +11,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import CategoricalDtype import pandas._testing as tm @@ -135,7 +137,6 @@ tm.assert_frame_equal(parsed, expected) - @pytest.mark.filterwarnings("always") def test_read_dta2(self, datapath): expected = DataFrame.from_records( [ @@ -184,11 +185,13 @@ parsed_115 = self.read_dta(path2) with tm.assert_produces_warning(UserWarning): parsed_117 = self.read_dta(path3) + # FIXME: don't leave commented-out # 113 is buggy due to limits of date format support in Stata # parsed_113 = self.read_dta( # datapath("io", "data", "stata", "stata2_113.dta") # ) + # FIXME: don't leave commented-out # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats # tm.assert_frame_equal(parsed_113, expected) @@ -798,7 +801,7 @@ expected_values.insert(0, ".") for t in types: offset = valid_range[t][1] - for i in range(0, 27): + for i in range(27): val = StataMissingValue(offset + 1 + i) assert val.string == expected_values[i] @@ -1541,14 +1544,22 @@ df.to_stata(path) def test_path_pathlib(self): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) df.index.name = "index" reader = lambda x: read_stata(x).set_index("index") result = tm.round_trip_pathlib(df.to_stata, reader) tm.assert_frame_equal(df, result) def test_pickle_path_localpath(self): - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) df.index.name = "index" reader = lambda x: read_stata(x).set_index("index") result = tm.round_trip_localpath(df.to_stata, reader) @@ -1569,7 +1580,11 @@ def test_set_index(self): # GH 17328 - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) df.index.name = "index" with tm.ensure_clean() as path: df.to_stata(path) @@ -1706,7 +1721,11 @@ def test_nonfile_writing(self, version): # GH 21041 bio = io.BytesIO() - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) df.index.name = "index" with tm.ensure_clean() as path: df.to_stata(bio, version=version) @@ -1718,7 +1737,11 @@ def test_gzip_writing(self): # writing version 117 requires seek and cannot be used with gzip - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=pd.Index(list("ABCD"), dtype=object), + index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + ) df.index.name = "index" with tm.ensure_clean() as path: with gzip.GzipFile(path, "wb") as gz: @@ -1832,15 +1855,14 @@ @pytest.mark.slow def test_stata_119(self, datapath): # Gzipped since contains 32,999 variables and uncompressed is 20MiB + # Just validate that the reader reports correct number of variables + # to avoid high peak memory with gzip.open( datapath("io", "data", "stata", "stata1_119.dta.gz"), "rb" ) as gz: - df = read_stata(gz) - assert df.shape == (1, 32999) - assert df.iloc[0, 6] == "A" * 3000 - assert df.iloc[0, 7] == 3.14 - assert df.iloc[0, -1] == 1 - assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21)) + with StataReader(gz) as reader: + reader._ensure_open() + assert reader._nvar == 32999 @pytest.mark.parametrize("version", [118, 119, None]) def test_utf8_writer(self, version): @@ -1901,6 +1923,41 @@ with pytest.raises(ValueError, match="You must use version 119"): StataWriterUTF8(path, df, version=118) + @pytest.mark.parametrize( + "dtype_backend", + ["numpy_nullable", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))], + ) + def test_read_write_ea_dtypes(self, dtype_backend): + df = DataFrame( + { + "a": [1, 2, None], + "b": ["a", "b", "c"], + "c": [True, False, None], + "d": [1.5, 2.5, 3.5], + "e": pd.date_range("2020-12-31", periods=3, freq="D"), + }, + index=pd.Index([0, 1, 2], name="index"), + ) + df = df.convert_dtypes(dtype_backend=dtype_backend) + df.to_stata("test_stata.dta", version=118) + + with tm.ensure_clean() as path: + df.to_stata(path) + written_and_read_again = self.read_dta(path) + + expected = DataFrame( + { + "a": [1, 2, np.nan], + "b": ["a", "b", "c"], + "c": [1.0, 0, np.nan], + "d": [1.5, 2.5, 3.5], + "e": pd.date_range("2020-12-31", periods=3, freq="D"), + }, + index=pd.Index([0, 1, 2], name="index", dtype=np.int32), + ) + + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) + @pytest.mark.parametrize("version", [105, 108, 111, 113, 114]) def test_backward_compat(version, datapath): diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/test_user_agent.py pandas-2.2.2+dfsg/pandas/tests/io/test_user_agent.py --- pandas-2.1.4+dfsg/pandas/tests/io/test_user_agent.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/test_user_agent.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,400 +0,0 @@ -""" -Tests for the pandas custom headers in http(s) requests -""" -import gzip -import http.server -from io import BytesIO -import multiprocessing -import socket -import time -import urllib.error - -import pytest - -from pandas.compat import is_ci_environment -import pandas.util._test_decorators as td - -import pandas as pd -import pandas._testing as tm - -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.skipif( - is_ci_environment(), - reason="GH 45651: This test can hang in our CI min_versions build", - ), -] - - -class BaseUserAgentResponder(http.server.BaseHTTPRequestHandler): - """ - Base class for setting up a server that can be set up to respond - with a particular file format with accompanying content-type headers. - The interfaces on the different io methods are different enough - that this seemed logical to do. - """ - - def start_processing_headers(self): - """ - shared logic at the start of a GET request - """ - self.send_response(200) - self.requested_from_user_agent = self.headers["User-Agent"] - response_df = pd.DataFrame( - { - "header": [self.requested_from_user_agent], - } - ) - return response_df - - def gzip_bytes(self, response_bytes): - """ - some web servers will send back gzipped files to save bandwidth - """ - with BytesIO() as bio: - with gzip.GzipFile(fileobj=bio, mode="w") as zipper: - zipper.write(response_bytes) - response_bytes = bio.getvalue() - return response_bytes - - def write_back_bytes(self, response_bytes): - """ - shared logic at the end of a GET request - """ - self.wfile.write(response_bytes) - - -class CSVUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - - self.send_header("Content-Type", "text/csv") - self.end_headers() - - response_bytes = response_df.to_csv(index=False).encode("utf-8") - self.write_back_bytes(response_bytes) - - -class GzippedCSVUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "text/csv") - self.send_header("Content-Encoding", "gzip") - self.end_headers() - - response_bytes = response_df.to_csv(index=False).encode("utf-8") - response_bytes = self.gzip_bytes(response_bytes) - - self.write_back_bytes(response_bytes) - - -class JSONUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/json") - self.end_headers() - - response_bytes = response_df.to_json().encode("utf-8") - - self.write_back_bytes(response_bytes) - - -class GzippedJSONUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/json") - self.send_header("Content-Encoding", "gzip") - self.end_headers() - - response_bytes = response_df.to_json().encode("utf-8") - response_bytes = self.gzip_bytes(response_bytes) - - self.write_back_bytes(response_bytes) - - -class HTMLUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "text/html") - self.end_headers() - - response_bytes = response_df.to_html(index=False).encode("utf-8") - - self.write_back_bytes(response_bytes) - - -class ParquetPyArrowUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/octet-stream") - self.end_headers() - - response_bytes = response_df.to_parquet(index=False, engine="pyarrow") - - self.write_back_bytes(response_bytes) - - -class ParquetFastParquetUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/octet-stream") - self.end_headers() - - # the fastparquet engine doesn't like to write to a buffer - # it can do it via the open_with function being set appropriately - # however it automatically calls the close method and wipes the buffer - # so just overwrite that attribute on this instance to not do that - - # protected by an importorskip in the respective test - import fsspec - - response_df.to_parquet( - "memory://fastparquet_user_agent.parquet", - index=False, - engine="fastparquet", - compression=None, - ) - with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f: - response_bytes = f.read() - - self.write_back_bytes(response_bytes) - - -class PickleUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/octet-stream") - self.end_headers() - - bio = BytesIO() - response_df.to_pickle(bio) - response_bytes = bio.getvalue() - - self.write_back_bytes(response_bytes) - - -class StataUserAgentResponder(BaseUserAgentResponder): - def do_GET(self): - response_df = self.start_processing_headers() - self.send_header("Content-Type", "application/octet-stream") - self.end_headers() - - bio = BytesIO() - response_df.to_stata(bio, write_index=False) - response_bytes = bio.getvalue() - - self.write_back_bytes(response_bytes) - - -class AllHeaderCSVResponder(http.server.BaseHTTPRequestHandler): - """ - Send all request headers back for checking round trip - """ - - def do_GET(self): - response_df = pd.DataFrame(self.headers.items()) - self.send_response(200) - self.send_header("Content-Type", "text/csv") - self.end_headers() - response_bytes = response_df.to_csv(index=False).encode("utf-8") - self.wfile.write(response_bytes) - - -def wait_until_ready(func, *args, **kwargs): - def inner(*args, **kwargs): - while True: - try: - return func(*args, **kwargs) - except urllib.error.URLError: - # Connection refused as http server is starting - time.sleep(0.1) - - return inner - - -def process_server(responder, port): - with http.server.HTTPServer(("localhost", port), responder) as server: - server.handle_request() - server.server_close() - - -@pytest.fixture -def responder(request): - """ - Fixture that starts a local http server in a separate process on localhost - and returns the port. - - Running in a separate process instead of a thread to allow termination/killing - of http server upon cleanup. - """ - # Find an available port - with socket.socket() as sock: - sock.bind(("localhost", 0)) - port = sock.getsockname()[1] - - server_process = multiprocessing.Process( - target=process_server, args=(request.param, port) - ) - server_process.start() - yield port - server_process.join(10) - server_process.terminate() - kill_time = 5 - wait_time = 0 - while server_process.is_alive(): - if wait_time > kill_time: - server_process.kill() - break - wait_time += 0.1 - time.sleep(0.1) - server_process.close() - - -@pytest.mark.parametrize( - "responder, read_method, parquet_engine", - [ - (CSVUserAgentResponder, pd.read_csv, None), - (JSONUserAgentResponder, pd.read_json, None), - ( - HTMLUserAgentResponder, - lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0], - None, - ), - (ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"), - pytest.param( - ParquetFastParquetUserAgentResponder, - pd.read_parquet, - "fastparquet", - # TODO(ArrayManager) fastparquet - marks=[ - td.skip_array_manager_not_yet_implemented, - ], - ), - (PickleUserAgentResponder, pd.read_pickle, None), - (StataUserAgentResponder, pd.read_stata, None), - (GzippedCSVUserAgentResponder, pd.read_csv, None), - (GzippedJSONUserAgentResponder, pd.read_json, None), - ], - indirect=["responder"], -) -def test_server_and_default_headers(responder, read_method, parquet_engine): - if parquet_engine is not None: - pytest.importorskip(parquet_engine) - if parquet_engine == "fastparquet": - pytest.importorskip("fsspec") - - read_method = wait_until_ready(read_method) - if parquet_engine is None: - df_http = read_method(f"http://localhost:{responder}") - else: - df_http = read_method(f"http://localhost:{responder}", engine=parquet_engine) - - assert not df_http.empty - - -@pytest.mark.parametrize( - "responder, read_method, parquet_engine", - [ - (CSVUserAgentResponder, pd.read_csv, None), - (JSONUserAgentResponder, pd.read_json, None), - ( - HTMLUserAgentResponder, - lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0], - None, - ), - (ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"), - pytest.param( - ParquetFastParquetUserAgentResponder, - pd.read_parquet, - "fastparquet", - # TODO(ArrayManager) fastparquet - marks=[ - td.skip_array_manager_not_yet_implemented, - ], - ), - (PickleUserAgentResponder, pd.read_pickle, None), - (StataUserAgentResponder, pd.read_stata, None), - (GzippedCSVUserAgentResponder, pd.read_csv, None), - (GzippedJSONUserAgentResponder, pd.read_json, None), - ], - indirect=["responder"], -) -def test_server_and_custom_headers(responder, read_method, parquet_engine): - if parquet_engine is not None: - pytest.importorskip(parquet_engine) - if parquet_engine == "fastparquet": - pytest.importorskip("fsspec") - - custom_user_agent = "Super Cool One" - df_true = pd.DataFrame({"header": [custom_user_agent]}) - - read_method = wait_until_ready(read_method) - if parquet_engine is None: - df_http = read_method( - f"http://localhost:{responder}", - storage_options={"User-Agent": custom_user_agent}, - ) - else: - df_http = read_method( - f"http://localhost:{responder}", - storage_options={"User-Agent": custom_user_agent}, - engine=parquet_engine, - ) - - tm.assert_frame_equal(df_true, df_http) - - -@pytest.mark.parametrize( - "responder, read_method", - [ - (AllHeaderCSVResponder, pd.read_csv), - ], - indirect=["responder"], -) -def test_server_and_all_custom_headers(responder, read_method): - custom_user_agent = "Super Cool One" - custom_auth_token = "Super Secret One" - storage_options = { - "User-Agent": custom_user_agent, - "Auth": custom_auth_token, - } - read_method = wait_until_ready(read_method) - df_http = read_method( - f"http://localhost:{responder}", - storage_options=storage_options, - ) - - df_http = df_http[df_http["0"].isin(storage_options.keys())] - df_http = df_http.sort_values(["0"]).reset_index() - df_http = df_http[["0", "1"]] - - keys = list(storage_options.keys()) - df_true = pd.DataFrame({"0": keys, "1": [storage_options[k] for k in keys]}) - df_true = df_true.sort_values(["0"]) - df_true = df_true.reset_index().drop(["index"], axis=1) - - tm.assert_frame_equal(df_true, df_http) - - -@pytest.mark.parametrize( - "engine", - [ - "pyarrow", - "fastparquet", - ], -) -def test_to_parquet_to_disk_with_storage_options(engine): - headers = { - "User-Agent": "custom", - "Auth": "other_custom", - } - - pytest.importorskip(engine) - - true_df = pd.DataFrame({"column_name": ["column_value"]}) - msg = ( - "storage_options passed with file object or non-fsspec file path|" - "storage_options passed with buffer, or non-supported URL" - ) - with pytest.raises(ValueError, match=msg): - true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/xml/conftest.py pandas-2.2.2+dfsg/pandas/tests/io/xml/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/io/xml/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/xml/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,9 +1,11 @@ +from pathlib import Path + import pytest @pytest.fixture -def xml_data_path(tests_io_data_path, datapath): - return tests_io_data_path / "xml" +def xml_data_path(): + return Path(__file__).parent.parent / "data" / "xml" @pytest.fixture diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/xml/test_xml.py pandas-2.2.2+dfsg/pandas/tests/io/xml/test_xml.py --- pandas-2.1.4+dfsg/pandas/tests/io/xml/test_xml.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/xml/test_xml.py 2024-04-10 17:42:52.000000000 +0000 @@ -32,6 +32,7 @@ ArrowStringArray, StringArray, ) +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -2004,7 +2005,9 @@ tm.assert_frame_equal(df_lxml, df_etree) -def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): +def test_read_xml_nullable_dtypes( + parser, string_storage, dtype_backend, using_infer_string +): # GH#50500 data = """ @@ -2032,10 +2035,22 @@ """ - if string_storage == "python": + if using_infer_string: + pa = pytest.importorskip("pyarrow") + string_array = ArrowStringArrayNumpySemantics(pa.array(["x", "y"])) + string_array_na = ArrowStringArrayNumpySemantics(pa.array(["x", None])) + + elif string_storage == "python": string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + elif dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + from pandas.arrays import ArrowExtensionArray + + string_array = ArrowExtensionArray(pa.array(["x", "y"])) + string_array_na = ArrowExtensionArray(pa.array(["x", None])) + else: pa = pytest.importorskip("pyarrow") string_array = ArrowStringArray(pa.array(["x", "y"])) diff -Nru pandas-2.1.4+dfsg/pandas/tests/io/xml/test_xml_dtypes.py pandas-2.2.2+dfsg/pandas/tests/io/xml/test_xml_dtypes.py --- pandas-2.1.4+dfsg/pandas/tests/io/xml/test_xml_dtypes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/io/xml/test_xml_dtypes.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,7 @@ from pandas import ( DataFrame, + DatetimeIndex, Series, to_datetime, ) @@ -146,7 +147,9 @@ "Col1": ["square", "circle", "triangle"], "Col2": Series(["00360", "00360", "00180"]).astype("string"), "Col3": Series([4.0, float("nan"), 3.0]).astype("Int64"), - "Col4": to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + "Col4": DatetimeIndex( + ["2020-01-01", "2021-01-01", "2022-01-01"], dtype="M8[ns]" + ), } ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/libs/test_hashtable.py pandas-2.2.2+dfsg/pandas/tests/libs/test_hashtable.py --- pandas-2.1.4+dfsg/pandas/tests/libs/test_hashtable.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/libs/test_hashtable.py 2024-04-10 17:42:52.000000000 +0000 @@ -586,15 +586,26 @@ expected = (np.arange(N) + N).astype(dtype) values = np.repeat(expected, 5) values.flags.writeable = writable - keys, counts = ht.value_count(values, False) + keys, counts, _ = ht.value_count(values, False) tm.assert_numpy_array_equal(np.sort(keys), expected) assert np.all(counts == 5) + def test_value_count_mask(self, dtype): + if dtype == np.object_: + pytest.skip("mask not implemented for object dtype") + values = np.array([1] * 5, dtype=dtype) + mask = np.zeros((5,), dtype=np.bool_) + mask[1] = True + mask[4] = True + keys, counts, na_counter = ht.value_count(values, False, mask=mask) + assert len(keys) == 2 + assert na_counter == 2 + def test_value_count_stable(self, dtype, writable): # GH12679 values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype) values.flags.writeable = writable - keys, counts = ht.value_count(values, False) + keys, counts, _ = ht.value_count(values, False) tm.assert_numpy_array_equal(keys, values) assert np.all(counts == 1) @@ -633,13 +644,13 @@ values = np.repeat(np.arange(N).astype(dtype), 5) values[0] = 42 values.flags.writeable = writable - result = ht.mode(values, False) + result = ht.mode(values, False)[0] assert result == 42 def test_mode_stable(self, dtype, writable): values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype) values.flags.writeable = writable - keys = ht.mode(values, False) + keys = ht.mode(values, False)[0] tm.assert_numpy_array_equal(keys, values) @@ -647,7 +658,7 @@ # GH42688, nans aren't mangled nulls = [pd.NA, np.nan, pd.NaT, None] values = np.array([True] + nulls * 2, dtype=np.object_) - modes = ht.mode(values, False) + modes = ht.mode(values, False)[0] assert modes.size == len(nulls) @@ -685,9 +696,9 @@ class TestHelpFunctionsWithNans: def test_value_count(self, dtype): values = np.array([np.nan, np.nan, np.nan], dtype=dtype) - keys, counts = ht.value_count(values, True) + keys, counts, _ = ht.value_count(values, True) assert len(keys) == 0 - keys, counts = ht.value_count(values, False) + keys, counts, _ = ht.value_count(values, False) assert len(keys) == 1 and np.all(np.isnan(keys)) assert counts[0] == 3 @@ -713,8 +724,8 @@ def test_mode(self, dtype): values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype) - assert ht.mode(values, True) == 42 - assert np.isnan(ht.mode(values, False)) + assert ht.mode(values, True)[0] == 42 + assert np.isnan(ht.mode(values, False)[0]) def test_ismember_tuple_with_nans(): diff -Nru pandas-2.1.4+dfsg/pandas/tests/libs/test_libalgos.py pandas-2.2.2+dfsg/pandas/tests/libs/test_libalgos.py --- pandas-2.1.4+dfsg/pandas/tests/libs/test_libalgos.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/libs/test_libalgos.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,162 @@ +from datetime import datetime +from itertools import permutations + +import numpy as np + +from pandas._libs import algos as libalgos + +import pandas._testing as tm + + +def test_ensure_platform_int(): + arr = np.arange(100, dtype=np.intp) + + result = libalgos.ensure_platform_int(arr) + assert result is arr + + +def test_is_lexsorted(): + failure = [ + np.array( + ([3] * 32) + ([2] * 32) + ([1] * 32) + ([0] * 32), + dtype="int64", + ), + np.array( + list(range(31))[::-1] * 4, + dtype="int64", + ), + ] + + assert not libalgos.is_lexsorted(failure) + + +def test_groupsort_indexer(): + a = np.random.default_rng(2).integers(0, 1000, 100).astype(np.intp) + b = np.random.default_rng(2).integers(0, 1000, 100).astype(np.intp) + + result = libalgos.groupsort_indexer(a, 1000)[0] + + # need to use a stable sort + # np.argsort returns int, groupsort_indexer + # always returns intp + expected = np.argsort(a, kind="mergesort") + expected = expected.astype(np.intp) + + tm.assert_numpy_array_equal(result, expected) + + # compare with lexsort + # np.lexsort returns int, groupsort_indexer + # always returns intp + key = a * 1000 + b + result = libalgos.groupsort_indexer(key, 1000000)[0] + expected = np.lexsort((b, a)) + expected = expected.astype(np.intp) + + tm.assert_numpy_array_equal(result, expected) + + +class TestPadBackfill: + def test_backfill(self): + old = np.array([1, 5, 10], dtype=np.int64) + new = np.array(list(range(12)), dtype=np.int64) + + filler = libalgos.backfill["int64_t"](old, new) + + expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(filler, expect_filler) + + # corner case + old = np.array([1, 4], dtype=np.int64) + new = np.array(list(range(5, 10)), dtype=np.int64) + filler = libalgos.backfill["int64_t"](old, new) + + expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(filler, expect_filler) + + def test_pad(self): + old = np.array([1, 5, 10], dtype=np.int64) + new = np.array(list(range(12)), dtype=np.int64) + + filler = libalgos.pad["int64_t"](old, new) + + expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.intp) + tm.assert_numpy_array_equal(filler, expect_filler) + + # corner case + old = np.array([5, 10], dtype=np.int64) + new = np.arange(5, dtype=np.int64) + filler = libalgos.pad["int64_t"](old, new) + expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp) + tm.assert_numpy_array_equal(filler, expect_filler) + + def test_pad_backfill_object_segfault(self): + old = np.array([], dtype="O") + new = np.array([datetime(2010, 12, 31)], dtype="O") + + result = libalgos.pad["object"](old, new) + expected = np.array([-1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + result = libalgos.pad["object"](new, old) + expected = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + result = libalgos.backfill["object"](old, new) + expected = np.array([-1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + result = libalgos.backfill["object"](new, old) + expected = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + +class TestInfinity: + def test_infinity_sort(self): + # GH#13445 + # numpy's argsort can be unhappy if something is less than + # itself. Instead, let's give our infinities a self-consistent + # ordering, but outside the float extended real line. + + Inf = libalgos.Infinity() + NegInf = libalgos.NegInfinity() + + ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf] + + assert all(Inf >= x for x in ref_nums) + assert all(Inf > x or x is Inf for x in ref_nums) + assert Inf >= Inf and Inf == Inf + assert not Inf < Inf and not Inf > Inf + assert libalgos.Infinity() == libalgos.Infinity() + assert not libalgos.Infinity() != libalgos.Infinity() + + assert all(NegInf <= x for x in ref_nums) + assert all(NegInf < x or x is NegInf for x in ref_nums) + assert NegInf <= NegInf and NegInf == NegInf + assert not NegInf < NegInf and not NegInf > NegInf + assert libalgos.NegInfinity() == libalgos.NegInfinity() + assert not libalgos.NegInfinity() != libalgos.NegInfinity() + + for perm in permutations(ref_nums): + assert sorted(perm) == ref_nums + + # smoke tests + np.array([libalgos.Infinity()] * 32).argsort() + np.array([libalgos.NegInfinity()] * 32).argsort() + + def test_infinity_against_nan(self): + Inf = libalgos.Infinity() + NegInf = libalgos.NegInfinity() + + assert not Inf > np.nan + assert not Inf >= np.nan + assert not Inf < np.nan + assert not Inf <= np.nan + assert not Inf == np.nan + assert Inf != np.nan + + assert not NegInf > np.nan + assert not NegInf >= np.nan + assert not NegInf < np.nan + assert not NegInf <= np.nan + assert not NegInf == np.nan + assert NegInf != np.nan diff -Nru pandas-2.1.4+dfsg/pandas/tests/plotting/common.py pandas-2.2.2+dfsg/pandas/tests/plotting/common.py --- pandas-2.1.4+dfsg/pandas/tests/plotting/common.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/plotting/common.py 2024-04-10 17:42:52.000000000 +0000 @@ -328,7 +328,7 @@ ) -def _flatten_visible(axes): +def _flatten_visible(axes: Axes | Sequence[Axes]) -> Sequence[Axes]: """ Flatten axes, and filter only visible @@ -339,8 +339,8 @@ """ from pandas.plotting._matplotlib.tools import flatten_axes - axes = flatten_axes(axes) - axes = [ax for ax in axes if ax.get_visible()] + axes_ndarray = flatten_axes(axes) + axes = [ax for ax in axes_ndarray if ax.get_visible()] return axes @@ -535,9 +535,6 @@ for ret in gen_plots(f, fig, **kwargs): tm.assert_is_valid_plot_return_object(ret) - with tm.ensure_clean(return_filelike=True) as path: - plt.savefig(path) - finally: plt.close(fig) diff -Nru pandas-2.1.4+dfsg/pandas/tests/plotting/frame/test_frame.py pandas-2.2.2+dfsg/pandas/tests/plotting/frame/test_frame.py --- pandas-2.1.4+dfsg/pandas/tests/plotting/frame/test_frame.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/plotting/frame/test_frame.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,16 +12,20 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.api import is_list_like import pandas as pd from pandas import ( DataFrame, + Index, MultiIndex, PeriodIndex, Series, bdate_range, date_range, + option_context, plotting, ) import pandas._testing as tm @@ -50,19 +54,31 @@ class TestDataFramePlots: @pytest.mark.slow def test_plot(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) _check_plot_works(df.plot, grid=False) @pytest.mark.slow def test_plot_subplots(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) # _check_plot_works adds an ax so use default_axes=True to avoid warning axes = _check_plot_works(df.plot, default_axes=True, subplots=True) _check_axes_shape(axes, axes_num=4, layout=(4, 1)) @pytest.mark.slow def test_plot_subplots_negative_layout(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) axes = _check_plot_works( df.plot, default_axes=True, @@ -73,7 +89,11 @@ @pytest.mark.slow def test_plot_subplots_use_index(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) axes = _check_plot_works( df.plot, default_axes=True, @@ -283,7 +303,11 @@ def test_plot_xy(self): # columns.inferred_type == 'string' - df = tm.makeTimeDataFrame(5) + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=5, freq="B"), + ) _check_data(df.plot(x=0, y=1), df.set_index("A")["B"].plot()) _check_data(df.plot(x=0), df.set_index("A").plot()) _check_data(df.plot(y=0), df.B.plot()) @@ -292,7 +316,11 @@ _check_data(df.plot(y="B"), df.B.plot()) def test_plot_xy_int_cols(self): - df = tm.makeTimeDataFrame(5) + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=5, freq="B"), + ) # columns.inferred_type == 'integer' df.columns = np.arange(1, len(df.columns) + 1) _check_data(df.plot(x=1, y=2), df.set_index(1)[2].plot()) @@ -300,7 +328,11 @@ _check_data(df.plot(y=1), df[1].plot()) def test_plot_xy_figsize_and_title(self): - df = tm.makeTimeDataFrame(5) + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=5, freq="B"), + ) # figsize and title ax = df.plot(x=1, y=2, title="Test", figsize=(16, 8)) _check_text_labels(ax.title, "Test") @@ -333,19 +365,31 @@ # GH: 24867 df = DataFrame({"a": np.arange(100)}, index=np.arange(100)) - msg = "Boolean, None and 'sym' are valid options, 'sm' is given." + msg = f"keyword '{input_param}' should be bool, None, or 'sym', not 'sm'" with pytest.raises(ValueError, match=msg): df.plot(**{input_param: "sm"}) + msg = f"PiePlot ignores the '{input_param}' keyword" + with tm.assert_produces_warning(UserWarning, match=msg): + df.plot.pie(subplots=True, **{input_param: True}) + def test_xcompat(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) ax = df.plot(x_compat=True) lines = ax.get_lines() assert not isinstance(lines[0].get_xdata(), PeriodIndex) _check_ticks_props(ax, xrot=30) def test_xcompat_plot_params(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) plotting.plot_params["xaxis.compat"] = True ax = df.plot() lines = ax.get_lines() @@ -353,7 +397,11 @@ _check_ticks_props(ax, xrot=30) def test_xcompat_plot_params_x_compat(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) plotting.plot_params["x_compat"] = False ax = df.plot() @@ -364,7 +412,11 @@ assert isinstance(PeriodIndex(lines[0].get_xdata()), PeriodIndex) def test_xcompat_plot_params_context_manager(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) # useful if you're plotting a bunch together with plotting.plot_params.use("x_compat", True): ax = df.plot() @@ -373,7 +425,11 @@ _check_ticks_props(ax, xrot=30) def test_xcompat_plot_period(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) ax = df.plot() lines = ax.get_lines() assert not isinstance(lines[0].get_xdata(), PeriodIndex) @@ -398,7 +454,7 @@ def test_unsorted_index(self, index_dtype): df = DataFrame( {"y": np.arange(100)}, - index=pd.Index(np.arange(99, -1, -1), dtype=index_dtype), + index=Index(np.arange(99, -1, -1), dtype=index_dtype), dtype=np.int64, ) ax = df.plot() @@ -716,7 +772,7 @@ expected = [0.0, 0.0, 0.0, 10.0, 0.0, 20.0, 15.0, 10.0, 40.0] assert result == expected - @pytest.mark.parametrize("idx", [pd.Index, pd.CategoricalIndex]) + @pytest.mark.parametrize("idx", [Index, pd.CategoricalIndex]) def test_bar_categorical(self, idx): # GH 13019 df = DataFrame( @@ -790,13 +846,17 @@ _check_plot_works(df.plot.scatter, x=x, y=y) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) @pytest.mark.parametrize("x, y", [("a", "b"), (0, 1)]) @pytest.mark.parametrize("b_col", [[2, 3, 4], ["a", "b", "c"]]) - def test_scatterplot_object_data(self, b_col, x, y): + def test_scatterplot_object_data(self, b_col, x, y, infer_string): # GH 18755 - df = DataFrame({"a": ["A", "B", "C"], "b": b_col}) + with option_context("future.infer_string", infer_string): + df = DataFrame({"a": ["A", "B", "C"], "b": b_col}) - _check_plot_works(df.plot.scatter, x=x, y=y) + _check_plot_works(df.plot.scatter, x=x, y=y) @pytest.mark.parametrize("ordered", [True, False]) @pytest.mark.parametrize( @@ -1362,16 +1422,27 @@ assert result[expected][0].get_color() == "C1" def test_unordered_ts(self): + # GH#2609, GH#55906 + index = [date(2012, 10, 1), date(2012, 9, 1), date(2012, 8, 1)] + values = [3.0, 2.0, 1.0] df = DataFrame( - np.array([3.0, 2.0, 1.0]), - index=[date(2012, 10, 1), date(2012, 9, 1), date(2012, 8, 1)], + np.array(values), + index=index, columns=["test"], ) ax = df.plot() xticks = ax.lines[0].get_xdata() - assert xticks[0] < xticks[1] + tm.assert_numpy_array_equal(xticks, np.array(index, dtype=object)) ydata = ax.lines[0].get_ydata() - tm.assert_numpy_array_equal(ydata, np.array([1.0, 2.0, 3.0])) + tm.assert_numpy_array_equal(ydata, np.array(values)) + + # even though we don't sort the data before passing it to matplotlib, + # the ticks are sorted + xticks = ax.xaxis.get_ticklabels() + xlocs = [x.get_position()[0] for x in xticks] + assert Index(xlocs).is_monotonic_increasing + xlabels = [x.get_text() for x in xticks] + assert pd.to_datetime(xlabels, format="%Y-%m-%d").is_monotonic_increasing @pytest.mark.parametrize("kind", plotting.PlotAccessor._common_kinds) def test_kind_both_ways(self, kind): @@ -1759,7 +1830,7 @@ df_err = DataFrame( np.abs(np.random.default_rng(2).standard_normal((10, 2))), columns=[0, 2] ) - ix = date_range("1/1/2000", periods=10, freq="M") + ix = date_range("1/1/2000", periods=10, freq="ME") df.set_index(ix, inplace=True) df_err.set_index(ix, inplace=True) ax = _check_plot_works(df.plot, yerr=df_err, kind="line") @@ -1780,7 +1851,7 @@ d_err = {"x": np.ones(12) * 0.2, "y": np.ones(12) * 0.4} # check time-series plots - ix = date_range("1/1/2000", "1/1/2001", freq="M") + ix = date_range("1/1/2000", "1/1/2001", freq="ME") tdf = DataFrame(d, index=ix) tdf_err = DataFrame(d_err, index=ix) @@ -2040,9 +2111,17 @@ ) args = {"x": "A", "y": "B"} elif kind == "area": - df = tm.makeTimeDataFrame().abs() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ).abs() else: - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) # Use a weakref so we can see if the object gets collected without # also preventing it from being collected @@ -2491,7 +2570,11 @@ def test_plot_no_warning(self): # GH 55138 # TODO(3.0): this can be removed once Period[B] deprecation is enforced - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) with tm.assert_produces_warning(False): _ = df.plot() _ = df.T.plot() diff -Nru pandas-2.1.4+dfsg/pandas/tests/plotting/frame/test_frame_legend.py pandas-2.2.2+dfsg/pandas/tests/plotting/frame/test_frame_legend.py --- pandas-2.1.4+dfsg/pandas/tests/plotting/frame/test_frame_legend.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/plotting/frame/test_frame_legend.py 2024-04-10 17:42:52.000000000 +0000 @@ -231,7 +231,7 @@ "line", "bar", "barh", - pytest.param("kde", marks=td.skip_if_no_scipy), + pytest.param("kde", marks=td.skip_if_no("scipy")), "area", "hist", ], diff -Nru pandas-2.1.4+dfsg/pandas/tests/plotting/frame/test_frame_subplots.py pandas-2.2.2+dfsg/pandas/tests/plotting/frame/test_frame_subplots.py --- pandas-2.1.4+dfsg/pandas/tests/plotting/frame/test_frame_subplots.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/plotting/frame/test_frame_subplots.py 2024-04-10 17:42:52.000000000 +0000 @@ -89,7 +89,7 @@ @pytest.mark.parametrize("kind", ["line", "area"]) def test_subplots_timeseries(self, kind): - idx = date_range(start="2014-07-01", freq="M", periods=10) + idx = date_range(start="2014-07-01", freq="ME", periods=10) df = DataFrame(np.random.default_rng(2).random((10, 3)), index=idx) axes = df.plot(kind=kind, subplots=True, sharex=True) @@ -112,7 +112,7 @@ @pytest.mark.parametrize("kind", ["line", "area"]) def test_subplots_timeseries_rot(self, kind): - idx = date_range(start="2014-07-01", freq="M", periods=10) + idx = date_range(start="2014-07-01", freq="ME", periods=10) df = DataFrame(np.random.default_rng(2).random((10, 3)), index=idx) axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45, fontsize=7) for ax in axes: @@ -361,7 +361,7 @@ mpl.pyplot.subplots_adjust(left=0.05, right=0.95, hspace=0.3, wspace=0.3) df = DataFrame( np.random.default_rng(2).standard_normal((10, 9)), - index=date_range(start="2014-07-01", freq="M", periods=10), + index=date_range(start="2014-07-01", freq="ME", periods=10), ) for i, ax in enumerate(axes.ravel()): df[i].plot(ax=ax, fontsize=5) diff -Nru pandas-2.1.4+dfsg/pandas/tests/plotting/test_backend.py pandas-2.2.2+dfsg/pandas/tests/plotting/test_backend.py --- pandas-2.1.4+dfsg/pandas/tests/plotting/test_backend.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/plotting/test_backend.py 2024-04-10 17:42:52.000000000 +0000 @@ -80,7 +80,7 @@ assert pandas.options.plotting.backend == "matplotlib" -@td.skip_if_mpl +@td.skip_if_installed("matplotlib") def test_no_matplotlib_ok(): msg = ( 'matplotlib is required for plotting when the default backend "matplotlib" is ' diff -Nru pandas-2.1.4+dfsg/pandas/tests/plotting/test_boxplot_method.py pandas-2.2.2+dfsg/pandas/tests/plotting/test_boxplot_method.py --- pandas-2.1.4+dfsg/pandas/tests/plotting/test_boxplot_method.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/plotting/test_boxplot_method.py 2024-04-10 17:42:52.000000000 +0000 @@ -330,6 +330,22 @@ assert ax.get_ylabel() == ylabel @pytest.mark.parametrize("vert", [True, False]) + def test_plot_box(self, vert): + # GH 54941 + rng = np.random.default_rng(2) + df1 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) + df2 = DataFrame(rng.integers(0, 100, size=(100, 4)), columns=list("ABCD")) + + xlabel, ylabel = "x", "y" + _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) + df1.plot.box(ax=axs[0], vert=vert, xlabel=xlabel, ylabel=ylabel) + df2.plot.box(ax=axs[1], vert=vert, xlabel=xlabel, ylabel=ylabel) + for ax in axs: + assert ax.get_xlabel() == xlabel + assert ax.get_ylabel() == ylabel + mpl.pyplot.close() + + @pytest.mark.parametrize("vert", [True, False]) def test_boxplot_xlabel_ylabel(self, vert): df = DataFrame( { diff -Nru pandas-2.1.4+dfsg/pandas/tests/plotting/test_converter.py pandas-2.2.2+dfsg/pandas/tests/plotting/test_converter.py --- pandas-2.1.4+dfsg/pandas/tests/plotting/test_converter.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/plotting/test_converter.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,6 +10,8 @@ import pandas._config.config as cf +from pandas._libs.tslibs import to_offset + from pandas import ( Index, Period, @@ -255,10 +257,10 @@ result = converter.TimeFormatter(None)(time) assert result == format_expected - @pytest.mark.parametrize("freq", ("B", "L", "S")) + @pytest.mark.parametrize("freq", ("B", "ms", "s")) def test_dateindex_conversion(self, freq, dtc): rtol = 10**-9 - dateindex = tm.makeDateIndex(k=10, freq=freq) + dateindex = date_range("2020-01-01", periods=10, freq=freq) rs = dtc.convert(dateindex, None, None) xp = converter.mdates.date2num(dateindex._mpl_repr()) tm.assert_almost_equal(rs, xp, rtol=rtol) @@ -390,7 +392,7 @@ pytest.skip("the quarterly finder is only invoked if the span is >= 45") nyears = span / 4 (min_anndef, maj_anndef) = converter._get_default_annual_spacing(nyears) - result = converter._quarterly_finder(vmin, vmax, "Q") + result = converter._quarterly_finder(vmin, vmax, to_offset("QE")) quarters = PeriodIndex( arrays.PeriodArray(np.array([x[0] for x in result]), dtype="period[Q]") ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/plotting/test_datetimelike.py pandas-2.2.2+dfsg/pandas/tests/plotting/test_datetimelike.py --- pandas-2.1.4+dfsg/pandas/tests/plotting/test_datetimelike.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/plotting/test_datetimelike.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,6 +14,7 @@ BaseOffset, to_offset, ) +from pandas._libs.tslibs.dtypes import freq_to_period_freqstr from pandas import ( DataFrame, @@ -48,7 +49,7 @@ def test_ts_plot_with_tz(self, tz_aware_fixture): # GH2877, GH17173, GH31205, GH31580 tz = tz_aware_fixture - index = date_range("1/1/2011", periods=2, freq="H", tz=tz) + index = date_range("1/1/2011", periods=2, freq="h", tz=tz) ts = Series([188.5, 328.25], index=index) _check_plot_works(ts.plot) ax = ts.plot() @@ -86,7 +87,7 @@ def test_frame_inferred_n_gt_1(self): # N > 1 - idx = date_range("2008-1-1 00:15:00", freq="15T", periods=10) + idx = date_range("2008-1-1 00:15:00", freq="15min", periods=10) idx = DatetimeIndex(idx.values, freq=None) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx @@ -101,7 +102,7 @@ _check_plot_works(a.plot, yerr=a) def test_nonnumeric_exclude(self): - idx = date_range("1/1/1987", freq="A", periods=3) + idx = date_range("1/1/1987", freq="YE", periods=3) df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}, idx) fig, ax = mpl.pyplot.subplots() @@ -110,13 +111,13 @@ mpl.pyplot.close(fig) def test_nonnumeric_exclude_error(self): - idx = date_range("1/1/1987", freq="A", periods=3) + idx = date_range("1/1/1987", freq="YE", periods=3) df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}, idx) msg = "no numeric data to plot" with pytest.raises(TypeError, match=msg): df["A"].plot() - @pytest.mark.parametrize("freq", ["S", "T", "H", "D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_tsplot_period(self, freq): idx = period_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) @@ -124,7 +125,7 @@ _check_plot_works(ser.plot, ax=ax) @pytest.mark.parametrize( - "freq", ["S", "T", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_tsplot_datetime(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -133,14 +134,18 @@ _check_plot_works(ser.plot, ax=ax) def test_tsplot(self): - ts = tm.makeTimeSeries() + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) _, ax = mpl.pyplot.subplots() ts.plot(style="k", ax=ax) color = (0.0, 0.0, 0.0, 1) assert color == ax.get_lines()[0].get_color() def test_both_style_and_color(self): - ts = tm.makeTimeSeries() + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) msg = ( "Cannot pass 'style' string with a color symbol and 'color' " "keyword argument. Please use one or the other or pass 'style' " @@ -164,8 +169,8 @@ from pandas.plotting._matplotlib.converter import get_datevalue assert get_datevalue(None, "D") is None - assert get_datevalue(1987, "A") == 1987 - assert get_datevalue(Period(1987, "A"), "M") == Period("1987-12", "M").ordinal + assert get_datevalue(1987, "Y") == 1987 + assert get_datevalue(Period(1987, "Y"), "M") == Period("1987-12", "M").ordinal assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal def test_ts_plot_format_coord(self): @@ -175,7 +180,7 @@ first_y = first_line.get_ydata()[0] assert expected_string == ax.format_coord(first_x, first_y) - annual = Series(1, index=date_range("2014-01-01", periods=3, freq="A-DEC")) + annual = Series(1, index=date_range("2014-01-01", periods=3, freq="YE-DEC")) _, ax = mpl.pyplot.subplots() annual.plot(ax=ax) check_format_of_first_point(ax, "t = 2014 y = 1.000000") @@ -186,14 +191,14 @@ daily.plot(ax=ax) check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") - @pytest.mark.parametrize("freq", ["S", "T", "H", "D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"]) def test_line_plot_period_series(self, freq): idx = period_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq) @pytest.mark.parametrize( - "frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"] + "frqncy", ["1s", "3s", "5min", "7h", "4D", "8W", "11M", "3Y"] ) def test_line_plot_period_mlt_series(self, frqncy): # test period index line plot for series with multiples (`mlt`) of the @@ -203,14 +208,14 @@ _check_plot_works(s.plot, s.index.freq.rule_code) @pytest.mark.parametrize( - "freq", ["S", "T", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_datetime_series(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx) _check_plot_works(ser.plot, ser.index.freq.rule_code) - @pytest.mark.parametrize("freq", ["S", "T", "H", "D", "W", "M", "Q", "A"]) + @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "ME", "QE", "YE"]) def test_line_plot_period_frame(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) df = DataFrame( @@ -221,7 +226,7 @@ _check_plot_works(df.plot, df.index.freq) @pytest.mark.parametrize( - "frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"] + "frqncy", ["1s", "3s", "5min", "7h", "4D", "8W", "11M", "3Y"] ) def test_line_plot_period_mlt_frame(self, frqncy): # test period index line plot for DataFrames with multiples (`mlt`) @@ -233,12 +238,13 @@ index=idx, columns=["A", "B", "C"], ) - freq = df.index.asfreq(df.index.freq.rule_code).freq + freq = freq_to_period_freqstr(1, df.index.freq.rule_code) + freq = df.index.asfreq(freq).freq _check_plot_works(df.plot, freq) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( - "freq", ["S", "T", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_datetime_frame(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -247,11 +253,12 @@ index=idx, columns=["A", "B", "C"], ) - freq = df.index.to_period(df.index.freq.rule_code).freq + freq = freq_to_period_freqstr(1, df.index.freq.rule_code) + freq = df.index.to_period(freq).freq _check_plot_works(df.plot, freq) @pytest.mark.parametrize( - "freq", ["S", "T", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"] ) def test_line_plot_inferred_freq(self, freq): idx = date_range("12/31/1999", freq=freq, periods=100) @@ -271,7 +278,9 @@ assert not hasattr(ax, "freq") def test_plot_offset_freq(self): - ser = tm.makeTimeSeries() + ser = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) _check_plot_works(ser.plot) def test_plot_offset_freq_business(self): @@ -288,7 +297,7 @@ def test_uhf(self): import pandas.plotting._matplotlib.converter as conv - idx = date_range("2012-6-22 21:59:51.960928", freq="L", periods=500) + idx = date_range("2012-6-22 21:59:51.960928", freq="ms", periods=500) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 2)), index=idx ) @@ -306,7 +315,7 @@ assert xp == rs def test_irreg_hf(self): - idx = date_range("2012-6-22 21:59:51", freq="S", periods=10) + idx = date_range("2012-6-22 21:59:51", freq="s", periods=10) df = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 2)), index=idx ) @@ -320,7 +329,7 @@ assert (np.fabs(diffs[1:] - [sec, sec * 2, sec]) < 1e-8).all() def test_irreg_hf_object(self): - idx = date_range("2012-6-22 21:59:51", freq="S", periods=10) + idx = date_range("2012-6-22 21:59:51", freq="s", periods=10) df2 = DataFrame( np.random.default_rng(2).standard_normal((len(idx), 2)), index=idx ) @@ -332,7 +341,9 @@ assert (np.fabs(diffs[1:] - sec) < 1e-8).all() def test_irregular_datetime64_repr_bug(self): - ser = tm.makeTimeSeries() + ser = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) ser = ser.iloc[[0, 1, 2, 7]] _, ax = mpl.pyplot.subplots() @@ -344,7 +355,7 @@ assert rs == xp def test_business_freq(self): - bts = tm.makePeriodSeries() + bts = Series(range(5), period_range("2020-01-01", periods=5)) msg = r"PeriodDtype\[B\] is deprecated" dt = bts.index[0].to_timestamp() with tm.assert_produces_warning(FutureWarning, match=msg): @@ -357,7 +368,10 @@ assert PeriodIndex(data=idx).freqstr == "B" def test_business_freq_convert(self): - bts = tm.makeTimeSeries(300).asfreq("BM") + bts = Series( + np.arange(300, dtype=np.float64), + index=date_range("2020-01-01", periods=300, freq="B"), + ).asfreq("BME") ts = bts.to_period("M") _, ax = mpl.pyplot.subplots() bts.plot(ax=ax) @@ -368,7 +382,9 @@ def test_freq_with_no_period_alias(self): # GH34487 freq = WeekOfMonth() - bts = tm.makeTimeSeries(5).asfreq(freq) + bts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ).asfreq(freq) _, ax = mpl.pyplot.subplots() bts.plot(ax=ax) @@ -379,7 +395,7 @@ def test_nonzero_base(self): # GH2571 - idx = date_range("2012-12-20", periods=24, freq="H") + timedelta(minutes=30) + idx = date_range("2012-12-20", periods=24, freq="h") + timedelta(minutes=30) df = DataFrame(np.arange(24), index=idx) _, ax = mpl.pyplot.subplots() df.plot(ax=ax) @@ -387,13 +403,18 @@ assert not Index(rs).is_normalized def test_dataframe(self): - bts = DataFrame({"a": tm.makeTimeSeries()}) + bts = DataFrame( + { + "a": Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + ) + } + ) _, ax = mpl.pyplot.subplots() bts.plot(ax=ax) idx = ax.get_lines()[0].get_xdata() - msg = r"PeriodDtype\[B\] is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_index_equal(bts.index.to_period(), PeriodIndex(idx)) + tm.assert_index_equal(bts.index.to_period(), PeriodIndex(idx)) @pytest.mark.filterwarnings( "ignore:Period with BDay freq is deprecated:FutureWarning" @@ -401,8 +422,23 @@ @pytest.mark.parametrize( "obj", [ - tm.makeTimeSeries(), - DataFrame({"a": tm.makeTimeSeries(), "b": tm.makeTimeSeries() + 1}), + Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + ), + DataFrame( + { + "a": Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + ), + "b": Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + ) + + 1, + } + ), ], ) def test_axis_limits(self, obj): @@ -435,9 +471,9 @@ assert conv.get_finder(to_offset("B")) == conv._daily_finder assert conv.get_finder(to_offset("D")) == conv._daily_finder - assert conv.get_finder(to_offset("M")) == conv._monthly_finder - assert conv.get_finder(to_offset("Q")) == conv._quarterly_finder - assert conv.get_finder(to_offset("A")) == conv._annual_finder + assert conv.get_finder(to_offset("ME")) == conv._monthly_finder + assert conv.get_finder(to_offset("QE")) == conv._quarterly_finder + assert conv.get_finder(to_offset("YE")) == conv._annual_finder assert conv.get_finder(to_offset("W")) == conv._daily_finder def test_finder_daily(self): @@ -520,10 +556,10 @@ def test_finder_annual(self): xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] - xp = [Period(x, freq="A").ordinal for x in xp] + xp = [Period(x, freq="Y").ordinal for x in xp] rs = [] for nyears in [5, 10, 19, 49, 99, 199, 599, 1001]: - rng = period_range("1987", periods=nyears, freq="A") + rng = period_range("1987", periods=nyears, freq="Y") ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() ser.plot(ax=ax) @@ -548,18 +584,20 @@ def test_finder_hourly(self): nhours = 23 - rng = date_range("1/1/1999", freq="H", periods=nhours) + rng = date_range("1/1/1999", freq="h", periods=nhours) ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) _, ax = mpl.pyplot.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] - xp = Period("1/1/1999", freq="H").ordinal + xp = Period("1/1/1999", freq="h").ordinal assert rs == xp def test_gaps(self): - ts = tm.makeTimeSeries() + ts = Series( + np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + ) ts.iloc[5:25] = np.nan _, ax = mpl.pyplot.subplots() ts.plot(ax=ax) @@ -577,7 +615,9 @@ def test_gaps_irregular(self): # irregular - ts = tm.makeTimeSeries() + ts = Series( + np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + ) ts = ts.iloc[[0, 1, 2, 5, 7, 9, 12, 15, 20]] ts.iloc[2:5] = np.nan _, ax = mpl.pyplot.subplots() @@ -612,7 +652,9 @@ assert mask[2:5, 1].all() def test_gap_upsample(self): - low = tm.makeTimeSeries() + low = Series( + np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + ) low.iloc[5:25] = np.nan _, ax = mpl.pyplot.subplots() low.plot(ax=ax) @@ -731,7 +773,10 @@ def test_mixed_freq_regular_first(self): # TODO - s1 = tm.makeTimeSeries() + s1 = Series( + np.arange(20, dtype=np.float64), + index=date_range("2020-01-01", periods=20, freq="B"), + ) s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15]] # it works! @@ -754,7 +799,9 @@ assert right >= pidx[-1].ordinal def test_mixed_freq_irregular_first(self): - s1 = tm.makeTimeSeries() + s1 = Series( + np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) + ) s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15]] _, ax = mpl.pyplot.subplots() s2.plot(style="g", ax=ax) @@ -768,7 +815,10 @@ def test_mixed_freq_regular_first_df(self): # GH 9852 - s1 = tm.makeTimeSeries().to_frame() + s1 = Series( + np.arange(20, dtype=np.float64), + index=date_range("2020-01-01", periods=20, freq="B"), + ).to_frame() s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] _, ax = mpl.pyplot.subplots() s1.plot(ax=ax) @@ -787,7 +837,9 @@ def test_mixed_freq_irregular_first_df(self): # GH 9852 - s1 = tm.makeTimeSeries().to_frame() + s1 = Series( + np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) + ).to_frame() s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] _, ax = mpl.pyplot.subplots() s2.plot(style="g", ax=ax) @@ -801,7 +853,7 @@ def test_mixed_freq_hf_first(self): idxh = date_range("1/1/1999", periods=365, freq="D") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -811,11 +863,11 @@ assert PeriodIndex(data=line.get_xdata()).freq == "D" def test_mixed_freq_alignment(self): - ts_ind = date_range("2012-01-01 13:00", "2012-01-02", freq="H") + ts_ind = date_range("2012-01-01 13:00", "2012-01-02", freq="h") ts_data = np.random.default_rng(2).standard_normal(12) ts = Series(ts_data, index=ts_ind) - ts2 = ts.asfreq("T").interpolate() + ts2 = ts.asfreq("min").interpolate() _, ax = mpl.pyplot.subplots() ax = ts.plot(ax=ax) @@ -825,7 +877,7 @@ def test_mixed_freq_lf_first(self): idxh = date_range("1/1/1999", periods=365, freq="D") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -838,19 +890,21 @@ mpl.pyplot.close(ax.get_figure()) def test_mixed_freq_lf_first_hourly(self): - idxh = date_range("1/1/1999", periods=240, freq="T") - idxl = date_range("1/1/1999", periods=4, freq="H") + idxh = date_range("1/1/1999", periods=240, freq="min") + idxl = date_range("1/1/1999", periods=4, freq="h") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() low.plot(ax=ax) high.plot(ax=ax) for line in ax.get_lines(): - assert PeriodIndex(data=line.get_xdata()).freq == "T" + assert PeriodIndex(data=line.get_xdata()).freq == "min" @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_mixed_freq_irreg_period(self): - ts = tm.makeTimeSeries() + ts = Series( + np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30) + ) irreg = ts.iloc[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]] msg = r"PeriodDtype\[B\] is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -862,7 +916,7 @@ def test_mixed_freq_shared_ax(self): # GH13341, using sharex=True - idx1 = date_range("2015-01-01", periods=3, freq="M") + idx1 = date_range("2015-01-01", periods=3, freq="ME") idx2 = idx1[:1].union(idx1[2:]) s1 = Series(range(len(idx1)), idx1) s2 = Series(range(len(idx2)), idx2) @@ -877,7 +931,7 @@ def test_mixed_freq_shared_ax_twin_x(self): # GH13341, using sharex=True - idx1 = date_range("2015-01-01", periods=3, freq="M") + idx1 = date_range("2015-01-01", periods=3, freq="ME") idx2 = idx1[:1].union(idx1[2:]) s1 = Series(range(len(idx1)), idx1) s2 = Series(range(len(idx2)), idx2) @@ -913,9 +967,24 @@ assert s.index.min() <= Series(xdata).min() assert Series(xdata).max() <= s.index.max() + def test_to_weekly_resampling_disallow_how_kwd(self): + idxh = date_range("1/1/1999", periods=52, freq="W") + idxl = date_range("1/1/1999", periods=12, freq="ME") + high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) + low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) + _, ax = mpl.pyplot.subplots() + high.plot(ax=ax) + + msg = ( + "'how' is not a valid keyword for plotting functions. If plotting " + "multiple objects on shared axes, resample manually first." + ) + with pytest.raises(ValueError, match=msg): + low.plot(ax=ax, how="foo") + def test_to_weekly_resampling(self): idxh = date_range("1/1/1999", periods=52, freq="W") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -926,7 +995,7 @@ def test_from_weekly_resampling(self): idxh = date_range("1/1/1999", periods=52, freq="W") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -949,7 +1018,7 @@ @pytest.mark.parametrize("kind1, kind2", [("line", "area"), ("area", "line")]) def test_from_resampling_area_line_mixed(self, kind1, kind2): idxh = date_range("1/1/1999", periods=52, freq="W") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = DataFrame( np.random.default_rng(2).random((len(idxh), 3)), index=idxh, @@ -1005,7 +1074,7 @@ @pytest.mark.parametrize("kind1, kind2", [("line", "area"), ("area", "line")]) def test_from_resampling_area_line_mixed_high_to_low(self, kind1, kind2): idxh = date_range("1/1/1999", periods=52, freq="W") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = DataFrame( np.random.default_rng(2).random((len(idxh), 3)), index=idxh, @@ -1058,8 +1127,8 @@ def test_mixed_freq_second_millisecond(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="S", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100L", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=50) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # high to low @@ -1068,12 +1137,12 @@ low.plot(ax=ax) assert len(ax.get_lines()) == 2 for line in ax.get_lines(): - assert PeriodIndex(data=line.get_xdata()).freq == "L" + assert PeriodIndex(data=line.get_xdata()).freq == "ms" def test_mixed_freq_second_millisecond_low_to_high(self): # GH 7772, GH 7760 - idxh = date_range("2014-07-01 09:00", freq="S", periods=50) - idxl = date_range("2014-07-01 09:00", freq="100L", periods=500) + idxh = date_range("2014-07-01 09:00", freq="s", periods=50) + idxl = date_range("2014-07-01 09:00", freq="100ms", periods=500) high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) # low to high @@ -1082,7 +1151,7 @@ high.plot(ax=ax) assert len(ax.get_lines()) == 2 for line in ax.get_lines(): - assert PeriodIndex(data=line.get_xdata()).freq == "L" + assert PeriodIndex(data=line.get_xdata()).freq == "ms" def test_irreg_dtypes(self): # date @@ -1211,7 +1280,7 @@ def test_secondary_upsample(self): idxh = date_range("1/1/1999", periods=365, freq="D") - idxl = date_range("1/1/1999", periods=12, freq="M") + idxl = date_range("1/1/1999", periods=12, freq="ME") high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh) low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl) _, ax = mpl.pyplot.subplots() @@ -1229,7 +1298,11 @@ ax = fig.add_subplot(211) # ts - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) df.plot(secondary_y=["A", "B"], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 @@ -1247,7 +1320,11 @@ mpl.pyplot.close(fig) def test_secondary_legend_right(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) df.plot(secondary_y=["A", "C"], mark_right=False, ax=ax) @@ -1260,7 +1337,11 @@ mpl.pyplot.close(fig) def test_secondary_legend_bar(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) fig, ax = mpl.pyplot.subplots() df.plot(kind="bar", secondary_y=["A"], ax=ax) leg = ax.get_legend() @@ -1269,7 +1350,11 @@ mpl.pyplot.close(fig) def test_secondary_legend_bar_right(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) fig, ax = mpl.pyplot.subplots() df.plot(kind="bar", secondary_y=["A"], mark_right=False, ax=ax) leg = ax.get_legend() @@ -1278,10 +1363,18 @@ mpl.pyplot.close(fig) def test_secondary_legend_multi_col(self): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) ax = df.plot(secondary_y=["C", "D"], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 @@ -1296,7 +1389,11 @@ def test_secondary_legend_nonts(self): # non-ts - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) ax = df.plot(secondary_y=["A", "B"], ax=ax) @@ -1313,7 +1410,11 @@ def test_secondary_legend_nonts_multi_col(self): # non-ts - df = tm.makeDataFrame() + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + ) fig = mpl.pyplot.figure() ax = fig.add_subplot(211) ax = df.plot(secondary_y=["C", "D"], ax=ax) @@ -1329,7 +1430,7 @@ @pytest.mark.xfail(reason="Api changed in 3.6.0") def test_format_date_axis(self): - rng = date_range("1/1/2012", periods=12, freq="M") + rng = date_range("1/1/2012", periods=12, freq="ME") df = DataFrame(np.random.default_rng(2).standard_normal((len(rng), 3)), rng) _, ax = mpl.pyplot.subplots() ax = df.plot(ax=ax) @@ -1367,7 +1468,9 @@ # GH 2960 from pandas.plotting._matplotlib.converter import DatetimeConverter - ts = tm.makeTimeSeries()[:20] + ts = Series( + np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) + ) ts_irregular = ts.iloc[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]] # plot the left section of the irregular series, then the right section @@ -1431,7 +1534,9 @@ # GH 3490 - irregular-timeseries with secondary y from pandas.plotting._matplotlib.converter import DatetimeConverter - ts = tm.makeTimeSeries()[:20] + ts = Series( + np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) + ) ts_irregular = ts.iloc[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]] _, ax = mpl.pyplot.subplots() @@ -1514,7 +1619,7 @@ def test_hist(self): # https://github.com/matplotlib/matplotlib/issues/8459 - rng = date_range("1/1/2011", periods=10, freq="H") + rng = date_range("1/1/2011", periods=10, freq="h") x = rng w1 = np.arange(0, 1, 0.1) w2 = np.arange(0, 1, 0.1)[::-1] @@ -1632,17 +1737,16 @@ if orig_axfreq is None: assert ax.freq == dfreq + if freq is not None: + ax_freq = to_offset(ax.freq, is_period=True) if freq is not None and orig_axfreq is None: - assert ax.freq == freq + assert ax_freq == freq ax = fig.add_subplot(212) kwargs["ax"] = ax ret = f(*args, **kwargs) assert ret is not None # TODO: do something more intelligent - with tm.ensure_clean(return_filelike=True) as path: - plt.savefig(path) - # GH18439, GH#24088, statsmodels#4772 with tm.ensure_clean(return_filelike=True) as path: pickle.dump(fig, path) diff -Nru pandas-2.1.4+dfsg/pandas/tests/plotting/test_hist_method.py pandas-2.2.2+dfsg/pandas/tests/plotting/test_hist_method.py --- pandas-2.1.4+dfsg/pandas/tests/plotting/test_hist_method.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/plotting/test_hist_method.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,7 @@ DataFrame, Index, Series, + date_range, to_datetime, ) import pandas._testing as tm @@ -29,7 +30,11 @@ @pytest.fixture def ts(): - return tm.makeTimeSeries(name="ts") + return Series( + np.arange(30, dtype=np.float64), + index=date_range("2020-01-01", periods=30, freq="B"), + name="ts", + ) class TestSeriesPlots: diff -Nru pandas-2.1.4+dfsg/pandas/tests/plotting/test_misc.py pandas-2.2.2+dfsg/pandas/tests/plotting/test_misc.py --- pandas-2.1.4+dfsg/pandas/tests/plotting/test_misc.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/plotting/test_misc.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,5 @@ """ Test cases for misc plot functions """ +import os import numpy as np import pytest @@ -10,8 +11,11 @@ Index, Series, Timestamp, + date_range, interval_range, + period_range, plotting, + read_csv, ) import pandas._testing as tm from pandas.tests.plotting.common import ( @@ -23,10 +27,19 @@ ) mpl = pytest.importorskip("matplotlib") +plt = pytest.importorskip("matplotlib.pyplot") cm = pytest.importorskip("matplotlib.cm") -@td.skip_if_mpl +@pytest.fixture +def iris(datapath) -> DataFrame: + """ + The iris dataset as a DataFrame. + """ + return read_csv(datapath("io", "data", "csv", "iris.csv")) + + +@td.skip_if_installed("matplotlib") def test_import_error_message(): # GH-19810 df = DataFrame({"A": [1, 2]}) @@ -69,11 +82,39 @@ assert len(kwargs) == 24 +@pytest.mark.parametrize("kind", plotting.PlotAccessor._all_kinds) +@pytest.mark.parametrize( + "data", [DataFrame(np.arange(15).reshape(5, 3)), Series(range(5))] +) +@pytest.mark.parametrize( + "index", + [ + Index(range(5)), + date_range("2020-01-01", periods=5), + period_range("2020-01-01", periods=5), + ], +) +def test_savefig(kind, data, index): + fig, ax = plt.subplots() + data.index = index + kwargs = {} + if kind in ["hexbin", "scatter", "pie"]: + if isinstance(data, Series): + pytest.skip(f"{kind} not supported with Series") + kwargs = {"x": 0, "y": 1} + data.plot(kind=kind, ax=ax, **kwargs) + fig.savefig(os.devnull) + + class TestSeriesPlots: def test_autocorrelation_plot(self): from pandas.plotting import autocorrelation_plot - ser = tm.makeTimeSeries(name="ts") + ser = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) # Ensure no UserWarning when making plot with tm.assert_produces_warning(None): _check_plot_works(autocorrelation_plot, series=ser) @@ -86,13 +127,21 @@ def test_lag_plot(self, kwargs): from pandas.plotting import lag_plot - ser = tm.makeTimeSeries(name="ts") + ser = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) _check_plot_works(lag_plot, series=ser, **kwargs) def test_bootstrap_plot(self): from pandas.plotting import bootstrap_plot - ser = tm.makeTimeSeries(name="ts") + ser = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) _check_plot_works(bootstrap_plot, series=ser, size=10) @@ -517,9 +566,9 @@ # Test barh plot with string and integer at the same column from matplotlib.text import Text - df = DataFrame([{"word": 1, "value": 0}, {"word": "knowledg", "value": 2}]) + df = DataFrame([{"word": 1, "value": 0}, {"word": "knowledge", "value": 2}]) plot_barh = df.plot.barh(x="word", legend=None) - expected_yticklabels = [Text(0, 0, "1"), Text(0, 1, "knowledg")] + expected_yticklabels = [Text(0, 0, "1"), Text(0, 1, "knowledge")] assert all( actual.get_text() == expected.get_text() for actual, expected in zip( diff -Nru pandas-2.1.4+dfsg/pandas/tests/plotting/test_series.py pandas-2.2.2+dfsg/pandas/tests/plotting/test_series.py --- pandas-2.1.4+dfsg/pandas/tests/plotting/test_series.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/plotting/test_series.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,6 +14,7 @@ DataFrame, Series, date_range, + period_range, plotting, ) import pandas._testing as tm @@ -37,17 +38,18 @@ @pytest.fixture def ts(): - return tm.makeTimeSeries(name="ts") + return Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) @pytest.fixture def series(): - return tm.makeStringSeries(name="series") - - -@pytest.fixture -def iseries(): - return tm.makePeriodSeries(name="iseries") + return Series( + range(20), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(20)] + ) class TestSeriesPlots: @@ -82,8 +84,9 @@ def test_plot_ts_area_stacked(self, ts): _check_plot_works(ts.plot.area, stacked=False) - def test_plot_iseries(self, iseries): - _check_plot_works(iseries.plot) + def test_plot_iseries(self): + ser = Series(range(5), period_range("2020-01-01", periods=5)) + _check_plot_works(ser.plot) @pytest.mark.parametrize( "kind", @@ -91,7 +94,7 @@ "line", "bar", "barh", - pytest.param("kde", marks=td.skip_if_no_scipy), + pytest.param("kde", marks=td.skip_if_no("scipy")), "hist", "box", ], @@ -238,7 +241,7 @@ with pytest.raises(TypeError, match=msg): _check_plot_works(s.plot) - @pytest.mark.parametrize("index", [None, tm.makeDateIndex(k=4)]) + @pytest.mark.parametrize("index", [None, date_range("2020-01-01", periods=4)]) def test_line_area_nan_series(self, index): values = [1, 2, np.nan, 3] d = Series(values, index=index) @@ -715,7 +718,7 @@ ) def test_errorbar_plot_ts(self, yerr): # test time series plotting - ix = date_range("1/1/2000", "1/1/2001", freq="M") + ix = date_range("1/1/2000", "1/1/2001", freq="ME") ts = Series(np.arange(12), index=ix, name="x") yerr.index = ix diff -Nru pandas-2.1.4+dfsg/pandas/tests/reductions/test_reductions.py pandas-2.2.2+dfsg/pandas/tests/reductions/test_reductions.py --- pandas-2.1.4+dfsg/pandas/tests/reductions/test_reductions.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reductions/test_reductions.py 2024-04-10 17:42:52.000000000 +0000 @@ -23,22 +23,26 @@ Timestamp, date_range, isna, + period_range, timedelta_range, to_timedelta, ) import pandas._testing as tm from pandas.core import nanops +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics def get_objs(): indexes = [ - tm.makeBoolIndex(10, name="a"), - tm.makeIntIndex(10, name="a"), - tm.makeFloatIndex(10, name="a"), - tm.makeDateIndex(10, name="a"), - tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern"), - tm.makePeriodIndex(10, name="a"), - tm.makeStringIndex(10, name="a"), + Index([True, False] * 5, name="a"), + Index(np.arange(10), dtype=np.int64, name="a"), + Index(np.arange(10), dtype=np.float64, name="a"), + DatetimeIndex(date_range("2020-01-01", periods=10), name="a"), + DatetimeIndex(date_range("2020-01-01", periods=10), name="a").tz_localize( + tz="US/Eastern" + ), + PeriodIndex(period_range("2020-01-01", periods=10, freq="D"), name="a"), + Index([str(i) for i in range(10)], name="a"), ] arr = np.random.default_rng(2).standard_normal(10) @@ -57,7 +61,11 @@ def test_ops(self, opname, obj): result = getattr(obj, opname)() if not isinstance(obj, PeriodIndex): - expected = getattr(obj.values, opname)() + if isinstance(obj.values, ArrowStringArrayNumpySemantics): + # max not on the interface + expected = getattr(np.array(obj.values), opname)() + else: + expected = getattr(obj.values, opname)() else: expected = Period(ordinal=getattr(obj.asi8, opname)(), freq=obj.freq) @@ -529,7 +537,7 @@ assert result is NaT def test_numpy_minmax_period(self): - pr = pd.period_range(start="2016-01-15", end="2016-01-20") + pr = period_range(start="2016-01-15", end="2016-01-20") assert np.min(pr) == Period("2016-01-15", freq="D") assert np.max(pr) == Period("2016-01-20", freq="D") @@ -822,24 +830,23 @@ # See GH#16830 data = np.arange(1, 11) - s = Series(data, index=data) - result = np.argmax(s) + ser = Series(data, index=data) + result = np.argmax(ser) expected = np.argmax(data) assert result == expected - result = s.argmax() + result = ser.argmax() assert result == expected msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argmax(s, out=data) + np.argmax(ser, out=data) - def test_idxmin_dt64index(self): + def test_idxmin_dt64index(self, unit): # GH#43587 should have NaT instead of NaN - ser = Series( - [1.0, 2.0, np.nan], index=DatetimeIndex(["NaT", "2015-02-08", "NaT"]) - ) + dti = DatetimeIndex(["NaT", "2015-02-08", "NaT"]).as_unit(unit) + ser = Series([1.0, 2.0, np.nan], index=dti) msg = "The behavior of Series.idxmin with all-NA values" with tm.assert_produces_warning(FutureWarning, match=msg): res = ser.idxmin(skipna=False) @@ -853,18 +860,18 @@ msg = "The behavior of DataFrame.idxmin with all-NA values" with tm.assert_produces_warning(FutureWarning, match=msg): res = df.idxmin(skipna=False) - assert res.dtype == "M8[ns]" + assert res.dtype == f"M8[{unit}]" assert res.isna().all() msg = "The behavior of DataFrame.idxmax with all-NA values" with tm.assert_produces_warning(FutureWarning, match=msg): res = df.idxmax(skipna=False) - assert res.dtype == "M8[ns]" + assert res.dtype == f"M8[{unit}]" assert res.isna().all() def test_idxmin(self): # test idxmin # _check_stat_op approach can not be used here because of isna check. - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") # add some NaNs string_series[5:15] = np.nan @@ -897,7 +904,7 @@ def test_idxmax(self): # test idxmax # _check_stat_op approach can not be used here because of isna check. - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") # add some NaNs string_series[5:15] = np.nan @@ -942,7 +949,11 @@ assert result == 1.1 def test_all_any(self): - ts = tm.makeTimeSeries() + ts = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) bool_series = ts > 0 assert not bool_series.all() assert bool_series.any() @@ -1166,7 +1177,7 @@ with pytest.raises(ValueError, match=msg): test_input.idxmax(skipna=False) - def test_idxminmax_object_dtype(self): + def test_idxminmax_object_dtype(self, using_infer_string): # pre-2.1 object-dtype was disallowed for argmin/max ser = Series(["foo", "bar", "baz"]) assert ser.idxmax() == 0 @@ -1180,18 +1191,19 @@ assert ser2.idxmin() == 0 assert ser2.idxmin(skipna=False) == 0 - # attempting to compare np.nan with string raises - ser3 = Series(["foo", "foo", "bar", "bar", None, np.nan, "baz"]) - msg = "'>' not supported between instances of 'float' and 'str'" - with pytest.raises(TypeError, match=msg): - ser3.idxmax() - with pytest.raises(TypeError, match=msg): - ser3.idxmax(skipna=False) - msg = "'<' not supported between instances of 'float' and 'str'" - with pytest.raises(TypeError, match=msg): - ser3.idxmin() - with pytest.raises(TypeError, match=msg): - ser3.idxmin(skipna=False) + if not using_infer_string: + # attempting to compare np.nan with string raises + ser3 = Series(["foo", "foo", "bar", "bar", None, np.nan, "baz"]) + msg = "'>' not supported between instances of 'float' and 'str'" + with pytest.raises(TypeError, match=msg): + ser3.idxmax() + with pytest.raises(TypeError, match=msg): + ser3.idxmax(skipna=False) + msg = "'<' not supported between instances of 'float' and 'str'" + with pytest.raises(TypeError, match=msg): + ser3.idxmin() + with pytest.raises(TypeError, match=msg): + ser3.idxmin(skipna=False) def test_idxminmax_object_frame(self): # GH#4279 @@ -1446,14 +1458,14 @@ s = Series(data, dtype=object) result = s.mode(dropna) - expected2 = Series(expected2, dtype=object) + expected2 = Series(expected2, dtype=None if expected2 == ["bar"] else object) tm.assert_series_equal(result, expected2) data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] s = Series(data, dtype=object).astype(str) result = s.mode(dropna) - expected3 = Series(expected3, dtype=str) + expected3 = Series(expected3) tm.assert_series_equal(result, expected3) @pytest.mark.parametrize( @@ -1468,7 +1480,7 @@ s = Series([1, "foo", "foo", np.nan, np.nan, np.nan]) result = s.mode(dropna) - expected = Series(expected2, dtype=object) + expected = Series(expected2, dtype=None if expected2 == ["foo"] else object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff -Nru pandas-2.1.4+dfsg/pandas/tests/reductions/test_stat_reductions.py pandas-2.2.2+dfsg/pandas/tests/reductions/test_stat_reductions.py --- pandas-2.1.4+dfsg/pandas/tests/reductions/test_stat_reductions.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reductions/test_stat_reductions.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,21 +10,17 @@ from pandas import ( DataFrame, Series, + date_range, ) import pandas._testing as tm -from pandas.core.arrays import ( - DatetimeArray, - PeriodArray, - TimedeltaArray, -) class TestDatetimeLikeStatReductions: - @pytest.mark.parametrize("box", [Series, pd.Index, DatetimeArray]) + @pytest.mark.parametrize("box", [Series, pd.Index, pd.array]) def test_dt64_mean(self, tz_naive_fixture, box): tz = tz_naive_fixture - dti = pd.date_range("2001-01-01", periods=11, tz=tz) + dti = date_range("2001-01-01", periods=11, tz=tz) # shuffle so that we are not just working with monotone-increasing dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) dtarr = dti._data @@ -40,11 +36,11 @@ assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz) assert obj.mean(skipna=False) is pd.NaT - @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray]) - @pytest.mark.parametrize("freq", ["S", "H", "D", "W", "B"]) + @pytest.mark.parametrize("box", [Series, pd.Index, pd.array]) + @pytest.mark.parametrize("freq", ["s", "h", "D", "W", "B"]) def test_period_mean(self, box, freq): # GH#24757 - dti = pd.date_range("2001-01-01", periods=11) + dti = date_range("2001-01-01", periods=11) # shuffle so that we are not just working with monotone-increasing dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) @@ -66,9 +62,10 @@ with pytest.raises(TypeError, match="ambiguous"): obj.mean(skipna=True) - @pytest.mark.parametrize("box", [Series, pd.Index, TimedeltaArray]) + @pytest.mark.parametrize("box", [Series, pd.Index, pd.array]) def test_td64_mean(self, box): - tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D") + m8values = np.array([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], "m8[D]") + tdi = pd.TimedeltaIndex(m8values).as_unit("ns") tdarr = tdi._data obj = box(tdarr, copy=False) @@ -103,7 +100,7 @@ # mean, idxmax, idxmin, min, and max are valid for dates if name not in ["max", "min", "mean", "median", "std"]: - ds = Series(pd.date_range("1/1/2001", periods=10)) + ds = Series(date_range("1/1/2001", periods=10)) msg = f"does not support reduction '{name}'" with pytest.raises(TypeError, match=msg): f(ds) @@ -154,15 +151,15 @@ f(string_series_, numeric_only=True) def test_sum(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") self._check_stat_op("sum", np.sum, string_series, check_allna=False) def test_mean(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") self._check_stat_op("mean", np.mean, string_series) def test_median(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") self._check_stat_op("median", np.median, string_series) # test with integers, test failure @@ -170,20 +167,24 @@ tm.assert_almost_equal(np.median(int_ts), int_ts.median()) def test_prod(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") self._check_stat_op("prod", np.prod, string_series) def test_min(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") self._check_stat_op("min", np.min, string_series, check_objects=True) def test_max(self): - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") self._check_stat_op("max", np.max, string_series, check_objects=True) def test_var_std(self): - string_series = tm.makeStringSeries().rename("series") - datetime_series = tm.makeTimeSeries().rename("ts") + string_series = Series(range(20), dtype=np.float64, name="series") + datetime_series = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) alt = lambda x: np.std(x, ddof=1) self._check_stat_op("std", alt, string_series) @@ -208,8 +209,12 @@ assert pd.isna(result) def test_sem(self): - string_series = tm.makeStringSeries().rename("series") - datetime_series = tm.makeTimeSeries().rename("ts") + string_series = Series(range(20), dtype=np.float64, name="series") + datetime_series = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) self._check_stat_op("sem", alt, string_series) @@ -228,7 +233,7 @@ def test_skew(self): sp_stats = pytest.importorskip("scipy.stats") - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") alt = lambda x: sp_stats.skew(x, bias=False) self._check_stat_op("skew", alt, string_series) @@ -250,7 +255,7 @@ def test_kurt(self): sp_stats = pytest.importorskip("scipy.stats") - string_series = tm.makeStringSeries().rename("series") + string_series = Series(range(20), dtype=np.float64, name="series") alt = lambda x: sp_stats.kurtosis(x, bias=False) self._check_stat_op("kurt", alt, string_series) diff -Nru pandas-2.1.4+dfsg/pandas/tests/resample/conftest.py pandas-2.2.2+dfsg/pandas/tests/resample/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/resample/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/resample/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,5 +1,4 @@ from datetime import datetime -import warnings import numpy as np import pytest @@ -8,8 +7,6 @@ DataFrame, Series, ) -from pandas.core.indexes.datetimes import date_range -from pandas.core.indexes.period import period_range # The various methods we support downsample_methods = [ @@ -45,40 +42,6 @@ @pytest.fixture -def simple_date_range_series(): - """ - Series with date range index and random data for test purposes. - """ - - def _simple_date_range_series(start, end, freq="D"): - rng = date_range(start, end, freq=freq) - return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - - return _simple_date_range_series - - -@pytest.fixture -def simple_period_range_series(): - """ - Series with period range index and random data for test purposes. - """ - - def _simple_period_range_series(start, end, freq="D"): - with warnings.catch_warnings(): - # suppress Period[B] deprecation warning - msg = "|".join(["Period with BDay freq", r"PeriodDtype\[B\] is deprecated"]) - warnings.filterwarnings( - "ignore", - msg, - category=FutureWarning, - ) - rng = period_range(start, end, freq=freq) - return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - - return _simple_period_range_series - - -@pytest.fixture def _index_start(): """Fixture for parametrization of index, series and frame.""" return datetime(2005, 1, 1) diff -Nru pandas-2.1.4+dfsg/pandas/tests/resample/test_base.py pandas-2.2.2+dfsg/pandas/tests/resample/test_base.py --- pandas-2.1.4+dfsg/pandas/tests/resample/test_base.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/resample/test_base.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,8 +3,13 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_extension_array_dtype + +import pandas as pd from pandas import ( DataFrame, + DatetimeIndex, + Index, MultiIndex, NaT, PeriodIndex, @@ -44,7 +49,7 @@ return _create_index -@pytest.mark.parametrize("freq", ["2D", "1H"]) +@pytest.mark.parametrize("freq", ["2D", "1h"]) @pytest.mark.parametrize( "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE] ) @@ -65,16 +70,16 @@ ser = series - result = ser.resample("1H").asfreq() - new_index = create_index(ser.index[0], ser.index[-1], freq="1H") + result = ser.resample("1h").asfreq() + new_index = create_index(ser.index[0], ser.index[-1], freq="1h") expected = ser.reindex(new_index) tm.assert_series_equal(result, expected) # Explicit cast to float to avoid implicit cast when setting None frame = ser.astype("float").to_frame("value") frame.iloc[1] = None - result = frame.resample("1H").asfreq(fill_value=4.0) - new_index = create_index(frame.index[0], frame.index[-1], freq="1H") + result = frame.resample("1h").asfreq(fill_value=4.0) + new_index = create_index(frame.index[0], frame.index[-1], freq="1h") expected = frame.reindex(new_index, fill_value=4.0) tm.assert_frame_equal(result, expected) @@ -83,8 +88,13 @@ def test_resample_interpolate(frame): # GH#12925 df = frame - result = df.resample("1T").asfreq().interpolate() - expected = df.resample("1T").interpolate() + warn = None + if isinstance(df.index, PeriodIndex): + warn = FutureWarning + msg = "Resampling with a PeriodIndex is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = df.resample("1min").asfreq().interpolate() + expected = df.resample("1min").interpolate() tm.assert_frame_equal(result, expected) @@ -96,25 +106,33 @@ "but got an instance of 'RangeIndex'" ) with pytest.raises(TypeError, match=msg): - xp.resample("A") + xp.resample("YE") @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_resample_empty_series(freq, empty_series_dti, resample_method): # GH12771 & GH12868 ser = empty_series_dti - if freq == "M" and isinstance(ser.index, TimedeltaIndex): + if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): ser.resample(freq) return - - rs = ser.resample(freq) + elif freq == "ME" and isinstance(ser.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" + + warn = None + if isinstance(ser.index, PeriodIndex): + warn = FutureWarning + msg = "Resampling with a PeriodIndex is deprecated" + with tm.assert_produces_warning(warn, match=msg): + rs = ser.resample(freq) result = getattr(rs, resample_method)() if resample_method == "ohlc": @@ -136,9 +154,9 @@ @pytest.mark.parametrize( "freq", [ - pytest.param("M", marks=pytest.mark.xfail(reason="Don't know why this fails")), + pytest.param("ME", marks=pytest.mark.xfail(reason="Don't know why this fails")), "D", - "H", + "h", ], ) def test_resample_nat_index_series(freq, series, resample_method): @@ -146,7 +164,10 @@ ser = series.copy() ser.index = PeriodIndex([NaT] * len(ser), freq=freq) - rs = ser.resample(freq) + + msg = "Resampling with a PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.resample(freq) result = getattr(rs, resample_method)() if resample_method == "ohlc": @@ -162,21 +183,29 @@ @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) @pytest.mark.parametrize("resample_method", ["count", "size"]) def test_resample_count_empty_series(freq, empty_series_dti, resample_method): # GH28427 ser = empty_series_dti - if freq == "M" and isinstance(ser.index, TimedeltaIndex): + if freq == "ME" and isinstance(ser.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): ser.resample(freq) return - - rs = ser.resample(freq) + elif freq == "ME" and isinstance(ser.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" + + warn = None + if isinstance(ser.index, PeriodIndex): + warn = FutureWarning + msg = "Resampling with a PeriodIndex is deprecated" + with tm.assert_produces_warning(warn, match=msg): + rs = ser.resample(freq) result = getattr(rs, resample_method)() @@ -188,21 +217,29 @@ @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_resample_empty_dataframe(empty_frame_dti, freq, resample_method): # GH13212 df = empty_frame_dti # count retains dimensions too - if freq == "M" and isinstance(df.index, TimedeltaIndex): + if freq == "ME" and isinstance(df.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): df.resample(freq, group_keys=False) return - - rs = df.resample(freq, group_keys=False) + elif freq == "ME" and isinstance(df.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" + + warn = None + if isinstance(df.index, PeriodIndex): + warn = FutureWarning + msg = "Resampling with a PeriodIndex is deprecated" + with tm.assert_produces_warning(warn, match=msg): + rs = df.resample(freq, group_keys=False) result = getattr(rs, resample_method)() if resample_method == "ohlc": # TODO: no tests with len(df.columns) > 0 @@ -228,47 +265,65 @@ @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_resample_count_empty_dataframe(freq, empty_frame_dti): # GH28427 empty_frame_dti["a"] = [] - if freq == "M" and isinstance(empty_frame_dti.index, TimedeltaIndex): + if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): empty_frame_dti.resample(freq) return - - result = empty_frame_dti.resample(freq).count() + elif freq == "ME" and isinstance(empty_frame_dti.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" + + warn = None + if isinstance(empty_frame_dti.index, PeriodIndex): + warn = FutureWarning + msg = "Resampling with a PeriodIndex is deprecated" + with tm.assert_produces_warning(warn, match=msg): + rs = empty_frame_dti.resample(freq) + result = rs.count() index = _asfreq_compat(empty_frame_dti.index, freq) - expected = DataFrame({"a": []}, dtype="int64", index=index) + expected = DataFrame(dtype="int64", index=index, columns=Index(["a"], dtype=object)) tm.assert_frame_equal(result, expected) @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_resample_size_empty_dataframe(freq, empty_frame_dti): # GH28427 empty_frame_dti["a"] = [] - if freq == "M" and isinstance(empty_frame_dti.index, TimedeltaIndex): + if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): empty_frame_dti.resample(freq) return - - result = empty_frame_dti.resample(freq).size() + elif freq == "ME" and isinstance(empty_frame_dti.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" + + msg = "Resampling with a PeriodIndex" + warn = None + if isinstance(empty_frame_dti.index, PeriodIndex): + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + rs = empty_frame_dti.resample(freq) + result = rs.size() index = _asfreq_compat(empty_frame_dti.index, freq) @@ -277,18 +332,30 @@ tm.assert_series_equal(result, expected) -@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -@pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) +@pytest.mark.parametrize( + "index", + [ + PeriodIndex([], freq="M", name="a"), + DatetimeIndex([], name="a"), + TimedeltaIndex([], name="a"), + ], +) @pytest.mark.parametrize("dtype", [float, int, object, "datetime64[ns]"]) +@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_resample_empty_dtypes(index, dtype, resample_method): # Empty series were sometimes causing a segfault (for the functions # with Cython bounds-checking disabled) or an IndexError. We just run # them to ensure they no longer do. (GH #10228) + warn = None if isinstance(index, PeriodIndex): # GH#53511 index = PeriodIndex([], freq="B", name=index.name) + warn = FutureWarning + msg = "Resampling with a PeriodIndex is deprecated" + empty_series_dti = Series([], index, dtype) - rs = empty_series_dti.resample("d", group_keys=False) + with tm.assert_produces_warning(warn, match=msg): + rs = empty_series_dti.resample("d", group_keys=False) try: getattr(rs, resample_method)() except DataError: @@ -298,22 +365,34 @@ @all_ts -@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("freq", ["ME", "D", "h"]) def test_apply_to_empty_series(empty_series_dti, freq): # GH 14313 ser = empty_series_dti - if freq == "M" and isinstance(empty_series_dti.index, TimedeltaIndex): + if freq == "ME" and isinstance(empty_series_dti.index, TimedeltaIndex): msg = ( "Resampling on a TimedeltaIndex requires fixed-duration `freq`, " - "e.g. '24H' or '3D', not " + "e.g. '24h' or '3D', not " ) with pytest.raises(ValueError, match=msg): empty_series_dti.resample(freq) return - - result = ser.resample(freq, group_keys=False).apply(lambda x: 1) - expected = ser.resample(freq).apply("sum") + elif freq == "ME" and isinstance(empty_series_dti.index, PeriodIndex): + # index is PeriodIndex, so convert to corresponding Period freq + freq = "M" + + msg = "Resampling with a PeriodIndex" + warn = None + if isinstance(empty_series_dti.index, PeriodIndex): + warn = FutureWarning + + with tm.assert_produces_warning(warn, match=msg): + rs = ser.resample(freq, group_keys=False) + + result = rs.apply(lambda x: 1) + with tm.assert_produces_warning(warn, match=msg): + expected = ser.resample(freq).apply("sum") tm.assert_series_equal(result, expected, check_dtype=False) @@ -321,10 +400,18 @@ @all_ts def test_resampler_is_iterable(series): # GH 15314 - freq = "H" + freq = "h" tg = Grouper(freq=freq, convention="start") - grouped = series.groupby(tg) - resampled = series.resample(freq) + msg = "Resampling with a PeriodIndex" + warn = None + if isinstance(series.index, PeriodIndex): + warn = FutureWarning + + with tm.assert_produces_warning(warn, match=msg): + grouped = series.groupby(tg) + + with tm.assert_produces_warning(warn, match=msg): + resampled = series.resample(freq) for (rk, rv), (gk, gv) in zip(resampled, grouped): assert rk == gk tm.assert_series_equal(rv, gv) @@ -335,7 +422,39 @@ # GH 15023 ser = series q = 0.75 - freq = "H" - result = ser.resample(freq).quantile(q) - expected = ser.resample(freq).agg(lambda x: x.quantile(q)).rename(ser.name) + freq = "h" + + msg = "Resampling with a PeriodIndex" + warn = None + if isinstance(series.index, PeriodIndex): + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + result = ser.resample(freq).quantile(q) + expected = ser.resample(freq).agg(lambda x: x.quantile(q)).rename(ser.name) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, skipna, how): + # GH#57019 + if is_extension_array_dtype(any_real_nullable_dtype): + na_value = Series(dtype=any_real_nullable_dtype).dtype.na_value + else: + na_value = np.nan + df = DataFrame( + { + "a": [2, 1, 1, 2], + "b": [na_value, 3.0, na_value, 4.0], + "c": [na_value, 3.0, na_value, 4.0], + }, + index=date_range("2020-01-01", periods=4, freq="D"), + dtype=any_real_nullable_dtype, + ) + rs = df.resample("ME") + method = getattr(rs, how) + result = method(skipna=skipna) + + gb = df.groupby(df.shape[0] * [pd.to_datetime("2020-01-31")]) + expected = getattr(gb, how)(skipna=skipna) + expected.index.freq = "ME" + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/resample/test_datetime_index.py pandas-2.2.2+dfsg/pandas/tests/resample/test_datetime_index.py --- pandas-2.1.4+dfsg/pandas/tests/resample/test_datetime_index.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/resample/test_datetime_index.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,5 @@ from datetime import datetime from functools import partial -from io import StringIO import numpy as np import pytest @@ -8,10 +7,13 @@ from pandas._libs import lib from pandas._typing import DatetimeNaTType +from pandas.compat import is_platform_windows +import pandas.util._test_decorators as td import pandas as pd from pandas import ( DataFrame, + Index, Series, Timedelta, Timestamp, @@ -54,6 +56,19 @@ return request.param +@pytest.fixture +def simple_date_range_series(): + """ + Series with date range index and random data for test purposes. + """ + + def _simple_date_range_series(start, end, freq="D"): + rng = date_range(start, end, freq=freq) + return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + + return _simple_date_range_series + + def test_custom_grouper(index, unit): dti = index.as_unit(unit) s = Series(np.array([1] * len(dti)), index=dti, dtype="int64") @@ -82,7 +97,7 @@ arr = [1] + [5] * 2592 idx = dti[0:-1:5] idx = idx.append(dti[-1:]) - idx = DatetimeIndex(idx, freq="5T").as_unit(unit) + idx = DatetimeIndex(idx, freq="5min").as_unit(unit) expect = Series(arr, index=idx) # GH2763 - return input dtype if we can @@ -140,21 +155,21 @@ # GH 25580, resample on IntegerArray ts = Series( range(9), - index=date_range("1/1/2000", periods=9, freq="T").as_unit(unit), + index=date_range("1/1/2000", periods=9, freq="min").as_unit(unit), dtype="Int64", ) - result = ts.resample("3T").sum() + result = ts.resample("3min").sum() expected = Series( [3, 12, 21], - index=date_range("1/1/2000", periods=3, freq="3T").as_unit(unit), + index=date_range("1/1/2000", periods=3, freq="3min").as_unit(unit), dtype="Int64", ) tm.assert_series_equal(result, expected) - result = ts.resample("3T").mean() + result = ts.resample("3min").mean() expected = Series( [1, 4, 7], - index=date_range("1/1/2000", periods=3, freq="3T").as_unit(unit), + index=date_range("1/1/2000", periods=3, freq="3min").as_unit(unit), dtype="Float64", ) tm.assert_series_equal(result, expected) @@ -169,6 +184,9 @@ tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:The 'convention' keyword in Series.resample:FutureWarning" +) @pytest.mark.parametrize( "_index_start,_index_end,_index_name", [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], @@ -255,11 +273,11 @@ def __call__(self, x): return str(type(x)) - df_standard = df.resample("M").apply(fn) - df_lambda = df.resample("M").apply(lambda x: str(type(x))) - df_partial = df.resample("M").apply(partial(fn)) - df_partial2 = df.resample("M").apply(partial(fn, a=2)) - df_class = df.resample("M").apply(FnClass()) + df_standard = df.resample("ME").apply(fn) + df_lambda = df.resample("ME").apply(lambda x: str(type(x))) + df_partial = df.resample("ME").apply(partial(fn)) + df_partial2 = df.resample("ME").apply(partial(fn, a=2)) + df_class = df.resample("ME").apply(FnClass()) tm.assert_frame_equal(df_standard, df_lambda) tm.assert_frame_equal(df_standard, df_partial) @@ -271,34 +289,30 @@ # GH 8371 # odd results when rounding is needed - data = """date,time,value -11-08-2014,00:00:01.093,1 -11-08-2014,00:00:02.159,1 -11-08-2014,00:00:02.667,1 -11-08-2014,00:00:03.175,1 -11-08-2014,00:00:07.058,1 -11-08-2014,00:00:07.362,1 -11-08-2014,00:00:08.324,1 -11-08-2014,00:00:08.830,1 -11-08-2014,00:00:08.982,1 -11-08-2014,00:00:09.815,1 -11-08-2014,00:00:10.540,1 -11-08-2014,00:00:11.061,1 -11-08-2014,00:00:11.617,1 -11-08-2014,00:00:13.607,1 -11-08-2014,00:00:14.535,1 -11-08-2014,00:00:15.525,1 -11-08-2014,00:00:17.960,1 -11-08-2014,00:00:20.674,1 -11-08-2014,00:00:21.191,1""" - - df = pd.read_csv( - StringIO(data), - parse_dates={"timestamp": ["date", "time"]}, - index_col="timestamp", - ) + ts = [ + "2014-11-08 00:00:01", + "2014-11-08 00:00:02", + "2014-11-08 00:00:02", + "2014-11-08 00:00:03", + "2014-11-08 00:00:07", + "2014-11-08 00:00:07", + "2014-11-08 00:00:08", + "2014-11-08 00:00:08", + "2014-11-08 00:00:08", + "2014-11-08 00:00:09", + "2014-11-08 00:00:10", + "2014-11-08 00:00:11", + "2014-11-08 00:00:11", + "2014-11-08 00:00:13", + "2014-11-08 00:00:14", + "2014-11-08 00:00:15", + "2014-11-08 00:00:17", + "2014-11-08 00:00:20", + "2014-11-08 00:00:21", + ] + df = DataFrame({"value": [1] * 19}, index=pd.to_datetime(ts)) df.index = df.index.as_unit(unit) - df.index.name = None + result = df.resample("6s").sum() expected = DataFrame( {"value": [4, 9, 4, 2]}, @@ -425,29 +439,43 @@ @pytest.mark.parametrize("f", ["sum", "mean", "prod", "min", "max", "var"]) def test_resample_frame_basic_cy_funcs(f, unit): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((50, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=50, freq="B"), + ) df.index = df.index.as_unit(unit) - b = Grouper(freq="M") + b = Grouper(freq="ME") g = df.groupby(b) # check all cython functions work g._cython_agg_general(f, alt=None, numeric_only=True) -@pytest.mark.parametrize("freq", ["A", "M"]) +@pytest.mark.parametrize("freq", ["YE", "ME"]) def test_resample_frame_basic_M_A(freq, unit): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((50, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=50, freq="B"), + ) df.index = df.index.as_unit(unit) result = df.resample(freq).mean() tm.assert_series_equal(result["A"], df["A"].resample(freq).mean()) -@pytest.mark.parametrize("freq", ["W-WED", "M"]) +@pytest.mark.parametrize("freq", ["W-WED", "ME"]) def test_resample_frame_basic_kind(freq, unit): - df = tm.makeTimeDataFrame() + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) df.index = df.index.as_unit(unit) - df.resample(freq, kind="period").mean() + msg = "The 'kind' keyword in DataFrame.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df.resample(freq, kind="period").mean() def test_resample_upsample(unit): @@ -493,31 +521,31 @@ ), ) expected.index = expected.index.as_unit(unit) - tm.assert_series_equal(s.resample("10S").mean(), expected) + tm.assert_series_equal(s.resample("10s").mean(), expected) def test_resample_extra_index_point(unit): # GH#9756 - index = date_range(start="20150101", end="20150331", freq="BM").as_unit(unit) + index = date_range(start="20150101", end="20150331", freq="BME").as_unit(unit) expected = DataFrame({"A": Series([21, 41, 63], index=index)}) index = date_range(start="20150101", end="20150331", freq="B").as_unit(unit) df = DataFrame({"A": Series(range(len(index)), index=index)}, dtype="int64") - result = df.resample("BM").last() + result = df.resample("BME").last() tm.assert_frame_equal(result, expected) def test_upsample_with_limit(unit): - rng = date_range("1/1/2000", periods=3, freq="5t").as_unit(unit) + rng = date_range("1/1/2000", periods=3, freq="5min").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) - result = ts.resample("t").ffill(limit=2) + result = ts.resample("min").ffill(limit=2) expected = ts.reindex(result.index, method="ffill", limit=2) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("freq", ["5D", "10H", "5Min", "10S"]) -@pytest.mark.parametrize("rule", ["Y", "3M", "15D", "30H", "15Min", "30S"]) +@pytest.mark.parametrize("freq", ["1D", "10h", "5Min", "10s"]) +@pytest.mark.parametrize("rule", ["YE", "3ME", "15D", "30h", "15Min", "30s"]) def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit): # GH 33939 rng = date_range("1/1/2000", periods=3, freq=freq, tz=tz_aware_fixture).as_unit( @@ -560,10 +588,10 @@ index = index.union(date_range("4-15-2000", "5-15-2000", freq="h").as_unit(unit)) s = Series(range(len(index)), index=index) - a = s.loc[:"4-15-2000"].resample("30T").ohlc() + a = s.loc[:"4-15-2000"].resample("30min").ohlc() assert isinstance(a, DataFrame) - b = s.loc[:"4-14-2000"].resample("30T").ohlc() + b = s.loc[:"4-14-2000"].resample("30min").ohlc() assert isinstance(b, DataFrame) @@ -604,9 +632,9 @@ ).reindex(["VOLUME", "PRICE"], axis=1) df.index = df.index.as_unit(unit) df.columns.name = "Cols" - res = df.resample("H").ohlc() + res = df.resample("h").ohlc() exp = pd.concat( - [df["VOLUME"].resample("H").ohlc(), df["PRICE"].resample("H").ohlc()], + [df["VOLUME"].resample("h").ohlc(), df["PRICE"].resample("h").ohlc()], axis=1, keys=df.columns, ) @@ -614,7 +642,7 @@ tm.assert_frame_equal(exp, res) df.columns = [["a", "b"], ["c", "d"]] - res = df.resample("H").ohlc() + res = df.resample("h").ohlc() exp.columns = pd.MultiIndex.from_tuples( [ ("a", "c", "open"), @@ -644,7 +672,7 @@ df.iloc[3, :] = np.nan warning_msg = "DataFrame.resample with axis=1 is deprecated." with tm.assert_produces_warning(FutureWarning, match=warning_msg): - result = df.resample("Q", axis=1).mean() + result = df.resample("QE", axis=1).mean() msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -659,8 +687,8 @@ ).as_unit(unit) s = Series(np.random.default_rng(2).random(len(dti)), dti) bs = s.resample("B", closed="right", label="right").mean() - result = bs.resample("8H").mean() - assert len(result) == 22 + result = bs.resample("8h").mean() + assert len(result) == 25 assert isinstance(result.index.freq, offsets.DateOffset) assert result.index.freq == offsets.Hour(8) @@ -668,9 +696,9 @@ @pytest.mark.parametrize( "freq, expected_kwargs", [ - ["A-DEC", {"start": "1990", "end": "2000", "freq": "a-dec"}], - ["A-JUN", {"start": "1990", "end": "2000", "freq": "a-jun"}], - ["M", {"start": "1990-01", "end": "2000-01", "freq": "M"}], + ["YE-DEC", {"start": "1990", "end": "2000", "freq": "Y-DEC"}], + ["YE-JUN", {"start": "1990", "end": "2000", "freq": "Y-JUN"}], + ["ME", {"start": "1990-01", "end": "2000-01", "freq": "M"}], ], ) def test_resample_timestamp_to_period( @@ -679,7 +707,9 @@ ts = simple_date_range_series("1/1/1990", "1/1/2000") ts.index = ts.index.as_unit(unit) - result = ts.resample(freq, kind="period").mean() + msg = "The 'kind' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ts.resample(freq, kind="period").mean() expected = ts.resample(freq).mean() expected.index = period_range(**expected_kwargs) tm.assert_series_equal(result, expected) @@ -710,7 +740,7 @@ rng2 = rng.repeat(5).values ts = Series(np.random.default_rng(2).standard_normal(len(rng2)), index=rng2) - result = ts.resample("M").mean() + result = ts.resample("ME").mean() expected = ts.groupby(lambda x: x.month).mean() assert len(result) == 2 @@ -739,12 +769,12 @@ warning_msg = "DataFrame.resample with axis=1 is deprecated." with tm.assert_produces_warning(FutureWarning, match=warning_msg): - result = df.resample("M", axis=1).mean() - expected = df.T.resample("M").mean().T + result = df.resample("ME", axis=1).mean() + expected = df.T.resample("ME").mean().T tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("freq", ["t", "5t", "15t", "30t", "4h", "12h"]) +@pytest.mark.parametrize("freq", ["min", "5min", "15min", "30min", "4h", "12h"]) def test_resample_anchored_ticks(freq, unit): # If a fixed delta (5 minute, 4 hour) evenly divides a day, we should # "anchor" the origin at midnight so we get regular intervals rather @@ -765,7 +795,7 @@ rng = date_range("2000-1-1", f"2000-{end}-10", freq="D").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - tm.assert_series_equal(ts.resample("M").sum(), ts.resample("M").apply(mysum)) + tm.assert_series_equal(ts.resample("ME").sum(), ts.resample("ME").apply(mysum)) def test_resample_single_group_std(unit): @@ -921,13 +951,13 @@ ts_1 = Series(random_values, index=rng) result_1 = ts_1.resample("D", origin="epoch").mean() - result_2 = ts_1.resample("24H", origin="epoch").mean() + result_2 = ts_1.resample("24h", origin="epoch").mean() tm.assert_series_equal(result_1, result_2) # check that we have the same behavior with epoch even if we are not timezone aware ts_no_tz = ts_1.tz_localize(None) result_3 = ts_no_tz.resample("D", origin="epoch").mean() - result_4 = ts_no_tz.resample("24H", origin="epoch").mean() + result_4 = ts_no_tz.resample("24h", origin="epoch").mean() tm.assert_series_equal(result_1, result_3.tz_localize(rng.tz), check_freq=False) tm.assert_series_equal(result_1, result_4.tz_localize(rng.tz), check_freq=False) @@ -936,7 +966,7 @@ rng = date_range(start, end, freq="7min").as_unit(unit) ts_2 = Series(random_values, index=rng) result_5 = ts_2.resample("D", origin="epoch").mean() - result_6 = ts_2.resample("24H", origin="epoch").mean() + result_6 = ts_2.resample("24h", origin="epoch").mean() tm.assert_series_equal(result_1.tz_localize(None), result_5.tz_localize(None)) tm.assert_series_equal(result_1.tz_localize(None), result_6.tz_localize(None)) @@ -972,32 +1002,32 @@ expected_ts = ["2013-11-02 22:00-05:00", "2013-11-03 22:00-06:00"] expected = _create_series([23.0, 2.0], expected_ts) - result = ts.resample("D", origin="start", offset="-2H").sum() + result = ts.resample("D", origin="start", offset="-2h").sum() tm.assert_series_equal(result, expected) expected_ts = ["2013-11-02 22:00-05:00", "2013-11-03 21:00-06:00"] - expected = _create_series([22.0, 3.0], expected_ts, freq="24H") - result = ts.resample("24H", origin="start", offset="-2H").sum() + expected = _create_series([22.0, 3.0], expected_ts, freq="24h") + result = ts.resample("24h", origin="start", offset="-2h").sum() tm.assert_series_equal(result, expected) expected_ts = ["2013-11-02 02:00-05:00", "2013-11-03 02:00-06:00"] expected = _create_series([3.0, 22.0], expected_ts) - result = ts.resample("D", origin="start", offset="2H").sum() + result = ts.resample("D", origin="start", offset="2h").sum() tm.assert_series_equal(result, expected) expected_ts = ["2013-11-02 23:00-05:00", "2013-11-03 23:00-06:00"] expected = _create_series([24.0, 1.0], expected_ts) - result = ts.resample("D", origin="start", offset="-1H").sum() + result = ts.resample("D", origin="start", offset="-1h").sum() tm.assert_series_equal(result, expected) expected_ts = ["2013-11-02 01:00-05:00", "2013-11-03 01:00:00-0500"] expected = _create_series([1.0, 24.0], expected_ts) - result = ts.resample("D", origin="start", offset="1H").sum() + result = ts.resample("D", origin="start", offset="1h").sum() tm.assert_series_equal(result, expected) def test_resample_daily_anchored(unit): - rng = date_range("1/1/2000 0:00:00", periods=10000, freq="T").as_unit(unit) + rng = date_range("1/1/2000 0:00:00", periods=10000, freq="min").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) ts[:2] = np.nan # so results are the same @@ -1012,7 +1042,9 @@ rng = date_range("1/1/2000", "12/31/2000").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - result = ts.resample("M", kind="period").mean() + msg = "The 'kind' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ts.resample("ME", kind="period").mean() exp_index = period_range("Jan-2000", "Dec-2000", freq="M") tm.assert_index_equal(result.index, exp_index) @@ -1021,12 +1053,15 @@ # aggregate a period resampler with a lambda s2 = Series( np.random.default_rng(2).integers(0, 5, 50), - index=period_range("2012-01-01", freq="H", periods=50), + index=period_range("2012-01-01", freq="h", periods=50), dtype="float64", ) expected = s2.to_timestamp().resample("D").mean().to_period() - result = s2.resample("D").agg(lambda x: x.mean()) + msg = "Resampling with a PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = s2.resample("D") + result = rs.agg(lambda x: x.mean()) tm.assert_series_equal(result, expected) @@ -1044,8 +1079,12 @@ all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") ).set_index("timestamp") df.index = df.index.as_unit(unit) - result = df.groupby("ID").resample("5min").sum() - expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("ID").resample("5min").sum() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1064,7 +1103,9 @@ result = df.resample("1D").ffill() assert result.val.dtype == np.int32 - result = df.groupby("group").resample("1D").ffill() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1075,12 +1116,12 @@ df = {"a": [1, 3, 1, 4]} df = DataFrame(df, index=date_range("2017-01-01", "2017-01-04").as_unit(unit)) - expected = df.astype("float64").resample("H").mean()["a"].interpolate("cubic") + expected = df.astype("float64").resample("h").mean()["a"].interpolate("cubic") - result = df.resample("H")["a"].mean().interpolate("cubic") + result = df.resample("h")["a"].mean().interpolate("cubic") tm.assert_series_equal(result, expected) - result = df.resample("H").mean()["a"].interpolate("cubic") + result = df.resample("h").mean()["a"].interpolate("cubic") tm.assert_series_equal(result, expected) @@ -1099,7 +1140,7 @@ dates = date_range("4/16/2012 20:00", periods=5000, freq="h").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(dates)), index=dates) # it works! - ts.resample("M") + ts.resample("ME") def test_nanosecond_resample_error(): @@ -1107,66 +1148,80 @@ # Resampling using pd.tseries.offsets.Nano as period start = 1443707890427 exp_start = 1443707890400 - indx = date_range(start=pd.to_datetime(start), periods=10, freq="100n") + indx = date_range(start=pd.to_datetime(start), periods=10, freq="100ns") ts = Series(range(len(indx)), index=indx) r = ts.resample(pd.tseries.offsets.Nano(100)) result = r.agg("mean") - exp_indx = date_range(start=pd.to_datetime(exp_start), periods=10, freq="100n") + exp_indx = date_range(start=pd.to_datetime(exp_start), periods=10, freq="100ns") exp = Series(range(len(exp_indx)), index=exp_indx, dtype=float) tm.assert_series_equal(result, exp) -def test_resample_anchored_intraday(simple_date_range_series, unit): +def test_resample_anchored_intraday(unit): # #1471, #1458 rng = date_range("1/1/2012", "4/1/2012", freq="100min").as_unit(unit) df = DataFrame(rng.month, index=rng) - result = df.resample("M").mean() - expected = df.resample("M", kind="period").mean().to_timestamp(how="end") + result = df.resample("ME").mean() + msg = "The 'kind' keyword in DataFrame.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.resample("ME", kind="period").mean().to_timestamp(how="end") expected.index += Timedelta(1, "ns") - Timedelta(1, "D") expected.index = expected.index.as_unit(unit)._with_freq("infer") - assert expected.index.freq == "M" + assert expected.index.freq == "ME" tm.assert_frame_equal(result, expected) - result = df.resample("M", closed="left").mean() - exp = df.shift(1, freq="D").resample("M", kind="period").mean() + result = df.resample("ME", closed="left").mean() + msg = "The 'kind' keyword in DataFrame.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + exp = df.shift(1, freq="D").resample("ME", kind="period").mean() exp = exp.to_timestamp(how="end") exp.index = exp.index + Timedelta(1, "ns") - Timedelta(1, "D") exp.index = exp.index.as_unit(unit)._with_freq("infer") - assert exp.index.freq == "M" + assert exp.index.freq == "ME" tm.assert_frame_equal(result, exp) + +def test_resample_anchored_intraday2(unit): rng = date_range("1/1/2012", "4/1/2012", freq="100min").as_unit(unit) df = DataFrame(rng.month, index=rng) - result = df.resample("Q").mean() - expected = df.resample("Q", kind="period").mean().to_timestamp(how="end") + result = df.resample("QE").mean() + msg = "The 'kind' keyword in DataFrame.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = df.resample("QE", kind="period").mean().to_timestamp(how="end") expected.index += Timedelta(1, "ns") - Timedelta(1, "D") - expected.index._data.freq = "Q" + expected.index._data.freq = "QE" expected.index._freq = lib.no_default expected.index = expected.index.as_unit(unit) tm.assert_frame_equal(result, expected) - result = df.resample("Q", closed="left").mean() - expected = df.shift(1, freq="D").resample("Q", kind="period", closed="left").mean() + result = df.resample("QE", closed="left").mean() + msg = "The 'kind' keyword in DataFrame.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = ( + df.shift(1, freq="D").resample("QE", kind="period", closed="left").mean() + ) expected = expected.to_timestamp(how="end") expected.index += Timedelta(1, "ns") - Timedelta(1, "D") - expected.index._data.freq = "Q" + expected.index._data.freq = "QE" expected.index._freq = lib.no_default expected.index = expected.index.as_unit(unit) tm.assert_frame_equal(result, expected) + +def test_resample_anchored_intraday3(simple_date_range_series, unit): ts = simple_date_range_series("2012-04-29 23:00", "2012-04-30 5:00", freq="h") ts.index = ts.index.as_unit(unit) - resampled = ts.resample("M").mean() + resampled = ts.resample("ME").mean() assert len(resampled) == 1 -@pytest.mark.parametrize("freq", ["MS", "BMS", "QS-MAR", "AS-DEC", "AS-JUN"]) +@pytest.mark.parametrize("freq", ["MS", "BMS", "QS-MAR", "YS-DEC", "YS-JUN"]) def test_resample_anchored_monthstart(simple_date_range_series, freq, unit): ts = simple_date_range_series("1/1/2000", "12/31/2002") ts.index = ts.index.as_unit(unit) @@ -1181,41 +1236,35 @@ # # See: https://github.com/pandas-dev/pandas/issues/8683 - index1 = date_range("2014-10-14 23:06:23.206", periods=3, freq="400L") - index2 = date_range("2014-10-15 23:00:00", periods=2, freq="2200L") + index1 = date_range("2014-10-14 23:06:23.206", periods=3, freq="400ms") + index2 = date_range("2014-10-15 23:00:00", periods=2, freq="2200ms") index = index1.union(index2) s = Series(np.random.default_rng(2).standard_normal(5), index=index) # Ensure left closing works - result = s.resample("2200L", label=label).mean() + result = s.resample("2200ms", label=label).mean() assert result.index[-1] == Timestamp(f"2014-10-15 23:00:{sec}00") def test_corner_cases(unit): # miscellaneous test coverage - rng = date_range("1/1/2000", periods=12, freq="t").as_unit(unit) + rng = date_range("1/1/2000", periods=12, freq="min").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - result = ts.resample("5t", closed="right", label="left").mean() - ex_index = date_range("1999-12-31 23:55", periods=4, freq="5t").as_unit(unit) + result = ts.resample("5min", closed="right", label="left").mean() + ex_index = date_range("1999-12-31 23:55", periods=4, freq="5min").as_unit(unit) tm.assert_index_equal(result.index, ex_index) -def test_corner_cases_period(simple_period_range_series): - # miscellaneous test coverage - len0pts = simple_period_range_series("2007-01", "2010-05", freq="M")[:0] - # it works - result = len0pts.resample("A-DEC").mean() - assert len(result) == 0 - - def test_corner_cases_date(simple_date_range_series, unit): # resample to periods ts = simple_date_range_series("2000-04-28", "2000-04-30 11:00", freq="h") ts.index = ts.index.as_unit(unit) - result = ts.resample("M", kind="period").mean() + msg = "The 'kind' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ts.resample("ME", kind="period").mean() assert len(result) == 1 assert result.index[0] == Period("2000-04", freq="M") @@ -1262,19 +1311,23 @@ ), ], ) -def test_resample_median_bug_1688(dtype): +def test_resample_median_bug_1688(dtype, unit): + # GH#55958 + dti = DatetimeIndex( + [datetime(2012, 1, 1, 0, 0, 0), datetime(2012, 1, 1, 0, 5, 0)] + ).as_unit(unit) df = DataFrame( [1, 2], - index=[datetime(2012, 1, 1, 0, 0, 0), datetime(2012, 1, 1, 0, 5, 0)], + index=dti, dtype=dtype, ) - result = df.resample("T").apply(lambda x: x.mean()) - exp = df.asfreq("T") + result = df.resample("min").apply(lambda x: x.mean()) + exp = df.asfreq("min") tm.assert_frame_equal(result, exp) - result = df.resample("T").median() - exp = df.asfreq("T") + result = df.resample("min").median() + exp = df.asfreq("min") tm.assert_frame_equal(result, exp) @@ -1282,23 +1335,23 @@ ts = simple_date_range_series("1/1/2000", "4/1/2000") ts.index = ts.index.as_unit(unit) - result = ts.resample("M").apply(lambda x: x.mean()) - exp = ts.resample("M").mean() + result = ts.resample("ME").apply(lambda x: x.mean()) + exp = ts.resample("ME").mean() tm.assert_series_equal(result, exp) - foo_exp = ts.resample("M").mean() + foo_exp = ts.resample("ME").mean() foo_exp.name = "foo" - bar_exp = ts.resample("M").std() + bar_exp = ts.resample("ME").std() bar_exp.name = "bar" - result = ts.resample("M").apply([lambda x: x.mean(), lambda x: x.std(ddof=1)]) + result = ts.resample("ME").apply([lambda x: x.mean(), lambda x: x.std(ddof=1)]) result.columns = ["foo", "bar"] tm.assert_series_equal(result["foo"], foo_exp) tm.assert_series_equal(result["bar"], bar_exp) # this is a MI Series, so comparing the names of the results # doesn't make sense - result = ts.resample("M").aggregate( + result = ts.resample("ME").aggregate( {"foo": lambda x: x.mean(), "bar": lambda x: x.std(ddof=1)} ) tm.assert_series_equal(result["foo"], foo_exp, check_names=False) @@ -1314,19 +1367,19 @@ df = DataFrame({"close": 1}, index=bad_ind) # it works! - df.resample("AS").sum() + df.resample("YS").sum() def test_resample_consistency(unit): # GH 6418 # resample with bfill / limit / reindex consistency - i30 = date_range("2002-02-02", periods=4, freq="30T").as_unit(unit) + i30 = date_range("2002-02-02", periods=4, freq="30min").as_unit(unit) s = Series(np.arange(4.0), index=i30) s.iloc[2] = np.nan # Upsample by factor 3 with reindex() and resample() methods: - i10 = date_range(i30[0], i30[-1], freq="10T").as_unit(unit) + i10 = date_range(i30[0], i30[-1], freq="10min").as_unit(unit) s10 = s.reindex(index=i10, method="bfill") s10_2 = s.reindex(index=i10, method="bfill", limit=2) @@ -1356,25 +1409,37 @@ @pytest.mark.parametrize("dates", [dates1, dates2, dates3]) -def test_resample_timegrouper(dates): +def test_resample_timegrouper(dates, unit): # GH 7227 + dates = DatetimeIndex(dates).as_unit(unit) df = DataFrame({"A": dates, "B": np.arange(len(dates))}) - result = df.set_index("A").resample("M").count() + result = df.set_index("A").resample("ME").count() exp_idx = DatetimeIndex( ["2014-07-31", "2014-08-31", "2014-09-30", "2014-10-31", "2014-11-30"], - freq="M", + freq="ME", name="A", - ) + ).as_unit(unit) expected = DataFrame({"B": [1, 0, 2, 2, 1]}, index=exp_idx) if df["A"].isna().any(): expected.index = expected.index._with_freq(None) tm.assert_frame_equal(result, expected) - result = df.groupby(Grouper(freq="M", key="A")).count() + result = df.groupby(Grouper(freq="ME", key="A")).count() tm.assert_frame_equal(result, expected) + +@pytest.mark.parametrize("dates", [dates1, dates2, dates3]) +def test_resample_timegrouper2(dates, unit): + dates = DatetimeIndex(dates).as_unit(unit) + df = DataFrame({"A": dates, "B": np.arange(len(dates)), "C": np.arange(len(dates))}) - result = df.set_index("A").resample("M").count() + result = df.set_index("A").resample("ME").count() + + exp_idx = DatetimeIndex( + ["2014-07-31", "2014-08-31", "2014-09-30", "2014-10-31", "2014-11-30"], + freq="ME", + name="A", + ).as_unit(unit) expected = DataFrame( {"B": [1, 0, 2, 2, 1], "C": [1, 0, 2, 2, 1]}, index=exp_idx, @@ -1384,7 +1449,7 @@ expected.index = expected.index._with_freq(None) tm.assert_frame_equal(result, expected) - result = df.groupby(Grouper(freq="M", key="A")).count() + result = df.groupby(Grouper(freq="ME", key="A")).count() tm.assert_frame_equal(result, expected) @@ -1421,7 +1486,11 @@ def test_resample_nunique_preserves_column_level_names(unit): # see gh-23222 - df = tm.makeTimeDataFrame(freq="1D").abs() + df = DataFrame( + np.random.default_rng(2).standard_normal((5, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=5, freq="D"), + ).abs() df.index = df.index.as_unit(unit) df.columns = pd.MultiIndex.from_arrays( [df.columns.tolist()] * 2, names=["lev0", "lev1"] @@ -1446,7 +1515,7 @@ index2 = date_range("4-15-2000", "5-15-2000", freq="h").as_unit(unit) index3 = index.append(index2) s = Series(range(len(index3)), index=index3, dtype="int64") - r = s.resample("M") + r = s.resample("ME") result = r.count() expected = func(r) tm.assert_series_equal(result, expected) @@ -1460,11 +1529,13 @@ # use a fixed seed to always have the same uniques prng = np.random.default_rng(2) - dr = date_range(start="2015-08-27", periods=n // 10, freq="T").as_unit(unit) + dr = date_range(start="2015-08-27", periods=n // 10, freq="min").as_unit(unit) ts = Series(prng.integers(0, n // k, n).astype("int64"), index=prng.choice(dr, n)) - left = ts.resample("30T").nunique() - ix = date_range(start=ts.index.min(), end=ts.index.max(), freq="30T").as_unit(unit) + left = ts.resample("30min").nunique() + ix = date_range(start=ts.index.min(), end=ts.index.max(), freq="30min").as_unit( + unit + ) vals = ts.values bins = np.searchsorted(ix.values, ts.index, side="right") @@ -1483,14 +1554,16 @@ def test_resample_size(unit): n = 10000 - dr = date_range("2015-09-19", periods=n, freq="T").as_unit(unit) + dr = date_range("2015-09-19", periods=n, freq="min").as_unit(unit) ts = Series( np.random.default_rng(2).standard_normal(n), index=np.random.default_rng(2).choice(dr, n), ) - left = ts.resample("7T").size() - ix = date_range(start=left.index.min(), end=ts.index.max(), freq="7T").as_unit(unit) + left = ts.resample("7min").size() + ix = date_range(start=left.index.min(), end=ts.index.max(), freq="7min").as_unit( + unit + ) bins = np.searchsorted(ix.values, ts.index.values, side="right") val = np.bincount(bins, minlength=len(ix) + 1)[1:].astype("int64", copy=False) @@ -1521,11 +1594,11 @@ pd.to_datetime(df2.ts, unit="s") .dt.tz_localize("UTC") .dt.tz_convert("Europe/Madrid"), - freq="H", + freq="h", ) df = DataFrame([5, 5], index=dti1) - result = df.resample(rule="H").sum() + result = df.resample(rule="h").sum() expected = DataFrame([5, 5], index=dti2) tm.assert_frame_equal(result, expected) @@ -1569,6 +1642,8 @@ ), ) + +def test_resample_dst_anchor2(unit): dti = date_range( "2013-09-30", "2013-11-02", freq="30Min", tz="Europe/Paris" ).as_unit(unit) @@ -1576,73 +1651,86 @@ df = DataFrame({"a": values, "b": values, "c": values}, index=dti, dtype="int64") how = {"a": "min", "b": "max", "c": "count"} + rs = df.resample("W-MON") + result = rs.agg(how)[["a", "b", "c"]] + expected = DataFrame( + { + "a": [0, 48, 384, 720, 1056, 1394], + "b": [47, 383, 719, 1055, 1393, 1586], + "c": [48, 336, 336, 336, 338, 193], + }, + index=date_range( + "9/30/2013", "11/4/2013", freq="W-MON", tz="Europe/Paris" + ).as_unit(unit), + ) tm.assert_frame_equal( - df.resample("W-MON").agg(how)[["a", "b", "c"]], - DataFrame( - { - "a": [0, 48, 384, 720, 1056, 1394], - "b": [47, 383, 719, 1055, 1393, 1586], - "c": [48, 336, 336, 336, 338, 193], - }, - index=date_range( - "9/30/2013", "11/4/2013", freq="W-MON", tz="Europe/Paris" - ).as_unit(unit), - ), + result, + expected, "W-MON Frequency", ) + rs2 = df.resample("2W-MON") + result2 = rs2.agg(how)[["a", "b", "c"]] + expected2 = DataFrame( + { + "a": [0, 48, 720, 1394], + "b": [47, 719, 1393, 1586], + "c": [48, 672, 674, 193], + }, + index=date_range( + "9/30/2013", "11/11/2013", freq="2W-MON", tz="Europe/Paris" + ).as_unit(unit), + ) tm.assert_frame_equal( - df.resample("2W-MON").agg(how)[["a", "b", "c"]], - DataFrame( - { - "a": [0, 48, 720, 1394], - "b": [47, 719, 1393, 1586], - "c": [48, 672, 674, 193], - }, - index=date_range( - "9/30/2013", "11/11/2013", freq="2W-MON", tz="Europe/Paris" - ).as_unit(unit), - ), + result2, + expected2, "2W-MON Frequency", ) - tm.assert_frame_equal( - df.resample("MS").agg(how)[["a", "b", "c"]], - DataFrame( - {"a": [0, 48, 1538], "b": [47, 1537, 1586], "c": [48, 1490, 49]}, - index=date_range( - "9/1/2013", "11/1/2013", freq="MS", tz="Europe/Paris" - ).as_unit(unit), + rs3 = df.resample("MS") + result3 = rs3.agg(how)[["a", "b", "c"]] + expected3 = DataFrame( + {"a": [0, 48, 1538], "b": [47, 1537, 1586], "c": [48, 1490, 49]}, + index=date_range("9/1/2013", "11/1/2013", freq="MS", tz="Europe/Paris").as_unit( + unit ), + ) + tm.assert_frame_equal( + result3, + expected3, "MS Frequency", ) + rs4 = df.resample("2MS") + result4 = rs4.agg(how)[["a", "b", "c"]] + expected4 = DataFrame( + {"a": [0, 1538], "b": [1537, 1586], "c": [1538, 49]}, + index=date_range( + "9/1/2013", "11/1/2013", freq="2MS", tz="Europe/Paris" + ).as_unit(unit), + ) tm.assert_frame_equal( - df.resample("2MS").agg(how)[["a", "b", "c"]], - DataFrame( - {"a": [0, 1538], "b": [1537, 1586], "c": [1538, 49]}, - index=date_range( - "9/1/2013", "11/1/2013", freq="2MS", tz="Europe/Paris" - ).as_unit(unit), - ), + result4, + expected4, "2MS Frequency", ) df_daily = df["10/26/2013":"10/29/2013"] + rs_d = df_daily.resample("D") + result_d = rs_d.agg({"a": "min", "b": "max", "c": "count"})[["a", "b", "c"]] + expected_d = DataFrame( + { + "a": [1248, 1296, 1346, 1394], + "b": [1295, 1345, 1393, 1441], + "c": [48, 50, 48, 48], + }, + index=date_range( + "10/26/2013", "10/29/2013", freq="D", tz="Europe/Paris" + ).as_unit(unit), + ) tm.assert_frame_equal( - df_daily.resample("D").agg({"a": "min", "b": "max", "c": "count"})[ - ["a", "b", "c"] - ], - DataFrame( - { - "a": [1248, 1296, 1346, 1394], - "b": [1295, 1345, 1393, 1441], - "c": [48, 50, 48, 48], - }, - index=date_range( - "10/26/2013", "10/29/2013", freq="D", tz="Europe/Paris" - ).as_unit(unit), - ), + result_d, + expected_d, "D Frequency", ) @@ -1651,11 +1739,11 @@ # GH 8531 tz = pytz.timezone("Europe/Berlin") dt = datetime(2014, 10, 26) - dates = date_range(tz.localize(dt), periods=4, freq="2H").as_unit(unit) - result = Series(5, index=dates).resample("H").mean() + dates = date_range(tz.localize(dt), periods=4, freq="2h").as_unit(unit) + result = Series(5, index=dates).resample("h").mean() expected = Series( [5.0, np.nan] * 3 + [5.0], - index=date_range(tz.localize(dt), periods=7, freq="H").as_unit(unit), + index=date_range(tz.localize(dt), periods=7, freq="h").as_unit(unit), ) tm.assert_series_equal(result, expected) @@ -1681,7 +1769,7 @@ def test_downsample_across_dst_weekly_2(unit): # GH 9119, GH 21459 - idx = date_range("2013-04-01", "2013-05-01", tz="Europe/London", freq="H").as_unit( + idx = date_range("2013-04-01", "2013-05-01", tz="Europe/London", freq="h").as_unit( unit ) s = Series(index=idx, dtype=np.float64) @@ -1699,7 +1787,7 @@ # GH 25758 start = datetime(2018, 11, 3, 12) end = datetime(2018, 11, 5, 12) - index = date_range(start, end, freq="1H").as_unit(unit) + index = date_range(start, end, freq="1h").as_unit(unit) index = index.tz_localize("UTC").tz_convert("America/Havana") data = list(range(len(index))) dataframe = DataFrame(data, index=index) @@ -1723,9 +1811,8 @@ "1970-01-01 00:00:01", "1970-01-01 00:00:02", ] - ) + ).as_unit(unit) frame = DataFrame([2, 3, 5, 7, 11], index=index) - frame.index = frame.index.as_unit(unit) index_1s = DatetimeIndex( ["1970-01-01 00:00:00", "1970-01-01 00:00:01", "1970-01-01 00:00:02"] @@ -1784,10 +1871,21 @@ expected = series.resample("D").mean().multiply(multiplier) tm.assert_series_equal(result, expected) + +def test_resample_apply_with_additional_args2(): # Testing dataframe + def f(data, add_arg): + return np.mean(data) * add_arg + + multiplier = 10 + df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) - result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) - expected = df.groupby("A").resample("D").mean().multiply(multiplier) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) @@ -1795,23 +1893,23 @@ @pytest.mark.parametrize( "n1, freq1, n2, freq2", [ - (30, "S", 0.5, "Min"), - (60, "S", 1, "Min"), - (3600, "S", 1, "H"), - (60, "Min", 1, "H"), - (21600, "S", 0.25, "D"), - (86400, "S", 1, "D"), - (43200, "S", 0.5, "D"), + (30, "s", 0.5, "Min"), + (60, "s", 1, "Min"), + (3600, "s", 1, "h"), + (60, "Min", 1, "h"), + (21600, "s", 0.25, "D"), + (86400, "s", 1, "D"), + (43200, "s", 0.5, "D"), (1440, "Min", 1, "D"), - (12, "H", 0.5, "D"), - (24, "H", 1, "D"), + (12, "h", 0.5, "D"), + (24, "h", 1, "D"), ], ) def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k, unit): # GH 24127 n1_ = n1 * k n2_ = n2 * k - dti = date_range("19910905 13:00", "19911005 07:00", freq=freq1).as_unit(unit) + dti = date_range("1991-09-05", "1991-09-12", freq=freq1).as_unit(unit) ser = Series(range(len(dti)), index=dti) result1 = ser.resample(str(n1_) + freq1).mean() @@ -1824,10 +1922,10 @@ [ ("19910905", "19920406", "D", "19910905", "19920407"), ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920407"), - ("19910905 06:00", "19920406 06:00", "H", "19910905 06:00", "19920406 07:00"), - ("19910906", "19920406", "M", "19910831", "19920430"), - ("19910831", "19920430", "M", "19910831", "19920531"), - ("1991-08", "1992-04", "M", "19910831", "19920531"), + ("19910905 06:00", "19920406 06:00", "h", "19910905 06:00", "19920406 07:00"), + ("19910906", "19920406", "ME", "19910831", "19920430"), + ("19910831", "19920430", "ME", "19910831", "19920531"), + ("1991-08", "1992-04", "ME", "19910831", "19920531"), ], ) def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last, unit): @@ -1848,7 +1946,7 @@ @pytest.mark.parametrize("duplicates", [True, False]) def test_resample_apply_product(duplicates, unit): # GH 5586 - index = date_range(start="2012-01-31", freq="M", periods=12).as_unit(unit) + index = date_range(start="2012-01-31", freq="ME", periods=12).as_unit(unit) ts = Series(range(12), index=index) df = DataFrame({"A": ts, "B": ts + 2}) @@ -1857,11 +1955,11 @@ msg = "using DatetimeIndexResampler.prod" with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.resample("Q").apply(np.prod) + result = df.resample("QE").apply(np.prod) expected = DataFrame( np.array([[0, 24], [60, 210], [336, 720], [990, 1716]], dtype=np.int64), index=DatetimeIndex( - ["2012-03-31", "2012-06-30", "2012-09-30", "2012-12-31"], freq="Q-DEC" + ["2012-03-31", "2012-06-30", "2012-09-30", "2012-12-31"], freq="QE-DEC" ).as_unit(unit), columns=df.columns, ) @@ -1875,32 +1973,32 @@ "2020-03-28", "2020-03-31", "D", - "24H", + "24h", "2020-03-30 01:00", ), # includes transition into DST ( "2020-03-28", "2020-10-27", "D", - "24H", + "24h", "2020-10-27 00:00", ), # includes transition into and out of DST ( "2020-10-25", "2020-10-27", "D", - "24H", + "24h", "2020-10-26 23:00", ), # includes transition out of DST ( "2020-03-28", "2020-03-31", - "24H", + "24h", "D", "2020-03-30 00:00", ), # same as above, but from 24H to D - ("2020-03-28", "2020-10-27", "24H", "D", "2020-10-27 00:00"), - ("2020-10-25", "2020-10-27", "24H", "D", "2020-10-26 00:00"), + ("2020-03-28", "2020-10-27", "24h", "D", "2020-10-27 00:00"), + ("2020-10-25", "2020-10-27", "24h", "D", "2020-10-26 00:00"), ], ) def test_resample_calendar_day_with_dst( @@ -1921,12 +2019,12 @@ @pytest.mark.parametrize("func", ["min", "max", "first", "last"]) def test_resample_aggregate_functions_min_count(func, unit): # GH#37768 - index = date_range(start="2020", freq="M", periods=3).as_unit(unit) + index = date_range(start="2020", freq="ME", periods=3).as_unit(unit) ser = Series([1, np.nan, np.nan], index) - result = getattr(ser.resample("Q"), func)(min_count=2) + result = getattr(ser.resample("QE"), func)(min_count=2) expected = Series( [np.nan], - index=DatetimeIndex(["2020-03-31"], freq="Q-DEC").as_unit(unit), + index=DatetimeIndex(["2020-03-31"], freq="QE-DEC").as_unit(unit), ) tm.assert_series_equal(result, expected) @@ -1934,7 +2032,7 @@ def test_resample_unsigned_int(any_unsigned_int_numpy_dtype, unit): # gh-43329 df = DataFrame( - index=date_range(start="2000-01-01", end="2000-01-03 23", freq="12H").as_unit( + index=date_range(start="2000-01-01", end="2000-01-03 23", freq="12h").as_unit( unit ), columns=["x"], @@ -1957,9 +2055,9 @@ def test_long_rule_non_nano(): # https://github.com/pandas-dev/pandas/issues/51024 - idx = date_range("0300-01-01", "2000-01-01", unit="s", freq="100Y") + idx = date_range("0300-01-01", "2000-01-01", unit="s", freq="100YE") ser = Series([1, 4, 2, 8, 5, 7, 1, 4, 2, 8, 5, 7, 1, 4, 2, 8, 5], index=idx) - result = ser.resample("200Y").mean() + result = ser.resample("200YE").mean() expected_idx = DatetimeIndex( np.array( [ @@ -1974,7 +2072,7 @@ "1900-12-31", ] ).astype("datetime64[s]"), - freq="200A-DEC", + freq="200YE-DEC", ) expected = Series([1.0, 3.0, 6.5, 4.0, 3.0, 6.5, 4.0, 3.0, 6.5], index=expected_idx) tm.assert_series_equal(result, expected) @@ -1994,3 +2092,140 @@ ) expected = Series([], index=expected_idx, name="values", dtype="float64") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "freq, freq_depr", + [ + ("2ME", "2M"), + ("2QE", "2Q"), + ("2QE-SEP", "2Q-SEP"), + ("1YE", "1Y"), + ("2YE-MAR", "2Y-MAR"), + ("1YE", "1A"), + ("2YE-MAR", "2A-MAR"), + ], +) +def test_resample_M_Q_Y_A_deprecated(freq, freq_depr): + # GH#9586 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + f"in a future version, please use '{freq[1:]}' instead." + + s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) + expected = s.resample(freq).mean() + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = s.resample(freq_depr).mean() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "freq, freq_depr", + [ + ("2BME", "2BM"), + ("2BQE", "2BQ"), + ("2BQE-MAR", "2BQ-MAR"), + ], +) +def test_resample_BM_BQ_deprecated(freq, freq_depr): + # GH#52064 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed " + f"in a future version, please use '{freq[1:]}' instead." + + s = Series(range(10), index=date_range("20130101", freq="d", periods=10)) + expected = s.resample(freq).mean() + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = s.resample(freq_depr).mean() + tm.assert_series_equal(result, expected) + + +def test_resample_ms_closed_right(unit): + # https://github.com/pandas-dev/pandas/issues/55271 + dti = date_range(start="2020-01-31", freq="1min", periods=6000, unit=unit) + df = DataFrame({"ts": dti}, index=dti) + grouped = df.resample("MS", closed="right") + result = grouped.last() + exp_dti = DatetimeIndex( + [datetime(2020, 1, 1), datetime(2020, 2, 1)], freq="MS" + ).as_unit(unit) + expected = DataFrame( + {"ts": [datetime(2020, 2, 1), datetime(2020, 2, 4, 3, 59)]}, + index=exp_dti, + ).astype(f"M8[{unit}]") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("freq", ["B", "C"]) +def test_resample_c_b_closed_right(freq: str, unit): + # https://github.com/pandas-dev/pandas/issues/55281 + dti = date_range(start="2020-01-31", freq="1min", periods=6000, unit=unit) + df = DataFrame({"ts": dti}, index=dti) + grouped = df.resample(freq, closed="right") + result = grouped.last() + + exp_dti = DatetimeIndex( + [ + datetime(2020, 1, 30), + datetime(2020, 1, 31), + datetime(2020, 2, 3), + datetime(2020, 2, 4), + ], + freq=freq, + ).as_unit(unit) + expected = DataFrame( + { + "ts": [ + datetime(2020, 1, 31), + datetime(2020, 2, 3), + datetime(2020, 2, 4), + datetime(2020, 2, 4, 3, 59), + ] + }, + index=exp_dti, + ).astype(f"M8[{unit}]") + tm.assert_frame_equal(result, expected) + + +def test_resample_b_55282(unit): + # https://github.com/pandas-dev/pandas/issues/55282 + dti = date_range("2023-09-26", periods=6, freq="12h", unit=unit) + ser = Series([1, 2, 3, 4, 5, 6], index=dti) + result = ser.resample("B", closed="right", label="right").mean() + + exp_dti = DatetimeIndex( + [ + datetime(2023, 9, 26), + datetime(2023, 9, 27), + datetime(2023, 9, 28), + datetime(2023, 9, 29), + ], + freq="B", + ).as_unit(unit) + expected = Series( + [1.0, 2.5, 4.5, 6.0], + index=exp_dti, + ) + tm.assert_series_equal(result, expected) + + +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize( + "tz", + [ + None, + pytest.param( + "UTC", + marks=pytest.mark.xfail( + condition=is_platform_windows(), + reason="TODO: Set ARROW_TIMEZONE_DATABASE env var in CI", + ), + ), + ], +) +def test_arrow_timestamp_resample(tz): + # GH 56371 + idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]") + if tz is not None: + idx = idx.dt.tz_localize(tz) + expected = Series(np.arange(5, dtype=np.float64), index=idx) + result = expected.resample("1D").mean() + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/resample/test_period_index.py pandas-2.2.2+dfsg/pandas/tests/resample/test_period_index.py --- pandas-2.1.4+dfsg/pandas/tests/resample/test_period_index.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/resample/test_period_index.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,5 @@ from datetime import datetime +import warnings import dateutil import numpy as np @@ -29,6 +30,10 @@ from pandas.tseries import offsets +pytestmark = pytest.mark.filterwarnings( + "ignore:Resampling with a PeriodIndex is deprecated:FutureWarning" +) + @pytest.fixture() def _index_factory(): @@ -40,8 +45,29 @@ return "pi" +@pytest.fixture +def simple_period_range_series(): + """ + Series with period range index and random data for test purposes. + """ + + def _simple_period_range_series(start, end, freq="D"): + with warnings.catch_warnings(): + # suppress Period[B] deprecation warning + msg = "|".join(["Period with BDay freq", r"PeriodDtype\[B\] is deprecated"]) + warnings.filterwarnings( + "ignore", + msg, + category=FutureWarning, + ) + rng = period_range(start, end, freq=freq) + return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + + return _simple_period_range_series + + class TestPeriodIndex: - @pytest.mark.parametrize("freq", ["2D", "1H", "2H"]) + @pytest.mark.parametrize("freq", ["2D", "1h", "2h"]) @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) def test_asfreq(self, series_and_frame, freq, kind): # GH 12884, 15944 @@ -55,7 +81,9 @@ end = (obj.index[-1] + obj.index.freq).to_timestamp(how="start") new_index = date_range(start=start, end=end, freq=freq, inclusive="left") expected = obj.to_timestamp().reindex(new_index).to_period(freq) - result = obj.resample(freq, kind=kind).asfreq() + msg = "The 'kind' keyword in (Series|DataFrame).resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = obj.resample(freq, kind=kind).asfreq() tm.assert_almost_equal(result, expected) def test_asfreq_fill_value(self, series): @@ -65,23 +93,27 @@ new_index = date_range( s.index[0].to_timestamp(how="start"), (s.index[-1]).to_timestamp(how="start"), - freq="1H", + freq="1h", ) expected = s.to_timestamp().reindex(new_index, fill_value=4.0) - result = s.resample("1H", kind="timestamp").asfreq(fill_value=4.0) + msg = "The 'kind' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.resample("1h", kind="timestamp").asfreq(fill_value=4.0) tm.assert_series_equal(result, expected) frame = s.to_frame("value") new_index = date_range( frame.index[0].to_timestamp(how="start"), (frame.index[-1]).to_timestamp(how="start"), - freq="1H", + freq="1h", ) expected = frame.to_timestamp().reindex(new_index, fill_value=3.0) - result = frame.resample("1H", kind="timestamp").asfreq(fill_value=3.0) + msg = "The 'kind' keyword in DataFrame.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = frame.resample("1h", kind="timestamp").asfreq(fill_value=3.0) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("freq", ["H", "12H", "2D", "W"]) + @pytest.mark.parametrize("freq", ["h", "12h", "2D", "W"]) @pytest.mark.parametrize("kind", [None, "period", "timestamp"]) @pytest.mark.parametrize("kwargs", [{"on": "date"}, {"level": "d"}]) def test_selection(self, index, freq, kind, kwargs): @@ -97,42 +129,49 @@ r"not currently supported, use \.set_index\(\.\.\.\) to " "explicitly set index" ) + depr_msg = "The 'kind' keyword in DataFrame.resample is deprecated" with pytest.raises(NotImplementedError, match=msg): - df.resample(freq, kind=kind, **kwargs) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + df.resample(freq, kind=kind, **kwargs) @pytest.mark.parametrize("month", MONTHS) @pytest.mark.parametrize("meth", ["ffill", "bfill"]) @pytest.mark.parametrize("conv", ["start", "end"]) - @pytest.mark.parametrize("targ", ["D", "B", "M"]) + @pytest.mark.parametrize( + ("offset", "period"), [("D", "D"), ("B", "B"), ("ME", "M"), ("QE", "Q")] + ) def test_annual_upsample_cases( - self, targ, conv, meth, month, simple_period_range_series + self, offset, period, conv, meth, month, simple_period_range_series ): - ts = simple_period_range_series("1/1/1990", "12/31/1991", freq=f"A-{month}") - warn = FutureWarning if targ == "B" else None + ts = simple_period_range_series("1/1/1990", "12/31/1991", freq=f"Y-{month}") + warn = FutureWarning if period == "B" else None msg = r"PeriodDtype\[B\] is deprecated" + if warn is None: + msg = "Resampling with a PeriodIndex is deprecated" + warn = FutureWarning with tm.assert_produces_warning(warn, match=msg): - result = getattr(ts.resample(targ, convention=conv), meth)() - expected = result.to_timestamp(targ, how=conv) - expected = expected.asfreq(targ, meth).to_period() + result = getattr(ts.resample(period, convention=conv), meth)() + expected = result.to_timestamp(period, how=conv) + expected = expected.asfreq(offset, meth).to_period() tm.assert_series_equal(result, expected) def test_basic_downsample(self, simple_period_range_series): ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M") - result = ts.resample("a-dec").mean() + result = ts.resample("Y-DEC").mean() expected = ts.groupby(ts.index.year).mean() - expected.index = period_range("1/1/1990", "6/30/1995", freq="a-dec") + expected.index = period_range("1/1/1990", "6/30/1995", freq="Y-DEC") tm.assert_series_equal(result, expected) # this is ok - tm.assert_series_equal(ts.resample("a-dec").mean(), result) - tm.assert_series_equal(ts.resample("a").mean(), result) + tm.assert_series_equal(ts.resample("Y-DEC").mean(), result) + tm.assert_series_equal(ts.resample("Y").mean(), result) @pytest.mark.parametrize( "rule,expected_error_msg", [ - ("a-dec", ""), - ("q-mar", ""), + ("Y-DEC", ""), + ("Q-MAR", ""), ("M", ""), ("w-thu", ""), ], @@ -150,29 +189,34 @@ @pytest.mark.parametrize("freq", ["D", "2D"]) def test_basic_upsample(self, freq, simple_period_range_series): ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M") - result = ts.resample("a-dec").mean() + result = ts.resample("Y-DEC").mean() - resampled = result.resample(freq, convention="end").ffill() + msg = "The 'convention' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + resampled = result.resample(freq, convention="end").ffill() expected = result.to_timestamp(freq, how="end") expected = expected.asfreq(freq, "ffill").to_period(freq) tm.assert_series_equal(resampled, expected) def test_upsample_with_limit(self): - rng = period_range("1/1/2000", periods=5, freq="A") + rng = period_range("1/1/2000", periods=5, freq="Y") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), rng) - result = ts.resample("M", convention="end").ffill(limit=2) + msg = "The 'convention' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ts.resample("M", convention="end").ffill(limit=2) expected = ts.asfreq("M").reindex(result.index, method="ffill", limit=2) tm.assert_series_equal(result, expected) def test_annual_upsample(self, simple_period_range_series): - ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="A-DEC") + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="Y-DEC") df = DataFrame({"a": ts}) rdf = df.resample("D").ffill() exp = df["a"].resample("D").ffill() tm.assert_series_equal(rdf["a"], exp) - rng = period_range("2000", "2003", freq="A-DEC") + def test_annual_upsample2(self): + rng = period_range("2000", "2003", freq="Y-DEC") ts = Series([1, 2, 3, 4], index=rng) result = ts.resample("M").ffill() @@ -182,19 +226,24 @@ tm.assert_series_equal(result, expected) @pytest.mark.parametrize("month", MONTHS) - @pytest.mark.parametrize("target", ["D", "B", "M"]) @pytest.mark.parametrize("convention", ["start", "end"]) + @pytest.mark.parametrize( + ("offset", "period"), [("D", "D"), ("B", "B"), ("ME", "M")] + ) def test_quarterly_upsample( - self, month, target, convention, simple_period_range_series + self, month, offset, period, convention, simple_period_range_series ): freq = f"Q-{month}" ts = simple_period_range_series("1/1/1990", "12/31/1995", freq=freq) - warn = FutureWarning if target == "B" else None + warn = FutureWarning if period == "B" else None msg = r"PeriodDtype\[B\] is deprecated" + if warn is None: + msg = "Resampling with a PeriodIndex is deprecated" + warn = FutureWarning with tm.assert_produces_warning(warn, match=msg): - result = ts.resample(target, convention=convention).ffill() - expected = result.to_timestamp(target, how=convention) - expected = expected.asfreq(target, "ffill").to_period() + result = ts.resample(period, convention=convention).ffill() + expected = result.to_timestamp(period, how=convention) + expected = expected.asfreq(offset, "ffill").to_period() tm.assert_series_equal(result, expected) @pytest.mark.parametrize("target", ["D", "B"]) @@ -204,6 +253,9 @@ warn = None if target == "D" else FutureWarning msg = r"PeriodDtype\[B\] is deprecated" + if warn is None: + msg = "Resampling with a PeriodIndex is deprecated" + warn = FutureWarning with tm.assert_produces_warning(warn, match=msg): result = ts.resample(target, convention=convention).ffill() expected = result.to_timestamp(target, how=convention) @@ -219,13 +271,16 @@ ) s[10:30] = np.nan index = PeriodIndex( - [Period("2013-01-01 00:00", "T"), Period("2013-01-01 00:01", "T")], + [Period("2013-01-01 00:00", "min"), Period("2013-01-01 00:01", "min")], name="idx", ) expected = Series([34.5, 79.5], index=index) - result = s.to_period().resample("T", kind="period").mean() + msg = "The 'kind' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.to_period().resample("min", kind="period").mean() tm.assert_series_equal(result, expected) - result2 = s.resample("T", kind="period").mean() + with tm.assert_produces_warning(FutureWarning, match=msg): + result2 = s.resample("min", kind="period").mean() tm.assert_series_equal(result2, expected) @pytest.mark.parametrize( @@ -254,86 +309,80 @@ "Frequency cannot be resampled to , " "as they are not sub or super periods" ) + pi = period_range(start="2000", periods=3, freq="M") + ser = Series(range(3), index=pi) + rs = ser.resample("W") with pytest.raises(IncompatibleFrequency, match=msg): - Series( - range(3), index=period_range(start="2000", periods=3, freq="M") - ).resample("W").mean() + # TODO: should this raise at the resample call instead of at the mean call? + rs.mean() - def test_with_local_timezone_pytz(self): + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("America/Los_Angeles"), + dateutil.tz.gettz("America/Los_Angeles"), + ], + ) + def test_with_local_timezone(self, tz): # see gh-5430 - local_timezone = pytz.timezone("America/Los_Angeles") + local_timezone = tz start = datetime(year=2013, month=11, day=1, hour=0, minute=0, tzinfo=pytz.utc) # 1 day later end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=pytz.utc) - index = date_range(start, end, freq="H") + index = date_range(start, end, freq="h", name="idx") series = Series(1, index=index) series = series.tz_convert(local_timezone) - result = series.resample("D", kind="period").mean() + msg = "The 'kind' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = series.resample("D", kind="period").mean() # Create the expected series # Index is moved back a day with the timezone conversion from UTC to # Pacific - expected_index = period_range(start=start, end=end, freq="D") - offsets.Day() + expected_index = ( + period_range(start=start, end=end, freq="D", name="idx") - offsets.Day() + ) expected = Series(1.0, index=expected_index) tm.assert_series_equal(result, expected) - def test_resample_with_pytz(self): + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("America/Los_Angeles"), + dateutil.tz.gettz("America/Los_Angeles"), + ], + ) + def test_resample_with_tz(self, tz, unit): # GH 13238 - s = Series( - 2, index=date_range("2017-01-01", periods=48, freq="H", tz="US/Eastern") - ) - result = s.resample("D").mean() + dti = date_range("2017-01-01", periods=48, freq="h", tz=tz, unit=unit) + ser = Series(2, index=dti) + result = ser.resample("D").mean() + exp_dti = pd.DatetimeIndex( + ["2017-01-01", "2017-01-02"], tz=tz, freq="D" + ).as_unit(unit) expected = Series( 2.0, - index=pd.DatetimeIndex( - ["2017-01-01", "2017-01-02"], tz="US/Eastern", freq="D" - ), + index=exp_dti, ) tm.assert_series_equal(result, expected) # Especially assert that the timezone is LMT for pytz - assert result.index.tz == pytz.timezone("US/Eastern") - - def test_with_local_timezone_dateutil(self): - # see gh-5430 - local_timezone = "dateutil/America/Los_Angeles" - - start = datetime( - year=2013, month=11, day=1, hour=0, minute=0, tzinfo=dateutil.tz.tzutc() - ) - # 1 day later - end = datetime( - year=2013, month=11, day=2, hour=0, minute=0, tzinfo=dateutil.tz.tzutc() - ) - - index = date_range(start, end, freq="H", name="idx") - - series = Series(1, index=index) - series = series.tz_convert(local_timezone) - result = series.resample("D", kind="period").mean() - - # Create the expected series - # Index is moved back a day with the timezone conversion from UTC to - # Pacific - expected_index = ( - period_range(start=start, end=end, freq="D", name="idx") - offsets.Day() - ) - expected = Series(1.0, index=expected_index) - tm.assert_series_equal(result, expected) + assert result.index.tz == tz def test_resample_nonexistent_time_bin_edge(self): # GH 19375 - index = date_range("2017-03-12", "2017-03-12 1:45:00", freq="15T") + index = date_range("2017-03-12", "2017-03-12 1:45:00", freq="15min") s = Series(np.zeros(len(index)), index=index) expected = s.tz_localize("US/Pacific") - expected.index = pd.DatetimeIndex(expected.index, freq="900S") - result = expected.resample("900S").mean() + expected.index = pd.DatetimeIndex(expected.index, freq="900s") + result = expected.resample("900s").mean() tm.assert_series_equal(result, expected) + def test_resample_nonexistent_time_bin_edge2(self): # GH 23742 - index = date_range(start="2017-10-10", end="2017-10-20", freq="1H") + index = date_range(start="2017-10-10", end="2017-10-20", freq="1h") index = index.tz_localize("UTC").tz_convert("America/Sao_Paulo") df = DataFrame(data=list(range(len(index))), index=index) result = df.groupby(pd.Grouper(freq="1D")).count() @@ -350,20 +399,23 @@ def test_resample_ambiguous_time_bin_edge(self): # GH 10117 idx = date_range( - "2014-10-25 22:00:00", "2014-10-26 00:30:00", freq="30T", tz="Europe/London" + "2014-10-25 22:00:00", + "2014-10-26 00:30:00", + freq="30min", + tz="Europe/London", ) expected = Series(np.zeros(len(idx)), index=idx) - result = expected.resample("30T").mean() + result = expected.resample("30min").mean() tm.assert_series_equal(result, expected) def test_fill_method_and_how_upsample(self): # GH2073 s = Series( np.arange(9, dtype="int64"), - index=date_range("2010-01-01", periods=9, freq="Q"), + index=date_range("2010-01-01", periods=9, freq="QE"), ) - last = s.resample("M").ffill() - both = s.resample("M").ffill().resample("M").last().astype("int64") + last = s.resample("ME").ffill() + both = s.resample("ME").ffill().resample("ME").last().astype("int64") tm.assert_series_equal(last, both) @pytest.mark.parametrize("day", DAYS) @@ -375,6 +427,9 @@ warn = None if target == "D" else FutureWarning msg = r"PeriodDtype\[B\] is deprecated" + if warn is None: + msg = "Resampling with a PeriodIndex is deprecated" + warn = FutureWarning with tm.assert_produces_warning(warn, match=msg): result = ts.resample(target, convention=convention).ffill() expected = result.to_timestamp(target, how=convention) @@ -384,13 +439,15 @@ def test_resample_to_timestamps(self, simple_period_range_series): ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="M") - result = ts.resample("A-DEC", kind="timestamp").mean() - expected = ts.to_timestamp(how="start").resample("A-DEC").mean() + msg = "The 'kind' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ts.resample("Y-DEC", kind="timestamp").mean() + expected = ts.to_timestamp(how="start").resample("YE-DEC").mean() tm.assert_series_equal(result, expected) @pytest.mark.parametrize("month", MONTHS) def test_resample_to_quarterly(self, simple_period_range_series, month): - ts = simple_period_range_series("1990", "1992", freq=f"A-{month}") + ts = simple_period_range_series("1990", "1992", freq=f"Y-{month}") quar_ts = ts.resample(f"Q-{month}").ffill() stamps = ts.to_timestamp("D", how="start") @@ -408,42 +465,47 @@ @pytest.mark.parametrize("how", ["start", "end"]) def test_resample_to_quarterly_start_end(self, simple_period_range_series, how): # conforms, but different month - ts = simple_period_range_series("1990", "1992", freq="A-JUN") - result = ts.resample("Q-MAR", convention=how).ffill() + ts = simple_period_range_series("1990", "1992", freq="Y-JUN") + msg = "The 'convention' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ts.resample("Q-MAR", convention=how).ffill() expected = ts.asfreq("Q-MAR", how=how) expected = expected.reindex(result.index, method="ffill") + # FIXME: don't leave commented-out # .to_timestamp('D') # expected = expected.resample('Q-MAR').ffill() tm.assert_series_equal(result, expected) def test_resample_fill_missing(self): - rng = PeriodIndex([2000, 2005, 2007, 2009], freq="A") + rng = PeriodIndex([2000, 2005, 2007, 2009], freq="Y") s = Series(np.random.default_rng(2).standard_normal(4), index=rng) stamps = s.to_timestamp() - filled = s.resample("A").ffill() - expected = stamps.resample("A").ffill().to_period("A") + filled = s.resample("Y").ffill() + expected = stamps.resample("YE").ffill().to_period("Y") tm.assert_series_equal(filled, expected) def test_cant_fill_missing_dups(self): - rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq="A") + rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq="Y") s = Series(np.random.default_rng(2).standard_normal(5), index=rng) msg = "Reindexing only valid with uniquely valued Index objects" with pytest.raises(InvalidIndexError, match=msg): - s.resample("A").ffill() + s.resample("Y").ffill() @pytest.mark.parametrize("freq", ["5min"]) @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) def test_resample_5minute(self, freq, kind): - rng = period_range("1/1/2000", "1/5/2000", freq="T") + rng = period_range("1/1/2000", "1/5/2000", freq="min") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) expected = ts.to_timestamp().resample(freq).mean() if kind != "timestamp": expected = expected.to_period(freq) - result = ts.resample(freq, kind=kind).mean() + msg = "The 'kind' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ts.resample(freq, kind=kind).mean() tm.assert_series_equal(result, expected) def test_upsample_daily_business_daily(self, simple_period_range_series): @@ -454,9 +516,11 @@ tm.assert_series_equal(result, expected) ts = simple_period_range_series("1/1/2000", "2/1/2000") - result = ts.resample("H", convention="s").asfreq() - exp_rng = period_range("1/1/2000", "2/1/2000 23:00", freq="H") - expected = ts.asfreq("H", how="s").reindex(exp_rng) + msg = "The 'convention' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ts.resample("h", convention="s").asfreq() + exp_rng = period_range("1/1/2000", "2/1/2000 23:00", freq="h") + expected = ts.asfreq("h", how="s").reindex(exp_rng) tm.assert_series_equal(result, expected) def test_resample_irregular_sparse(self): @@ -481,8 +545,8 @@ expected = ts.asfreq("W-THU").ffill() tm.assert_series_equal(result, expected) - def test_resample_tz_localized(self): - dr = date_range(start="2012-4-13", end="2012-5-1") + def test_resample_tz_localized(self, unit): + dr = date_range(start="2012-4-13", end="2012-5-1", unit=unit) ts = Series(range(len(dr)), index=dr) ts_utc = ts.tz_localize("UTC") @@ -491,9 +555,7 @@ result = ts_local.resample("W").mean() ts_local_naive = ts_local.copy() - ts_local_naive.index = [ - x.replace(tzinfo=None) for x in ts_local_naive.index.to_pydatetime() - ] + ts_local_naive.index = ts_local_naive.index.tz_localize(None) exp = ts_local_naive.resample("W").mean().tz_localize("America/Los_Angeles") exp.index = pd.DatetimeIndex(exp.index, freq="W") @@ -503,9 +565,10 @@ # it works result = ts_local.resample("D").mean() + def test_resample_tz_localized2(self): # #2245 idx = date_range( - "2001-09-20 15:59", "2001-09-20 16:00", freq="T", tz="Australia/Sydney" + "2001-09-20 15:59", "2001-09-20 16:00", freq="min", tz="Australia/Sydney" ) s = Series([1, 2], index=idx) @@ -516,27 +579,30 @@ tm.assert_series_equal(result, expected) # for good measure - result = s.resample("D", kind="period").mean() + msg = "The 'kind' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.resample("D", kind="period").mean() ex_index = period_range("2001-09-20", periods=1, freq="D") expected = Series([1.5], index=ex_index) tm.assert_series_equal(result, expected) + def test_resample_tz_localized3(self): # GH 6397 # comparing an offset that doesn't propagate tz's - rng = date_range("1/1/2011", periods=20000, freq="H") + rng = date_range("1/1/2011", periods=20000, freq="h") rng = rng.tz_localize("EST") ts = DataFrame(index=rng) ts["first"] = np.random.default_rng(2).standard_normal(len(rng)) ts["second"] = np.cumsum(np.random.default_rng(2).standard_normal(len(rng))) expected = DataFrame( { - "first": ts.resample("A").sum()["first"], - "second": ts.resample("A").mean()["second"], + "first": ts.resample("YE").sum()["first"], + "second": ts.resample("YE").mean()["second"], }, columns=["first", "second"], ) result = ( - ts.resample("A") + ts.resample("YE") .agg({"first": "sum", "second": "mean"}) .reindex(columns=["first", "second"]) ) @@ -566,8 +632,8 @@ rng = period_range("2000Q1", periods=10, freq="Q-DEC") ts = Series(np.arange(10), index=rng) - result = ts.resample("A").mean() - exp = ts.to_timestamp().resample("A").mean().to_period() + result = ts.resample("Y").mean() + exp = ts.to_timestamp().resample("YE").mean().to_period() tm.assert_series_equal(result, exp) def test_resample_weekly_bug_1726(self): @@ -605,8 +671,10 @@ "2016-03-15 01:00:00-05:00", "2016-03-15 13:00:00-05:00", ] - index = pd.to_datetime(expected_index_values, utc=True).tz_convert( - "America/Chicago" + index = ( + pd.to_datetime(expected_index_values, utc=True) + .tz_convert("America/Chicago") + .as_unit(index.unit) ) index = pd.DatetimeIndex(index, freq="12h") expected = DataFrame( @@ -627,7 +695,7 @@ @pytest.mark.xfail(reason="Commented out for more than 3 years. Should this work?") def test_monthly_convention_span(self): - rng = period_range("2000-01", periods=3, freq="M") + rng = period_range("2000-01", periods=3, freq="ME") ts = Series(np.arange(3), index=rng) # hacky way to get same thing @@ -640,7 +708,7 @@ tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "from_freq, to_freq", [("D", "M"), ("Q", "A"), ("M", "Q"), ("D", "W")] + "from_freq, to_freq", [("D", "ME"), ("QE", "YE"), ("ME", "QE"), ("D", "W")] ) def test_default_right_closed_label(self, from_freq, to_freq): idx = date_range(start="8/15/2012", periods=100, freq=from_freq) @@ -653,7 +721,7 @@ @pytest.mark.parametrize( "from_freq, to_freq", - [("D", "MS"), ("Q", "AS"), ("M", "QS"), ("H", "D"), ("T", "H")], + [("D", "MS"), ("QE", "YS"), ("ME", "QS"), ("h", "D"), ("min", "h")], ) def test_default_left_closed_label(self, from_freq, to_freq): idx = date_range(start="8/15/2012", periods=100, freq=from_freq) @@ -665,15 +733,15 @@ ) def test_all_values_single_bin(self): - # 2070 + # GH#2070 index = period_range(start="2012-01-01", end="2012-12-31", freq="M") - s = Series(np.random.default_rng(2).standard_normal(len(index)), index=index) + ser = Series(np.random.default_rng(2).standard_normal(len(index)), index=index) - result = s.resample("A").mean() - tm.assert_almost_equal(result.iloc[0], s.mean()) + result = ser.resample("Y").mean() + tm.assert_almost_equal(result.iloc[0], ser.mean()) def test_evenly_divisible_with_no_extra_bins(self): - # 4076 + # GH#4076 # when the frequency is evenly divisible, sometimes extra bins df = DataFrame( @@ -683,10 +751,11 @@ result = df.resample("5D").mean() expected = pd.concat([df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T expected.index = pd.DatetimeIndex( - [Timestamp("2000-1-1"), Timestamp("2000-1-6")], freq="5D" + [Timestamp("2000-1-1"), Timestamp("2000-1-6")], dtype="M8[ns]", freq="5D" ) tm.assert_frame_equal(result, expected) + def test_evenly_divisible_with_no_extra_bins2(self): index = date_range(start="2001-5-4", periods=28) df = DataFrame( [ @@ -745,7 +814,7 @@ result = df.resample("7D").sum() tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("freq, period_mult", [("H", 24), ("12H", 2)]) + @pytest.mark.parametrize("freq, period_mult", [("h", 24), ("12h", 2)]) @pytest.mark.parametrize("kind", [None, "period"]) def test_upsampling_ohlc(self, freq, period_mult, kind): # GH 13083 @@ -757,7 +826,9 @@ # of the last original period, so extend accordingly: new_index = period_range(start="2000", freq=freq, periods=period_mult * len(pi)) expected = expected.reindex(new_index) - result = s.resample(freq, kind=kind).ohlc() + msg = "The 'kind' keyword in Series.resample is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.resample(freq, kind=kind).ohlc() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -800,19 +871,22 @@ ) def test_resample_with_nat(self, periods, values, freq, expected_values): # GH 13224 - index = PeriodIndex(periods, freq="S") + index = PeriodIndex(periods, freq="s") frame = DataFrame(values, index=index) expected_index = period_range( "1970-01-01 00:00:00", periods=len(expected_values), freq=freq ) expected = DataFrame(expected_values, index=expected_index) - result = frame.resample(freq).mean() + msg = "Resampling with a PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = frame.resample(freq) + result = rs.mean() tm.assert_frame_equal(result, expected) def test_resample_with_only_nat(self): # GH 13224 - pi = PeriodIndex([pd.NaT] * 3, freq="S") + pi = PeriodIndex([pd.NaT] * 3, freq="s") frame = DataFrame([2, 3, 5], index=pi, columns=["a"]) expected_index = PeriodIndex(data=[], freq=pi.freq) expected = DataFrame(index=expected_index, columns=["a"], dtype="float64") @@ -822,20 +896,19 @@ @pytest.mark.parametrize( "start,end,start_freq,end_freq,offset", [ - ("19910905", "19910909 03:00", "H", "24H", "10H"), - ("19910905", "19910909 12:00", "H", "24H", "10H"), - ("19910905", "19910909 23:00", "H", "24H", "10H"), - ("19910905 10:00", "19910909", "H", "24H", "10H"), - ("19910905 10:00", "19910909 10:00", "H", "24H", "10H"), - ("19910905", "19910909 10:00", "H", "24H", "10H"), - ("19910905 12:00", "19910909", "H", "24H", "10H"), - ("19910905 12:00", "19910909 03:00", "H", "24H", "10H"), - ("19910905 12:00", "19910909 12:00", "H", "24H", "10H"), - ("19910905 12:00", "19910909 12:00", "H", "24H", "34H"), - ("19910905 12:00", "19910909 12:00", "H", "17H", "10H"), - ("19910905 12:00", "19910909 12:00", "H", "17H", "3H"), - ("19910905 12:00", "19910909 1:00", "H", "M", "3H"), - ("19910905", "19910913 06:00", "2H", "24H", "10H"), + ("19910905", "19910909 03:00", "h", "24h", "10h"), + ("19910905", "19910909 12:00", "h", "24h", "10h"), + ("19910905", "19910909 23:00", "h", "24h", "10h"), + ("19910905 10:00", "19910909", "h", "24h", "10h"), + ("19910905 10:00", "19910909 10:00", "h", "24h", "10h"), + ("19910905", "19910909 10:00", "h", "24h", "10h"), + ("19910905 12:00", "19910909", "h", "24h", "10h"), + ("19910905 12:00", "19910909 03:00", "h", "24h", "10h"), + ("19910905 12:00", "19910909 12:00", "h", "24h", "10h"), + ("19910905 12:00", "19910909 12:00", "h", "24h", "34h"), + ("19910905 12:00", "19910909 12:00", "h", "17h", "10h"), + ("19910905 12:00", "19910909 12:00", "h", "17h", "3h"), + ("19910905", "19910913 06:00", "2h", "24h", "10h"), ("19910905", "19910905 01:39", "Min", "5Min", "3Min"), ("19910905", "19910905 03:18", "2Min", "5Min", "3Min"), ], @@ -844,52 +917,184 @@ # GH 23882 & 31809 pi = period_range(start, end, freq=start_freq) ser = Series(np.arange(len(pi)), index=pi) - result = ser.resample(end_freq, offset=offset).mean() + msg = "Resampling with a PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.resample(end_freq, offset=offset) + result = rs.mean() result = result.to_timestamp(end_freq) expected = ser.to_timestamp().resample(end_freq, offset=offset).mean() - if end_freq == "M": - # TODO: is non-tick the relevant characteristic? (GH 33815) - expected.index = expected.index._with_freq(None) + tm.assert_series_equal(result, expected) + + def test_resample_with_offset_month(self): + # GH 23882 & 31809 + pi = period_range("19910905 12:00", "19910909 1:00", freq="h") + ser = Series(np.arange(len(pi)), index=pi) + msg = "Resampling with a PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.resample("M", offset="3h") + result = rs.mean() + result = result.to_timestamp("M") + expected = ser.to_timestamp().resample("ME", offset="3h").mean() + # TODO: is non-tick the relevant characteristic? (GH 33815) + expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "first,last,freq,exp_first,exp_last", + "first,last,freq,freq_to_offset,exp_first,exp_last", [ - ("19910905", "19920406", "D", "19910905", "19920406"), - ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920406"), + ("19910905", "19920406", "D", "D", "19910905", "19920406"), + ("19910905 00:00", "19920406 06:00", "D", "D", "19910905", "19920406"), ( "19910905 06:00", "19920406 06:00", - "H", + "h", + "h", "19910905 06:00", "19920406 06:00", ), - ("19910906", "19920406", "M", "1991-09", "1992-04"), - ("19910831", "19920430", "M", "1991-08", "1992-04"), - ("1991-08", "1992-04", "M", "1991-08", "1992-04"), + ("19910906", "19920406", "M", "ME", "1991-09", "1992-04"), + ("19910831", "19920430", "M", "ME", "1991-08", "1992-04"), + ("1991-08", "1992-04", "M", "ME", "1991-08", "1992-04"), ], ) - def test_get_period_range_edges(self, first, last, freq, exp_first, exp_last): + def test_get_period_range_edges( + self, first, last, freq, freq_to_offset, exp_first, exp_last + ): first = Period(first) last = Period(last) exp_first = Period(exp_first, freq=freq) exp_last = Period(exp_last, freq=freq) - freq = pd.tseries.frequencies.to_offset(freq) + freq = pd.tseries.frequencies.to_offset(freq_to_offset) result = _get_period_range_edges(first, last, freq) expected = (exp_first, exp_last) assert result == expected def test_sum_min_count(self): # GH 19974 - index = date_range(start="2018", freq="M", periods=6) + index = date_range(start="2018", freq="ME", periods=6) data = np.ones(6) data[3:6] = np.nan s = Series(data, index).to_period() - result = s.resample("Q").sum(min_count=1) + msg = "Resampling with a PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = s.resample("Q") + result = rs.sum(min_count=1) expected = Series( [3.0, np.nan], index=PeriodIndex(["2018Q1", "2018Q2"], freq="Q-DEC") ) tm.assert_series_equal(result, expected) + + def test_resample_t_l_deprecated(self): + # GH#52536 + msg_t = "'T' is deprecated and will be removed in a future version." + msg_l = "'L' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg_l): + rng_l = period_range( + "2020-01-01 00:00:00 00:00", "2020-01-01 00:00:00 00:01", freq="L" + ) + ser = Series(np.arange(len(rng_l)), index=rng_l) + + rng = period_range( + "2020-01-01 00:00:00 00:00", "2020-01-01 00:00:00 00:01", freq="min" + ) + expected = Series([29999.5, 60000.0], index=rng) + with tm.assert_produces_warning(FutureWarning, match=msg_t): + result = ser.resample("T").mean() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "freq, freq_depr, freq_res, freq_depr_res, data", + [ + ("2Q", "2q", "2Y", "2y", [0.5]), + ("2M", "2m", "2Q", "2q", [1.0, 3.0]), + ], + ) + def test_resample_lowercase_frequency_deprecated( + self, freq, freq_depr, freq_res, freq_depr_res, data + ): + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq[1:]}' instead." + depr_msg_res = f"'{freq_depr_res[1:]}' is deprecated and will be removed in a " + f"future version. Please use '{freq_res[1:]}' instead." + + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + rng_l = period_range("2020-01-01", "2020-08-01", freq=freq_depr) + ser = Series(np.arange(len(rng_l)), index=rng_l) + + rng = period_range("2020-01-01", "2020-08-01", freq=freq_res) + expected = Series(data=data, index=rng) + + with tm.assert_produces_warning(FutureWarning, match=depr_msg_res): + result = ser.resample(freq_depr_res).mean() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "offset", + [ + offsets.MonthBegin(), + offsets.BYearBegin(2), + offsets.BusinessHour(2), + ], + ) + def test_asfreq_invalid_period_offset(self, offset, series_and_frame): + # GH#55785 + msg = f"Invalid offset: '{offset.base}' for converting time series " + + df = series_and_frame + with pytest.raises(ValueError, match=msg): + df.asfreq(freq=offset) + + +@pytest.mark.parametrize( + "freq,freq_depr", + [ + ("2M", "2ME"), + ("2Q", "2QE"), + ("2Q-FEB", "2QE-FEB"), + ("2Y", "2YE"), + ("2Y-MAR", "2YE-MAR"), + ("2M", "2me"), + ("2Q", "2qe"), + ("2Y-MAR", "2ye-mar"), + ], +) +def test_resample_frequency_ME_QE_YE_error_message(series_and_frame, freq, freq_depr): + # GH#9586 + msg = f"for Period, please use '{freq[1:]}' instead of '{freq_depr[1:]}'" + + obj = series_and_frame + with pytest.raises(ValueError, match=msg): + obj.resample(freq_depr) + + +def test_corner_cases_period(simple_period_range_series): + # miscellaneous test coverage + len0pts = simple_period_range_series("2007-01", "2010-05", freq="M")[:0] + # it works + msg = "Resampling with a PeriodIndex is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = len0pts.resample("Y-DEC").mean() + assert len(result) == 0 + + +@pytest.mark.parametrize( + "freq_depr", + [ + "2BME", + "2CBME", + "2SME", + "2BQE-FEB", + "2BYE-MAR", + ], +) +def test_resample_frequency_invalid_freq(series_and_frame, freq_depr): + # GH#9586 + msg = f"Invalid frequency: {freq_depr[1:]}" + + obj = series_and_frame + with pytest.raises(ValueError, match=msg): + obj.resample(freq_depr) diff -Nru pandas-2.1.4+dfsg/pandas/tests/resample/test_resample_api.py pandas-2.2.2+dfsg/pandas/tests/resample/test_resample_api.py --- pandas-2.1.4+dfsg/pandas/tests/resample/test_resample_api.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/resample/test_resample_api.py 2024-04-10 17:42:52.000000000 +0000 @@ -33,13 +33,13 @@ def test_str(_test_series): - r = _test_series.resample("H") + r = _test_series.resample("h") assert ( "DatetimeIndexResampler [freq=, axis=0, closed=left, " "label=left, convention=start, origin=start_day]" in str(r) ) - r = _test_series.resample("H", origin="2000-01-01") + r = _test_series.resample("h", origin="2000-01-01") assert ( "DatetimeIndexResampler [freq=, axis=0, closed=left, " "label=left, convention=start, origin=2000-01-01 00:00:00]" in str(r) @@ -47,12 +47,12 @@ def test_api(_test_series): - r = _test_series.resample("H") + r = _test_series.resample("h") result = r.mean() assert isinstance(result, Series) assert len(result) == 217 - r = _test_series.to_frame().resample("H") + r = _test_series.to_frame().resample("h") result = r.mean() assert isinstance(result, DataFrame) assert len(result) == 217 @@ -77,7 +77,9 @@ ) index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) - result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) @@ -114,7 +116,10 @@ # group_keys=True expected.index = pd.MultiIndex.from_arrays( - [pd.to_datetime(["2000-01-01", "2000-01-06"]).repeat(5), expected.index] + [ + pd.to_datetime(["2000-01-01", "2000-01-06"]).as_unit("ns").repeat(5), + expected.index, + ] ) g = df.resample("5D", group_keys=True) result = g.apply(lambda x: x) @@ -125,36 +130,36 @@ # GH17905 # series - r = _test_series.resample("H") + r = _test_series.resample("h") expected = r.max() - r.mean() result = r.pipe(lambda x: x.max() - x.mean()) tm.assert_series_equal(result, expected) # dataframe - r = test_frame.resample("H") + r = test_frame.resample("h") expected = r.max() - r.mean() result = r.pipe(lambda x: x.max() - x.mean()) tm.assert_frame_equal(result, expected) def test_getitem(test_frame): - r = test_frame.resample("H") + r = test_frame.resample("h") tm.assert_index_equal(r._selected_obj.columns, test_frame.columns) - r = test_frame.resample("H")["B"] + r = test_frame.resample("h")["B"] assert r._selected_obj.name == test_frame.columns[1] # technically this is allowed - r = test_frame.resample("H")["A", "B"] + r = test_frame.resample("h")["A", "B"] tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]]) - r = test_frame.resample("H")["A", "B"] + r = test_frame.resample("h")["A", "B"] tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]]) @pytest.mark.parametrize("key", [["D"], ["A", "D"]]) def test_select_bad_cols(key, test_frame): - g = test_frame.resample("H") + g = test_frame.resample("h") # 'A' should not be referenced as a bad column... # will have to rethink regex if you change message! msg = r"^\"Columns not found: 'D'\"$" @@ -163,7 +168,7 @@ def test_attribute_access(test_frame): - r = test_frame.resample("H") + r = test_frame.resample("h") tm.assert_series_equal(r.A.sum(), r["A"].sum()) @@ -171,7 +176,7 @@ def test_api_compat_before_use(attr): # make sure that we are setting the binner # on these attributes - rng = date_range("1/1/2012", periods=100, freq="S") + rng = date_range("1/1/2012", periods=100, freq="s") ts = Series(np.arange(len(rng)), index=rng) rs = ts.resample("30s") @@ -186,13 +191,13 @@ def tests_raises_on_nuisance(test_frame): df = test_frame df["D"] = "foo" - r = df.resample("H") + r = df.resample("h") result = r[["A", "B"]].mean() expected = pd.concat([r.A.mean(), r.B.mean()], axis=1) tm.assert_frame_equal(result, expected) expected = r[["A", "B", "C"]].mean() - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): r.mean() result = r.mean(numeric_only=True) @@ -201,7 +206,7 @@ def test_downsample_but_actually_upsampling(): # this is reindex / asfreq - rng = date_range("1/1/2012", periods=100, freq="S") + rng = date_range("1/1/2012", periods=100, freq="s") ts = Series(np.arange(len(rng), dtype="int64"), index=rng) result = ts.resample("20s").asfreq() expected = Series( @@ -216,7 +221,7 @@ # ts2.resample('2s').mean().ffill() # preserve these semantics - rng = date_range("1/1/2012", periods=100, freq="S") + rng = date_range("1/1/2012", periods=100, freq="s") ts = Series(np.arange(len(rng)), index=rng) ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]] @@ -260,7 +265,7 @@ "2012-01-01 00:00:30", ], dtype="datetime64[ns]", - freq="2S", + freq="2s", ), ) tm.assert_series_equal(result, expected) @@ -294,7 +299,7 @@ def test_fillna(): # need to upsample here - rng = date_range("1/1/2012", periods=10, freq="2S") + rng = date_range("1/1/2012", periods=10, freq="2s") ts = Series(np.arange(len(rng), dtype="int64"), index=rng) r = ts.resample("s") @@ -344,11 +349,11 @@ # similar aggregations with and w/o selection list df = DataFrame( np.random.default_rng(2).standard_normal((1000, 3)), - index=date_range("1/1/2012", freq="S", periods=1000), + index=date_range("1/1/2012", freq="s", periods=1000), columns=["A", "B", "C"], ) - r = df.resample("3T") + r = df.resample("3min") msg = r"Column\(s\) \['r1', 'r2'\] do not exist" with pytest.raises(KeyError, match=msg): @@ -359,11 +364,11 @@ # GH#39025 df = DataFrame( np.random.default_rng(2).standard_normal((1000, 2)), - index=date_range("1/1/2012", freq="S", periods=1000), + index=date_range("1/1/2012", freq="s", periods=1000), columns=[1, "a"], ) - r = df.resample("3T") + r = df.resample("3min") msg = r"Column\(s\) \[2, 'b'\] do not exist" with pytest.raises(KeyError, match=msg): @@ -374,214 +379,245 @@ # `Base` test class -def test_agg(): - # test with all three Resampler apis and TimeGrouper - +@pytest.fixture +def index(): index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") index.name = "date" - df = DataFrame( + return index + + +@pytest.fixture +def df(index): + frame = DataFrame( np.random.default_rng(2).random((10, 2)), columns=list("AB"), index=index ) - df_col = df.reset_index() + return frame + + +@pytest.fixture +def df_col(df): + return df.reset_index() + + +@pytest.fixture +def df_mult(df_col, index): df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays( - [range(10), df.index], names=["index", "date"] + [range(10), index], names=["index", "date"] ) - r = df.resample("2D") - cases = [ - r, - df_col.resample("2D", on="date"), - df_mult.resample("2D", level="date"), - df.groupby(pd.Grouper(freq="2D")), - ] + return df_mult + + +@pytest.fixture +def a_mean(df): + return df.resample("2D")["A"].mean() + + +@pytest.fixture +def a_std(df): + return df.resample("2D")["A"].std() + + +@pytest.fixture +def a_sum(df): + return df.resample("2D")["A"].sum() + + +@pytest.fixture +def b_mean(df): + return df.resample("2D")["B"].mean() + + +@pytest.fixture +def b_std(df): + return df.resample("2D")["B"].std() + + +@pytest.fixture +def b_sum(df): + return df.resample("2D")["B"].sum() + + +@pytest.fixture +def df_resample(df): + return df.resample("2D") + + +@pytest.fixture +def df_col_resample(df_col): + return df_col.resample("2D", on="date") + + +@pytest.fixture +def df_mult_resample(df_mult): + return df_mult.resample("2D", level="date") + + +@pytest.fixture +def df_grouper_resample(df): + return df.groupby(pd.Grouper(freq="2D")) - a_mean = r["A"].mean() - a_std = r["A"].std() - a_sum = r["A"].sum() - b_mean = r["B"].mean() - b_std = r["B"].std() - b_sum = r["B"].sum() +@pytest.fixture( + params=["df_resample", "df_col_resample", "df_mult_resample", "df_grouper_resample"] +) +def cases(request): + return request.getfixturevalue(request.param) + + +def test_agg_mixed_column_aggregation(cases, a_mean, a_std, b_mean, b_std, request): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) msg = "using SeriesGroupBy.[mean|std]" - for t in cases: - # In case 2, "date" is an index and a column, so get included in the agg - if t == cases[2]: - date_mean = t["date"].mean() - date_std = t["date"].std() - exp = pd.concat([date_mean, date_std, expected], axis=1) - exp.columns = pd.MultiIndex.from_product( - [["date", "A", "B"], ["mean", "std"]] - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate([np.mean, np.std]) - tm.assert_frame_equal(result, exp) - else: - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate([np.mean, np.std]) - tm.assert_frame_equal(result, expected) + # "date" is an index and a column, so get included in the agg + if "df_mult" in request.node.callspec.id: + date_mean = cases["date"].mean() + date_std = cases["date"].std() + expected = pd.concat([date_mean, date_std, expected], axis=1) + expected.columns = pd.MultiIndex.from_product( + [["date", "A", "B"], ["mean", "std"]] + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = cases.aggregate([np.mean, np.std]) + tm.assert_frame_equal(result, expected) - expected = pd.concat([a_mean, b_std], axis=1) - for t in cases: - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate({"A": np.mean, "B": np.std}) - tm.assert_frame_equal(result, expected, check_like=True) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate(A=("A", np.mean), B=("B", np.std)) - tm.assert_frame_equal(result, expected, check_like=True) +@pytest.mark.parametrize( + "agg", + [ + {"func": {"A": np.mean, "B": np.std}}, + {"A": ("A", np.mean), "B": ("B", np.std)}, + {"A": NamedAgg("A", np.mean), "B": NamedAgg("B", np.std)}, + ], +) +def test_agg_both_mean_std_named_result(cases, a_mean, b_std, agg): + msg = "using SeriesGroupBy.[mean|std]" + expected = pd.concat([a_mean, b_std], axis=1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = cases.aggregate(**agg) + tm.assert_frame_equal(result, expected, check_like=True) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.aggregate(A=NamedAgg("A", np.mean), B=NamedAgg("B", np.std)) - tm.assert_frame_equal(result, expected, check_like=True) +def test_agg_both_mean_std_dict_of_list(cases, a_mean, a_std): expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) - for t in cases: - result = t.aggregate({"A": ["mean", "std"]}) - tm.assert_frame_equal(result, expected) + result = cases.aggregate({"A": ["mean", "std"]}) + tm.assert_frame_equal(result, expected) + +@pytest.mark.parametrize( + "agg", [{"func": ["mean", "sum"]}, {"mean": "mean", "sum": "sum"}] +) +def test_agg_both_mean_sum(cases, a_mean, a_sum, agg): expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ["mean", "sum"] - for t in cases: - result = t["A"].aggregate(["mean", "sum"]) - tm.assert_frame_equal(result, expected) + result = cases["A"].aggregate(**agg) + tm.assert_frame_equal(result, expected) - result = t["A"].aggregate(mean="mean", sum="sum") - tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "agg", + [ + {"A": {"mean": "mean", "sum": "sum"}}, + { + "A": {"mean": "mean", "sum": "sum"}, + "B": {"mean2": "mean", "sum2": "sum"}, + }, + ], +) +def test_agg_dict_of_dict_specificationerror(cases, agg): msg = "nested renamer is not supported" - for t in cases: - with pytest.raises(pd.errors.SpecificationError, match=msg): - t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) + with pytest.raises(pd.errors.SpecificationError, match=msg): + cases.aggregate(agg) - expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples( - [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] - ) - for t in cases: - with pytest.raises(pd.errors.SpecificationError, match=msg): - t.aggregate( - { - "A": {"mean": "mean", "sum": "sum"}, - "B": {"mean2": "mean", "sum2": "sum"}, - } - ) +def test_agg_dict_of_lists(cases, a_mean, a_std, b_mean, b_std): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] ) - for t in cases: - result = t.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) - - expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples( - [ - ("r1", "A", "mean"), - ("r1", "A", "sum"), - ("r2", "B", "mean"), - ("r2", "B", "sum"), - ] - ) - + result = cases.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) + tm.assert_frame_equal(result, expected, check_like=True) -def test_agg_misc(): - # test with all three Resampler apis and TimeGrouper - - index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") - index.name = "date" - df = DataFrame( - np.random.default_rng(2).random((10, 2)), columns=list("AB"), index=index - ) - df_col = df.reset_index() - df_mult = df_col.copy() - df_mult.index = pd.MultiIndex.from_arrays( - [range(10), df.index], names=["index", "date"] - ) - - r = df.resample("2D") - cases = [ - r, - df_col.resample("2D", on="date"), - df_mult.resample("2D", level="date"), - df.groupby(pd.Grouper(freq="2D")), - ] +@pytest.mark.parametrize( + "agg", + [ + {"func": {"A": np.sum, "B": lambda x: np.std(x, ddof=1)}}, + {"A": ("A", np.sum), "B": ("B", lambda x: np.std(x, ddof=1))}, + {"A": NamedAgg("A", np.sum), "B": NamedAgg("B", lambda x: np.std(x, ddof=1))}, + ], +) +def test_agg_with_lambda(cases, agg): # passed lambda msg = "using SeriesGroupBy.sum" - for t in cases: - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) - rcustom = t["B"].apply(lambda x: np.std(x, ddof=1)) - expected = pd.concat([r["A"].sum(), rcustom], axis=1) - tm.assert_frame_equal(result, expected, check_like=True) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.agg(A=("A", np.sum), B=("B", lambda x: np.std(x, ddof=1))) - tm.assert_frame_equal(result, expected, check_like=True) - - with tm.assert_produces_warning(FutureWarning, match=msg): - result = t.agg( - A=NamedAgg("A", np.sum), B=NamedAgg("B", lambda x: np.std(x, ddof=1)) - ) - tm.assert_frame_equal(result, expected, check_like=True) + rcustom = cases["B"].apply(lambda x: np.std(x, ddof=1)) + expected = pd.concat([cases["A"].sum(), rcustom], axis=1) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = cases.agg(**agg) + tm.assert_frame_equal(result, expected, check_like=True) - # agg with renamers - expected = pd.concat( - [t["A"].sum(), t["B"].sum(), t["A"].mean(), t["B"].mean()], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")] - ) +@pytest.mark.parametrize( + "agg", + [ + {"func": {"result1": np.sum, "result2": np.mean}}, + {"A": ("result1", np.sum), "B": ("result2", np.mean)}, + {"A": NamedAgg("result1", np.sum), "B": NamedAgg("result2", np.mean)}, + ], +) +def test_agg_no_column(cases, agg): msg = r"Column\(s\) \['result1', 'result2'\] do not exist" - for t in cases: - with pytest.raises(KeyError, match=msg): - t[["A", "B"]].agg({"result1": np.sum, "result2": np.mean}) + with pytest.raises(KeyError, match=msg): + cases[["A", "B"]].agg(**agg) - with pytest.raises(KeyError, match=msg): - t[["A", "B"]].agg(A=("result1", np.sum), B=("result2", np.mean)) - - with pytest.raises(KeyError, match=msg): - t[["A", "B"]].agg( - A=NamedAgg("result1", np.sum), B=NamedAgg("result2", np.mean) - ) +@pytest.mark.parametrize( + "cols, agg", + [ + [None, {"A": ["sum", "std"], "B": ["mean", "std"]}], + [ + [ + "A", + "B", + ], + {"A": ["sum", "std"], "B": ["mean", "std"]}, + ], + ], +) +def test_agg_specificationerror_nested(cases, cols, agg, a_sum, a_std, b_mean, b_std): # agg with different hows - expected = pd.concat( - [t["A"].sum(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1 - ) + # equivalent of using a selection list / or not + expected = pd.concat([a_sum, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] ) - for t in cases: - result = t.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) + if cols is not None: + obj = cases[cols] + else: + obj = cases + + result = obj.agg(agg) + tm.assert_frame_equal(result, expected, check_like=True) - # equivalent of using a selection list / or not - for t in cases: - result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) +@pytest.mark.parametrize( + "agg", [{"A": ["sum", "std"]}, {"A": ["sum", "std"], "B": ["mean", "std"]}] +) +def test_agg_specificationerror_series(cases, agg): msg = "nested renamer is not supported" # series like aggs - for t in cases: - with pytest.raises(pd.errors.SpecificationError, match=msg): - t["A"].agg({"A": ["sum", "std"]}) + with pytest.raises(pd.errors.SpecificationError, match=msg): + cases["A"].agg(agg) - with pytest.raises(pd.errors.SpecificationError, match=msg): - t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) +def test_agg_specificationerror_invalid_names(cases): # errors # invalid names in the agg specification msg = r"Column\(s\) \['B'\] do not exist" - for t in cases: - with pytest.raises(KeyError, match=msg): - t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) + with pytest.raises(KeyError, match=msg): + cases[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) @pytest.mark.parametrize( @@ -597,7 +633,7 @@ ).T warning_msg = "DataFrame.resample with axis=1 is deprecated." with tm.assert_produces_warning(FutureWarning, match=warning_msg): - res = df.resample("M", axis=1) + res = df.resample("ME", axis=1) with pytest.raises( NotImplementedError, match="axis other than 0 is not supported" ): @@ -650,7 +686,7 @@ # Error as we don't have 'z' column msg = r"Column\(s\) \['z'\] do not exist" with pytest.raises(KeyError, match=msg): - df.resample("30T").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]}) + df.resample("30min").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]}) def test_agg_list_like_func_with_args(): @@ -900,9 +936,9 @@ # GH#46442 test if `numeric_only` behave as expected for DataFrameGroupBy index = date_range("2018-01-01", periods=2, freq="D") - expected_index = date_range("2018-12-31", periods=1, freq="Y") + expected_index = date_range("2018-12-31", periods=1, freq="YE") df = DataFrame({"cat": ["cat_1", "cat_2"], "num": [5, 20]}, index=index) - resampled = df.resample("Y") + resampled = df.resample("YE") if numeric_only is lib.no_default: kwargs = {} else: @@ -912,7 +948,7 @@ if isinstance(expected_data, str): if method in ("var", "mean", "median", "prod"): klass = TypeError - msg = re.escape(f"agg function failed [how->{method},dtype->object]") + msg = re.escape(f"agg function failed [how->{method},dtype->") else: klass = ValueError msg = expected_data @@ -951,9 +987,9 @@ # GH#46442 test if `numeric_only` behave as expected for SeriesGroupBy index = date_range("2018-01-01", periods=2, freq="D") - expected_index = date_range("2018-12-31", periods=1, freq="Y") + expected_index = date_range("2018-12-31", periods=1, freq="YE") df = Series(["cat_1", "cat_2"], index=index) - resampled = df.resample("Y") + resampled = df.resample("YE") kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only} func = getattr(resampled, method) @@ -962,7 +998,7 @@ with pytest.raises(TypeError, match=msg): func(**kwargs) elif method == "prod": - msg = re.escape("agg function failed [how->prod,dtype->object]") + msg = re.escape("agg function failed [how->prod,dtype->") with pytest.raises(TypeError, match=msg): func(**kwargs) else: @@ -1004,11 +1040,11 @@ if raises: with tm.assert_produces_warning(FutureWarning, match=warn_msg): with pytest.raises(UnsupportedFunctionCall, match=error_msg): - func(*args, 1, 2, 3) + func(*args, 1, 2, 3, 4) else: with tm.assert_produces_warning(FutureWarning, match=warn_msg): with pytest.raises(TypeError, match=error_msg_type): - func(*args, 1, 2, 3) + func(*args, 1, 2, 3, 4) def test_df_axis_param_depr(): @@ -1021,7 +1057,7 @@ # Deprecation error when axis=1 is explicitly passed warning_msg = "DataFrame.resample with axis=1 is deprecated." with tm.assert_produces_warning(FutureWarning, match=warning_msg): - df.resample("M", axis=1) + df.resample("ME", axis=1) # Deprecation error when axis=0 is explicitly passed df = df.T @@ -1030,7 +1066,7 @@ "will be removed in a future version." ) with tm.assert_produces_warning(FutureWarning, match=warning_msg): - df.resample("M", axis=0) + df.resample("ME", axis=0) def test_series_axis_param_depr(_test_series): @@ -1039,7 +1075,7 @@ "deprecated and will be removed in a future version." ) with tm.assert_produces_warning(FutureWarning, match=warning_msg): - _test_series.resample("H", axis=0) + _test_series.resample("h", axis=0) def test_resample_empty(): @@ -1059,5 +1095,5 @@ ] ) ) - result = df.resample("8H").mean() + result = df.resample("8h").mean() tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/resample/test_resampler_grouper.py pandas-2.2.2+dfsg/pandas/tests/resample/test_resampler_grouper.py --- pandas-2.1.4+dfsg/pandas/tests/resample/test_resampler_grouper.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/resample/test_resampler_grouper.py 2024-04-10 17:42:52.000000000 +0000 @@ -30,8 +30,11 @@ code = dedent( """\ - import pandas._testing as tm - s = tm.makeTimeSeries() + import numpy as np + from pandas import Series, date_range + data = np.arange(10, dtype=np.float64) + index = date_range("2020-01-01", periods=len(data)) + s = Series(data, index=index) rs = s.resample("D") """ ) @@ -66,8 +69,12 @@ def f_0(x): return x.set_index("date").resample("D").asfreq() - expected = df.groupby("id").apply(f_0) - result = df.set_index("date").groupby("id").resample("D").asfreq() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = df.groupby("id").apply(f_0) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) df = DataFrame( @@ -81,8 +88,12 @@ def f_1(x): return x.resample("1D").ffill() - expected = df.groupby("group").apply(f_1) - result = df.groupby("group").resample("1D").ffill() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = df.groupby("group").apply(f_1) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -97,7 +108,9 @@ result = g.B.resample("2s").mean() tm.assert_series_equal(result, expected) - result = g.resample("2s").mean().B + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -108,12 +121,11 @@ df = DataFrame(data, index=date_range("2016-01-01", periods=2)) r = df.groupby("id").resample("1D") result = r["buyer"].count() + + exp_mi = pd.MultiIndex.from_arrays([[1, 2], df.index], names=("id", None)) expected = Series( [1, 1], - index=pd.MultiIndex.from_tuples( - [(1, Timestamp("2016-01-01")), (2, Timestamp("2016-01-02"))], - names=["id", None], - ), + index=exp_mi, name="buyer", ) tm.assert_series_equal(result, expected) @@ -176,7 +188,7 @@ def test_nearest(): # GH 17496 # Resample nearest - index = date_range("1/1/2000", periods=3, freq="T") + index = date_range("1/1/2000", periods=3, freq="min") result = Series(range(3), index=index).resample("20s").nearest() expected = Series( @@ -192,7 +204,7 @@ "2000-01-01 00:02:00", ], dtype="datetime64[ns]", - freq="20S", + freq="20s", ), ) tm.assert_series_equal(result, expected) @@ -222,8 +234,12 @@ g = test_frame.groupby("A") r = g.resample("2s") - result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = getattr(r, f)() + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -240,8 +256,12 @@ def test_methods_std_var(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = getattr(r, f)(ddof=1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -250,18 +270,24 @@ r = g.resample("2s") # reduction - expected = g.resample("2s").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() - result = r.apply(f_0) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = r.apply(f_0) tm.assert_frame_equal(result, expected) def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) - result = g.apply(f_1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -278,14 +304,14 @@ s = Series([1, 2], index=["a", "b"]) return s - expected = df.groupby(pd.Grouper(freq="M")).apply(f) + expected = df.groupby(pd.Grouper(freq="ME")).apply(f) - result = df.resample("M").apply(f) + result = df.resample("ME").apply(f) tm.assert_frame_equal(result, expected) # A case for series - expected = df["col1"].groupby(pd.Grouper(freq="M"), group_keys=False).apply(f) - result = df["col1"].resample("M").apply(f) + expected = df["col1"].groupby(pd.Grouper(freq="ME"), group_keys=False).apply(f) + result = df["col1"].resample("ME").apply(f) tm.assert_series_equal(result, expected) @@ -295,10 +321,10 @@ ind = date_range(start="2017-01-01", freq="15Min", periods=8) df = DataFrame(np.array([0] * 16).reshape(8, 2), index=ind, columns=cols) agg_dict = {col: (np.sum if col[3] == "one" else np.mean) for col in df.columns} - result = df.resample("H").apply(lambda x: agg_dict[x.name](x)) + result = df.resample("h").apply(lambda x: agg_dict[x.name](x)) expected = DataFrame( 2 * [[0, 0.0]], - index=date_range(start="2017-01-01", freq="1H", periods=2), + index=date_range(start="2017-01-01", freq="1h", periods=2), columns=pd.MultiIndex.from_tuples( [("A", "a", "", "one"), ("B", "b", "i", "two")] ), @@ -313,7 +339,7 @@ cutoff = cumsum.iloc[-1] * q return series[cumsum >= cutoff].iloc[0] - times = date_range("2017-6-23 18:00", periods=8, freq="15T", tz="UTC") + times = date_range("2017-6-23 18:00", periods=8, freq="15min", tz="UTC") data = Series([1.0, 1, 1, 1, 1, 2, 2, 0], index=times) weights = Series([160.0, 91, 65, 43, 24, 10, 1, 0], index=times) @@ -325,16 +351,19 @@ tm.assert_series_equal(result, expected) -def test_resample_groupby_with_label(): +def test_resample_groupby_with_label(unit): # GH 13235 - index = date_range("2000-01-01", freq="2D", periods=5) + index = date_range("2000-01-01", freq="2D", periods=5, unit=unit) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) - result = df.groupby("col0").resample("1W", label="left").sum() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("col0").resample("1W", label="left").sum() mi = [ np.array([0, 0, 1, 2], dtype=np.int64), - pd.to_datetime( - np.array(["1999-12-26", "2000-01-02", "2000-01-02", "2000-01-02"]) + np.array( + ["1999-12-26", "2000-01-02", "2000-01-02", "2000-01-02"], + dtype=f"M8[{unit}]", ), ] mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) @@ -349,7 +378,9 @@ # consistent return values with window df = test_frame expected = Index([1, 2, 3], name="A") - result = df.groupby("A").resample("2s").mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -382,14 +413,14 @@ ) # access "col" via getattr -> make sure we handle AttributeError - result = df.resample("H").apply(lambda group: group.col.sum()) + result = df.resample("h").apply(lambda group: group.col.sum()) expected = Series( - [3, 12, 21, 9], index=date_range("2012-01-01", periods=4, freq="H") + [3, 12, 21, 9], index=date_range("2012-01-01", periods=4, freq="h") ) tm.assert_series_equal(result, expected) # access "col" via _getitem__ -> make sure we handle KeyErrpr - result = df.resample("H").apply(lambda group: group["col"].sum()) + result = df.resample("h").apply(lambda group: group["col"].sum()) tm.assert_series_equal(result, expected) @@ -422,7 +453,7 @@ ) df["date"] = pd.to_datetime(df["date"]) - resampled = df.groupby("cat").resample("Y", on="date") + resampled = df.groupby("cat").resample("YE", on="date") expected = resampled[["num"]].sum() result = resampled.agg({"num": "sum"}) @@ -433,7 +464,7 @@ # GH 42905 ts = Timestamp("2021-02-28 00:00:00") df = DataFrame({"class": ["beta"], "value": [69]}, index=Index([ts], name="date")) - resampled = df.groupby("class").resample("M")["value"] + resampled = df.groupby("class").resample("ME")["value"] result = resampled.agg(["sum", "size"]) expected = DataFrame( [[69, 1]], @@ -447,7 +478,9 @@ def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = ( DataFrame(columns=["a", "b"]) .set_index(keys, drop=False) @@ -470,11 +503,15 @@ if consolidate: df = df._consolidate() - result = df.groupby(["key"]).resample("W", on="date").min() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ ["A"] * 3 + ["B"] * 3, - pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2), + pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2).as_unit( + "ns" + ), ], names=["key", "date"], ) @@ -499,19 +536,15 @@ } ) result = df.groupby("group").resample("2D", on="date")[["val"]].mean() + + mi_exp = pd.MultiIndex.from_arrays( + [[0, 0, 1, 1], df["date"]._values[::2]], names=["group", "date"] + ) expected = DataFrame( data={ "val": [4.0, 3.5, 6.5, 3.0], }, - index=Index( - data=[ - (0, Timestamp("2016-01-01")), - (0, Timestamp("2016-01-03")), - (1, Timestamp("2016-01-05")), - (1, Timestamp("2016-01-07")), - ], - name=("group", "date"), - ), + index=mi_exp, ) tm.assert_frame_equal(result, expected) @@ -522,7 +555,9 @@ df = DataFrame([], columns=["a", "b", "date"]) df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) expected = expected.set_index("date", append=True, drop=True) @@ -567,20 +602,22 @@ # GH 46826 df = DataFrame( {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, - index=date_range("31/12/2000 18:00", freq="H", periods=12), + index=date_range("31/12/2000 18:00", freq="h", periods=12), + ) + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = df.groupby("A").resample("D").size() + + mi_exp = pd.MultiIndex.from_arrays( + [ + [1, 1, 2, 2], + pd.DatetimeIndex(["2000-12-31", "2001-01-01"] * 2, dtype="M8[ns]"), + ], + names=["A", None], ) - result = df.groupby("A").resample("D").size() expected = Series( 3, - index=pd.MultiIndex.from_tuples( - [ - (1, Timestamp("2000-12-31")), - (1, Timestamp("2001-01-01")), - (2, Timestamp("2000-12-31")), - (2, Timestamp("2001-01-01")), - ], - names=["A", None], - ), + index=mi_exp, ) tm.assert_series_equal(result, expected) @@ -592,25 +629,18 @@ "group": [0, 0, 0, 0, 1, 1, 1, 1], "val": [3, 1, 4, 1, 5, 9, 2, 6], }, - index=Series( - date_range(start="2016-01-01", periods=8), - name="date", - ), + index=date_range(start="2016-01-01", periods=8, name="date"), ) result = df.groupby("group").resample("2D")[["val"]].mean() + + mi_exp = pd.MultiIndex.from_arrays( + [[0, 0, 1, 1], df.index[::2]], names=["group", "date"] + ) expected = DataFrame( data={ "val": [2.0, 2.5, 7.0, 4.0], }, - index=Index( - data=[ - (0, Timestamp("2016-01-01")), - (0, Timestamp("2016-01-03")), - (1, Timestamp("2016-01-05")), - (1, Timestamp("2016-01-07")), - ], - name=("group", "date"), - ), + index=mi_exp, ) tm.assert_frame_equal(result, expected) @@ -624,26 +654,19 @@ "second_val": [2, 7, 1, 8, 2, 8, 1, 8], "third_val": [1, 4, 1, 4, 2, 1, 3, 5], }, - index=Series( - date_range(start="2016-01-01", periods=8), - name="date", - ), + index=date_range(start="2016-01-01", periods=8, name="date"), ) result = df.groupby("group").resample("2D")[["first_val", "second_val"]].mean() + + mi_exp = pd.MultiIndex.from_arrays( + [[0, 0, 1, 1], df.index[::2]], names=["group", "date"] + ) expected = DataFrame( data={ "first_val": [2.0, 2.5, 7.0, 4.0], "second_val": [4.5, 4.5, 5.0, 4.5], }, - index=Index( - data=[ - (0, Timestamp("2016-01-01")), - (0, Timestamp("2016-01-03")), - (1, Timestamp("2016-01-05")), - (1, Timestamp("2016-01-07")), - ], - name=("group", "date"), - ), + index=mi_exp, ) tm.assert_frame_equal(result, expected) @@ -660,8 +683,10 @@ name="date", ), ) + gb = df.groupby("group") + rs = gb.resample("2D") with pytest.raises(KeyError, match="Columns not found"): - df.groupby("group").resample("2D")[["val_not_in_dataframe"]].mean() + rs[["val_not_in_dataframe"]] @pytest.mark.parametrize("kind", ["datetime", "period"]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/resample/test_time_grouper.py pandas-2.2.2+dfsg/pandas/tests/resample/test_time_grouper.py --- pandas-2.1.4+dfsg/pandas/tests/resample/test_time_grouper.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/resample/test_time_grouper.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,6 +7,7 @@ import pandas as pd from pandas import ( DataFrame, + Index, Series, Timestamp, ) @@ -24,7 +25,7 @@ def test_apply(test_series): - grouper = Grouper(freq="A", label="right", closed="right") + grouper = Grouper(freq="YE", label="right", closed="right") grouped = test_series.groupby(grouper) @@ -44,18 +45,18 @@ expected = test_series.groupby(lambda x: x.year).count() - grouper = Grouper(freq="A", label="right", closed="right") + grouper = Grouper(freq="YE", label="right", closed="right") result = test_series.groupby(grouper).count() expected.index = result.index tm.assert_series_equal(result, expected) - result = test_series.resample("A").count() + result = test_series.resample("YE").count() expected.index = result.index tm.assert_series_equal(result, expected) def test_numpy_reduction(test_series): - result = test_series.resample("A", closed="right").prod() + result = test_series.resample("YE", closed="right").prod() msg = "using SeriesGroupBy.prod" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -70,7 +71,7 @@ N = 1000 ind = date_range(start="2000-01-01", freq="D", periods=N) df = DataFrame({"open": 1, "close": 2}, index=ind) - tg = Grouper(freq="M") + tg = Grouper(freq="ME") grouper, _ = tg._get_grouper(df) @@ -86,19 +87,17 @@ @pytest.mark.parametrize( - "func", + "index", [ - tm.makeIntIndex, - tm.makeStringIndex, - tm.makeFloatIndex, - (lambda m: tm.makeCustomIndex(m, 2)), + Index([1, 2]), + Index(["a", "b"]), + Index([1.1, 2.2]), + pd.MultiIndex.from_arrays([[1, 2], ["a", "b"]]), ], ) -def test_fails_on_no_datetime_index(func): - n = 2 - index = func(n) +def test_fails_on_no_datetime_index(index): name = type(index).__name__ - df = DataFrame({"a": np.random.default_rng(2).standard_normal(n)}, index=index) + df = DataFrame({"a": range(len(index))}, index=index) msg = ( "Only valid with DatetimeIndex, TimedeltaIndex " @@ -138,13 +137,17 @@ normal_df["key"] = [1, 2, 3, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) - dt_df["key"] = [ - datetime(2013, 1, 1), - datetime(2013, 1, 2), - datetime(2013, 1, 3), - datetime(2013, 1, 4), - datetime(2013, 1, 5), - ] * 4 + dt_df["key"] = Index( + [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + datetime(2013, 1, 3), + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] + * 4, + dtype="M8[ns]", + ) normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) @@ -193,11 +196,11 @@ ], ) def test_resample_entirely_nat_window(method, method_args, unit): - s = Series([0] * 2 + [np.nan] * 2, index=date_range("2017", periods=4)) - result = methodcaller(method, **method_args)(s.resample("2d")) - expected = Series( - [0.0, unit], index=pd.DatetimeIndex(["2017-01-01", "2017-01-03"], freq="2D") - ) + ser = Series([0] * 2 + [np.nan] * 2, index=date_range("2017", periods=4)) + result = methodcaller(method, **method_args)(ser.resample("2d")) + + exp_dti = pd.DatetimeIndex(["2017-01-01", "2017-01-03"], dtype="M8[ns]", freq="2D") + expected = Series([0.0, unit], index=exp_dti) tm.assert_series_equal(result, expected) @@ -216,13 +219,17 @@ normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) - dt_df["key"] = [ - datetime(2013, 1, 1), - datetime(2013, 1, 2), - pd.NaT, - datetime(2013, 1, 4), - datetime(2013, 1, 5), - ] * 4 + dt_df["key"] = Index( + [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + pd.NaT, + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] + * 4, + dtype="M8[ns]", + ) normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) @@ -233,7 +240,13 @@ pad = DataFrame([[fill_value] * 4], index=[3], columns=["A", "B", "C", "D"]) expected = pd.concat([normal_result, pad]) expected = expected.sort_index() - dti = date_range(start="2013-01-01", freq="D", periods=5, name="key") + dti = date_range( + start="2013-01-01", + freq="D", + periods=5, + name="key", + unit=dt_df["key"]._values.unit, + ) expected.index = dti._with_freq(None) # TODO: is this desired? tm.assert_frame_equal(expected, dt_result) assert dt_result.index.name == "key" @@ -247,13 +260,17 @@ normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) - dt_df["key"] = [ - datetime(2013, 1, 1), - datetime(2013, 1, 2), - pd.NaT, - datetime(2013, 1, 4), - datetime(2013, 1, 5), - ] * 4 + dt_df["key"] = Index( + [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + pd.NaT, + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] + * 4, + dtype="M8[ns]", + ) normal_grouped = normal_df.groupby("key") dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) @@ -265,7 +282,11 @@ expected = pd.concat([normal_result, pad]) expected = expected.sort_index() expected.index = date_range( - start="2013-01-01", freq="D", periods=5, name="key" + start="2013-01-01", + freq="D", + periods=5, + name="key", + unit=dt_df["key"]._values.unit, )._with_freq(None) tm.assert_series_equal(expected, dt_result) assert dt_result.index.name == "key" @@ -273,7 +294,7 @@ def test_repr(): # GH18203 - result = repr(Grouper(key="A", freq="H")) + result = repr(Grouper(key="A", freq="h")) expected = ( "TimeGrouper(key='A', freq=, axis=0, sort=True, dropna=True, " "closed='left', label='left', how='mean', " @@ -281,7 +302,7 @@ ) assert result == expected - result = repr(Grouper(key="A", freq="H", origin="2000-01-01")) + result = repr(Grouper(key="A", freq="h", origin="2000-01-01")) expected = ( "TimeGrouper(key='A', freq=, axis=0, sort=True, dropna=True, " "closed='left', label='left', how='mean', " @@ -304,11 +325,12 @@ ], ) def test_upsample_sum(method, method_args, expected_values): - s = Series(1, index=date_range("2017", periods=2, freq="H")) - resampled = s.resample("30T") + ser = Series(1, index=date_range("2017", periods=2, freq="h")) + resampled = ser.resample("30min") index = pd.DatetimeIndex( ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"], - freq="30T", + dtype="M8[ns]", + freq="30min", ) result = methodcaller(method, **method_args)(resampled) expected = Series(expected_values, index=index) @@ -323,32 +345,21 @@ df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") - result = ( - df.set_index("week_starting") - .groupby("volume") - .resample("1D") - .interpolate(method="linear") - ) - - expected_ind = pd.MultiIndex.from_tuples( - [ - (50, Timestamp("2018-01-07")), - (50, Timestamp("2018-01-08")), - (50, Timestamp("2018-01-09")), - (50, Timestamp("2018-01-10")), - (50, Timestamp("2018-01-11")), - (50, Timestamp("2018-01-12")), - (50, Timestamp("2018-01-13")), - (50, Timestamp("2018-01-14")), - (50, Timestamp("2018-01-15")), - (50, Timestamp("2018-01-16")), - (50, Timestamp("2018-01-17")), - (50, Timestamp("2018-01-18")), - (50, Timestamp("2018-01-19")), - (50, Timestamp("2018-01-20")), - (50, Timestamp("2018-01-21")), - (60, Timestamp("2018-01-14")), - ], + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = ( + df.set_index("week_starting") + .groupby("volume") + .resample("1D") + .interpolate(method="linear") + ) + + volume = [50] * 15 + [60] + week_starting = list(date_range("2018-01-07", "2018-01-21")) + [ + Timestamp("2018-01-14") + ] + expected_ind = pd.MultiIndex.from_arrays( + [volume, week_starting], names=["volume", "week_starting"], ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/resample/test_timedelta.py pandas-2.2.2+dfsg/pandas/tests/resample/test_timedelta.py --- pandas-2.1.4+dfsg/pandas/tests/resample/test_timedelta.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/resample/test_timedelta.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -14,10 +16,10 @@ def test_asfreq_bug(): df = DataFrame(data=[1, 3], index=[timedelta(), timedelta(minutes=3)]) - result = df.resample("1T").asfreq() + result = df.resample("1min").asfreq() expected = DataFrame( data=[1, np.nan, np.nan, 3], - index=timedelta_range("0 day", periods=4, freq="1T"), + index=timedelta_range("0 day", periods=4, freq="1min"), ) tm.assert_frame_equal(result, expected) @@ -28,19 +30,19 @@ result = DataFrame({"value": [2, 3, 5]}, index).resample("1s").mean() expected = DataFrame( {"value": [2.5, np.nan, 5.0]}, - index=timedelta_range("0 day", periods=3, freq="1S"), + index=timedelta_range("0 day", periods=3, freq="1s"), ) tm.assert_frame_equal(result, expected) def test_resample_as_freq_with_subperiod(): # GH 13022 - index = timedelta_range("00:00:00", "00:10:00", freq="5T") + index = timedelta_range("00:00:00", "00:10:00", freq="5min") df = DataFrame(data={"value": [1, 5, 10]}, index=index) - result = df.resample("2T").asfreq() + result = df.resample("2min").asfreq() expected_data = {"value": [1, np.nan, np.nan, np.nan, np.nan, 10]} expected = DataFrame( - data=expected_data, index=timedelta_range("00:00:00", "00:10:00", freq="2T") + data=expected_data, index=timedelta_range("00:00:00", "00:10:00", freq="2min") ) tm.assert_frame_equal(result, expected) @@ -71,9 +73,9 @@ def test_resample_timedelta_idempotency(): # GH 12072 - index = timedelta_range("0", periods=9, freq="10L") + index = timedelta_range("0", periods=9, freq="10ms") series = Series(range(9), index=index) - result = series.resample("10L").mean() + result = series.resample("10ms").mean() expected = series.astype(float) tm.assert_series_equal(result, expected) @@ -98,12 +100,15 @@ df = DataFrame({"Group_obj": "A"}, index=pd.to_timedelta(list(range(20)), unit="s")) df["Group"] = df["Group_obj"].astype("category") result = df.resample("10s").agg(lambda x: (x.value_counts().index[0])) + exp_tdi = pd.TimedeltaIndex(np.array([0, 10], dtype="m8[s]"), freq="10s").as_unit( + "ns" + ) expected = DataFrame( {"Group_obj": ["A", "A"], "Group": ["A", "A"]}, - index=pd.TimedeltaIndex([0, 10], unit="s", freq="10s"), + index=exp_tdi, ) expected = expected.reindex(["Group_obj", "Group"], axis=1) - expected["Group"] = expected["Group_obj"] + expected["Group"] = expected["Group_obj"].astype("category") tm.assert_frame_equal(result, expected) @@ -128,13 +133,13 @@ @pytest.mark.parametrize( "start, end, freq, resample_freq", [ - ("8H", "21h59min50s", "10S", "3H"), # GH 30353 example - ("3H", "22H", "1H", "5H"), + ("8h", "21h59min50s", "10s", "3h"), # GH 30353 example + ("3h", "22h", "1h", "5h"), ("527D", "5006D", "3D", "10D"), ("1D", "10D", "1D", "2D"), # GH 13022 example # tests that worked before GH 33498: - ("8H", "21h59min50s", "10S", "2H"), - ("0H", "21h59min50s", "10S", "3H"), + ("8h", "21h59min50s", "10s", "2h"), + ("0h", "21h59min50s", "10s", "3h"), ("10D", "85D", "D", "2D"), ], ) @@ -155,7 +160,7 @@ # GH 10603 df = DataFrame( np.random.default_rng(2).normal(size=(10000, 4)), - index=timedelta_range(start="0s", periods=10000, freq="3906250n"), + index=timedelta_range(start="0s", periods=10000, freq="3906250ns"), ) if duplicates: # case with non-unique columns @@ -196,11 +201,20 @@ # GH#45414 idx = pd.Index([pd.Timedelta(seconds=120 + i * 30) for i in range(10)]) ser = Series(range(10), index=idx) - result = ser.resample("T", closed="right", label="right").sum() + result = ser.resample("min", closed="right", label="right").sum() expected = Series( [0, 3, 7, 11, 15, 9], index=pd.TimedeltaIndex( - [pd.Timedelta(seconds=120 + i * 60) for i in range(6)], freq="T" + [pd.Timedelta(seconds=120 + i * 60) for i in range(6)], freq="min" ), ) tm.assert_series_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_arrow_duration_resample(): + # GH 56371 + idx = pd.Index(timedelta_range("1 day", periods=5), dtype="duration[ns][pyarrow]") + expected = Series(np.arange(5, dtype=np.float64), index=idx) + result = expected.resample("1D").mean() + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_append_common.py pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_append_common.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_append_common.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_append_common.py 2024-04-10 17:42:52.000000000 +0000 @@ -57,10 +57,12 @@ Test common dtype coercion rules between concat and append. """ - def test_dtypes(self, item, index_or_series): + def test_dtypes(self, item, index_or_series, using_infer_string): # to confirm test case covers intended dtypes typ, vals = item obj = index_or_series(vals) + if typ == "object" and using_infer_string: + typ = "string" if isinstance(obj, Index): assert obj.dtype == typ elif isinstance(obj, Series): @@ -197,11 +199,11 @@ # index doesn't because bool is object dtype exp_series_dtype = typ2 mark = pytest.mark.xfail(reason="GH#39187 casting to object") - request.node.add_marker(mark) + request.applymarker(mark) elif typ2 == "bool" and typ1 in ("int64", "float64"): exp_series_dtype = typ1 mark = pytest.mark.xfail(reason="GH#39187 casting to object") - request.node.add_marker(mark) + request.applymarker(mark) elif typ1 in {"datetime64[ns, US/Eastern]", "timedelta64[ns]"} or typ2 in { "datetime64[ns, US/Eastern]", "timedelta64[ns]", @@ -315,7 +317,7 @@ exp_idx = pd.DatetimeIndex( ["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"], tz=tz, - ) + ).as_unit("ns") exp = DataFrame(0, index=exp_idx, columns=["A", "B"]) tm.assert_frame_equal(df1._append(df2), exp) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_categorical.py pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_categorical.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_categorical.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_categorical.py 2024-04-10 17:42:52.000000000 +0000 @@ -51,7 +51,7 @@ exp["h"] = exp["h"].astype(df2["h"].dtype) tm.assert_frame_equal(res, exp) - def test_categorical_concat_dtypes(self): + def test_categorical_concat_dtypes(self, using_infer_string): # GH8143 index = ["cat", "obj", "num"] cat = Categorical(["a", "b", "c"]) @@ -59,7 +59,9 @@ num = Series([1, 2, 3]) df = pd.concat([Series(cat), obj, num], axis=1, keys=index) - result = df.dtypes == "object" + result = df.dtypes == ( + object if not using_infer_string else "string[pyarrow_numpy]" + ) expected = Series([False, True, False], index=index) tm.assert_series_equal(result, expected) @@ -220,7 +222,7 @@ def test_categorical_index_upcast(self): # GH 17629 - # test upcasting to object when concatinating on categorical indexes + # test upcasting to object when concatenating on categorical indexes # with non-identical categories a = DataFrame({"foo": [1, 2]}, index=Categorical(["foo", "bar"])) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_concat.py pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_concat.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_concat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_concat.py 2024-04-10 17:42:52.000000000 +0000 @@ -30,8 +30,8 @@ class TestConcatenate: def test_append_concat(self): # GH#1815 - d1 = date_range("12/31/1990", "12/31/1999", freq="A-DEC") - d2 = date_range("12/31/2000", "12/31/2009", freq="A-DEC") + d1 = date_range("12/31/1990", "12/31/1999", freq="YE-DEC") + d2 = date_range("12/31/2000", "12/31/2009", freq="YE-DEC") s1 = Series(np.random.default_rng(2).standard_normal(10), d1) s2 = Series(np.random.default_rng(2).standard_normal(10), d2) @@ -267,12 +267,11 @@ # it works concat([df1, df2], sort=sort) - def test_concat_mixed_objs(self): - # concat mixed series/frames + def test_concat_mixed_objs_columns(self): + # Test column-wise concat for mixed series/frames (axis=1) # G2385 - # axis 1 - index = date_range("01-Jan-2013", periods=10, freq="H") + index = date_range("01-Jan-2013", periods=10, freq="h") arr = np.arange(10, dtype="int64") s1 = Series(arr, index=index) s2 = Series(arr, index=index) @@ -324,13 +323,41 @@ result = concat([s1, df, s2], axis=1, ignore_index=True) tm.assert_frame_equal(result, expected) - # axis 0 + def test_concat_mixed_objs_index(self): + # Test row-wise concat for mixed series/frames with a common name + # GH2385, GH15047 + + index = date_range("01-Jan-2013", periods=10, freq="h") + arr = np.arange(10, dtype="int64") + s1 = Series(arr, index=index) + s2 = Series(arr, index=index) + df = DataFrame(arr.reshape(-1, 1), index=index) + expected = DataFrame( np.tile(arr, 3).reshape(-1, 1), index=index.tolist() * 3, columns=[0] ) result = concat([s1, df, s2]) tm.assert_frame_equal(result, expected) + def test_concat_mixed_objs_index_names(self): + # Test row-wise concat for mixed series/frames with distinct names + # GH2385, GH15047 + + index = date_range("01-Jan-2013", periods=10, freq="h") + arr = np.arange(10, dtype="int64") + s1 = Series(arr, index=index, name="foo") + s2 = Series(arr, index=index, name="bar") + df = DataFrame(arr.reshape(-1, 1), index=index) + + expected = DataFrame( + np.kron(np.where(np.identity(3) == 1, 1, np.nan), arr).T, + index=index.tolist() * 3, + columns=["foo", 0, "bar"], + ) + result = concat([s1, df, s2]) + tm.assert_frame_equal(result, expected) + + # Rename all series to 0 when ignore_index=True expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) result = concat([s1, df, s2], ignore_index=True) tm.assert_frame_equal(result, expected) @@ -387,8 +414,10 @@ tm.assert_frame_equal(result, expected) def test_concat_bug_1719(self): - ts1 = tm.makeTimeSeries() - ts2 = tm.makeTimeSeries()[::2] + ts1 = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) + ts2 = ts1.copy()[::2] # to join with union # these two are of different length! @@ -870,3 +899,14 @@ result = concat([df1, df2]) expected = DataFrame(["a", 1], index=[0, 0]) tm.assert_frame_equal(result, expected) + + +def test_concat_none_with_timezone_timestamp(): + # GH#52093 + df1 = DataFrame([{"A": None}]) + df2 = DataFrame([{"A": pd.Timestamp("1990-12-20 00:00:00+00:00")}]) + msg = "The behavior of DataFrame concatenation with empty or all-NA entries" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = concat([df1, df2], ignore_index=True) + expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]}) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_dataframe.py pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_dataframe.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_dataframe.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_dataframe.py 2024-04-10 17:42:52.000000000 +0000 @@ -197,7 +197,7 @@ @pytest.mark.parametrize("axis", [0, 1]) def test_concat_copies(self, axis, order, ignore_index, using_copy_on_write): # based on asv ConcatDataFrames - df = DataFrame(np.zeros((10000, 200), dtype=np.float32, order=order)) + df = DataFrame(np.zeros((10, 5), dtype=np.float32, order=order)) res = concat([df] * 5, axis=axis, ignore_index=ignore_index, copy=True) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_datetimes.py pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_datetimes.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_datetimes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_datetimes.py 2024-04-10 17:42:52.000000000 +0000 @@ -46,32 +46,28 @@ def test_concat_datetime_timezone(self): # GH 18523 - idx1 = date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") - idx2 = date_range(start=idx1[0], end=idx1[-1], freq="H") + idx1 = date_range("2011-01-01", periods=3, freq="h", tz="Europe/Paris") + idx2 = date_range(start=idx1[0], end=idx1[-1], freq="h") df1 = DataFrame({"a": [1, 2, 3]}, index=idx1) df2 = DataFrame({"b": [1, 2, 3]}, index=idx2) result = concat([df1, df2], axis=1) - exp_idx = ( - DatetimeIndex( - [ - "2011-01-01 00:00:00+01:00", - "2011-01-01 01:00:00+01:00", - "2011-01-01 02:00:00+01:00", - ], - freq="H", - ) - .tz_convert("UTC") - .tz_convert("Europe/Paris") + exp_idx = DatetimeIndex( + [ + "2011-01-01 00:00:00+01:00", + "2011-01-01 01:00:00+01:00", + "2011-01-01 02:00:00+01:00", + ], + dtype="M8[ns, Europe/Paris]", + freq="h", ) - expected = DataFrame( [[1, 1], [2, 2], [3, 3]], index=exp_idx, columns=["a", "b"] ) tm.assert_frame_equal(result, expected) - idx3 = date_range("2011-01-01", periods=3, freq="H", tz="Asia/Tokyo") + idx3 = date_range("2011-01-01", periods=3, freq="h", tz="Asia/Tokyo") df3 = DataFrame({"b": [1, 2, 3]}, index=idx3) result = concat([df1, df3], axis=1) @@ -84,7 +80,7 @@ "2011-01-01 00:00:00+00:00", "2011-01-01 01:00:00+00:00", ] - ) + ).as_unit("ns") expected = DataFrame( [ @@ -102,7 +98,7 @@ tm.assert_frame_equal(result, expected) # GH 13783: Concat after resample - result = concat([df1.resample("H").mean(), df2.resample("H").mean()], sort=True) + result = concat([df1.resample("h").mean(), df2.resample("h").mean()], sort=True) expected = DataFrame( {"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]}, index=idx1.append(idx1), @@ -112,7 +108,7 @@ def test_concat_datetimeindex_freq(self): # GH 3232 # Monotonic index result - dr = date_range("01-Jan-2013", periods=100, freq="50L", tz="UTC") + dr = date_range("01-Jan-2013", periods=100, freq="50ms", tz="UTC") data = list(range(100)) expected = DataFrame(data, index=dr) result = concat([expected[:50], expected[50:]]) @@ -178,6 +174,7 @@ result = concat([y, y], ignore_index=True) tm.assert_series_equal(result, expected) + def test_concat_NaT_series2(self): # without tz x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h")) y = Series(date_range("20151124 10:00", "20151124 11:00", freq="1h")) @@ -196,8 +193,8 @@ def test_concat_NaT_dataframes(self, tz): # GH 12396 - first = DataFrame([[pd.NaT], [pd.NaT]]) - first = first.apply(lambda x: x.dt.tz_localize(tz)) + dti = DatetimeIndex([pd.NaT, pd.NaT], tz=tz) + first = DataFrame({0: dti}) second = DataFrame( [[Timestamp("2015/01/01", tz=tz)], [Timestamp("2016/01/01", tz=tz)]], index=[2, 3], @@ -298,6 +295,7 @@ result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) + def test_concat_tz_series2(self): # gh-11887: concat tz and object x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) y = Series(["a", "b"]) @@ -305,46 +303,58 @@ result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) + def test_concat_tz_series3(self, unit, unit2): # see gh-12217 and gh-12306 # Concatenating two UTC times - first = DataFrame([[datetime(2016, 1, 1)]]) + first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]") first[0] = first[0].dt.tz_localize("UTC") - second = DataFrame([[datetime(2016, 1, 2)]]) + second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]") second[0] = second[0].dt.tz_localize("UTC") result = concat([first, second]) - assert result[0].dtype == "datetime64[ns, UTC]" + exp_unit = tm.get_finest_unit(unit, unit2) + assert result[0].dtype == f"datetime64[{exp_unit}, UTC]" + def test_concat_tz_series4(self, unit, unit2): # Concatenating two London times - first = DataFrame([[datetime(2016, 1, 1)]]) + first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]") first[0] = first[0].dt.tz_localize("Europe/London") - second = DataFrame([[datetime(2016, 1, 2)]]) + second = DataFrame([[datetime(2016, 1, 2)]], dtype=f"M8[{unit2}]") second[0] = second[0].dt.tz_localize("Europe/London") result = concat([first, second]) - assert result[0].dtype == "datetime64[ns, Europe/London]" + exp_unit = tm.get_finest_unit(unit, unit2) + assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]" + def test_concat_tz_series5(self, unit, unit2): # Concatenating 2+1 London times - first = DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) + first = DataFrame( + [[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]], dtype=f"M8[{unit}]" + ) first[0] = first[0].dt.tz_localize("Europe/London") - second = DataFrame([[datetime(2016, 1, 3)]]) + second = DataFrame([[datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]") second[0] = second[0].dt.tz_localize("Europe/London") result = concat([first, second]) - assert result[0].dtype == "datetime64[ns, Europe/London]" + exp_unit = tm.get_finest_unit(unit, unit2) + assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]" - # Concat'ing 1+2 London times - first = DataFrame([[datetime(2016, 1, 1)]]) + def test_concat_tz_series6(self, unit, unit2): + # Concatenating 1+2 London times + first = DataFrame([[datetime(2016, 1, 1)]], dtype=f"M8[{unit}]") first[0] = first[0].dt.tz_localize("Europe/London") - second = DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) + second = DataFrame( + [[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]], dtype=f"M8[{unit2}]" + ) second[0] = second[0].dt.tz_localize("Europe/London") result = concat([first, second]) - assert result[0].dtype == "datetime64[ns, Europe/London]" + exp_unit = tm.get_finest_unit(unit, unit2) + assert result[0].dtype == f"datetime64[{exp_unit}, Europe/London]" def test_concat_tz_series_tzlocal(self): # see gh-13583 @@ -416,21 +426,25 @@ # GH 6606 df = DataFrame( { - "dt": [ - datetime(2014, 1, 1), - datetime(2014, 1, 2), - datetime(2014, 1, 3), - ], + "dt": DatetimeIndex( + [ + datetime(2014, 1, 1), + datetime(2014, 1, 2), + datetime(2014, 1, 3), + ], + dtype="M8[ns, US/Pacific]", + ), "b": ["A", "B", "C"], "c": [1, 2, 3], "d": [4, 5, 6], } ) - df["dt"] = df["dt"].apply(lambda d: Timestamp(d, tz="US/Pacific")) df = df.set_index(["dt", "b"]) exp_idx1 = DatetimeIndex( - ["2014-01-01", "2014-01-02", "2014-01-03"] * 2, tz="US/Pacific", name="dt" + ["2014-01-01", "2014-01-02", "2014-01-03"] * 2, + dtype="M8[ns, US/Pacific]", + name="dt", ) exp_idx2 = Index(["A", "B", "C"] * 2, name="b") exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) @@ -512,6 +526,7 @@ tm.assert_series_equal(result, expected) assert result.dtype == "object" + def test_concat_period_other_series2(self): # non-period x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) y = Series(DatetimeIndex(["2015-11-01", "2015-12-01"])) @@ -520,6 +535,7 @@ tm.assert_series_equal(result, expected) assert result.dtype == "object" + def test_concat_period_other_series3(self): x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) y = Series(["A", "B"]) expected = Series([x[0], x[1], y[0], y[1]], dtype="object") diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_empty.py pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_empty.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_empty.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_empty.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,7 +13,7 @@ class TestEmptyConcat: - def test_handle_empty_objects(self, sort): + def test_handle_empty_objects(self, sort, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd") ) @@ -26,7 +26,9 @@ concatted = concat(frames, axis=0, sort=sort) expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) - expected["foo"] = expected["foo"].astype("O") + expected["foo"] = expected["foo"].astype( + object if not using_infer_string else "string[pyarrow_numpy]" + ) expected.loc[0:4, "foo"] = "bar" tm.assert_frame_equal(concatted, expected) @@ -275,14 +277,14 @@ expected = DataFrame(columns=["a", "b"]) tm.assert_frame_equal(result, expected) - def test_concat_empty_dataframe_different_dtypes(self): + def test_concat_empty_dataframe_different_dtypes(self, using_infer_string): # 39037 df1 = DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) df2 = DataFrame({"a": [1, 2, 3]}) result = concat([df1[:0], df2[:0]]) assert result["a"].dtype == np.int64 - assert result["b"].dtype == np.object_ + assert result["b"].dtype == np.object_ if not using_infer_string else "string" def test_concat_to_empty_ea(self): """48510 `concat` to an empty EA should maintain type EA dtype.""" diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_index.py pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_index.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_index.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_index.py 2024-04-10 17:42:52.000000000 +0000 @@ -447,12 +447,14 @@ ) tm.assert_frame_equal(result, expected) - def test_concat_axis_1_sort_false_rangeindex(self): + def test_concat_axis_1_sort_false_rangeindex(self, using_infer_string): # GH 46675 s1 = Series(["a", "b", "c"]) s2 = Series(["a", "b"]) s3 = Series(["a", "b", "c", "d"]) - s4 = Series([], dtype=object) + s4 = Series( + [], dtype=object if not using_infer_string else "string[pyarrow_numpy]" + ) result = concat( [s1, s2, s3, s4], sort=False, join="outer", ignore_index=False, axis=1 ) @@ -463,7 +465,7 @@ ["c", np.nan] * 2, [np.nan] * 2 + ["d"] + [np.nan], ], - dtype=object, + dtype=object if not using_infer_string else "string[pyarrow_numpy]", ) tm.assert_frame_equal( result, expected, check_index_type=True, check_column_type=True diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_invalid.py pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_invalid.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_invalid.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_invalid.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,19 +12,19 @@ class TestInvalidConcat: - def test_concat_invalid(self): + @pytest.mark.parametrize("obj", [1, {}, [1, 2], (1, 2)]) + def test_concat_invalid(self, obj): # trying to concat a ndframe with a non-ndframe - df1 = tm.makeCustomDataframe(10, 2) - for obj in [1, {}, [1, 2], (1, 2)]: - msg = ( - f"cannot concatenate object of type '{type(obj)}'; " - "only Series and DataFrame objs are valid" - ) - with pytest.raises(TypeError, match=msg): - concat([df1, obj]) + df1 = DataFrame(range(2)) + msg = ( + f"cannot concatenate object of type '{type(obj)}'; " + "only Series and DataFrame objs are valid" + ) + with pytest.raises(TypeError, match=msg): + concat([df1, obj]) def test_concat_invalid_first_argument(self): - df1 = tm.makeCustomDataframe(10, 2) + df1 = DataFrame(range(2)) msg = ( "first argument must be an iterable of pandas " 'objects, you passed an object of type "DataFrame"' diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_series.py pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_series.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/concat/test_series.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/concat/test_series.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,7 +15,11 @@ class TestSeriesConcat: def test_concat_series(self): - ts = tm.makeTimeSeries() + ts = Series( + np.arange(20, dtype=np.float64), + index=date_range("2020-01-01", periods=20), + name="foo", + ) ts.name = "foo" pieces = [ts[:5], ts[5:15], ts[15:]] @@ -46,7 +50,9 @@ tm.assert_series_equal(result, expected) def test_concat_series_axis1(self): - ts = tm.makeTimeSeries() + ts = Series( + np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) + ) pieces = [ts[:-2], ts[2:], ts[2:-2]] diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/allow_exact_matches.csv pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/allow_exact_matches.csv --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/allow_exact_matches.csv 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/allow_exact_matches.csv 1970-01-01 00:00:00.000000000 +0000 @@ -1,28 +0,0 @@ -time,ticker,price,quantity,marketCenter,bid,ask -20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,, -20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 -20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 -20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,, -20160525 13:30:00.075,AAPL,98.55,6,ARCA,, -20160525 13:30:00.075,AAPL,98.55,6,ARCA,, -20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/allow_exact_matches_and_tolerance.csv pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/allow_exact_matches_and_tolerance.csv --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/allow_exact_matches_and_tolerance.csv 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/allow_exact_matches_and_tolerance.csv 1970-01-01 00:00:00.000000000 +0000 @@ -1,28 +0,0 @@ -time,ticker,price,quantity,marketCenter,bid,ask -20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,, -20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 -20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 -20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,, -20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,, -20160525 13:30:00.075,AAPL,98.55,6,ARCA,, -20160525 13:30:00.075,AAPL,98.55,6,ARCA,, -20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/asof.csv pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/asof.csv --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/asof.csv 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/asof.csv 1970-01-01 00:00:00.000000000 +0000 @@ -1,28 +0,0 @@ -time,ticker,price,quantity,marketCenter,bid,ask -20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 -20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 -20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 -20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 -20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/asof2.csv pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/asof2.csv --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/asof2.csv 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/asof2.csv 1970-01-01 00:00:00.000000000 +0000 @@ -1,78 +0,0 @@ -time,ticker,price,quantity,marketCenter,bid,ask -20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 -20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 -20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 -20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 -20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 -20160525 13:30:00.084,AAPL,98.64,40,NASDAQ,98.55,98.56 -20160525 13:30:00.084,AAPL,98.55,149,EDGX,98.55,98.56 -20160525 13:30:00.086,AAPL,98.56,500,ARCA,98.55,98.63 -20160525 13:30:00.104,AAPL,98.63,647,EDGX,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,300,EDGX,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,50,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,50,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,70,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,70,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,1,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,62,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,10,NASDAQ,98.62,98.63 -20160525 13:30:00.104,AAPL,98.63,100,ARCA,98.62,98.63 -20160525 13:30:00.105,AAPL,98.63,100,ARCA,98.62,98.63 -20160525 13:30:00.105,AAPL,98.63,700,ARCA,98.62,98.63 -20160525 13:30:00.106,AAPL,98.63,61,EDGX,98.62,98.63 -20160525 13:30:00.107,AAPL,98.63,100,ARCA,98.62,98.63 -20160525 13:30:00.107,AAPL,98.63,53,ARCA,98.62,98.63 -20160525 13:30:00.108,AAPL,98.63,100,ARCA,98.62,98.63 -20160525 13:30:00.108,AAPL,98.63,839,ARCA,98.62,98.63 -20160525 13:30:00.115,AAPL,98.63,5,EDGX,98.62,98.63 -20160525 13:30:00.118,AAPL,98.63,295,EDGX,98.62,98.63 -20160525 13:30:00.118,AAPL,98.63,5,EDGX,98.62,98.63 -20160525 13:30:00.128,AAPL,98.63,100,NASDAQ,98.62,98.63 -20160525 13:30:00.128,AAPL,98.63,100,NASDAQ,98.62,98.63 -20160525 13:30:00.128,MSFT,51.92,100,ARCA,51.92,51.95 -20160525 13:30:00.129,AAPL,98.62,100,NASDAQ,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,10,NASDAQ,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,59,NASDAQ,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,31,NASDAQ,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,69,NASDAQ,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,12,NASDAQ,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,12,EDGX,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,100,ARCA,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,100,ARCA,98.61,98.63 -20160525 13:30:00.130,MSFT,51.95,317,ARCA,51.93,51.95 -20160525 13:30:00.130,MSFT,51.95,283,ARCA,51.93,51.95 -20160525 13:30:00.135,MSFT,51.93,100,EDGX,51.92,51.95 -20160525 13:30:00.135,AAPL,98.62,100,ARCA,98.61,98.62 -20160525 13:30:00.144,AAPL,98.62,12,NASDAQ,98.61,98.62 -20160525 13:30:00.144,AAPL,98.62,88,NASDAQ,98.61,98.62 -20160525 13:30:00.144,AAPL,98.62,162,NASDAQ,98.61,98.62 -20160525 13:30:00.144,AAPL,98.61,100,BATS,98.61,98.62 -20160525 13:30:00.144,AAPL,98.62,61,ARCA,98.61,98.62 -20160525 13:30:00.144,AAPL,98.62,25,ARCA,98.61,98.62 -20160525 13:30:00.144,AAPL,98.62,14,ARCA,98.61,98.62 -20160525 13:30:00.145,AAPL,98.62,12,ARCA,98.6,98.63 -20160525 13:30:00.145,AAPL,98.62,100,ARCA,98.6,98.63 -20160525 13:30:00.145,AAPL,98.63,100,NASDAQ,98.6,98.63 -20160525 13:30:00.145,AAPL,98.63,100,NASDAQ,98.6,98.63 diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/quotes.csv pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/quotes.csv --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/quotes.csv 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/quotes.csv 1970-01-01 00:00:00.000000000 +0000 @@ -1,17 +0,0 @@ -time,ticker,bid,ask -20160525 13:30:00.023,GOOG,720.50,720.93 -20160525 13:30:00.023,MSFT,51.95,51.95 -20160525 13:30:00.041,MSFT,51.95,51.95 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.072,GOOG,720.50,720.88 -20160525 13:30:00.075,AAPL,98.55,98.56 -20160525 13:30:00.076,AAPL,98.55,98.56 -20160525 13:30:00.076,AAPL,98.55,98.56 -20160525 13:30:00.076,AAPL,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,51.95 -20160525 13:30:00.078,MSFT,51.92,51.95 diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/quotes2.csv pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/quotes2.csv --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/quotes2.csv 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/quotes2.csv 1970-01-01 00:00:00.000000000 +0000 @@ -1,57 +0,0 @@ -time,ticker,bid,ask -20160525 13:30:00.023,GOOG,720.50,720.93 -20160525 13:30:00.023,MSFT,51.95,51.95 -20160525 13:30:00.041,MSFT,51.95,51.95 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.048,GOOG,720.50,720.93 -20160525 13:30:00.072,GOOG,720.50,720.88 -20160525 13:30:00.075,AAPL,98.55,98.56 -20160525 13:30:00.076,AAPL,98.55,98.56 -20160525 13:30:00.076,AAPL,98.55,98.56 -20160525 13:30:00.076,AAPL,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,51.95 -20160525 13:30:00.078,MSFT,51.95,51.95 -20160525 13:30:00.078,MSFT,51.92,51.95 -20160525 13:30:00.079,MSFT,51.92,51.95 -20160525 13:30:00.080,AAPL,98.55,98.56 -20160525 13:30:00.084,AAPL,98.55,98.56 -20160525 13:30:00.086,AAPL,98.55,98.63 -20160525 13:30:00.088,AAPL,98.65,98.63 -20160525 13:30:00.089,AAPL,98.63,98.63 -20160525 13:30:00.104,AAPL,98.63,98.63 -20160525 13:30:00.104,AAPL,98.63,98.63 -20160525 13:30:00.104,AAPL,98.63,98.63 -20160525 13:30:00.104,AAPL,98.63,98.63 -20160525 13:30:00.104,AAPL,98.62,98.63 -20160525 13:30:00.105,AAPL,98.62,98.63 -20160525 13:30:00.107,AAPL,98.62,98.63 -20160525 13:30:00.115,AAPL,98.62,98.63 -20160525 13:30:00.115,AAPL,98.62,98.63 -20160525 13:30:00.118,AAPL,98.62,98.63 -20160525 13:30:00.128,AAPL,98.62,98.63 -20160525 13:30:00.128,AAPL,98.62,98.63 -20160525 13:30:00.129,AAPL,98.62,98.63 -20160525 13:30:00.129,AAPL,98.61,98.63 -20160525 13:30:00.129,AAPL,98.62,98.63 -20160525 13:30:00.129,AAPL,98.62,98.63 -20160525 13:30:00.129,AAPL,98.61,98.63 -20160525 13:30:00.130,MSFT,51.93,51.95 -20160525 13:30:00.130,MSFT,51.93,51.95 -20160525 13:30:00.130,AAPL,98.61,98.63 -20160525 13:30:00.131,AAPL,98.61,98.62 -20160525 13:30:00.131,AAPL,98.61,98.62 -20160525 13:30:00.135,MSFT,51.92,51.95 -20160525 13:30:00.135,AAPL,98.61,98.62 -20160525 13:30:00.136,AAPL,98.61,98.62 -20160525 13:30:00.136,AAPL,98.61,98.62 -20160525 13:30:00.144,AAPL,98.61,98.62 -20160525 13:30:00.144,AAPL,98.61,98.62 -20160525 13:30:00.145,AAPL,98.61,98.62 -20160525 13:30:00.145,AAPL,98.61,98.63 -20160525 13:30:00.145,AAPL,98.61,98.63 -20160525 13:30:00.145,AAPL,98.60,98.63 -20160525 13:30:00.145,AAPL,98.61,98.63 -20160525 13:30:00.145,AAPL,98.60,98.63 diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/tolerance.csv pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/tolerance.csv --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/tolerance.csv 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/tolerance.csv 1970-01-01 00:00:00.000000000 +0000 @@ -1,28 +0,0 @@ -time,ticker,price,quantity,marketCenter,bid,ask -20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 -20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 -20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 -20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 -20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, -20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 -20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 -20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 -20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 -20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 -20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/trades.csv pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/trades.csv --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/trades.csv 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/trades.csv 1970-01-01 00:00:00.000000000 +0000 @@ -1,28 +0,0 @@ -time,ticker,price,quantity,marketCenter -20160525 13:30:00.023,MSFT,51.9500,75,NASDAQ -20160525 13:30:00.038,MSFT,51.9500,155,NASDAQ -20160525 13:30:00.048,GOOG,720.7700,100,NASDAQ -20160525 13:30:00.048,GOOG,720.9200,100,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,200,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,300,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,600,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,44,NASDAQ -20160525 13:30:00.074,AAPL,98.6700,478343,NASDAQ -20160525 13:30:00.075,AAPL,98.6700,478343,NASDAQ -20160525 13:30:00.075,AAPL,98.6600,6,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,30,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,75,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,20,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,35,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,10,NASDAQ -20160525 13:30:00.075,AAPL,98.5500,6,ARCA -20160525 13:30:00.075,AAPL,98.5500,6,ARCA -20160525 13:30:00.076,AAPL,98.5600,1000,ARCA -20160525 13:30:00.076,AAPL,98.5600,200,ARCA -20160525 13:30:00.076,AAPL,98.5600,300,ARCA -20160525 13:30:00.076,AAPL,98.5600,400,ARCA -20160525 13:30:00.076,AAPL,98.5600,600,ARCA -20160525 13:30:00.076,AAPL,98.5600,200,ARCA -20160525 13:30:00.078,MSFT,51.9500,783,NASDAQ -20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ -20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/trades2.csv pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/trades2.csv --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/data/trades2.csv 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/data/trades2.csv 1970-01-01 00:00:00.000000000 +0000 @@ -1,78 +0,0 @@ -time,ticker,price,quantity,marketCenter -20160525 13:30:00.023,MSFT,51.9500,75,NASDAQ -20160525 13:30:00.038,MSFT,51.9500,155,NASDAQ -20160525 13:30:00.048,GOOG,720.7700,100,NASDAQ -20160525 13:30:00.048,GOOG,720.9200,100,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,200,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,300,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,600,NASDAQ -20160525 13:30:00.048,GOOG,720.9300,44,NASDAQ -20160525 13:30:00.074,AAPL,98.6700,478343,NASDAQ -20160525 13:30:00.075,AAPL,98.6700,478343,NASDAQ -20160525 13:30:00.075,AAPL,98.6600,6,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,30,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,75,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,20,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,35,NASDAQ -20160525 13:30:00.075,AAPL,98.6500,10,NASDAQ -20160525 13:30:00.075,AAPL,98.5500,6,ARCA -20160525 13:30:00.075,AAPL,98.5500,6,ARCA -20160525 13:30:00.076,AAPL,98.5600,1000,ARCA -20160525 13:30:00.076,AAPL,98.5600,200,ARCA -20160525 13:30:00.076,AAPL,98.5600,300,ARCA -20160525 13:30:00.076,AAPL,98.5600,400,ARCA -20160525 13:30:00.076,AAPL,98.5600,600,ARCA -20160525 13:30:00.076,AAPL,98.5600,200,ARCA -20160525 13:30:00.078,MSFT,51.9500,783,NASDAQ -20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ -20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ -20160525 13:30:00.084,AAPL,98.6400,40,NASDAQ -20160525 13:30:00.084,AAPL,98.5500,149,EDGX -20160525 13:30:00.086,AAPL,98.5600,500,ARCA -20160525 13:30:00.104,AAPL,98.6300,647,EDGX -20160525 13:30:00.104,AAPL,98.6300,300,EDGX -20160525 13:30:00.104,AAPL,98.6300,50,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,50,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,70,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,70,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,1,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,62,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,10,NASDAQ -20160525 13:30:00.104,AAPL,98.6300,100,ARCA -20160525 13:30:00.105,AAPL,98.6300,100,ARCA -20160525 13:30:00.105,AAPL,98.6300,700,ARCA -20160525 13:30:00.106,AAPL,98.6300,61,EDGX -20160525 13:30:00.107,AAPL,98.6300,100,ARCA -20160525 13:30:00.107,AAPL,98.6300,53,ARCA -20160525 13:30:00.108,AAPL,98.6300,100,ARCA -20160525 13:30:00.108,AAPL,98.6300,839,ARCA -20160525 13:30:00.115,AAPL,98.6300,5,EDGX -20160525 13:30:00.118,AAPL,98.6300,295,EDGX -20160525 13:30:00.118,AAPL,98.6300,5,EDGX -20160525 13:30:00.128,AAPL,98.6300,100,NASDAQ -20160525 13:30:00.128,AAPL,98.6300,100,NASDAQ -20160525 13:30:00.128,MSFT,51.9200,100,ARCA -20160525 13:30:00.129,AAPL,98.6200,100,NASDAQ -20160525 13:30:00.129,AAPL,98.6200,10,NASDAQ -20160525 13:30:00.129,AAPL,98.6200,59,NASDAQ -20160525 13:30:00.129,AAPL,98.6200,31,NASDAQ -20160525 13:30:00.129,AAPL,98.6200,69,NASDAQ -20160525 13:30:00.129,AAPL,98.6200,12,NASDAQ -20160525 13:30:00.129,AAPL,98.6200,12,EDGX -20160525 13:30:00.129,AAPL,98.6200,100,ARCA -20160525 13:30:00.129,AAPL,98.6200,100,ARCA -20160525 13:30:00.130,MSFT,51.9500,317,ARCA -20160525 13:30:00.130,MSFT,51.9500,283,ARCA -20160525 13:30:00.135,MSFT,51.9300,100,EDGX -20160525 13:30:00.135,AAPL,98.6200,100,ARCA -20160525 13:30:00.144,AAPL,98.6200,12,NASDAQ -20160525 13:30:00.144,AAPL,98.6200,88,NASDAQ -20160525 13:30:00.144,AAPL,98.6200,162,NASDAQ -20160525 13:30:00.144,AAPL,98.6100,100,BATS -20160525 13:30:00.144,AAPL,98.6200,61,ARCA -20160525 13:30:00.144,AAPL,98.6200,25,ARCA -20160525 13:30:00.144,AAPL,98.6200,14,ARCA -20160525 13:30:00.145,AAPL,98.6200,12,ARCA -20160525 13:30:00.145,AAPL,98.6200,100,ARCA -20160525 13:30:00.145,AAPL,98.6300,100,NASDAQ -20160525 13:30:00.145,AAPL,98.6300,100,NASDAQ diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/test_join.py pandas-2.2.2+dfsg/pandas/tests/reshape/merge/test_join.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/test_join.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/test_join.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -11,8 +13,10 @@ MultiIndex, Series, Timestamp, + bdate_range, concat, merge, + option_context, ) import pandas._testing as tm @@ -57,8 +61,13 @@ @pytest.fixture def target_source(self): - index, data = tm.getMixedTypeDict() - target = DataFrame(data, index=index) + data = { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + } + target = DataFrame(data, index=Index(["a", "b", "c", "d", "e"], dtype=object)) # Join on string value @@ -112,7 +121,10 @@ assert "key1.foo" in joined assert "key2.bar" in joined - def test_join_on(self, target_source): + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_join_on(self, target_source, infer_string): target, source = target_source merged = target.join(source, on="C") @@ -144,8 +156,8 @@ # overlap source_copy = source.copy() msg = ( - "You are trying to merge on float64 and object columns for key 'A'. " - "If you wish to proceed you should use pd.concat" + "You are trying to merge on float64 and object|string columns for key " + "'A'. If you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): target.join(source_copy, on="A") @@ -162,7 +174,7 @@ "a": np.random.default_rng(2).choice(["m", "f"], size=10), "b": np.random.default_rng(2).standard_normal(10), }, - index=tm.makeCustomIndex(10, 2), + index=MultiIndex.from_product([range(5), ["A", "B"]]), ) msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): @@ -174,7 +186,7 @@ "a": np.random.default_rng(2).choice(["m", "f"], size=3), "b": np.random.default_rng(2).standard_normal(3), }, - index=tm.makeCustomIndex(3, 2), + index=MultiIndex.from_arrays([range(3), list("abc")]), ) df2 = DataFrame( { @@ -198,7 +210,7 @@ "a": np.random.default_rng(2).choice(["m", "f"], size=10), "b": np.random.default_rng(2).standard_normal(10), }, - index=tm.makeCustomIndex(10, 2), + index=MultiIndex.from_product([range(5), ["A", "B"]]), ) msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): @@ -224,7 +236,8 @@ def test_join_on_pass_vector(self, target_source): target, source = target_source expected = target.join(source, on="C") - del expected["C"] + expected = expected.rename(columns={"C": "key_0"}) + expected = expected[["key_0", "A", "B", "D", "MergedA", "MergedD"]] join_col = target.pop("C") result = target.join(source, on=join_col) @@ -551,24 +564,30 @@ tm.assert_frame_equal(inner, left) tm.assert_frame_equal(inner, right) - def test_join_sort(self): - left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) - right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) - - joined = left.join(right, on="key", sort=True) - expected = DataFrame( - { - "key": ["bar", "baz", "foo", "foo"], - "value": [2, 3, 1, 4], - "value2": ["a", "b", "c", "c"], - }, - index=[1, 2, 0, 3], - ) - tm.assert_frame_equal(joined, expected) - - # smoke test - joined = left.join(right, on="key", sort=False) - tm.assert_index_equal(joined.index, Index(range(4)), exact=True) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_join_sort(self, infer_string): + with option_context("future.infer_string", infer_string): + left = DataFrame( + {"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]} + ) + right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) + + joined = left.join(right, on="key", sort=True) + expected = DataFrame( + { + "key": ["bar", "baz", "foo", "foo"], + "value": [2, 3, 1, 4], + "value2": ["a", "b", "c", "c"], + }, + index=[1, 2, 0, 3], + ) + tm.assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on="key", sort=False) + tm.assert_index_equal(joined.index, Index(range(4)), exact=True) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index @@ -612,7 +631,7 @@ df.insert(5, "dt", "foo") grouped = df.groupby("id") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.mean() mn = grouped.mean(numeric_only=True) @@ -757,7 +776,7 @@ ) result = df1.join(df2.set_index("date"), on="date") expected = df1.copy() - expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object) + expected["vals_2"] = Series([np.nan] * 2 + list("tuv")) tm.assert_frame_equal(result, expected) def test_join_datetime_string(self): @@ -771,13 +790,13 @@ ], columns=["x", "y", "a"], ) - dfa["x"] = pd.to_datetime(dfa["x"]) + dfa["x"] = pd.to_datetime(dfa["x"]).astype("M8[ns]") dfb = DataFrame( [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]], columns=["x", "y", "z"], index=[2, 4], ) - dfb["x"] = pd.to_datetime(dfb["x"]) + dfb["x"] = pd.to_datetime(dfb["x"]).astype("M8[ns]") result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"]) expected = DataFrame( [ @@ -787,6 +806,7 @@ index=[2, 4], columns=["x", "y", "z", "a"], ) + expected["x"] = expected["x"].astype("M8[ns]") tm.assert_frame_equal(result, expected) def test_join_with_categorical_index(self): @@ -813,9 +833,7 @@ left_grouped = left.groupby(join_col) right_grouped = right.groupby(join_col) - for group_key, group in result.groupby( - join_col if len(join_col) > 1 else join_col[0] - ): + for group_key, group in result.groupby(join_col): l_joined = _restrict_to_columns(group, left.columns, lsuffix) r_joined = _restrict_to_columns(group, right.columns, rsuffix) @@ -904,7 +922,7 @@ result = left.join(right, how="inner") expected = DataFrame( {"e": [5], "f": [6]}, - index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")), + index=MultiIndex.from_tuples([(1, 2, 4, 3)], names=("a", "b", "d", "c")), ) tm.assert_frame_equal(result, expected) @@ -928,10 +946,16 @@ ) right = DataFrame(data={"d": 4}, index=MultiIndex.from_tuples([(2,)], names=("b",))) result = left.join(right, how=join_type) - expected = DataFrame( - {"c": [3], "d": [4]}, - index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), - ) + if join_type == "right": + expected = DataFrame( + {"c": [3], "d": [4]}, + index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]), + ) + else: + expected = DataFrame( + {"c": [3], "d": [4]}, + index=MultiIndex.from_tuples([(1, 2)], names=["a", "b"]), + ) tm.assert_frame_equal(result, expected) @@ -1012,7 +1036,28 @@ expected = DataFrame(columns=["B", "C"], dtype="int64") if how != "cross": expected = expected.rename_axis("A") + if how == "outer": + expected = expected.sort_index() + + tm.assert_frame_equal(result, expected) + + +def test_join_empty_uncomparable_columns(): + # GH 57048 + df1 = DataFrame() + df2 = DataFrame(columns=["test"]) + df3 = DataFrame(columns=["foo", ("bar", "baz")]) + + result = df1 + df2 + expected = DataFrame(columns=["test"]) + tm.assert_frame_equal(result, expected) + + result = df2 + df3 + expected = DataFrame(columns=[("bar", "baz"), "foo", "test"]) + tm.assert_frame_equal(result, expected) + result = df1 + df3 + expected = DataFrame(columns=[("bar", "baz"), "foo"]) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/test_merge.py pandas-2.2.2+dfsg/pandas/tests/reshape/merge/test_merge.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/test_merge.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/test_merge.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas.core.dtypes.common import is_object_dtype +from pandas.core.dtypes.common import ( + is_object_dtype, + is_string_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -26,7 +29,6 @@ TimedeltaIndex, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import ( MergeError, @@ -317,14 +319,15 @@ merged["d"] = "peekaboo" assert (right["d"] == "bar").all() - def test_merge_nocopy(self, using_array_manager): + def test_merge_nocopy(self, using_array_manager, using_infer_string): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) merged = merge(left, right, left_index=True, right_index=True, copy=False) assert np.shares_memory(merged["a"]._values, left["a"]._values) - assert np.shares_memory(merged["d"]._values, right["d"]._values) + if not using_infer_string: + assert np.shares_memory(merged["d"]._values, right["d"]._values) def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -582,11 +585,11 @@ df_empty = df[:0] expected = DataFrame( { - "value_x": Series(dtype=df.dtypes["value"]), "key": Series(dtype=df.dtypes["key"]), + "value_x": Series(dtype=df.dtypes["value"]), "value_y": Series(dtype=df.dtypes["value"]), }, - columns=["value_x", "key", "value_y"], + columns=["key", "value_x", "value_y"], ) actual = df_empty.merge(df, on="key") tm.assert_frame_equal(actual, expected) @@ -668,11 +671,13 @@ "i1_": {0: 0, 1: np.nan}, "i3": {0: 0.0, 1: np.nan}, None: {0: 0, 1: 0}, - } + }, + columns=Index(["i1", "i2", "i1_", "i3", None], dtype=object), ) .set_index(None) .reset_index()[["i1", "i2", "i1_", "i3"]] ) + result.columns = result.columns.astype("object") tm.assert_frame_equal(result, expected, check_dtype=False) def test_merge_nan_right2(self): @@ -821,7 +826,7 @@ # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|string'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -882,9 +887,9 @@ dtz = pd.DatetimeTZDtype(tz="UTC") right = DataFrame( { - "date": [pd.Timestamp("2018", tz=dtz.tz)], + "date": DatetimeIndex(["2018"], dtype=dtz), "value": [4.0], - "date2": [pd.Timestamp("2019", tz=dtz.tz)], + "date2": DatetimeIndex(["2019"], dtype=dtz), }, columns=["date", "value", "date2"], ) @@ -892,20 +897,20 @@ result = left.merge(right, on="date") expected = DataFrame( { + "date": Series(dtype=dtz), "value_x": Series(dtype=float), "date2_x": Series(dtype=dtz), - "date": Series(dtype=dtz), "value_y": Series(dtype=float), "date2_y": Series(dtype=dtz), }, - columns=["value_x", "date2_x", "date", "value_y", "date2_y"], + columns=["date", "value_x", "date2_x", "value_y", "date2_y"], ) tm.assert_frame_equal(result, expected) def test_merge_datetime64tz_with_dst_transition(self): # GH 18885 df1 = DataFrame( - pd.date_range("2017-10-29 01:00", periods=4, freq="H", tz="Europe/Madrid"), + pd.date_range("2017-10-29 01:00", periods=4, freq="h", tz="Europe/Madrid"), columns=["date"], ) df1["value"] = 1 @@ -926,7 +931,7 @@ expected = DataFrame( { "date": pd.date_range( - "2017-10-29 01:00", periods=7, freq="H", tz="Europe/Madrid" + "2017-10-29 01:00", periods=7, freq="h", tz="Europe/Madrid" ), "value_x": [1] * 4 + [np.nan] * 3, "value_y": [np.nan] * 4 + [2] * 3, @@ -1347,9 +1352,12 @@ CategoricalIndex([1, 2, 4, None, None, None]), ), ( - DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"]), DatetimeIndex( - ["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT] + ["2001-01-01", "2002-02-02", "2003-03-03"], dtype="M8[ns]" + ), + DatetimeIndex( + ["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT], + dtype="M8[ns]", ), ), *[ @@ -1465,13 +1473,14 @@ def _check_merge(x, y): for how in ["inner", "left", "outer"]: - result = x.join(y, how=how) + for sort in [True, False]: + result = x.join(y, how=how, sort=sort) - expected = merge(x.reset_index(), y.reset_index(), how=how, sort=True) - expected = expected.set_index("index") + expected = merge(x.reset_index(), y.reset_index(), how=how, sort=sort) + expected = expected.set_index("index") - # TODO check_names on merge? - tm.assert_frame_equal(result, expected, check_names=False) + # TODO check_names on merge? + tm.assert_frame_equal(result, expected, check_names=False) class TestMergeDtypes: @@ -1495,7 +1504,7 @@ # We allow merging on object and categorical cols and cast # categorical cols to object result = merge(left, right, on="A") - assert is_object_dtype(result.A.dtype) + assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) @pytest.mark.parametrize( "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8] @@ -1634,7 +1643,7 @@ result = merge(df1, df2, on=["A"]) assert is_object_dtype(result.A.dtype) result = merge(df2, df1, on=["A"]) - assert is_object_dtype(result.A.dtype) + assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) @pytest.mark.parametrize( "df1_vals, df2_vals", @@ -1754,7 +1763,7 @@ "how, expected_data", [ ("inner", [[True, 1, 4], [False, 5, 3]]), - ("outer", [[True, 1, 4], [False, 5, 3]]), + ("outer", [[False, 5, 3], [True, 1, 4]]), ("left", [[True, 1, 4], [False, 5, 3]]), ("right", [[False, 5, 3], [True, 1, 4]]), ], @@ -1829,14 +1838,15 @@ if exp == "left": expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]}) elif exp == "right": - expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]}) + expected = DataFrame({"A": [1], "B": [np.nan], "C": [5]}) elif exp == "empty": expected = DataFrame(columns=["A", "B", "C"], dtype="int64") - if left_empty: - expected = expected[["B", "A", "C"]] elif exp == "empty_cross": expected = DataFrame(columns=["A_x", "B", "A_y", "C"], dtype="int64") + if how == "outer": + expected = expected.sort_values("A", ignore_index=True) + tm.assert_frame_equal(result, expected) @@ -1846,7 +1856,7 @@ { "X": Series( np.random.default_rng(2).choice(["foo", "bar"], size=(10,)) - ).astype(CDT(["foo", "bar"])), + ).astype(CategoricalDtype(["foo", "bar"])), "Y": np.random.default_rng(2).choice(["one", "two", "three"], size=(10,)), } ) @@ -1855,30 +1865,35 @@ @pytest.fixture def right(): return DataFrame( - {"X": Series(["foo", "bar"]).astype(CDT(["foo", "bar"])), "Z": [1, 2]} + { + "X": Series(["foo", "bar"]).astype(CategoricalDtype(["foo", "bar"])), + "Z": [1, 2], + } ) class TestMergeCategorical: - def test_identical(self, left): + def test_identical(self, left, using_infer_string): # merging on the same, should preserve dtypes merged = merge(left, left, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "string" expected = Series( - [CategoricalDtype(categories=["foo", "bar"]), np.dtype("O"), np.dtype("O")], + [CategoricalDtype(categories=["foo", "bar"]), dtype, dtype], index=["X", "Y_x", "Y_y"], ) tm.assert_series_equal(result, expected) - def test_basic(self, left, right): + def test_basic(self, left, right, using_infer_string): # we have matching Categorical dtypes in X # so should preserve the merged column merged = merge(left, right, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "string" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), - np.dtype("O"), + dtype, np.dtype("int64"), ], index=["X", "Y", "Z"], @@ -1982,16 +1997,17 @@ ).set_index(["id", "p"]) tm.assert_frame_equal(result, expected) - def test_other_columns(self, left, right): + def test_other_columns(self, left, right, using_infer_string): # non-merge columns should preserve if possible right = right.assign(Z=right.Z.astype("category")) merged = merge(left, right, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "string" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), - np.dtype("O"), + dtype, CategoricalDtype(categories=[1, 2]), ], index=["X", "Y", "Z"], @@ -2006,11 +2022,13 @@ "change", [ lambda x: x, - lambda x: x.astype(CDT(["foo", "bar", "bah"])), - lambda x: x.astype(CDT(ordered=True)), + lambda x: x.astype(CategoricalDtype(["foo", "bar", "bah"])), + lambda x: x.astype(CategoricalDtype(ordered=True)), ], ) - def test_dtype_on_merged_different(self, change, join_type, left, right): + def test_dtype_on_merged_different( + self, change, join_type, left, right, using_infer_string + ): # our merging columns, X now has 2 different dtypes # so we must be object as a result @@ -2022,9 +2040,8 @@ merged = merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() - expected = Series( - [np.dtype("O"), np.dtype("O"), np.dtype("int64")], index=["X", "Y", "Z"] - ) + dtype = np.dtype("O") if not using_infer_string else "string" + expected = Series([dtype, dtype, np.dtype("int64")], index=["X", "Y", "Z"]) tm.assert_series_equal(result, expected) def test_self_join_multiple_categories(self): @@ -2114,11 +2131,13 @@ # GH 17187 # merging with a boolean/int categorical column df1 = DataFrame({"id": [1, 2, 3, 4], "cat": category_column}) - df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered)) + df1["cat"] = df1["cat"].astype(CategoricalDtype(categories, ordered=ordered)) df2 = DataFrame({"id": [2, 4], "num": [1, 9]}) result = df1.merge(df2) expected = DataFrame({"id": [2, 4], "cat": expected_categories, "num": [1, 9]}) - expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered)) + expected["cat"] = expected["cat"].astype( + CategoricalDtype(categories, ordered=ordered) + ) tm.assert_frame_equal(expected, result) def test_merge_on_int_array(self): @@ -2334,9 +2353,9 @@ "outer", DataFrame( { - "A": [100, 200, 1, 300], - "B1": [60, 70, 80, np.nan], - "B2": [600, 700, np.nan, 800], + "A": [1, 100, 200, 300], + "B1": [80, 60, 70, np.nan], + "B2": [np.nan, 600, 700, 800], } ), ), @@ -2483,16 +2502,14 @@ result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf))) # Constructing the expected results - expected_labels = [letter + l_suf for letter in letters] + [ - letter + r_suf for letter in letters - ] - expected_index = MultiIndex.from_product( - [expected_labels, numbers], names=["outer", "inner"] - ) + tuples = [(letter + l_suf, num) for letter in letters for num in numbers] + tuples += [("id", "")] + tuples += [(letter + r_suf, num) for letter in letters for num in numbers] + + expected_index = MultiIndex.from_tuples(tuples, names=["outer", "inner"]) expected = DataFrame(columns=expected_index) - expected["id"] = "" - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_merge_datetime_upcast_dtype(): @@ -2755,9 +2772,9 @@ result = merge(right, left, on="key", how="outer") expected = DataFrame( { - "key": [np.nan, np.nan, 1, 2], - "col2": [3, 4, np.nan, np.nan], - "col1": [np.nan, np.nan, 1, 2], + "key": [1, 2, np.nan, np.nan], + "col2": [np.nan, np.nan, 3, 4], + "col1": [1, 2, np.nan, np.nan], }, dtype=dtype, ) @@ -2852,6 +2869,79 @@ tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("on_index", [True, False]) +@pytest.mark.parametrize("left_unique", [True, False]) +@pytest.mark.parametrize("left_monotonic", [True, False]) +@pytest.mark.parametrize("right_unique", [True, False]) +@pytest.mark.parametrize("right_monotonic", [True, False]) +def test_merge_combinations( + how, sort, on_index, left_unique, left_monotonic, right_unique, right_monotonic +): + # GH 54611 + left = [2, 3] + if left_unique: + left.append(4 if left_monotonic else 1) + else: + left.append(3 if left_monotonic else 2) + + right = [2, 3] + if right_unique: + right.append(4 if right_monotonic else 1) + else: + right.append(3 if right_monotonic else 2) + + left = DataFrame({"key": left}) + right = DataFrame({"key": right}) + + if on_index: + left = left.set_index("key") + right = right.set_index("key") + on_kwargs = {"left_index": True, "right_index": True} + else: + on_kwargs = {"on": "key"} + + result = merge(left, right, how=how, sort=sort, **on_kwargs) + + if on_index: + left = left.reset_index() + right = right.reset_index() + + if how in ["left", "right", "inner"]: + if how in ["left", "inner"]: + expected, other, other_unique = left, right, right_unique + else: + expected, other, other_unique = right, left, left_unique + if how == "inner": + keep_values = set(left["key"].values).intersection(right["key"].values) + keep_mask = expected["key"].isin(keep_values) + expected = expected[keep_mask] + if sort: + expected = expected.sort_values("key") + if not other_unique: + other_value_counts = other["key"].value_counts() + repeats = other_value_counts.reindex(expected["key"].values, fill_value=1) + repeats = repeats.astype(np.intp) + expected = expected["key"].repeat(repeats.values) + expected = expected.to_frame() + elif how == "outer": + left_counts = left["key"].value_counts() + right_counts = right["key"].value_counts() + expected_counts = left_counts.mul(right_counts, fill_value=1) + expected_counts = expected_counts.astype(np.intp) + expected = expected_counts.index.values.repeat(expected_counts.values) + expected = DataFrame({"key": expected}) + expected = expected.sort_values("key") + + if on_index: + expected = expected.set_index("key") + else: + expected = expected.reset_index(drop=True) + + tm.assert_frame_equal(result, expected) + + def test_merge_ea_int_and_float_numpy(): # GH#46178 df1 = DataFrame([1.0, np.nan], dtype=pd.Int64Dtype()) @@ -2885,3 +2975,46 @@ {"a": Series(["a", "b"], dtype=any_string_dtype), "b": [1, np.nan]} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("left_empty", [True, False]) +@pytest.mark.parametrize("right_empty", [True, False]) +def test_merge_empty_frames_column_order(left_empty, right_empty): + # GH 51929 + df1 = DataFrame(1, index=[0], columns=["A", "B"]) + df2 = DataFrame(1, index=[0], columns=["A", "C", "D"]) + + if left_empty: + df1 = df1.iloc[:0] + if right_empty: + df2 = df2.iloc[:0] + + result = merge(df1, df2, on=["A"], how="outer") + expected = DataFrame(1, index=[0], columns=["A", "B", "C", "D"]) + if left_empty and right_empty: + expected = expected.iloc[:0] + elif left_empty: + expected["B"] = np.nan + elif right_empty: + expected[["C", "D"]] = np.nan + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) +def test_merge_datetime_and_timedelta(how): + left = DataFrame({"key": Series([1, None], dtype="datetime64[ns]")}) + right = DataFrame({"key": Series([1], dtype="timedelta64[ns]")}) + + msg = ( + f"You are trying to merge on {left['key'].dtype} and {right['key'].dtype} " + "columns for key 'key'. If you wish to proceed you should use pd.concat" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + left.merge(right, on="key", how=how) + + msg = ( + f"You are trying to merge on {right['key'].dtype} and {left['key'].dtype} " + "columns for key 'key'. If you wish to proceed you should use pd.concat" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + right.merge(left, on="key", how=how) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/test_merge_asof.py pandas-2.2.2+dfsg/pandas/tests/reshape/merge/test_merge_asof.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/test_merge_asof.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/test_merge_asof.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,7 +11,7 @@ Index, Timedelta, merge_asof, - read_csv, + option_context, to_datetime, ) import pandas._testing as tm @@ -27,39 +27,1070 @@ class TestAsOfMerge: - def read_data(self, datapath, name, dedupe=False): - path = datapath("reshape", "merge", "data", name) - x = read_csv(path) + def prep_data(self, df, dedupe=False): if dedupe: - x = x.drop_duplicates(["time", "ticker"], keep="last").reset_index( + df = df.drop_duplicates(["time", "ticker"], keep="last").reset_index( drop=True ) - x.time = to_datetime(x.time) - return x + df.time = to_datetime(df.time) + return df @pytest.fixture - def trades(self, datapath): - return self.read_data(datapath, "trades.csv") + def trades(self): + df = pd.DataFrame( + [ + ["20160525 13:30:00.023", "MSFT", "51.9500", "75", "NASDAQ"], + ["20160525 13:30:00.038", "MSFT", "51.9500", "155", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.7700", "100", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9200", "100", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "200", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "300", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "600", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "44", "NASDAQ"], + ["20160525 13:30:00.074", "AAPL", "98.6700", "478343", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6700", "478343", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6600", "6", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "30", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "75", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "20", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "35", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "10", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.5500", "6", "ARCA"], + ["20160525 13:30:00.075", "AAPL", "98.5500", "6", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "1000", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "200", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "300", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "400", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "600", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "200", "ARCA"], + ["20160525 13:30:00.078", "MSFT", "51.9500", "783", "NASDAQ"], + ["20160525 13:30:00.078", "MSFT", "51.9500", "100", "NASDAQ"], + ["20160525 13:30:00.078", "MSFT", "51.9500", "100", "NASDAQ"], + ], + columns="time,ticker,price,quantity,marketCenter".split(","), + ) + df["price"] = df["price"].astype("float64") + df["quantity"] = df["quantity"].astype("int64") + return self.prep_data(df) @pytest.fixture - def quotes(self, datapath): - return self.read_data(datapath, "quotes.csv", dedupe=True) + def quotes(self): + df = pd.DataFrame( + [ + ["20160525 13:30:00.023", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.023", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.041", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.072", "GOOG", "720.50", "720.88"], + ["20160525 13:30:00.075", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.078", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.078", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.078", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.078", "MSFT", "51.92", "51.95"], + ], + columns="time,ticker,bid,ask".split(","), + ) + df["bid"] = df["bid"].astype("float64") + df["ask"] = df["ask"].astype("float64") + return self.prep_data(df, dedupe=True) @pytest.fixture - def asof(self, datapath): - return self.read_data(datapath, "asof.csv") + def asof(self): + df = pd.DataFrame( + [ + [ + "20160525 13:30:00.023", + "MSFT", + "51.95", + "75", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.038", + "MSFT", + "51.95", + "155", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.77", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.92", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "200", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "300", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "600", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "44", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.074", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.67", + "478343", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.66", + "6", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "30", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "75", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "20", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "35", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "10", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.55", + "6", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.55", + "6", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "1000", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "300", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "400", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "600", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "783", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.92", + "51.95", + ], + ], + columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), + ) + df["price"] = df["price"].astype("float64") + df["quantity"] = df["quantity"].astype("int64") + df["bid"] = df["bid"].astype("float64") + df["ask"] = df["ask"].astype("float64") + return self.prep_data(df) @pytest.fixture - def tolerance(self, datapath): - return self.read_data(datapath, "tolerance.csv") + def tolerance(self): + df = pd.DataFrame( + [ + [ + "20160525 13:30:00.023", + "MSFT", + "51.95", + "75", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.038", + "MSFT", + "51.95", + "155", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.77", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.92", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "200", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "300", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "600", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "44", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.074", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.67", + "478343", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.66", + "6", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "30", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "75", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "20", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "35", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "10", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.55", + "6", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.55", + "6", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "1000", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "300", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "400", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "600", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "783", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.92", + "51.95", + ], + ], + columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), + ) + df["price"] = df["price"].astype("float64") + df["quantity"] = df["quantity"].astype("int64") + df["bid"] = df["bid"].astype("float64") + df["ask"] = df["ask"].astype("float64") + return self.prep_data(df) @pytest.fixture def allow_exact_matches(self, datapath): - return self.read_data(datapath, "allow_exact_matches.csv") + df = pd.DataFrame( + [ + [ + "20160525 13:30:00.023", + "MSFT", + "51.95", + "75", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.038", + "MSFT", + "51.95", + "155", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.77", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.92", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "200", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "300", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "600", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "44", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.074", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.66", + "6", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "30", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "75", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "20", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "35", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "10", + "NASDAQ", + np.nan, + np.nan, + ], + ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], + ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "1000", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "300", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "400", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "600", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "783", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.95", + "51.95", + ], + ], + columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), + ) + df["price"] = df["price"].astype("float64") + df["quantity"] = df["quantity"].astype("int64") + df["bid"] = df["bid"].astype("float64") + df["ask"] = df["ask"].astype("float64") + return self.prep_data(df) @pytest.fixture - def allow_exact_matches_and_tolerance(self, datapath): - return self.read_data(datapath, "allow_exact_matches_and_tolerance.csv") + def allow_exact_matches_and_tolerance(self): + df = pd.DataFrame( + [ + [ + "20160525 13:30:00.023", + "MSFT", + "51.95", + "75", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.038", + "MSFT", + "51.95", + "155", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.77", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.92", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "200", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "300", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "600", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "44", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.074", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.66", + "6", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "30", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "75", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "20", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "35", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "10", + "NASDAQ", + np.nan, + np.nan, + ], + ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], + ["20160525 13:30:00.075", "AAPL", "98.55", "6", "ARCA", np.nan, np.nan], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "1000", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "300", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "400", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "600", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "783", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.95", + "51.95", + ], + ], + columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), + ) + df["price"] = df["price"].astype("float64") + df["quantity"] = df["quantity"].astype("int64") + df["bid"] = df["bid"].astype("float64") + df["ask"] = df["ask"].astype("float64") + return self.prep_data(df) def test_examples1(self): """doc-string examples""" @@ -353,7 +1384,8 @@ result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) tm.assert_frame_equal(result, expected) - def test_multiby_heterogeneous_types(self): + @pytest.mark.parametrize("dtype", ["object", "string"]) + def test_multiby_heterogeneous_types(self, dtype): # GH13936 trades = pd.DataFrame( { @@ -373,6 +1405,7 @@ }, columns=["time", "ticker", "exch", "price", "quantity"], ) + trades = trades.astype({"ticker": dtype, "exch": dtype}) quotes = pd.DataFrame( { @@ -393,6 +1426,7 @@ }, columns=["time", "ticker", "exch", "bid", "ask"], ) + quotes = quotes.astype({"ticker": dtype, "exch": dtype}) expected = pd.DataFrame( { @@ -414,6 +1448,7 @@ }, columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"], ) + expected = expected.astype({"ticker": dtype, "exch": dtype}) result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) tm.assert_frame_equal(result, expected) @@ -497,9 +1532,860 @@ ) def test_basic2(self, datapath): - expected = self.read_data(datapath, "asof2.csv") - trades = self.read_data(datapath, "trades2.csv") - quotes = self.read_data(datapath, "quotes2.csv", dedupe=True) + expected = pd.DataFrame( + [ + [ + "20160525 13:30:00.023", + "MSFT", + "51.95", + "75", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.038", + "MSFT", + "51.95", + "155", + "NASDAQ", + "51.95", + "51.95", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.77", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.92", + "100", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "200", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "300", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "600", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.048", + "GOOG", + "720.93", + "44", + "NASDAQ", + "720.5", + "720.93", + ], + [ + "20160525 13:30:00.074", + "AAPL", + "98.67", + "478343", + "NASDAQ", + np.nan, + np.nan, + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.67", + "478343", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.66", + "6", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "30", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "75", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "20", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "35", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.65", + "10", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.55", + "6", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.075", + "AAPL", + "98.55", + "6", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "1000", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "300", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "400", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "600", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.076", + "AAPL", + "98.56", + "200", + "ARCA", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "783", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.078", + "MSFT", + "51.95", + "100", + "NASDAQ", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.084", + "AAPL", + "98.64", + "40", + "NASDAQ", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.084", + "AAPL", + "98.55", + "149", + "EDGX", + "98.55", + "98.56", + ], + [ + "20160525 13:30:00.086", + "AAPL", + "98.56", + "500", + "ARCA", + "98.55", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "647", + "EDGX", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "300", + "EDGX", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "50", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "50", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "70", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "70", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "1", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "62", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "10", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.104", + "AAPL", + "98.63", + "100", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.105", + "AAPL", + "98.63", + "100", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.105", + "AAPL", + "98.63", + "700", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.106", + "AAPL", + "98.63", + "61", + "EDGX", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.107", + "AAPL", + "98.63", + "100", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.107", + "AAPL", + "98.63", + "53", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.108", + "AAPL", + "98.63", + "100", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.108", + "AAPL", + "98.63", + "839", + "ARCA", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.115", + "AAPL", + "98.63", + "5", + "EDGX", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.118", + "AAPL", + "98.63", + "295", + "EDGX", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.118", + "AAPL", + "98.63", + "5", + "EDGX", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.128", + "AAPL", + "98.63", + "100", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.128", + "AAPL", + "98.63", + "100", + "NASDAQ", + "98.62", + "98.63", + ], + [ + "20160525 13:30:00.128", + "MSFT", + "51.92", + "100", + "ARCA", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "100", + "NASDAQ", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "10", + "NASDAQ", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "59", + "NASDAQ", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "31", + "NASDAQ", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "69", + "NASDAQ", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "12", + "NASDAQ", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "12", + "EDGX", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "100", + "ARCA", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.129", + "AAPL", + "98.62", + "100", + "ARCA", + "98.61", + "98.63", + ], + [ + "20160525 13:30:00.130", + "MSFT", + "51.95", + "317", + "ARCA", + "51.93", + "51.95", + ], + [ + "20160525 13:30:00.130", + "MSFT", + "51.95", + "283", + "ARCA", + "51.93", + "51.95", + ], + [ + "20160525 13:30:00.135", + "MSFT", + "51.93", + "100", + "EDGX", + "51.92", + "51.95", + ], + [ + "20160525 13:30:00.135", + "AAPL", + "98.62", + "100", + "ARCA", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.62", + "12", + "NASDAQ", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.62", + "88", + "NASDAQ", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.62", + "162", + "NASDAQ", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.61", + "100", + "BATS", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.62", + "61", + "ARCA", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.62", + "25", + "ARCA", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.144", + "AAPL", + "98.62", + "14", + "ARCA", + "98.61", + "98.62", + ], + [ + "20160525 13:30:00.145", + "AAPL", + "98.62", + "12", + "ARCA", + "98.6", + "98.63", + ], + [ + "20160525 13:30:00.145", + "AAPL", + "98.62", + "100", + "ARCA", + "98.6", + "98.63", + ], + [ + "20160525 13:30:00.145", + "AAPL", + "98.63", + "100", + "NASDAQ", + "98.6", + "98.63", + ], + [ + "20160525 13:30:00.145", + "AAPL", + "98.63", + "100", + "NASDAQ", + "98.6", + "98.63", + ], + ], + columns="time,ticker,price,quantity,marketCenter,bid,ask".split(","), + ) + expected["price"] = expected["price"].astype("float64") + expected["quantity"] = expected["quantity"].astype("int64") + expected["bid"] = expected["bid"].astype("float64") + expected["ask"] = expected["ask"].astype("float64") + expected = self.prep_data(expected) + + trades = pd.DataFrame( + [ + ["20160525 13:30:00.023", "MSFT", "51.9500", "75", "NASDAQ"], + ["20160525 13:30:00.038", "MSFT", "51.9500", "155", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.7700", "100", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9200", "100", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "200", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "300", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "600", "NASDAQ"], + ["20160525 13:30:00.048", "GOOG", "720.9300", "44", "NASDAQ"], + ["20160525 13:30:00.074", "AAPL", "98.6700", "478343", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6700", "478343", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6600", "6", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "30", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "75", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "20", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "35", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.6500", "10", "NASDAQ"], + ["20160525 13:30:00.075", "AAPL", "98.5500", "6", "ARCA"], + ["20160525 13:30:00.075", "AAPL", "98.5500", "6", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "1000", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "200", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "300", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "400", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "600", "ARCA"], + ["20160525 13:30:00.076", "AAPL", "98.5600", "200", "ARCA"], + ["20160525 13:30:00.078", "MSFT", "51.9500", "783", "NASDAQ"], + ["20160525 13:30:00.078", "MSFT", "51.9500", "100", "NASDAQ"], + ["20160525 13:30:00.078", "MSFT", "51.9500", "100", "NASDAQ"], + ["20160525 13:30:00.084", "AAPL", "98.6400", "40", "NASDAQ"], + ["20160525 13:30:00.084", "AAPL", "98.5500", "149", "EDGX"], + ["20160525 13:30:00.086", "AAPL", "98.5600", "500", "ARCA"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "647", "EDGX"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "300", "EDGX"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "50", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "50", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "70", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "70", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "1", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "62", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "10", "NASDAQ"], + ["20160525 13:30:00.104", "AAPL", "98.6300", "100", "ARCA"], + ["20160525 13:30:00.105", "AAPL", "98.6300", "100", "ARCA"], + ["20160525 13:30:00.105", "AAPL", "98.6300", "700", "ARCA"], + ["20160525 13:30:00.106", "AAPL", "98.6300", "61", "EDGX"], + ["20160525 13:30:00.107", "AAPL", "98.6300", "100", "ARCA"], + ["20160525 13:30:00.107", "AAPL", "98.6300", "53", "ARCA"], + ["20160525 13:30:00.108", "AAPL", "98.6300", "100", "ARCA"], + ["20160525 13:30:00.108", "AAPL", "98.6300", "839", "ARCA"], + ["20160525 13:30:00.115", "AAPL", "98.6300", "5", "EDGX"], + ["20160525 13:30:00.118", "AAPL", "98.6300", "295", "EDGX"], + ["20160525 13:30:00.118", "AAPL", "98.6300", "5", "EDGX"], + ["20160525 13:30:00.128", "AAPL", "98.6300", "100", "NASDAQ"], + ["20160525 13:30:00.128", "AAPL", "98.6300", "100", "NASDAQ"], + ["20160525 13:30:00.128", "MSFT", "51.9200", "100", "ARCA"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "100", "NASDAQ"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "10", "NASDAQ"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "59", "NASDAQ"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "31", "NASDAQ"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "69", "NASDAQ"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "12", "NASDAQ"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "12", "EDGX"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "100", "ARCA"], + ["20160525 13:30:00.129", "AAPL", "98.6200", "100", "ARCA"], + ["20160525 13:30:00.130", "MSFT", "51.9500", "317", "ARCA"], + ["20160525 13:30:00.130", "MSFT", "51.9500", "283", "ARCA"], + ["20160525 13:30:00.135", "MSFT", "51.9300", "100", "EDGX"], + ["20160525 13:30:00.135", "AAPL", "98.6200", "100", "ARCA"], + ["20160525 13:30:00.144", "AAPL", "98.6200", "12", "NASDAQ"], + ["20160525 13:30:00.144", "AAPL", "98.6200", "88", "NASDAQ"], + ["20160525 13:30:00.144", "AAPL", "98.6200", "162", "NASDAQ"], + ["20160525 13:30:00.144", "AAPL", "98.6100", "100", "BATS"], + ["20160525 13:30:00.144", "AAPL", "98.6200", "61", "ARCA"], + ["20160525 13:30:00.144", "AAPL", "98.6200", "25", "ARCA"], + ["20160525 13:30:00.144", "AAPL", "98.6200", "14", "ARCA"], + ["20160525 13:30:00.145", "AAPL", "98.6200", "12", "ARCA"], + ["20160525 13:30:00.145", "AAPL", "98.6200", "100", "ARCA"], + ["20160525 13:30:00.145", "AAPL", "98.6300", "100", "NASDAQ"], + ["20160525 13:30:00.145", "AAPL", "98.6300", "100", "NASDAQ"], + ], + columns="time,ticker,price,quantity,marketCenter".split(","), + ) + trades["price"] = trades["price"].astype("float64") + trades["quantity"] = trades["quantity"].astype("int64") + trades = self.prep_data(trades) + + quotes = pd.DataFrame( + [ + ["20160525 13:30:00.023", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.023", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.041", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.048", "GOOG", "720.50", "720.93"], + ["20160525 13:30:00.072", "GOOG", "720.50", "720.88"], + ["20160525 13:30:00.075", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.076", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.078", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.078", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.078", "MSFT", "51.95", "51.95"], + ["20160525 13:30:00.078", "MSFT", "51.92", "51.95"], + ["20160525 13:30:00.079", "MSFT", "51.92", "51.95"], + ["20160525 13:30:00.080", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.084", "AAPL", "98.55", "98.56"], + ["20160525 13:30:00.086", "AAPL", "98.55", "98.63"], + ["20160525 13:30:00.088", "AAPL", "98.65", "98.63"], + ["20160525 13:30:00.089", "AAPL", "98.63", "98.63"], + ["20160525 13:30:00.104", "AAPL", "98.63", "98.63"], + ["20160525 13:30:00.104", "AAPL", "98.63", "98.63"], + ["20160525 13:30:00.104", "AAPL", "98.63", "98.63"], + ["20160525 13:30:00.104", "AAPL", "98.63", "98.63"], + ["20160525 13:30:00.104", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.105", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.107", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.115", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.115", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.118", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.128", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.128", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.129", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.129", "AAPL", "98.61", "98.63"], + ["20160525 13:30:00.129", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.129", "AAPL", "98.62", "98.63"], + ["20160525 13:30:00.129", "AAPL", "98.61", "98.63"], + ["20160525 13:30:00.130", "MSFT", "51.93", "51.95"], + ["20160525 13:30:00.130", "MSFT", "51.93", "51.95"], + ["20160525 13:30:00.130", "AAPL", "98.61", "98.63"], + ["20160525 13:30:00.131", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.131", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.135", "MSFT", "51.92", "51.95"], + ["20160525 13:30:00.135", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.136", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.136", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.144", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.144", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.145", "AAPL", "98.61", "98.62"], + ["20160525 13:30:00.145", "AAPL", "98.61", "98.63"], + ["20160525 13:30:00.145", "AAPL", "98.61", "98.63"], + ["20160525 13:30:00.145", "AAPL", "98.60", "98.63"], + ["20160525 13:30:00.145", "AAPL", "98.61", "98.63"], + ["20160525 13:30:00.145", "AAPL", "98.60", "98.63"], + ], + columns="time,ticker,bid,ask".split(","), + ) + quotes["bid"] = quotes["bid"].astype("float64") + quotes["ask"] = quotes["ask"].astype("float64") + quotes = self.prep_data(quotes, dedupe=True) result = merge_asof(trades, quotes, on="time", by="ticker") tm.assert_frame_equal(result, expected) @@ -531,14 +2417,14 @@ with pytest.raises(MergeError, match="can only asof on a key for left"): merge_asof(trades, quotes, by="ticker") - def test_with_duplicates(self, datapath, trades, quotes): + def test_with_duplicates(self, datapath, trades, quotes, asof): q = ( pd.concat([quotes, quotes]) .sort_values(["time", "ticker"]) .reset_index(drop=True) ) result = merge_asof(trades, q, on="time", by="ticker") - expected = self.read_data(datapath, "asof.csv") + expected = self.prep_data(asof) tm.assert_frame_equal(result, expected) def test_with_duplicates_no_on(self): @@ -1195,8 +3081,11 @@ tm.assert_frame_equal(result, expected) - def test_merge_datatype_error_raises(self): - msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" + def test_merge_datatype_error_raises(self, using_infer_string): + if using_infer_string: + msg = "incompatible merge keys" + else: + msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) @@ -1248,7 +3137,7 @@ else: merge_asof(df, df_null, on="a") - def test_by_nullable(self, any_numeric_ea_dtype): + def test_by_nullable(self, any_numeric_ea_dtype, using_infer_string): # Note: this test passes if instead of using pd.array we use # np.array([np.nan, 1]). Other than that, I (@jbrockmendel) # have NO IDEA what the expected behavior is. @@ -1290,6 +3179,8 @@ } ) expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object) + if using_infer_string: + expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) def test_merge_by_col_tz_aware(self): @@ -1315,7 +3206,7 @@ ) tm.assert_frame_equal(result, expected) - def test_by_mixed_tz_aware(self): + def test_by_mixed_tz_aware(self, using_infer_string): # GH 26649 left = pd.DataFrame( { @@ -1339,11 +3230,13 @@ columns=["by_col1", "by_col2", "on_col", "value_x"], ) expected["value_y"] = np.array([np.nan], dtype=object) + if using_infer_string: + expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[us]"]) - def test_by_datelike(self, dtype): - # GH 55453 + @pytest.mark.parametrize("dtype", ["float64", "int16", "m8[ns]", "M8[us]"]) + def test_by_dtype(self, dtype): + # GH 55453, GH 22794 left = pd.DataFrame( { "by_col": np.array([1], dtype=dtype), @@ -1488,21 +3381,25 @@ @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) +@pytest.mark.parametrize( "kwargs", [{"on": "x"}, {"left_index": True, "right_index": True}] ) @pytest.mark.parametrize( "data", [["2019-06-01 00:09:12", "2019-06-01 00:10:29"], [1.0, "2019-06-01 00:10:29"]], ) -def test_merge_asof_non_numerical_dtype(kwargs, data): +def test_merge_asof_non_numerical_dtype(kwargs, data, infer_string): # GH#29130 - left = pd.DataFrame({"x": data}, index=data) - right = pd.DataFrame({"x": data}, index=data) - with pytest.raises( - MergeError, - match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", - ): - merge_asof(left, right, **kwargs) + with option_context("future.infer_string", infer_string): + left = pd.DataFrame({"x": data}, index=data) + right = pd.DataFrame({"x": data}, index=data) + with pytest.raises( + MergeError, + match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", + ): + merge_asof(left, right, **kwargs) def test_merge_asof_non_numerical_dtype_object(): @@ -1592,16 +3489,19 @@ merge_asof(left, right, left_on="a", right_on="a") -def test_merge_asof_array_as_on(): +def test_merge_asof_array_as_on(unit): # GH#42844 + dti = pd.DatetimeIndex( + ["2021/01/01 00:37", "2021/01/01 01:40"], dtype=f"M8[{unit}]" + ) right = pd.DataFrame( { "a": [2, 6], - "ts": [pd.Timestamp("2021/01/01 00:37"), pd.Timestamp("2021/01/01 01:40")], + "ts": dti, } ) ts_merge = pd.date_range( - start=pd.Timestamp("2021/01/01 00:00"), periods=3, freq="1h" + start=pd.Timestamp("2021/01/01 00:00"), periods=3, freq="1h", unit=unit ) left = pd.DataFrame({"b": [4, 8, 7]}) result = merge_asof( @@ -1626,7 +3526,7 @@ expected = pd.DataFrame( { "a": [2, 6], - "ts": [pd.Timestamp("2021/01/01 00:37"), pd.Timestamp("2021/01/01 01:40")], + "ts": dti, "b": [4, 8], } ) @@ -1684,6 +3584,29 @@ tm.assert_frame_equal(result, expected) +@td.skip_if_no("pyarrow") +def test_merge_asof_pyarrow_td_tolerance(): + # GH 56486 + ser = pd.Series( + [datetime.datetime(2023, 1, 1)], dtype="timestamp[us, UTC][pyarrow]" + ) + df = pd.DataFrame( + { + "timestamp": ser, + "value": [1], + } + ) + result = merge_asof(df, df, on="timestamp", tolerance=Timedelta("1s")) + expected = pd.DataFrame( + { + "timestamp": ser, + "value_x": [1], + "value_y": [1], + } + ) + tm.assert_frame_equal(result, expected) + + def test_merge_asof_read_only_ndarray(): # GH 53513 left = pd.Series([2], index=[2], name="left") @@ -1694,3 +3617,41 @@ result = merge_asof(left, right, left_index=True, right_index=True) expected = pd.DataFrame({"left": [2], "right": [1]}, index=[2]) tm.assert_frame_equal(result, expected) + + +def test_merge_asof_multiby_with_categorical(): + # GH 43541 + left = pd.DataFrame( + { + "c1": pd.Categorical(["a", "a", "b", "b"], categories=["a", "b"]), + "c2": ["x"] * 4, + "t": [1] * 4, + "v": range(4), + } + ) + right = pd.DataFrame( + { + "c1": pd.Categorical(["b", "b"], categories=["b", "a"]), + "c2": ["x"] * 2, + "t": [1, 2], + "v": range(2), + } + ) + result = merge_asof( + left, + right, + by=["c1", "c2"], + on="t", + direction="forward", + suffixes=["_left", "_right"], + ) + expected = pd.DataFrame( + { + "c1": pd.Categorical(["a", "a", "b", "b"], categories=["a", "b"]), + "c2": ["x"] * 4, + "t": [1] * 4, + "v_left": range(4), + "v_right": [np.nan, np.nan, 0.0, 0.0], + } + ) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/test_merge_ordered.py pandas-2.2.2+dfsg/pandas/tests/reshape/merge/test_merge_ordered.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/test_merge_ordered.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/test_merge_ordered.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -209,3 +211,34 @@ msg = r"\{'h'\} not found in left columns" with pytest.raises(KeyError, match=msg): merge_ordered(left, right, on="E", left_by=["G", "h"]) + + @pytest.mark.parametrize("invalid_method", ["linear", "carrot"]) + def test_ffill_validate_fill_method(self, left, right, invalid_method): + # GH 55884 + with pytest.raises( + ValueError, match=re.escape("fill_method must be 'ffill' or None") + ): + merge_ordered(left, right, on="key", fill_method=invalid_method) + + def test_ffill_left_merge(self): + # GH 57010 + df1 = DataFrame( + { + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3, 1, 2, 3], + "group": ["a", "a", "a", "b", "b", "b"], + } + ) + df2 = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + result = merge_ordered( + df1, df2, fill_method="ffill", left_by="group", how="left" + ) + expected = DataFrame( + { + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3, 1, 2, 3], + "group": ["a", "a", "a", "b", "b", "b"], + "rvalue": [np.nan, 2.0, 2.0, np.nan, 2.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/merge/test_multi.py pandas-2.2.2+dfsg/pandas/tests/reshape/merge/test_multi.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/merge/test_multi.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/merge/test_multi.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -9,6 +11,7 @@ RangeIndex, Series, Timestamp, + option_context, ) import pandas._testing as tm from pandas.core.reshape.concat import concat @@ -69,11 +72,6 @@ return ["Origin", "Destination", "Period"] -@pytest.fixture -def idx_cols_multi(): - return ["Origin", "Destination", "Period", "TripPurp", "LinkType"] - - class TestMergeMulti: def test_merge_on_multikey(self, left, right, join_type): on_cols = ["key1", "key2"] @@ -93,67 +91,71 @@ tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("sort", [False, True]) - def test_left_join_multi_index(self, sort): - icols = ["1st", "2nd", "3rd"] - - def bind_cols(df): - iord = lambda a: 0 if a != a else ord(a) - f = lambda ts: ts.map(iord) - ord("a") - return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 1e4 - - def run_asserts(left, right, sort): - res = left.join(right, on=icols, how="left", sort=sort) - - assert len(left) < len(res) + 1 - assert not res["4th"].isna().any() - assert not res["5th"].isna().any() - - tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) - result = bind_cols(res.iloc[:, :-2]) - tm.assert_series_equal(res["4th"], result, check_names=False) - assert result.name is None - - if sort: - tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) - - out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") - - res.index = RangeIndex(len(res)) - tm.assert_frame_equal(out, res) - - lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) - left = DataFrame( - np.random.default_rng(2).choice(lc, (5000, 2)), columns=["1st", "3rd"] - ) - # Explicit cast to float to avoid implicit cast when setting nan - left.insert( - 1, - "2nd", - np.random.default_rng(2).integers(0, 1000, len(left)).astype("float"), - ) - - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i].copy() - - left["4th"] = bind_cols(left) - right["5th"] = -bind_cols(right) - right.set_index(icols, inplace=True) - - run_asserts(left, right, sort) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + @pytest.mark.parametrize("sort", [True, False]) + def test_left_join_multi_index(self, sort, infer_string): + with option_context("future.infer_string", infer_string): + icols = ["1st", "2nd", "3rd"] + + def bind_cols(df): + iord = lambda a: 0 if a != a else ord(a) + f = lambda ts: ts.map(iord) - ord("a") + return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 + + def run_asserts(left, right, sort): + res = left.join(right, on=icols, how="left", sort=sort) + + assert len(left) < len(res) + 1 + assert not res["4th"].isna().any() + assert not res["5th"].isna().any() + + tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) + result = bind_cols(res.iloc[:, :-2]) + tm.assert_series_equal(res["4th"], result, check_names=False) + assert result.name is None + + if sort: + tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) + + out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") + + res.index = RangeIndex(len(res)) + tm.assert_frame_equal(out, res) + + lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) + left = DataFrame( + np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"] + ) + # Explicit cast to float to avoid implicit cast when setting nan + left.insert( + 1, + "2nd", + np.random.default_rng(2).integers(0, 10, len(left)).astype("float"), + ) - # inject some nulls - left.loc[1::23, "1st"] = np.nan - left.loc[2::37, "2nd"] = np.nan - left.loc[3::43, "3rd"] = np.nan - left["4th"] = bind_cols(left) + i = np.random.default_rng(2).permutation(len(left)) + right = left.iloc[i].copy() - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i, :-1] - right["5th"] = -bind_cols(right) - right.set_index(icols, inplace=True) + left["4th"] = bind_cols(left) + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) + + run_asserts(left, right, sort) + + # inject some nulls + left.loc[1::4, "1st"] = np.nan + left.loc[2::5, "2nd"] = np.nan + left.loc[3::6, "3rd"] = np.nan + left["4th"] = bind_cols(left) + + i = np.random.default_rng(2).permutation(len(left)) + right = left.iloc[i, :-1] + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) - run_asserts(left, right, sort) + run_asserts(left, right, sort) @pytest.mark.parametrize("sort", [False, True]) def test_merge_right_vs_left(self, left, right, sort): @@ -193,7 +195,7 @@ def test_compress_group_combinations(self): # ~ 40000000 possible unique groups - key1 = tm.makeStringIndex(10000) + key1 = [str(i) for i in range(10000)] key1 = np.tile(key1, 2) key2 = key1[::-1] @@ -637,7 +639,7 @@ axis=0, sort=True, ).reindex(columns=expected.columns) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_join_multi_levels_invalid(self, portfolio, household): portfolio = portfolio.copy() @@ -741,10 +743,8 @@ expected = ( DataFrame( { - "household_id": [1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + "household_id": [2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 2, 4], "asset_id": [ - "nl0000301109", - "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", @@ -754,11 +754,11 @@ "lu0197800237", "lu0197800237", "nl0000289965", + "nl0000301109", + "nl0000301109", None, ], "t": [ - None, - None, 233, 234, 235, @@ -769,10 +769,10 @@ 181, None, None, + None, + None, ], "share": [ - 1.0, - 0.4, 0.6, 0.6, 0.6, @@ -783,10 +783,10 @@ 0.6, 0.25, 1.0, + 0.4, + 1.0, ], "log_return": [ - None, - None, 0.09604978, -0.06524096, 0.03532373, @@ -797,6 +797,8 @@ 0.036997, None, None, + None, + None, ], } ) @@ -815,9 +817,13 @@ class TestJoinMultiMulti: - def test_join_multi_multi( - self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi - ): + def test_join_multi_multi(self, left_multi, right_multi, join_type, on_cols_multi): + left_names = left_multi.index.names + right_names = right_multi.index.names + if join_type == "right": + level_order = right_names + left_names.difference(right_names) + else: + level_order = left_names + right_names.difference(left_names) # Multi-index join tests expected = ( merge( @@ -826,7 +832,7 @@ how=join_type, on=on_cols_multi, ) - .set_index(idx_cols_multi) + .set_index(level_order) .sort_index() ) @@ -834,11 +840,18 @@ tm.assert_frame_equal(result, expected) def test_join_multi_empty_frames( - self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi + self, left_multi, right_multi, join_type, on_cols_multi ): left_multi = left_multi.drop(columns=left_multi.columns) right_multi = right_multi.drop(columns=right_multi.columns) + left_names = left_multi.index.names + right_names = right_multi.index.names + if join_type == "right": + level_order = right_names + left_names.difference(right_names) + else: + level_order = left_names + right_names.difference(left_names) + expected = ( merge( left_multi.reset_index(), @@ -846,7 +859,7 @@ how=join_type, on=on_cols_multi, ) - .set_index(idx_cols_multi) + .set_index(level_order) .sort_index() ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/test_crosstab.py pandas-2.2.2+dfsg/pandas/tests/reshape/test_crosstab.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/test_crosstab.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/test_crosstab.py 2024-04-10 17:42:52.000000000 +0000 @@ -562,7 +562,7 @@ codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], names=["A", "B"], ) - expected_column = Index(["bar", "foo", "All"], dtype="object", name="C") + expected_column = Index(["bar", "foo", "All"], name="C") expected_data = np.array( [ [2.0, 2.0, 4.0], @@ -670,7 +670,7 @@ ) expected = DataFrame( np.array([0] * 29 + [1], dtype=float).reshape(10, 3), - columns=Index(["bar", "foo", "All"], dtype="object", name="C"), + columns=Index(["bar", "foo", "All"], name="C"), index=MultiIndex.from_tuples( [ ("one", "A"), @@ -722,7 +722,7 @@ codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], names=["A", "B"], ) - expected.columns = Index(["large", "small"], dtype="object", name="C") + expected.columns = Index(["large", "small"], name="C") tm.assert_frame_equal(result, expected) # normalize on columns @@ -737,9 +737,7 @@ [0, 0.4, 0.222222], ] ) - expected.columns = Index( - ["large", "small", "Sub-Total"], dtype="object", name="C" - ) + expected.columns = Index(["large", "small", "Sub-Total"], name="C") expected.index = MultiIndex( levels=[["bar", "foo"], ["one", "two"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], @@ -760,9 +758,7 @@ [0.444444, 0.555555, 1], ] ) - expected.columns = Index( - ["large", "small", "Sub-Total"], dtype="object", name="C" - ) + expected.columns = Index(["large", "small", "Sub-Total"], name="C") expected.index = MultiIndex( levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], @@ -887,7 +883,4 @@ if not a_is_cat: expected = expected.loc[[0, 2, "All"]] expected["All"] = expected["All"].astype("int64") - repr(result) - repr(expected) - repr(expected.loc[[0, 2, "All"]]) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/test_cut.py pandas-2.2.2+dfsg/pandas/tests/reshape/test_cut.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/test_cut.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/test_cut.py 2024-04-10 17:42:52.000000000 +0000 @@ -21,7 +21,7 @@ to_datetime, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype import pandas.core.reshape.tile as tmod @@ -359,7 +359,7 @@ IntervalIndex.from_breaks(exp_bins, closed="right").take( [0, 0, 0, 1, 1, 1, 2, 2, 2] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -370,7 +370,7 @@ expected = Series( IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -445,65 +445,79 @@ Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "data", - [ - to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])), - [ - np.datetime64("2013-01-01"), - np.datetime64("2013-01-02"), - np.datetime64("2013-01-03"), - ], - np.array( - [ - np.datetime64("2013-01-01"), - np.datetime64("2013-01-02"), - np.datetime64("2013-01-03"), - ] - ), - DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"]), - ], -) -def test_datetime_cut(data): +@pytest.mark.parametrize("box", [Series, Index, np.array, list]) +def test_datetime_cut(unit, box): # see gh-14714 # # Testing time data when it comes in various collection types. + data = to_datetime(["2013-01-01", "2013-01-02", "2013-01-03"]).astype(f"M8[{unit}]") + data = box(data) result, _ = cut(data, 3, retbins=True) - expected = Series( - IntervalIndex( + + if box is list: + # We don't (yet) do inference on these, so get nanos + unit = "ns" + + if unit == "s": + # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425 + # for why we round to 8 seconds instead of 7 + left = DatetimeIndex( + ["2012-12-31 23:57:08", "2013-01-01 16:00:00", "2013-01-02 08:00:00"], + dtype=f"M8[{unit}]", + ) + else: + left = DatetimeIndex( [ - Interval( - Timestamp("2012-12-31 23:57:07.200000"), - Timestamp("2013-01-01 16:00:00"), - ), - Interval( - Timestamp("2013-01-01 16:00:00"), Timestamp("2013-01-02 08:00:00") - ), - Interval( - Timestamp("2013-01-02 08:00:00"), Timestamp("2013-01-03 00:00:00") - ), - ] + "2012-12-31 23:57:07.200000", + "2013-01-01 16:00:00", + "2013-01-02 08:00:00", + ], + dtype=f"M8[{unit}]", ) - ).astype(CDT(ordered=True)) + right = DatetimeIndex( + ["2013-01-01 16:00:00", "2013-01-02 08:00:00", "2013-01-03 00:00:00"], + dtype=f"M8[{unit}]", + ) + + exp_intervals = IntervalIndex.from_arrays(left, right) + expected = Series(exp_intervals).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(Series(result), expected) -@pytest.mark.parametrize( - "bins", - [ - 3, +@pytest.mark.parametrize("box", [list, np.array, Index, Series]) +def test_datetime_tz_cut_mismatched_tzawareness(box): + # GH#54964 + bins = box( [ Timestamp("2013-01-01 04:57:07.200000"), Timestamp("2013-01-01 21:00:00"), Timestamp("2013-01-02 13:00:00"), Timestamp("2013-01-03 05:00:00"), + ] + ) + ser = Series(date_range("20130101", periods=3, tz="US/Eastern")) + + msg = "Cannot use timezone-naive bins with timezone-aware values" + with pytest.raises(ValueError, match=msg): + cut(ser, bins) + + +@pytest.mark.parametrize( + "bins", + [ + 3, + [ + Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"), + Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"), + Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"), + Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"), ], ], ) @@ -511,12 +525,12 @@ def test_datetime_tz_cut(bins, box): # see gh-19872 tz = "US/Eastern" - s = Series(date_range("20130101", periods=3, tz=tz)) + ser = Series(date_range("20130101", periods=3, tz=tz)) if not isinstance(bins, int): bins = box(bins) - result = cut(s, bins) + result = cut(ser, bins) expected = Series( IntervalIndex( [ @@ -534,7 +548,7 @@ ), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -558,17 +572,33 @@ @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"]) -def test_datetime_cut_roundtrip(tz): +def test_datetime_cut_roundtrip(tz, unit): # see gh-19891 - ser = Series(date_range("20180101", periods=3, tz=tz)) + ser = Series(date_range("20180101", periods=3, tz=tz, unit=unit)) result, result_bins = cut(ser, 2, retbins=True) expected = cut(ser, result_bins) tm.assert_series_equal(result, expected) - expected_bins = DatetimeIndex( - ["2017-12-31 23:57:07.200000", "2018-01-02 00:00:00", "2018-01-03 00:00:00"] - ) + if unit == "s": + # TODO: constructing DatetimeIndex with dtype="M8[s]" without truncating + # the first entry here raises in array_to_datetime. Should truncate + # instead of raising? + # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425 + # for why we round to 8 seconds instead of 7 + expected_bins = DatetimeIndex( + ["2017-12-31 23:57:08", "2018-01-02 00:00:00", "2018-01-03 00:00:00"], + dtype=f"M8[{unit}]", + ) + else: + expected_bins = DatetimeIndex( + [ + "2017-12-31 23:57:07.200000", + "2018-01-02 00:00:00", + "2018-01-03 00:00:00", + ], + dtype=f"M8[{unit}]", + ) expected_bins = expected_bins.tz_localize(tz) tm.assert_index_equal(result_bins, expected_bins) @@ -668,10 +698,10 @@ def test_cut_unordered_with_series_labels(): # https://github.com/pandas-dev/pandas/issues/36603 - s = Series([1, 2, 3, 4, 5]) + ser = Series([1, 2, 3, 4, 5]) bins = Series([0, 2, 4, 6]) labels = Series(["a", "b", "c"]) - result = cut(s, bins=bins, labels=labels, ordered=False) + result = cut(ser, bins=bins, labels=labels, ordered=False) expected = Series(["a", "a", "b", "b", "c"], dtype="category") tm.assert_series_equal(result, expected) @@ -692,15 +722,15 @@ dtype="category", ).cat.as_ordered() - s = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0]) - result = cut(s, bins=[0, 2, 4], include_lowest=True) + ser = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0]) + result = cut(ser, bins=[0, 2, 4], include_lowest=True) tm.assert_series_equal(result, expected) def test_cut_with_nonexact_categorical_indices(): # GH 42424 - ser = Series(range(0, 100)) + ser = Series(range(100)) ser1 = cut(ser, 10).value_counts().head(5) ser2 = cut(ser, 10).value_counts().tail(5) result = DataFrame({"1": ser1, "2": ser2}) @@ -741,7 +771,7 @@ # https://github.com/pandas-dev/pandas/issues/46218 bins = interval_range(Timestamp("2022-02-25"), Timestamp("2022-02-27"), freq="1D") # passing Series instead of list is important to trigger bug - result = cut(Series([Timestamp("2022-02-26")]), bins=bins) + result = cut(Series([Timestamp("2022-02-26")]).astype("M8[ns]"), bins=bins) expected = Categorical.from_codes([0], bins, ordered=True) tm.assert_categorical_equal(result.array, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/test_from_dummies.py pandas-2.2.2+dfsg/pandas/tests/reshape/test_from_dummies.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/test_from_dummies.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/test_from_dummies.py 2024-04-10 17:42:52.000000000 +0000 @@ -328,9 +328,13 @@ ), ], ) -def test_no_prefix_string_cats_default_category(default_category, expected): +def test_no_prefix_string_cats_default_category( + default_category, expected, using_infer_string +): dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]}) result = from_dummies(dummies, default_category=default_category) + if using_infer_string: + expected[""] = expected[""].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/test_get_dummies.py pandas-2.2.2+dfsg/pandas/tests/reshape/test_get_dummies.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/test_get_dummies.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/test_get_dummies.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,13 +4,18 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_integer_dtype import pandas as pd from pandas import ( + ArrowDtype, Categorical, + CategoricalDtype, CategoricalIndex, DataFrame, + Index, RangeIndex, Series, SparseDtype, @@ -19,6 +24,11 @@ import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray +try: + import pyarrow as pa +except ImportError: + pa = None + class TestGetDummies: @pytest.fixture @@ -69,7 +79,7 @@ result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) tm.assert_frame_equal(result, expected) - def test_get_dummies_basic_types(self, sparse, dtype): + def test_get_dummies_basic_types(self, sparse, dtype, using_infer_string): # GH 10531 s_list = list("abc") s_series = Series(s_list) @@ -110,7 +120,8 @@ result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype) - expected_counts = {"int64": 1, "object": 1} + key = "string" if using_infer_string else "object" + expected_counts = {"int64": 1, key: 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) expected = Series(expected_counts, name="count").sort_index() @@ -203,7 +214,7 @@ tm.assert_frame_equal(result, expected) - def test_dataframe_dummies_string_dtype(self, df): + def test_dataframe_dummies_string_dtype(self, df, using_infer_string): # GH44965 df = df[["A", "B"]] df = df.astype({"A": "object", "B": "string"}) @@ -217,6 +228,9 @@ }, dtype=bool, ) + if not using_infer_string: + # infer_string returns numpy bools + expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean") tm.assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self, df, sparse, dtype): @@ -693,3 +707,37 @@ dtype=any_numeric_ea_and_arrow_dtype, ) tm.assert_frame_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_get_dummies_ea_dtype(self): + # GH#56273 + for dtype, exp_dtype in [ + ("string[pyarrow]", "boolean"), + ("string[pyarrow_numpy]", "bool"), + (CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"), + (CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"), + ]: + df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) + result = get_dummies(df) + expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_get_dummies_arrow_dtype(self): + # GH#56273 + df = DataFrame({"name": Series(["a"], dtype=ArrowDtype(pa.string())), "x": 1}) + result = get_dummies(df) + expected = DataFrame({"x": 1, "name_a": Series([True], dtype="bool[pyarrow]")}) + tm.assert_frame_equal(result, expected) + + df = DataFrame( + { + "name": Series( + ["a"], + dtype=CategoricalDtype(Index(["a"], dtype=ArrowDtype(pa.string()))), + ), + "x": 1, + } + ) + result = get_dummies(df) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/test_melt.py pandas-2.2.2+dfsg/pandas/tests/reshape/test_melt.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/test_melt.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/test_melt.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,6 +6,8 @@ import pandas as pd from pandas import ( DataFrame, + Index, + date_range, lreshape, melt, wide_to_long, @@ -15,7 +17,11 @@ @pytest.fixture def df(): - res = tm.makeTimeDataFrame()[:10] + res = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), + ) res["id1"] = (res["A"] > 0).astype(np.int64) res["id2"] = (res["B"] > 0).astype(np.int64) return res @@ -281,7 +287,7 @@ @pytest.mark.parametrize( "col", [ - pd.Series(pd.date_range("2010", periods=5, tz="US/Pacific")), + pd.Series(date_range("2010", periods=5, tz="US/Pacific")), pd.Series(["a", "b", "c", "a", "d"], dtype="category"), pd.Series([0, 1, 0, 0, 0]), ], @@ -327,32 +333,28 @@ ) # Try to melt with missing `value_vars` column name - msg = "The following '{Var}' are not present in the DataFrame: {Col}" - with pytest.raises( - KeyError, match=msg.format(Var="value_vars", Col="\\['C'\\]") - ): + msg = "The following id_vars or value_vars are not present in the DataFrame:" + with pytest.raises(KeyError, match=msg): df.melt(["a", "b"], ["C", "d"]) # Try to melt with missing `id_vars` column name - with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['A'\\]")): + with pytest.raises(KeyError, match=msg): df.melt(["A", "b"], ["c", "d"]) # Multiple missing with pytest.raises( KeyError, - match=msg.format(Var="id_vars", Col="\\['not_here', 'or_there'\\]"), + match=msg, ): df.melt(["a", "b", "not_here", "or_there"], ["c", "d"]) # Multiindex melt fails if column is missing from multilevel melt multi = df.copy() multi.columns = [list("ABCD"), list("abcd")] - with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['E'\\]")): + with pytest.raises(KeyError, match=msg): multi.melt([("E", "a")], [("B", "b")]) # Multiindex fails if column is missing from single level melt - with pytest.raises( - KeyError, match=msg.format(Var="value_vars", Col="\\['F'\\]") - ): + with pytest.raises(KeyError, match=msg): multi.melt(["A"], ["F"], col_level=0) def test_melt_mixed_int_str_id_vars(self): @@ -400,11 +402,11 @@ def test_ignore_index_name_and_type(self): # GH 17440 - index = pd.Index(["foo", "bar"], dtype="category", name="baz") + index = Index(["foo", "bar"], dtype="category", name="baz") df = DataFrame({"x": [0, 1], "y": [2, 3]}, index=index) result = melt(df, ignore_index=False) - expected_index = pd.Index(["foo", "bar"] * 2, dtype="category", name="baz") + expected_index = Index(["foo", "bar"] * 2, dtype="category", name="baz") expected = DataFrame( {"variable": ["x", "x", "y", "y"], "value": [0, 1, 2, 3]}, index=expected_index, @@ -459,6 +461,81 @@ ) tm.assert_frame_equal(result, expected) + def test_melt_preserves_datetime(self): + df = DataFrame( + data=[ + { + "type": "A0", + "start_date": pd.Timestamp("2023/03/01", tz="Asia/Tokyo"), + "end_date": pd.Timestamp("2023/03/10", tz="Asia/Tokyo"), + }, + { + "type": "A1", + "start_date": pd.Timestamp("2023/03/01", tz="Asia/Tokyo"), + "end_date": pd.Timestamp("2023/03/11", tz="Asia/Tokyo"), + }, + ], + index=["aaaa", "bbbb"], + ) + result = df.melt( + id_vars=["type"], + value_vars=["start_date", "end_date"], + var_name="start/end", + value_name="date", + ) + expected = DataFrame( + { + "type": {0: "A0", 1: "A1", 2: "A0", 3: "A1"}, + "start/end": { + 0: "start_date", + 1: "start_date", + 2: "end_date", + 3: "end_date", + }, + "date": { + 0: pd.Timestamp("2023-03-01 00:00:00+0900", tz="Asia/Tokyo"), + 1: pd.Timestamp("2023-03-01 00:00:00+0900", tz="Asia/Tokyo"), + 2: pd.Timestamp("2023-03-10 00:00:00+0900", tz="Asia/Tokyo"), + 3: pd.Timestamp("2023-03-11 00:00:00+0900", tz="Asia/Tokyo"), + }, + } + ) + tm.assert_frame_equal(result, expected) + + def test_melt_allows_non_scalar_id_vars(self): + df = DataFrame( + data={"a": [1, 2, 3], "b": [4, 5, 6]}, + index=["11", "22", "33"], + ) + result = df.melt( + id_vars="a", + var_name=0, + value_name=1, + ) + expected = DataFrame({"a": [1, 2, 3], 0: ["b"] * 3, 1: [4, 5, 6]}) + tm.assert_frame_equal(result, expected) + + def test_melt_allows_non_string_var_name(self): + df = DataFrame( + data={"a": [1, 2, 3], "b": [4, 5, 6]}, + index=["11", "22", "33"], + ) + result = df.melt( + id_vars=["a"], + var_name=0, + value_name=1, + ) + expected = DataFrame({"a": [1, 2, 3], 0: ["b"] * 3, 1: [4, 5, 6]}) + tm.assert_frame_equal(result, expected) + + def test_melt_non_scalar_var_name_raises(self): + df = DataFrame( + data={"a": [1, 2, 3], "b": [4, 5, 6]}, + index=["11", "22", "33"], + ) + with pytest.raises(ValueError, match=r".* must be a scalar."): + df.melt(id_vars=["a"], var_name=[1, 2]) + class TestLreshape: def test_pairs(self): @@ -1132,7 +1209,7 @@ j="num", sep="-", ) - index = pd.Index( + index = Index( [("1", 1), ("2", 1), ("1", 2), ("2", 2)], name=("id", "num"), ) @@ -1143,3 +1220,33 @@ new_level = expected.index.levels[0].astype(dtype) expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected) + + +def test_wide_to_long_pyarrow_string_columns(): + # GH 57066 + pytest.importorskip("pyarrow") + df = DataFrame( + { + "ID": {0: 1}, + "R_test1": {0: 1}, + "R_test2": {0: 1}, + "R_test3": {0: 2}, + "D": {0: 1}, + } + ) + df.columns = df.columns.astype("string[pyarrow_numpy]") + result = wide_to_long( + df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*" + ) + expected = DataFrame( + [[1, 1], [1, 1], [1, 2]], + columns=Index(["D", "R"], dtype=object), + index=pd.MultiIndex.from_arrays( + [ + [1, 1, 1], + Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"), + ], + names=["ID", "UNPIVOTED"], + ), + ) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/test_pivot.py pandas-2.2.2+dfsg/pandas/tests/reshape/test_pivot.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/test_pivot.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/test_pivot.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.errors import PerformanceWarning import pandas as pd @@ -23,7 +25,7 @@ date_range, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype from pandas.core.reshape import reshape as reshape_lib from pandas.core.reshape.pivot import pivot_table @@ -33,7 +35,7 @@ return request.param -@pytest.fixture(params=[([0] * 4, [1] * 4), (range(0, 3), range(1, 4))]) +@pytest.fixture(params=[([0] * 4, [1] * 4), (range(3), range(1, 4))]) def interval_values(request, closed): left, right = request.param return Categorical(pd.IntervalIndex.from_arrays(left, right, closed)) @@ -201,7 +203,9 @@ ["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True ) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - result = pivot_table(df, values="values", index=["A", "B"], dropna=True) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = pivot_table(df, values="values", index=["A", "B"], dropna=True) exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) expected = DataFrame({"values": [1.0, 2.0, 3.0, 4.0]}, index=exp_index) @@ -215,14 +219,18 @@ { "A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], "B": [1, 2, 3, 1, 2, 3, 1, 2, 3], - "C": range(0, 9), + "C": range(9), } ) - df["A"] = df["A"].astype(CDT(categories, ordered=False)) - result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) + df["A"] = df["A"].astype(CategoricalDtype(categories, ordered=False)) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) expected_columns = Series(["a", "b", "c"], name="A") - expected_columns = expected_columns.astype(CDT(categories, ordered=False)) + expected_columns = expected_columns.astype( + CategoricalDtype(categories, ordered=False) + ) expected_index = Series([1, 2, 3], name="B") expected = DataFrame( [[0.0, 3.0, 6.0], [1.0, 4.0, 7.0], [2.0, 5.0, 8.0]], @@ -248,7 +256,9 @@ } ) - result = df.pivot_table(index="A", values="B", dropna=dropna) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.pivot_table(index="A", values="B", dropna=dropna) if dropna: values = [2.0, 3.0] codes = [0, 1] @@ -281,7 +291,9 @@ } ) - result = df.pivot_table(index="A", values="B", dropna=dropna) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.pivot_table(index="A", values="B", dropna=dropna) expected = DataFrame( {"B": [2.0, 3.0, 0.0]}, index=Index( @@ -299,7 +311,10 @@ def test_pivot_with_interval_index(self, interval_values, dropna): # GH 25814 df = DataFrame({"A": interval_values, "B": 1}) - result = df.pivot_table(index="A", values="B", dropna=dropna) + + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.pivot_table(index="A", values="B", dropna=dropna) expected = DataFrame( {"B": 1.0}, index=Index(interval_values.unique(), name="A") ) @@ -320,9 +335,11 @@ } ) - pivot_tab = pivot_table( - df, index="C", columns="B", values="A", aggfunc="sum", margins=True - ) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + pivot_tab = pivot_table( + df, index="C", columns="B", values="A", aggfunc="sum", margins=True + ) result = pivot_tab["All"] expected = Series( @@ -434,19 +451,23 @@ }, index=idx, ) - res = df.pivot_table(index=df.index.month, columns=Grouper(key="dt", freq="M")) - exp_columns = MultiIndex.from_tuples([("A", pd.Timestamp("2011-01-31"))]) - exp_columns.names = [None, "dt"] + res = df.pivot_table(index=df.index.month, columns=Grouper(key="dt", freq="ME")) + exp_columns = MultiIndex.from_arrays( + [["A"], pd.DatetimeIndex(["2011-01-31"], dtype="M8[ns]")], + names=[None, "dt"], + ) exp = DataFrame( [3.25, 2.0], index=Index([1, 2], dtype=np.int32), columns=exp_columns ) tm.assert_frame_equal(res, exp) res = df.pivot_table( - index=Grouper(freq="A"), columns=Grouper(key="dt", freq="M") + index=Grouper(freq="YE"), columns=Grouper(key="dt", freq="ME") ) exp = DataFrame( - [3.0], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns + [3.0], + index=pd.DatetimeIndex(["2011-12-31"], freq="YE"), + columns=exp_columns, ) tm.assert_frame_equal(res, exp) @@ -541,40 +562,48 @@ tm.assert_frame_equal(result, pv.T) @pytest.mark.parametrize("method", [True, False]) - def test_pivot_with_tz(self, method): + def test_pivot_with_tz(self, method, unit): # GH 5878 df = DataFrame( { - "dt1": [ - datetime(2013, 1, 1, 9, 0), - datetime(2013, 1, 2, 9, 0), - datetime(2013, 1, 1, 9, 0), - datetime(2013, 1, 2, 9, 0), - ], - "dt2": [ - datetime(2014, 1, 1, 9, 0), - datetime(2014, 1, 1, 9, 0), - datetime(2014, 1, 2, 9, 0), - datetime(2014, 1, 2, 9, 0), - ], + "dt1": pd.DatetimeIndex( + [ + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + ], + dtype=f"M8[{unit}, US/Pacific]", + ), + "dt2": pd.DatetimeIndex( + [ + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 2, 9, 0), + datetime(2014, 1, 2, 9, 0), + ], + dtype=f"M8[{unit}, Asia/Tokyo]", + ), "data1": np.arange(4, dtype="int64"), "data2": np.arange(4, dtype="int64"), } ) - df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) - df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) - exp_col1 = Index(["data1", "data1", "data2", "data2"]) exp_col2 = pd.DatetimeIndex( - ["2014/01/01 09:00", "2014/01/02 09:00"] * 2, name="dt2", tz="Asia/Tokyo" + ["2014/01/01 09:00", "2014/01/02 09:00"] * 2, + name="dt2", + dtype=f"M8[{unit}, Asia/Tokyo]", ) exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) + exp_idx = pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], + name="dt1", + dtype=f"M8[{unit}, US/Pacific]", + ) expected = DataFrame( [[0, 2, 0, 2], [1, 3, 1, 3]], - index=pd.DatetimeIndex( - ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" - ), + index=exp_idx, columns=exp_col, ) @@ -586,12 +615,8 @@ expected = DataFrame( [[0, 2], [1, 3]], - index=pd.DatetimeIndex( - ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" - ), - columns=pd.DatetimeIndex( - ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" - ), + index=exp_idx, + columns=exp_col2[:2], ) if method: @@ -758,7 +783,8 @@ codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], names=[None, "bar"], ) - expected = DataFrame(data=data, index=index, columns=columns, dtype="object") + expected = DataFrame(data=data, index=index, columns=columns) + expected["baz"] = expected["baz"].astype(object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( @@ -801,7 +827,8 @@ codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[None, "foo"], ) - expected = DataFrame(data=data, index=index, columns=columns, dtype="object") + expected = DataFrame(data=data, index=index, columns=columns) + expected["baz"] = expected["baz"].astype(object) tm.assert_frame_equal(result, expected) def test_pivot_columns_none_raise_error(self): @@ -926,7 +953,7 @@ # to help with a buglet data.columns = [k * 2 for k in data.columns] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") table = data.drop(columns="CC").pivot_table( @@ -999,7 +1026,7 @@ } ) if aggfunc != "sum": - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) if "B" not in columns: @@ -1063,7 +1090,7 @@ expected = DataFrame( [[4.0, 5.0, 6.0]], columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), - index=Index(["v"]), + index=Index(["v"], dtype=object), ) tm.assert_frame_equal(result, expected) @@ -1271,7 +1298,7 @@ expected = DataFrame( np.array([10, 18, 3], dtype="int64").reshape(1, 3), - index=pd.DatetimeIndex([datetime(2013, 12, 31)], freq="A"), + index=pd.DatetimeIndex([datetime(2013, 12, 31)], freq="YE"), columns="Carl Joe Mark".split(), ) expected.index.name = "Date" @@ -1279,7 +1306,7 @@ result = pivot_table( df, - index=Grouper(freq="A"), + index=Grouper(freq="YE"), columns="Buyer", values="Quantity", aggfunc="sum", @@ -1289,7 +1316,7 @@ result = pivot_table( df, index="Buyer", - columns=Grouper(freq="A"), + columns=Grouper(freq="YE"), values="Quantity", aggfunc="sum", ) @@ -1434,8 +1461,8 @@ result = pivot_table( df, - index=Grouper(freq="M", key="Date"), - columns=Grouper(freq="M", key="PayDay"), + index=Grouper(freq="ME", key="Date"), + columns=Grouper(freq="ME", key="PayDay"), values="Quantity", aggfunc="sum", ) @@ -1467,7 +1494,7 @@ datetime(2013, 11, 30), datetime(2013, 12, 31), ], - freq="M", + freq="ME", ), columns=pd.DatetimeIndex( [ @@ -1476,7 +1503,7 @@ datetime(2013, 11, 30), datetime(2013, 12, 31), ], - freq="M", + freq="ME", ), ) expected.index.name = "Date" @@ -1486,8 +1513,8 @@ result = pivot_table( df, - index=Grouper(freq="M", key="PayDay"), - columns=Grouper(freq="M", key="Date"), + index=Grouper(freq="ME", key="PayDay"), + columns=Grouper(freq="ME", key="Date"), values="Quantity", aggfunc="sum", ) @@ -1513,7 +1540,7 @@ result = pivot_table( df, - index=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], + index=[Grouper(freq="ME", key="Date"), Grouper(freq="ME", key="PayDay")], columns=["Branch"], values="Quantity", aggfunc="sum", @@ -1523,29 +1550,36 @@ result = pivot_table( df, index=["Branch"], - columns=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], + columns=[Grouper(freq="ME", key="Date"), Grouper(freq="ME", key="PayDay")], values="Quantity", aggfunc="sum", ) tm.assert_frame_equal(result, expected.T) def test_pivot_datetime_tz(self): - dates1 = [ - "2011-07-19 07:00:00", - "2011-07-19 08:00:00", - "2011-07-19 09:00:00", - "2011-07-19 07:00:00", - "2011-07-19 08:00:00", - "2011-07-19 09:00:00", - ] - dates2 = [ - "2013-01-01 15:00:00", - "2013-01-01 15:00:00", - "2013-01-01 15:00:00", - "2013-02-01 15:00:00", - "2013-02-01 15:00:00", - "2013-02-01 15:00:00", - ] + dates1 = pd.DatetimeIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ], + dtype="M8[ns, US/Pacific]", + name="dt1", + ) + dates2 = pd.DatetimeIndex( + [ + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + ], + dtype="M8[ns, Asia/Tokyo]", + ) df = DataFrame( { "label": ["a", "a", "a", "b", "b", "b"], @@ -1555,14 +1589,8 @@ "value2": [1, 2] * 3, } ) - df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) - df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) - exp_idx = pd.DatetimeIndex( - ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], - tz="US/Pacific", - name="dt1", - ) + exp_idx = dates1[:3] exp_col1 = Index(["value1", "value1"]) exp_col2 = Index(["a", "b"], name="label") exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) @@ -1576,7 +1604,7 @@ exp_col2 = Index(["value1", "value1", "value2", "value2"] * 2) exp_col3 = pd.DatetimeIndex( ["2013-01-01 15:00:00", "2013-02-01 15:00:00"] * 4, - tz="Asia/Tokyo", + dtype="M8[ns, Asia/Tokyo]", name="dt2", ) exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) @@ -1621,22 +1649,26 @@ def test_pivot_dtaccessor(self): # GH 8103 - dates1 = [ - "2011-07-19 07:00:00", - "2011-07-19 08:00:00", - "2011-07-19 09:00:00", - "2011-07-19 07:00:00", - "2011-07-19 08:00:00", - "2011-07-19 09:00:00", - ] - dates2 = [ - "2013-01-01 15:00:00", - "2013-01-01 15:00:00", - "2013-01-01 15:00:00", - "2013-02-01 15:00:00", - "2013-02-01 15:00:00", - "2013-02-01 15:00:00", - ] + dates1 = pd.DatetimeIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + ) + dates2 = pd.DatetimeIndex( + [ + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + ] + ) df = DataFrame( { "label": ["a", "a", "a", "b", "b", "b"], @@ -1646,8 +1678,6 @@ "value2": [1, 2] * 3, } ) - df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d)) - df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d)) result = pivot_table( df, index="label", columns=df["dt1"].dt.hour, values="value1" @@ -1708,39 +1738,38 @@ ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("i", range(1, 367)) - def test_daily(self, i): + def test_daily(self): rng = date_range("1/1/2000", "12/31/2004", freq="D") - ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) + ts = Series(np.arange(len(rng)), index=rng) - annual = pivot_table( + result = pivot_table( DataFrame(ts), index=ts.index.year, columns=ts.index.dayofyear ) - annual.columns = annual.columns.droplevel(0) + result.columns = result.columns.droplevel(0) doy = np.asarray(ts.index.dayofyear) - subset = ts[doy == i] - subset.index = subset.index.year - - result = annual[i].dropna() - tm.assert_series_equal(result, subset, check_names=False) - assert result.name == i - - @pytest.mark.parametrize("i", range(1, 13)) - def test_monthly(self, i): - rng = date_range("1/1/2000", "12/31/2004", freq="M") - ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) - - annual = pivot_table(DataFrame(ts), index=ts.index.year, columns=ts.index.month) - annual.columns = annual.columns.droplevel(0) - - month = ts.index.month - subset = ts[month == i] - subset.index = subset.index.year - result = annual[i].dropna() - tm.assert_series_equal(result, subset, check_names=False) - assert result.name == i + expected = {} + for y in ts.index.year.unique().values: + mask = ts.index.year == y + expected[y] = Series(ts.values[mask], index=doy[mask]) + expected = DataFrame(expected, dtype=float).T + tm.assert_frame_equal(result, expected) + + def test_monthly(self): + rng = date_range("1/1/2000", "12/31/2004", freq="ME") + ts = Series(np.arange(len(rng)), index=rng) + + result = pivot_table(DataFrame(ts), index=ts.index.year, columns=ts.index.month) + result.columns = result.columns.droplevel(0) + + month = np.asarray(ts.index.month) + expected = {} + for y in ts.index.year.unique().values: + mask = ts.index.year == y + expected[y] = Series(ts.values[mask], index=month[mask]) + expected = DataFrame(expected, dtype=float).T + tm.assert_frame_equal(result, expected) def test_pivot_table_with_iterator_values(self, data): # GH 12017 @@ -1768,7 +1797,7 @@ { "item": ["bacon", "cheese", "bacon", "cheese"], "cost": [2.5, 4.5, 3.2, 3.3], - "day": ["M", "M", "T", "T"], + "day": ["ME", "ME", "T", "T"], } ) table = costs.pivot_table( @@ -1778,12 +1807,12 @@ margins_name=margins_name, aggfunc=["mean", "max"], ) - ix = Index(["bacon", "cheese", margins_name], dtype="object", name="item") + ix = Index(["bacon", "cheese", margins_name], name="item") tups = [ - ("mean", "cost", "M"), + ("mean", "cost", "ME"), ("mean", "cost", "T"), ("mean", "cost", margins_name), - ("max", "cost", "M"), + ("max", "cost", "ME"), ("max", "cost", "T"), ("max", "cost", margins_name), ] @@ -1815,7 +1844,9 @@ df.y = df.y.astype("category") df.z = df.z.astype("category") - table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) def test_margins_casted_to_float(self): @@ -1877,9 +1908,11 @@ {"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]} ) df["C1"] = df["C1"].astype("category") - result = df.pivot_table( - "V", index="C1", columns="C2", dropna=observed, aggfunc="count" - ) + msg = "The default value of observed=False is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df.pivot_table( + "V", index="C1", columns="C2", dropna=observed, aggfunc="count" + ) expected_index = pd.CategoricalIndex( ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" @@ -1966,7 +1999,7 @@ def test_pivot_margins_name_unicode(self): # issue #13292 greek = "\u0394\u03bf\u03ba\u03b9\u03bc\u03ae" - frame = DataFrame({"foo": [1, 2, 3]}) + frame = DataFrame({"foo": [1, 2, 3]}, columns=Index(["foo"], dtype=object)) table = pivot_table( frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek ) @@ -2491,12 +2524,12 @@ expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) - def test_pivot_integer_bug(self): - df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")]) + @pytest.mark.parametrize("dtype", [object, "string"]) + def test_pivot_integer_bug(self, dtype): + df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) result = df.pivot(index=1, columns=0, values=2) - repr(result) - tm.assert_index_equal(result.columns, Index(["A", "B"], name=0)) + tm.assert_index_equal(result.columns, Index(["A", "B"], name=0, dtype=dtype)) def test_pivot_index_none(self): # GH#3962 @@ -2578,6 +2611,7 @@ with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() # pylint: disable=missing-kwoa + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2593,6 +2627,7 @@ expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") def test_pivot_index_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2606,6 +2641,7 @@ expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="None is cast to NaN") def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2661,3 +2697,18 @@ names=["a", "date"], ) tm.assert_index_equal(pivot.index, expected) + + def test_pivot_table_with_margins_and_numeric_column_names(self): + # GH#26568 + df = DataFrame([["a", "x", 1], ["a", "y", 2], ["b", "y", 3], ["b", "z", 4]]) + + result = df.pivot_table( + index=0, columns=1, values=2, aggfunc="sum", fill_value=0, margins=True + ) + + expected = DataFrame( + [[1, 2, 0, 3], [0, 3, 4, 7], [1, 5, 4, 10]], + columns=Index(["x", "y", "z", "All"], name=1), + index=Index(["a", "b", "All"], name=0), + ) + tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/test_qcut.py pandas-2.2.2+dfsg/pandas/tests/reshape/test_qcut.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/test_qcut.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/test_qcut.py 2024-04-10 17:42:52.000000000 +0000 @@ -11,6 +11,7 @@ IntervalIndex, NaT, Series, + Timedelta, TimedeltaIndex, Timestamp, cut, @@ -20,12 +21,9 @@ timedelta_range, ) import pandas._testing as tm -from pandas.api.types import CategoricalDtype as CDT +from pandas.api.types import CategoricalDtype -from pandas.tseries.offsets import ( - Day, - Nano, -) +from pandas.tseries.offsets import Day def test_qcut(): @@ -129,7 +127,9 @@ exp_levels = np.array( [Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)] ) - exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True)) + exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype( + CategoricalDtype(ordered=True) + ) tm.assert_series_equal(res, exp) @@ -199,7 +199,7 @@ if labels is None: intervals = IntervalIndex([Interval(start, end)] * length, closed="right") - expected = Series(intervals).astype(CDT(ordered=True)) + expected = Series(intervals).astype(CategoricalDtype(ordered=True)) else: expected = Series([0] * length, dtype=np.intp) @@ -214,11 +214,14 @@ ], ids=lambda x: str(x.dtype), ) -def test_qcut_nat(ser): +def test_qcut_nat(ser, unit): # see gh-19768 - intervals = IntervalIndex.from_tuples( - [(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])] - ) + ser = ser.dt.as_unit(unit) + td = Timedelta(1, unit=unit).as_unit(unit) + + left = Series([ser[0] - td, np.nan, ser[2] - Day()], dtype=ser.dtype) + right = Series([ser[2] - Day(), np.nan, ser[2]], dtype=ser.dtype) + intervals = IntervalIndex.from_arrays(left, right) expected = Series(Categorical(intervals, ordered=True)) result = qcut(ser, 2) @@ -249,7 +252,7 @@ ), ] ) - ).astype(CDT(ordered=True)) + ).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/reshape/test_union_categoricals.py pandas-2.2.2+dfsg/pandas/tests/reshape/test_union_categoricals.py --- pandas-2.1.4+dfsg/pandas/tests/reshape/test_union_categoricals.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/reshape/test_union_categoricals.py 2024-04-10 17:42:52.000000000 +0000 @@ -110,7 +110,7 @@ res = union_categoricals( [ Categorical(np.array([np.nan, np.nan], dtype=object)), - Categorical(["X"]), + Categorical(["X"], categories=pd.Index(["X"], dtype=object)), ] ) exp = Categorical([np.nan, np.nan, "X"]) @@ -123,8 +123,10 @@ tm.assert_categorical_equal(res, exp) @pytest.mark.parametrize("val", [[], ["1"]]) - def test_union_categoricals_empty(self, val): + def test_union_categoricals_empty(self, val, request, using_infer_string): # GH 13759 + if using_infer_string and val == ["1"]: + request.applymarker(pytest.mark.xfail("object and strings dont match")) res = union_categoricals([Categorical([]), Categorical(val)]) exp = Categorical(val) tm.assert_categorical_equal(res, exp) diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_arithmetic.py pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_arithmetic.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_arithmetic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_arithmetic.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,56 +8,185 @@ Timedelta, Timestamp, ) +import pandas._testing as tm -@pytest.mark.parametrize("method", ["__add__", "__sub__"]) -@pytest.mark.parametrize( - "interval", - [ - Interval(Timestamp("2017-01-01 00:00:00"), Timestamp("2018-01-01 00:00:00")), - Interval(Timedelta(days=7), Timedelta(days=14)), - ], -) -@pytest.mark.parametrize( - "delta", [Timedelta(days=7), timedelta(7), np.timedelta64(7, "D")] -) -def test_time_interval_add_subtract_timedelta(interval, delta, method): - # https://github.com/pandas-dev/pandas/issues/32023 - result = getattr(interval, method)(delta) - left = getattr(interval.left, method)(delta) - right = getattr(interval.right, method)(delta) - expected = Interval(left, right) - - assert result == expected +class TestIntervalArithmetic: + def test_interval_add(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(1, 2, closed=closed) + result = interval + 1 + assert result == expected -@pytest.mark.parametrize("interval", [Interval(1, 2), Interval(1.0, 2.0)]) -@pytest.mark.parametrize( - "delta", [Timedelta(days=7), timedelta(7), np.timedelta64(7, "D")] -) -def test_numeric_interval_add_timedelta_raises(interval, delta): - # https://github.com/pandas-dev/pandas/issues/32023 - msg = "|".join( - [ - "unsupported operand", - "cannot use operands", - "Only numeric, Timestamp and Timedelta endpoints are allowed", - ] - ) - with pytest.raises((TypeError, ValueError), match=msg): - interval + delta + result = 1 + interval + assert result == expected + + result = interval + result += 1 + assert result == expected + + msg = r"unsupported operand type\(s\) for \+" + with pytest.raises(TypeError, match=msg): + interval + interval + + with pytest.raises(TypeError, match=msg): + interval + "foo" + + def test_interval_sub(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(-1, 0, closed=closed) + + result = interval - 1 + assert result == expected + + result = interval + result -= 1 + assert result == expected + + msg = r"unsupported operand type\(s\) for -" + with pytest.raises(TypeError, match=msg): + interval - interval + + with pytest.raises(TypeError, match=msg): + interval - "foo" + + def test_interval_mult(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(0, 2, closed=closed) + + result = interval * 2 + assert result == expected - with pytest.raises((TypeError, ValueError), match=msg): - delta + interval + result = 2 * interval + assert result == expected + result = interval + result *= 2 + assert result == expected -@pytest.mark.parametrize("klass", [timedelta, np.timedelta64, Timedelta]) -def test_timedelta_add_timestamp_interval(klass): - delta = klass(0) - expected = Interval(Timestamp("2020-01-01"), Timestamp("2020-02-01")) + msg = r"unsupported operand type\(s\) for \*" + with pytest.raises(TypeError, match=msg): + interval * interval - result = delta + expected - assert result == expected + msg = r"can\'t multiply sequence by non-int" + with pytest.raises(TypeError, match=msg): + interval * "foo" - result = expected + delta - assert result == expected + def test_interval_div(self, closed): + interval = Interval(0, 1, closed=closed) + expected = Interval(0, 0.5, closed=closed) + + result = interval / 2.0 + assert result == expected + + result = interval + result /= 2.0 + assert result == expected + + msg = r"unsupported operand type\(s\) for /" + with pytest.raises(TypeError, match=msg): + interval / interval + + with pytest.raises(TypeError, match=msg): + interval / "foo" + + def test_interval_floordiv(self, closed): + interval = Interval(1, 2, closed=closed) + expected = Interval(0, 1, closed=closed) + + result = interval // 2 + assert result == expected + + result = interval + result //= 2 + assert result == expected + + msg = r"unsupported operand type\(s\) for //" + with pytest.raises(TypeError, match=msg): + interval // interval + + with pytest.raises(TypeError, match=msg): + interval // "foo" + + @pytest.mark.parametrize("method", ["__add__", "__sub__"]) + @pytest.mark.parametrize( + "interval", + [ + Interval( + Timestamp("2017-01-01 00:00:00"), Timestamp("2018-01-01 00:00:00") + ), + Interval(Timedelta(days=7), Timedelta(days=14)), + ], + ) + @pytest.mark.parametrize( + "delta", [Timedelta(days=7), timedelta(7), np.timedelta64(7, "D")] + ) + def test_time_interval_add_subtract_timedelta(self, interval, delta, method): + # https://github.com/pandas-dev/pandas/issues/32023 + result = getattr(interval, method)(delta) + left = getattr(interval.left, method)(delta) + right = getattr(interval.right, method)(delta) + expected = Interval(left, right) + + assert result == expected + + @pytest.mark.parametrize("interval", [Interval(1, 2), Interval(1.0, 2.0)]) + @pytest.mark.parametrize( + "delta", [Timedelta(days=7), timedelta(7), np.timedelta64(7, "D")] + ) + def test_numeric_interval_add_timedelta_raises(self, interval, delta): + # https://github.com/pandas-dev/pandas/issues/32023 + msg = "|".join( + [ + "unsupported operand", + "cannot use operands", + "Only numeric, Timestamp and Timedelta endpoints are allowed", + ] + ) + with pytest.raises((TypeError, ValueError), match=msg): + interval + delta + + with pytest.raises((TypeError, ValueError), match=msg): + delta + interval + + @pytest.mark.parametrize("klass", [timedelta, np.timedelta64, Timedelta]) + def test_timedelta_add_timestamp_interval(self, klass): + delta = klass(0) + expected = Interval(Timestamp("2020-01-01"), Timestamp("2020-02-01")) + + result = delta + expected + assert result == expected + + result = expected + delta + assert result == expected + + +class TestIntervalComparisons: + def test_interval_equal(self): + assert Interval(0, 1) == Interval(0, 1, closed="right") + assert Interval(0, 1) != Interval(0, 1, closed="left") + assert Interval(0, 1) != 0 + + def test_interval_comparison(self): + msg = ( + "'<' not supported between instances of " + "'pandas._libs.interval.Interval' and 'int'" + ) + with pytest.raises(TypeError, match=msg): + Interval(0, 1) < 2 + + assert Interval(0, 1) < Interval(1, 2) + assert Interval(0, 1) < Interval(0, 2) + assert Interval(0, 1) < Interval(0.5, 1.5) + assert Interval(0, 1) <= Interval(0, 1) + assert Interval(0, 1) > Interval(-1, 2) + assert Interval(0, 1) >= Interval(0, 1) + + def test_equality_comparison_broadcasts_over_array(self): + # https://github.com/pandas-dev/pandas/issues/35931 + interval = Interval(0, 1) + arr = np.array([interval, interval]) + result = interval == arr + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_constructors.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,51 @@ +import pytest + +from pandas import ( + Interval, + Period, + Timestamp, +) + + +class TestIntervalConstructors: + @pytest.mark.parametrize( + "left, right", + [ + ("a", "z"), + (("a", "b"), ("c", "d")), + (list("AB"), list("ab")), + (Interval(0, 1), Interval(1, 2)), + (Period("2018Q1", freq="Q"), Period("2018Q1", freq="Q")), + ], + ) + def test_construct_errors(self, left, right): + # GH#23013 + msg = "Only numeric, Timestamp and Timedelta endpoints are allowed" + with pytest.raises(ValueError, match=msg): + Interval(left, right) + + def test_constructor_errors(self): + msg = "invalid option for 'closed': foo" + with pytest.raises(ValueError, match=msg): + Interval(0, 1, closed="foo") + + msg = "left side of interval must be <= right side" + with pytest.raises(ValueError, match=msg): + Interval(1, 0) + + @pytest.mark.parametrize( + "tz_left, tz_right", [(None, "UTC"), ("UTC", None), ("UTC", "US/Eastern")] + ) + def test_constructor_errors_tz(self, tz_left, tz_right): + # GH#18538 + left = Timestamp("2017-01-01", tz=tz_left) + right = Timestamp("2017-01-02", tz=tz_right) + + if tz_left is None or tz_right is None: + error = TypeError + msg = "Cannot compare tz-naive and tz-aware timestamps" + else: + error = ValueError + msg = "left and right must have the same time zone" + with pytest.raises(error, match=msg): + Interval(left, right) diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_contains.py pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_contains.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_contains.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_contains.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,73 @@ +import pytest + +from pandas import ( + Interval, + Timedelta, + Timestamp, +) + + +class TestContains: + def test_contains(self): + interval = Interval(0, 1) + assert 0.5 in interval + assert 1 in interval + assert 0 not in interval + + interval_both = Interval(0, 1, "both") + assert 0 in interval_both + assert 1 in interval_both + + interval_neither = Interval(0, 1, closed="neither") + assert 0 not in interval_neither + assert 0.5 in interval_neither + assert 1 not in interval_neither + + def test_contains_interval(self, inclusive_endpoints_fixture): + interval1 = Interval(0, 1, "both") + interval2 = Interval(0, 1, inclusive_endpoints_fixture) + assert interval1 in interval1 + assert interval2 in interval2 + assert interval2 in interval1 + assert interval1 not in interval2 or inclusive_endpoints_fixture == "both" + + def test_contains_infinite_length(self): + interval1 = Interval(0, 1, "both") + interval2 = Interval(float("-inf"), float("inf"), "neither") + assert interval1 in interval2 + assert interval2 not in interval1 + + def test_contains_zero_length(self): + interval1 = Interval(0, 1, "both") + interval2 = Interval(-1, -1, "both") + interval3 = Interval(0.5, 0.5, "both") + assert interval2 not in interval1 + assert interval3 in interval1 + assert interval2 not in interval3 and interval3 not in interval2 + assert interval1 not in interval2 and interval1 not in interval3 + + @pytest.mark.parametrize( + "type1", + [ + (0, 1), + (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)), + (Timedelta("0h"), Timedelta("1h")), + ], + ) + @pytest.mark.parametrize( + "type2", + [ + (0, 1), + (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)), + (Timedelta("0h"), Timedelta("1h")), + ], + ) + def test_contains_mixed_types(self, type1, type2): + interval1 = Interval(*type1) + interval2 = Interval(*type2) + if type1 == type2: + assert interval1 in interval2 + else: + msg = "^'<=' not supported between instances of" + with pytest.raises(TypeError, match=msg): + interval1 in interval2 diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_formats.py pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_formats.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_formats.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_formats.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,11 @@ +from pandas import Interval + + +def test_interval_repr(): + interval = Interval(0, 1) + assert repr(interval) == "Interval(0, 1, closed='right')" + assert str(interval) == "(0, 1]" + + interval_left = Interval(0, 1, closed="left") + assert repr(interval_left) == "Interval(0, 1, closed='left')" + assert str(interval_left) == "[0, 1)" diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_interval.py pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_interval.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_interval.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_interval.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,12 +3,9 @@ from pandas import ( Interval, - Period, Timedelta, Timestamp, ) -import pandas._testing as tm -import pandas.core.common as com @pytest.fixture @@ -23,48 +20,6 @@ assert interval.right == 1 assert interval.mid == 0.5 - def test_repr(self, interval): - assert repr(interval) == "Interval(0, 1, closed='right')" - assert str(interval) == "(0, 1]" - - interval_left = Interval(0, 1, closed="left") - assert repr(interval_left) == "Interval(0, 1, closed='left')" - assert str(interval_left) == "[0, 1)" - - def test_contains(self, interval): - assert 0.5 in interval - assert 1 in interval - assert 0 not in interval - - interval_both = Interval(0, 1, "both") - assert 0 in interval_both - assert 1 in interval_both - - interval_neither = Interval(0, 1, closed="neither") - assert 0 not in interval_neither - assert 0.5 in interval_neither - assert 1 not in interval_neither - - def test_equal(self): - assert Interval(0, 1) == Interval(0, 1, closed="right") - assert Interval(0, 1) != Interval(0, 1, closed="left") - assert Interval(0, 1) != 0 - - def test_comparison(self): - msg = ( - "'<' not supported between instances of " - "'pandas._libs.interval.Interval' and 'int'" - ) - with pytest.raises(TypeError, match=msg): - Interval(0, 1) < 2 - - assert Interval(0, 1) < Interval(1, 2) - assert Interval(0, 1) < Interval(0, 2) - assert Interval(0, 1) < Interval(0.5, 1.5) - assert Interval(0, 1) <= Interval(0, 1) - assert Interval(0, 1) > Interval(-1, 2) - assert Interval(0, 1) >= Interval(0, 1) - def test_hash(self, interval): # should not raise hash(interval) @@ -80,8 +35,8 @@ (-np.inf, np.inf, np.inf), (Timedelta("0 days"), Timedelta("5 days"), Timedelta("5 days")), (Timedelta("10 days"), Timedelta("10 days"), Timedelta("0 days")), - (Timedelta("1H10min"), Timedelta("5H5min"), Timedelta("3H55min")), - (Timedelta("5S"), Timedelta("1H"), Timedelta("59min55S")), + (Timedelta("1h10min"), Timedelta("5h5min"), Timedelta("3h55min")), + (Timedelta("5s"), Timedelta("1h"), Timedelta("59min55s")), ], ) def test_length(self, left, right, expected): @@ -130,150 +85,3 @@ result = iv.is_empty expected = closed != "both" assert result is expected - - @pytest.mark.parametrize( - "left, right", - [ - ("a", "z"), - (("a", "b"), ("c", "d")), - (list("AB"), list("ab")), - (Interval(0, 1), Interval(1, 2)), - (Period("2018Q1", freq="Q"), Period("2018Q1", freq="Q")), - ], - ) - def test_construct_errors(self, left, right): - # GH 23013 - msg = "Only numeric, Timestamp and Timedelta endpoints are allowed" - with pytest.raises(ValueError, match=msg): - Interval(left, right) - - def test_math_add(self, closed): - interval = Interval(0, 1, closed=closed) - expected = Interval(1, 2, closed=closed) - - result = interval + 1 - assert result == expected - - result = 1 + interval - assert result == expected - - result = interval - result += 1 - assert result == expected - - msg = r"unsupported operand type\(s\) for \+" - with pytest.raises(TypeError, match=msg): - interval + interval - - with pytest.raises(TypeError, match=msg): - interval + "foo" - - def test_math_sub(self, closed): - interval = Interval(0, 1, closed=closed) - expected = Interval(-1, 0, closed=closed) - - result = interval - 1 - assert result == expected - - result = interval - result -= 1 - assert result == expected - - msg = r"unsupported operand type\(s\) for -" - with pytest.raises(TypeError, match=msg): - interval - interval - - with pytest.raises(TypeError, match=msg): - interval - "foo" - - def test_math_mult(self, closed): - interval = Interval(0, 1, closed=closed) - expected = Interval(0, 2, closed=closed) - - result = interval * 2 - assert result == expected - - result = 2 * interval - assert result == expected - - result = interval - result *= 2 - assert result == expected - - msg = r"unsupported operand type\(s\) for \*" - with pytest.raises(TypeError, match=msg): - interval * interval - - msg = r"can\'t multiply sequence by non-int" - with pytest.raises(TypeError, match=msg): - interval * "foo" - - def test_math_div(self, closed): - interval = Interval(0, 1, closed=closed) - expected = Interval(0, 0.5, closed=closed) - - result = interval / 2.0 - assert result == expected - - result = interval - result /= 2.0 - assert result == expected - - msg = r"unsupported operand type\(s\) for /" - with pytest.raises(TypeError, match=msg): - interval / interval - - with pytest.raises(TypeError, match=msg): - interval / "foo" - - def test_math_floordiv(self, closed): - interval = Interval(1, 2, closed=closed) - expected = Interval(0, 1, closed=closed) - - result = interval // 2 - assert result == expected - - result = interval - result //= 2 - assert result == expected - - msg = r"unsupported operand type\(s\) for //" - with pytest.raises(TypeError, match=msg): - interval // interval - - with pytest.raises(TypeError, match=msg): - interval // "foo" - - def test_constructor_errors(self): - msg = "invalid option for 'closed': foo" - with pytest.raises(ValueError, match=msg): - Interval(0, 1, closed="foo") - - msg = "left side of interval must be <= right side" - with pytest.raises(ValueError, match=msg): - Interval(1, 0) - - @pytest.mark.parametrize( - "tz_left, tz_right", [(None, "UTC"), ("UTC", None), ("UTC", "US/Eastern")] - ) - def test_constructor_errors_tz(self, tz_left, tz_right): - # GH 18538 - left = Timestamp("2017-01-01", tz=tz_left) - right = Timestamp("2017-01-02", tz=tz_right) - - if com.any_none(tz_left, tz_right): - error = TypeError - msg = "Cannot compare tz-naive and tz-aware timestamps" - else: - error = ValueError - msg = "left and right must have the same time zone" - with pytest.raises(error, match=msg): - Interval(left, right) - - def test_equality_comparison_broadcasts_over_array(self): - # https://github.com/pandas-dev/pandas/issues/35931 - interval = Interval(0, 1) - arr = np.array([interval, interval]) - result = interval == arr - expected = np.array([True, True]) - tm.assert_numpy_array_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_ops.py pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_ops.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_ops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_ops.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,119 +0,0 @@ -"""Tests for Interval-Interval operations, such as overlaps, contains, etc.""" -import pytest - -from pandas import ( - Interval, - Timedelta, - Timestamp, -) - - -@pytest.fixture( - params=[ - (Timedelta("0 days"), Timedelta("1 day")), - (Timestamp("2018-01-01"), Timedelta("1 day")), - (0, 1), - ], - ids=lambda x: type(x[0]).__name__, -) -def start_shift(request): - """ - Fixture for generating intervals of types from a start value and a shift - value that can be added to start to generate an endpoint - """ - return request.param - - -class TestOverlaps: - def test_overlaps_self(self, start_shift, closed): - start, shift = start_shift - interval = Interval(start, start + shift, closed) - assert interval.overlaps(interval) - - def test_overlaps_nested(self, start_shift, closed, other_closed): - start, shift = start_shift - interval1 = Interval(start, start + 3 * shift, other_closed) - interval2 = Interval(start + shift, start + 2 * shift, closed) - - # nested intervals should always overlap - assert interval1.overlaps(interval2) - - def test_overlaps_disjoint(self, start_shift, closed, other_closed): - start, shift = start_shift - interval1 = Interval(start, start + shift, other_closed) - interval2 = Interval(start + 2 * shift, start + 3 * shift, closed) - - # disjoint intervals should never overlap - assert not interval1.overlaps(interval2) - - def test_overlaps_endpoint(self, start_shift, closed, other_closed): - start, shift = start_shift - interval1 = Interval(start, start + shift, other_closed) - interval2 = Interval(start + shift, start + 2 * shift, closed) - - # overlap if shared endpoint is closed for both (overlap at a point) - result = interval1.overlaps(interval2) - expected = interval1.closed_right and interval2.closed_left - assert result == expected - - @pytest.mark.parametrize( - "other", - [10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")], - ids=lambda x: type(x).__name__, - ) - def test_overlaps_invalid_type(self, other): - interval = Interval(0, 1) - msg = f"`other` must be an Interval, got {type(other).__name__}" - with pytest.raises(TypeError, match=msg): - interval.overlaps(other) - - -class TestContains: - def test_contains_interval(self, inclusive_endpoints_fixture): - interval1 = Interval(0, 1, "both") - interval2 = Interval(0, 1, inclusive_endpoints_fixture) - assert interval1 in interval1 - assert interval2 in interval2 - assert interval2 in interval1 - assert interval1 not in interval2 or inclusive_endpoints_fixture == "both" - - def test_contains_infinite_length(self): - interval1 = Interval(0, 1, "both") - interval2 = Interval(float("-inf"), float("inf"), "neither") - assert interval1 in interval2 - assert interval2 not in interval1 - - def test_contains_zero_length(self): - interval1 = Interval(0, 1, "both") - interval2 = Interval(-1, -1, "both") - interval3 = Interval(0.5, 0.5, "both") - assert interval2 not in interval1 - assert interval3 in interval1 - assert interval2 not in interval3 and interval3 not in interval2 - assert interval1 not in interval2 and interval1 not in interval3 - - @pytest.mark.parametrize( - "type1", - [ - (0, 1), - (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)), - (Timedelta("0h"), Timedelta("1h")), - ], - ) - @pytest.mark.parametrize( - "type2", - [ - (0, 1), - (Timestamp(2000, 1, 1, 0), Timestamp(2000, 1, 1, 1)), - (Timedelta("0h"), Timedelta("1h")), - ], - ) - def test_contains_mixed_types(self, type1, type2): - interval1 = Interval(*type1) - interval2 = Interval(*type2) - if type1 == type2: - assert interval1 in interval2 - else: - msg = "^'<=' not supported between instances of" - with pytest.raises(TypeError, match=msg): - interval1 in interval2 diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_overlaps.py pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_overlaps.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/interval/test_overlaps.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/interval/test_overlaps.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,67 @@ +import pytest + +from pandas import ( + Interval, + Timedelta, + Timestamp, +) + + +@pytest.fixture( + params=[ + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timedelta("1 day")), + (0, 1), + ], + ids=lambda x: type(x[0]).__name__, +) +def start_shift(request): + """ + Fixture for generating intervals of types from a start value and a shift + value that can be added to start to generate an endpoint + """ + return request.param + + +class TestOverlaps: + def test_overlaps_self(self, start_shift, closed): + start, shift = start_shift + interval = Interval(start, start + shift, closed) + assert interval.overlaps(interval) + + def test_overlaps_nested(self, start_shift, closed, other_closed): + start, shift = start_shift + interval1 = Interval(start, start + 3 * shift, other_closed) + interval2 = Interval(start + shift, start + 2 * shift, closed) + + # nested intervals should always overlap + assert interval1.overlaps(interval2) + + def test_overlaps_disjoint(self, start_shift, closed, other_closed): + start, shift = start_shift + interval1 = Interval(start, start + shift, other_closed) + interval2 = Interval(start + 2 * shift, start + 3 * shift, closed) + + # disjoint intervals should never overlap + assert not interval1.overlaps(interval2) + + def test_overlaps_endpoint(self, start_shift, closed, other_closed): + start, shift = start_shift + interval1 = Interval(start, start + shift, other_closed) + interval2 = Interval(start + shift, start + 2 * shift, closed) + + # overlap if shared endpoint is closed for both (overlap at a point) + result = interval1.overlaps(interval2) + expected = interval1.closed_right and interval2.closed_left + assert result == expected + + @pytest.mark.parametrize( + "other", + [10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")], + ids=lambda x: type(x).__name__, + ) + def test_overlaps_invalid_type(self, other): + interval = Interval(0, 1) + msg = f"`other` must be an Interval, got {type(other).__name__}" + with pytest.raises(TypeError, match=msg): + interval.overlaps(other) diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/period/test_arithmetic.py pandas-2.2.2+dfsg/pandas/tests/scalar/period/test_arithmetic.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/period/test_arithmetic.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/period/test_arithmetic.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,486 @@ +from datetime import timedelta + +import numpy as np +import pytest + +from pandas._libs.tslibs.period import IncompatibleFrequency + +from pandas import ( + NaT, + Period, + Timedelta, + Timestamp, + offsets, +) + + +class TestPeriodArithmetic: + def test_add_overflow_raises(self): + # GH#55503 + per = Timestamp.max.to_period("ns") + + msg = "|".join( + [ + "Python int too large to convert to C long", + # windows, 32bit linux builds + "int too big to convert", + ] + ) + with pytest.raises(OverflowError, match=msg): + per + 1 + + msg = "value too large" + with pytest.raises(OverflowError, match=msg): + per + Timedelta(1) + with pytest.raises(OverflowError, match=msg): + per + offsets.Nano(1) + + def test_period_add_integer(self): + per1 = Period(freq="D", year=2008, month=1, day=1) + per2 = Period(freq="D", year=2008, month=1, day=2) + assert per1 + 1 == per2 + assert 1 + per1 == per2 + + def test_period_add_invalid(self): + # GH#4731 + per1 = Period(freq="D", year=2008, month=1, day=1) + per2 = Period(freq="D", year=2008, month=1, day=2) + + msg = "|".join( + [ + r"unsupported operand type\(s\)", + "can only concatenate str", + "must be str, not Period", + ] + ) + with pytest.raises(TypeError, match=msg): + per1 + "str" + with pytest.raises(TypeError, match=msg): + "str" + per1 + with pytest.raises(TypeError, match=msg): + per1 + per2 + + def test_period_sub_period_annual(self): + left, right = Period("2011", freq="Y"), Period("2007", freq="Y") + result = left - right + assert result == 4 * right.freq + + msg = r"Input has different freq=M from Period\(freq=Y-DEC\)" + with pytest.raises(IncompatibleFrequency, match=msg): + left - Period("2007-01", freq="M") + + def test_period_sub_period(self): + per1 = Period("2011-01-01", freq="D") + per2 = Period("2011-01-15", freq="D") + + off = per1.freq + assert per1 - per2 == -14 * off + assert per2 - per1 == 14 * off + + msg = r"Input has different freq=M from Period\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): + per1 - Period("2011-02", freq="M") + + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + def test_sub_n_gt_1_ticks(self, tick_classes, n): + # GH#23878 + p1 = Period("19910905", freq=tick_classes(n)) + p2 = Period("19920406", freq=tick_classes(n)) + + expected = Period(str(p2), freq=p2.freq.base) - Period( + str(p1), freq=p1.freq.base + ) + + assert (p2 - p1) == expected + + @pytest.mark.parametrize("normalize", [True, False]) + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + @pytest.mark.parametrize( + "offset, kwd_name", + [ + (offsets.YearEnd, "month"), + (offsets.QuarterEnd, "startingMonth"), + (offsets.MonthEnd, None), + (offsets.Week, "weekday"), + ], + ) + def test_sub_n_gt_1_offsets(self, offset, kwd_name, n, normalize): + # GH#23878 + kwds = {kwd_name: 3} if kwd_name is not None else {} + p1_d = "19910905" + p2_d = "19920406" + p1 = Period(p1_d, freq=offset(n, normalize, **kwds)) + p2 = Period(p2_d, freq=offset(n, normalize, **kwds)) + + expected = Period(p2_d, freq=p2.freq.base) - Period(p1_d, freq=p1.freq.base) + + assert (p2 - p1) == expected + + def test_period_add_offset(self): + # freq is DateOffset + for freq in ["Y", "2Y", "3Y"]: + per = Period("2011", freq=freq) + exp = Period("2013", freq=freq) + assert per + offsets.YearEnd(2) == exp + assert offsets.YearEnd(2) + per == exp + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + msg = "Input has different freq|Input cannot be converted to Period" + with pytest.raises(IncompatibleFrequency, match=msg): + per + off + with pytest.raises(IncompatibleFrequency, match=msg): + off + per + + for freq in ["M", "2M", "3M"]: + per = Period("2011-03", freq=freq) + exp = Period("2011-05", freq=freq) + assert per + offsets.MonthEnd(2) == exp + assert offsets.MonthEnd(2) + per == exp + + exp = Period("2012-03", freq=freq) + assert per + offsets.MonthEnd(12) == exp + assert offsets.MonthEnd(12) + per == exp + + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per + off + with pytest.raises(IncompatibleFrequency, match=msg): + off + per + + # freq is Tick + for freq in ["D", "2D", "3D"]: + per = Period("2011-04-01", freq=freq) + + exp = Period("2011-04-06", freq=freq) + assert per + offsets.Day(5) == exp + assert offsets.Day(5) + per == exp + + exp = Period("2011-04-02", freq=freq) + assert per + offsets.Hour(24) == exp + assert offsets.Hour(24) + per == exp + + exp = Period("2011-04-03", freq=freq) + assert per + np.timedelta64(2, "D") == exp + assert np.timedelta64(2, "D") + per == exp + + exp = Period("2011-04-02", freq=freq) + assert per + np.timedelta64(3600 * 24, "s") == exp + assert np.timedelta64(3600 * 24, "s") + per == exp + + exp = Period("2011-03-30", freq=freq) + assert per + timedelta(-2) == exp + assert timedelta(-2) + per == exp + + exp = Period("2011-04-03", freq=freq) + assert per + timedelta(hours=48) == exp + assert timedelta(hours=48) + per == exp + + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per + off + with pytest.raises(IncompatibleFrequency, match=msg): + off + per + + for freq in ["h", "2h", "3h"]: + per = Period("2011-04-01 09:00", freq=freq) + + exp = Period("2011-04-03 09:00", freq=freq) + assert per + offsets.Day(2) == exp + assert offsets.Day(2) + per == exp + + exp = Period("2011-04-01 12:00", freq=freq) + assert per + offsets.Hour(3) == exp + assert offsets.Hour(3) + per == exp + + msg = "cannot use operands with types" + exp = Period("2011-04-01 12:00", freq=freq) + assert per + np.timedelta64(3, "h") == exp + assert np.timedelta64(3, "h") + per == exp + + exp = Period("2011-04-01 10:00", freq=freq) + assert per + np.timedelta64(3600, "s") == exp + assert np.timedelta64(3600, "s") + per == exp + + exp = Period("2011-04-01 11:00", freq=freq) + assert per + timedelta(minutes=120) == exp + assert timedelta(minutes=120) + per == exp + + exp = Period("2011-04-05 12:00", freq=freq) + assert per + timedelta(days=4, minutes=180) == exp + assert timedelta(days=4, minutes=180) + per == exp + + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per + off + with pytest.raises(IncompatibleFrequency, match=msg): + off + per + + def test_period_sub_offset(self): + # freq is DateOffset + msg = "|".join( + [ + "Input has different freq", + "Input cannot be converted to Period", + ] + ) + + for freq in ["Y", "2Y", "3Y"]: + per = Period("2011", freq=freq) + assert per - offsets.YearEnd(2) == Period("2009", freq=freq) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per - off + + for freq in ["M", "2M", "3M"]: + per = Period("2011-03", freq=freq) + assert per - offsets.MonthEnd(2) == Period("2011-01", freq=freq) + assert per - offsets.MonthEnd(12) == Period("2010-03", freq=freq) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per - off + + # freq is Tick + for freq in ["D", "2D", "3D"]: + per = Period("2011-04-01", freq=freq) + assert per - offsets.Day(5) == Period("2011-03-27", freq=freq) + assert per - offsets.Hour(24) == Period("2011-03-31", freq=freq) + assert per - np.timedelta64(2, "D") == Period("2011-03-30", freq=freq) + assert per - np.timedelta64(3600 * 24, "s") == Period( + "2011-03-31", freq=freq + ) + assert per - timedelta(-2) == Period("2011-04-03", freq=freq) + assert per - timedelta(hours=48) == Period("2011-03-30", freq=freq) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per - off + + for freq in ["h", "2h", "3h"]: + per = Period("2011-04-01 09:00", freq=freq) + assert per - offsets.Day(2) == Period("2011-03-30 09:00", freq=freq) + assert per - offsets.Hour(3) == Period("2011-04-01 06:00", freq=freq) + assert per - np.timedelta64(3, "h") == Period("2011-04-01 06:00", freq=freq) + assert per - np.timedelta64(3600, "s") == Period( + "2011-04-01 08:00", freq=freq + ) + assert per - timedelta(minutes=120) == Period("2011-04-01 07:00", freq=freq) + assert per - timedelta(days=4, minutes=180) == Period( + "2011-03-28 06:00", freq=freq + ) + + for off in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: + with pytest.raises(IncompatibleFrequency, match=msg): + per - off + + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) + def test_period_addsub_nat(self, freq): + # GH#13071 + per = Period("2011-01", freq=freq) + + # For subtraction, NaT is treated as another Period object + assert NaT - per is NaT + assert per - NaT is NaT + + # For addition, NaT is treated as offset-like + assert NaT + per is NaT + assert per + NaT is NaT + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "m"]) + def test_period_add_sub_td64_nat(self, unit): + # GH#47196 + per = Period("2022-06-01", "D") + nat = np.timedelta64("NaT", unit) + + assert per + nat is NaT + assert nat + per is NaT + assert per - nat is NaT + + with pytest.raises(TypeError, match="unsupported operand"): + nat - per + + def test_period_ops_offset(self): + per = Period("2011-04-01", freq="D") + result = per + offsets.Day() + exp = Period("2011-04-02", freq="D") + assert result == exp + + result = per - offsets.Day(2) + exp = Period("2011-03-30", freq="D") + assert result == exp + + msg = r"Input cannot be converted to Period\(freq=D\)" + with pytest.raises(IncompatibleFrequency, match=msg): + per + offsets.Hour(2) + + with pytest.raises(IncompatibleFrequency, match=msg): + per - offsets.Hour(2) + + def test_period_add_timestamp_raises(self): + # GH#17983 + ts = Timestamp("2017") + per = Period("2017", freq="M") + + msg = r"unsupported operand type\(s\) for \+: 'Timestamp' and 'Period'" + with pytest.raises(TypeError, match=msg): + ts + per + + msg = r"unsupported operand type\(s\) for \+: 'Period' and 'Timestamp'" + with pytest.raises(TypeError, match=msg): + per + ts + + +class TestPeriodComparisons: + def test_period_comparison_same_freq(self): + jan = Period("2000-01", "M") + feb = Period("2000-02", "M") + + assert not jan == feb + assert jan != feb + assert jan < feb + assert jan <= feb + assert not jan > feb + assert not jan >= feb + + def test_period_comparison_same_period_different_object(self): + # Separate Period objects for the same period + left = Period("2000-01", "M") + right = Period("2000-01", "M") + + assert left == right + assert left >= right + assert left <= right + assert not left < right + assert not left > right + + def test_period_comparison_mismatched_freq(self): + jan = Period("2000-01", "M") + day = Period("2012-01-01", "D") + + assert not jan == day + assert jan != day + msg = r"Input has different freq=D from Period\(freq=M\)" + with pytest.raises(IncompatibleFrequency, match=msg): + jan < day + with pytest.raises(IncompatibleFrequency, match=msg): + jan <= day + with pytest.raises(IncompatibleFrequency, match=msg): + jan > day + with pytest.raises(IncompatibleFrequency, match=msg): + jan >= day + + def test_period_comparison_invalid_type(self): + jan = Period("2000-01", "M") + + assert not jan == 1 + assert jan != 1 + + int_or_per = "'(Period|int)'" + msg = f"not supported between instances of {int_or_per} and {int_or_per}" + for left, right in [(jan, 1), (1, jan)]: + with pytest.raises(TypeError, match=msg): + left > right + with pytest.raises(TypeError, match=msg): + left >= right + with pytest.raises(TypeError, match=msg): + left < right + with pytest.raises(TypeError, match=msg): + left <= right + + def test_period_comparison_nat(self): + per = Period("2011-01-01", freq="D") + + ts = Timestamp("2011-01-01") + # confirm Period('NaT') work identical with Timestamp('NaT') + for left, right in [ + (NaT, per), + (per, NaT), + (NaT, ts), + (ts, NaT), + ]: + assert not left < right + assert not left > right + assert not left == right + assert left != right + assert not left <= right + assert not left >= right + + @pytest.mark.parametrize( + "zerodim_arr, expected", + ((np.array(0), False), (np.array(Period("2000-01", "M")), True)), + ) + def test_period_comparison_numpy_zerodim_arr(self, zerodim_arr, expected): + per = Period("2000-01", "M") + + assert (per == zerodim_arr) is expected + assert (zerodim_arr == per) is expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/period/test_asfreq.py pandas-2.2.2+dfsg/pandas/tests/scalar/period/test_asfreq.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/period/test_asfreq.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/period/test_asfreq.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,5 @@ import pytest -from pandas._libs.tslibs.dtypes import _period_code_map from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG from pandas.errors import OutOfBoundsDatetime @@ -18,7 +17,7 @@ """Test frequency conversion of date objects""" @pytest.mark.filterwarnings("ignore:Period with BDay:FutureWarning") - @pytest.mark.parametrize("freq", ["A", "Q", "M", "W", "B", "D"]) + @pytest.mark.parametrize("freq", ["Y", "Q", "M", "W", "B", "D"]) def test_asfreq_near_zero(self, freq): # GH#19643, GH#19650 per = Period("0001-01-01", freq=freq) @@ -49,23 +48,23 @@ per.to_timestamp() def test_asfreq_corner(self): - val = Period(freq="A", year=2007) - result1 = val.asfreq("5t") - result2 = val.asfreq("t") - expected = Period("2007-12-31 23:59", freq="t") + val = Period(freq="Y", year=2007) + result1 = val.asfreq("5min") + result2 = val.asfreq("min") + expected = Period("2007-12-31 23:59", freq="min") assert result1.ordinal == expected.ordinal - assert result1.freqstr == "5T" + assert result1.freqstr == "5min" assert result2.ordinal == expected.ordinal - assert result2.freqstr == "T" + assert result2.freqstr == "min" def test_conv_annual(self): # frequency conversion tests: from Annual Frequency - ival_A = Period(freq="A", year=2007) + ival_A = Period(freq="Y", year=2007) - ival_AJAN = Period(freq="A-JAN", year=2007) - ival_AJUN = Period(freq="A-JUN", year=2007) - ival_ANOV = Period(freq="A-NOV", year=2007) + ival_AJAN = Period(freq="Y-JAN", year=2007) + ival_AJUN = Period(freq="Y-JUN", year=2007) + ival_ANOV = Period(freq="Y-NOV", year=2007) ival_A_to_Q_start = Period(freq="Q", year=2007, quarter=1) ival_A_to_Q_end = Period(freq="Q", year=2007, quarter=4) @@ -78,8 +77,8 @@ ival_A_to_B_end = Period(freq="B", year=2007, month=12, day=31) ival_A_to_D_start = Period(freq="D", year=2007, month=1, day=1) ival_A_to_D_end = Period(freq="D", year=2007, month=12, day=31) - ival_A_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_A_to_H_end = Period(freq="H", year=2007, month=12, day=31, hour=23) + ival_A_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_A_to_H_end = Period(freq="h", year=2007, month=12, day=31, hour=23) ival_A_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -87,10 +86,10 @@ freq="Min", year=2007, month=12, day=31, hour=23, minute=59 ) ival_A_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_A_to_S_end = Period( - freq="S", year=2007, month=12, day=31, hour=23, minute=59, second=59 + freq="s", year=2007, month=12, day=31, hour=23, minute=59, second=59 ) ival_AJAN_to_D_end = Period(freq="D", year=2007, month=1, day=31) @@ -100,36 +99,42 @@ ival_ANOV_to_D_end = Period(freq="D", year=2007, month=11, day=30) ival_ANOV_to_D_start = Period(freq="D", year=2006, month=12, day=1) - assert ival_A.asfreq("Q", "S") == ival_A_to_Q_start + assert ival_A.asfreq("Q", "s") == ival_A_to_Q_start assert ival_A.asfreq("Q", "e") == ival_A_to_Q_end assert ival_A.asfreq("M", "s") == ival_A_to_M_start assert ival_A.asfreq("M", "E") == ival_A_to_M_end - assert ival_A.asfreq("W", "S") == ival_A_to_W_start + assert ival_A.asfreq("W", "s") == ival_A_to_W_start assert ival_A.asfreq("W", "E") == ival_A_to_W_end with tm.assert_produces_warning(FutureWarning, match=bday_msg): - assert ival_A.asfreq("B", "S") == ival_A_to_B_start + assert ival_A.asfreq("B", "s") == ival_A_to_B_start assert ival_A.asfreq("B", "E") == ival_A_to_B_end - assert ival_A.asfreq("D", "S") == ival_A_to_D_start + assert ival_A.asfreq("D", "s") == ival_A_to_D_start assert ival_A.asfreq("D", "E") == ival_A_to_D_end - assert ival_A.asfreq("H", "S") == ival_A_to_H_start - assert ival_A.asfreq("H", "E") == ival_A_to_H_end - assert ival_A.asfreq("min", "S") == ival_A_to_T_start + msg = "'H' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ival_A.asfreq("H", "s") == ival_A_to_H_start + assert ival_A.asfreq("H", "E") == ival_A_to_H_end + assert ival_A.asfreq("min", "s") == ival_A_to_T_start assert ival_A.asfreq("min", "E") == ival_A_to_T_end - assert ival_A.asfreq("T", "S") == ival_A_to_T_start - assert ival_A.asfreq("T", "E") == ival_A_to_T_end - assert ival_A.asfreq("S", "S") == ival_A_to_S_start - assert ival_A.asfreq("S", "E") == ival_A_to_S_end + msg = "'T' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ival_A.asfreq("T", "s") == ival_A_to_T_start + assert ival_A.asfreq("T", "E") == ival_A_to_T_end + msg = "'S' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + assert ival_A.asfreq("S", "S") == ival_A_to_S_start + assert ival_A.asfreq("S", "E") == ival_A_to_S_end - assert ival_AJAN.asfreq("D", "S") == ival_AJAN_to_D_start + assert ival_AJAN.asfreq("D", "s") == ival_AJAN_to_D_start assert ival_AJAN.asfreq("D", "E") == ival_AJAN_to_D_end - assert ival_AJUN.asfreq("D", "S") == ival_AJUN_to_D_start + assert ival_AJUN.asfreq("D", "s") == ival_AJUN_to_D_start assert ival_AJUN.asfreq("D", "E") == ival_AJUN_to_D_end - assert ival_ANOV.asfreq("D", "S") == ival_ANOV_to_D_start + assert ival_ANOV.asfreq("D", "s") == ival_ANOV_to_D_start assert ival_ANOV.asfreq("D", "E") == ival_ANOV_to_D_end - assert ival_A.asfreq("A") == ival_A + assert ival_A.asfreq("Y") == ival_A def test_conv_quarterly(self): # frequency conversion tests: from Quarterly Frequency @@ -140,7 +145,7 @@ ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1) ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1) - ival_Q_to_A = Period(freq="A", year=2007) + ival_Q_to_A = Period(freq="Y", year=2007) ival_Q_to_M_start = Period(freq="M", year=2007, month=1) ival_Q_to_M_end = Period(freq="M", year=2007, month=3) ival_Q_to_W_start = Period(freq="W", year=2007, month=1, day=1) @@ -150,8 +155,8 @@ ival_Q_to_B_end = Period(freq="B", year=2007, month=3, day=30) ival_Q_to_D_start = Period(freq="D", year=2007, month=1, day=1) ival_Q_to_D_end = Period(freq="D", year=2007, month=3, day=31) - ival_Q_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_Q_to_H_end = Period(freq="H", year=2007, month=3, day=31, hour=23) + ival_Q_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_Q_to_H_end = Period(freq="h", year=2007, month=3, day=31, hour=23) ival_Q_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -159,10 +164,10 @@ freq="Min", year=2007, month=3, day=31, hour=23, minute=59 ) ival_Q_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_Q_to_S_end = Period( - freq="S", year=2007, month=3, day=31, hour=23, minute=59, second=59 + freq="s", year=2007, month=3, day=31, hour=23, minute=59, second=59 ) ival_QEJAN_to_D_start = Period(freq="D", year=2006, month=2, day=1) @@ -171,28 +176,28 @@ ival_QEJUN_to_D_start = Period(freq="D", year=2006, month=7, day=1) ival_QEJUN_to_D_end = Period(freq="D", year=2006, month=9, day=30) - assert ival_Q.asfreq("A") == ival_Q_to_A - assert ival_Q_end_of_year.asfreq("A") == ival_Q_to_A + assert ival_Q.asfreq("Y") == ival_Q_to_A + assert ival_Q_end_of_year.asfreq("Y") == ival_Q_to_A - assert ival_Q.asfreq("M", "S") == ival_Q_to_M_start + assert ival_Q.asfreq("M", "s") == ival_Q_to_M_start assert ival_Q.asfreq("M", "E") == ival_Q_to_M_end - assert ival_Q.asfreq("W", "S") == ival_Q_to_W_start + assert ival_Q.asfreq("W", "s") == ival_Q_to_W_start assert ival_Q.asfreq("W", "E") == ival_Q_to_W_end with tm.assert_produces_warning(FutureWarning, match=bday_msg): - assert ival_Q.asfreq("B", "S") == ival_Q_to_B_start + assert ival_Q.asfreq("B", "s") == ival_Q_to_B_start assert ival_Q.asfreq("B", "E") == ival_Q_to_B_end - assert ival_Q.asfreq("D", "S") == ival_Q_to_D_start + assert ival_Q.asfreq("D", "s") == ival_Q_to_D_start assert ival_Q.asfreq("D", "E") == ival_Q_to_D_end - assert ival_Q.asfreq("H", "S") == ival_Q_to_H_start - assert ival_Q.asfreq("H", "E") == ival_Q_to_H_end - assert ival_Q.asfreq("Min", "S") == ival_Q_to_T_start + assert ival_Q.asfreq("h", "s") == ival_Q_to_H_start + assert ival_Q.asfreq("h", "E") == ival_Q_to_H_end + assert ival_Q.asfreq("Min", "s") == ival_Q_to_T_start assert ival_Q.asfreq("Min", "E") == ival_Q_to_T_end - assert ival_Q.asfreq("S", "S") == ival_Q_to_S_start - assert ival_Q.asfreq("S", "E") == ival_Q_to_S_end + assert ival_Q.asfreq("s", "s") == ival_Q_to_S_start + assert ival_Q.asfreq("s", "E") == ival_Q_to_S_end - assert ival_QEJAN.asfreq("D", "S") == ival_QEJAN_to_D_start + assert ival_QEJAN.asfreq("D", "s") == ival_QEJAN_to_D_start assert ival_QEJAN.asfreq("D", "E") == ival_QEJAN_to_D_end - assert ival_QEJUN.asfreq("D", "S") == ival_QEJUN_to_D_start + assert ival_QEJUN.asfreq("D", "s") == ival_QEJUN_to_D_start assert ival_QEJUN.asfreq("D", "E") == ival_QEJUN_to_D_end assert ival_Q.asfreq("Q") == ival_Q @@ -203,7 +208,7 @@ ival_M = Period(freq="M", year=2007, month=1) ival_M_end_of_year = Period(freq="M", year=2007, month=12) ival_M_end_of_quarter = Period(freq="M", year=2007, month=3) - ival_M_to_A = Period(freq="A", year=2007) + ival_M_to_A = Period(freq="Y", year=2007) ival_M_to_Q = Period(freq="Q", year=2007, quarter=1) ival_M_to_W_start = Period(freq="W", year=2007, month=1, day=1) ival_M_to_W_end = Period(freq="W", year=2007, month=1, day=31) @@ -212,8 +217,8 @@ ival_M_to_B_end = Period(freq="B", year=2007, month=1, day=31) ival_M_to_D_start = Period(freq="D", year=2007, month=1, day=1) ival_M_to_D_end = Period(freq="D", year=2007, month=1, day=31) - ival_M_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_M_to_H_end = Period(freq="H", year=2007, month=1, day=31, hour=23) + ival_M_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_M_to_H_end = Period(freq="h", year=2007, month=1, day=31, hour=23) ival_M_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -221,30 +226,30 @@ freq="Min", year=2007, month=1, day=31, hour=23, minute=59 ) ival_M_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_M_to_S_end = Period( - freq="S", year=2007, month=1, day=31, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=31, hour=23, minute=59, second=59 ) - assert ival_M.asfreq("A") == ival_M_to_A - assert ival_M_end_of_year.asfreq("A") == ival_M_to_A + assert ival_M.asfreq("Y") == ival_M_to_A + assert ival_M_end_of_year.asfreq("Y") == ival_M_to_A assert ival_M.asfreq("Q") == ival_M_to_Q assert ival_M_end_of_quarter.asfreq("Q") == ival_M_to_Q - assert ival_M.asfreq("W", "S") == ival_M_to_W_start + assert ival_M.asfreq("W", "s") == ival_M_to_W_start assert ival_M.asfreq("W", "E") == ival_M_to_W_end with tm.assert_produces_warning(FutureWarning, match=bday_msg): - assert ival_M.asfreq("B", "S") == ival_M_to_B_start + assert ival_M.asfreq("B", "s") == ival_M_to_B_start assert ival_M.asfreq("B", "E") == ival_M_to_B_end - assert ival_M.asfreq("D", "S") == ival_M_to_D_start + assert ival_M.asfreq("D", "s") == ival_M_to_D_start assert ival_M.asfreq("D", "E") == ival_M_to_D_end - assert ival_M.asfreq("H", "S") == ival_M_to_H_start - assert ival_M.asfreq("H", "E") == ival_M_to_H_end - assert ival_M.asfreq("Min", "S") == ival_M_to_T_start + assert ival_M.asfreq("h", "s") == ival_M_to_H_start + assert ival_M.asfreq("h", "E") == ival_M_to_H_end + assert ival_M.asfreq("Min", "s") == ival_M_to_T_start assert ival_M.asfreq("Min", "E") == ival_M_to_T_end - assert ival_M.asfreq("S", "S") == ival_M_to_S_start - assert ival_M.asfreq("S", "E") == ival_M_to_S_end + assert ival_M.asfreq("s", "s") == ival_M_to_S_start + assert ival_M.asfreq("s", "E") == ival_M_to_S_end assert ival_M.asfreq("M") == ival_M @@ -278,14 +283,14 @@ ival_W_end_of_year = Period(freq="W", year=2007, month=12, day=31) ival_W_end_of_quarter = Period(freq="W", year=2007, month=3, day=31) ival_W_end_of_month = Period(freq="W", year=2007, month=1, day=31) - ival_W_to_A = Period(freq="A", year=2007) + ival_W_to_A = Period(freq="Y", year=2007) ival_W_to_Q = Period(freq="Q", year=2007, quarter=1) ival_W_to_M = Period(freq="M", year=2007, month=1) if Period(freq="D", year=2007, month=12, day=31).weekday == 6: - ival_W_to_A_end_of_year = Period(freq="A", year=2007) + ival_W_to_A_end_of_year = Period(freq="Y", year=2007) else: - ival_W_to_A_end_of_year = Period(freq="A", year=2008) + ival_W_to_A_end_of_year = Period(freq="Y", year=2008) if Period(freq="D", year=2007, month=3, day=31).weekday == 6: ival_W_to_Q_end_of_quarter = Period(freq="Q", year=2007, quarter=1) @@ -302,8 +307,8 @@ ival_W_to_B_end = Period(freq="B", year=2007, month=1, day=5) ival_W_to_D_start = Period(freq="D", year=2007, month=1, day=1) ival_W_to_D_end = Period(freq="D", year=2007, month=1, day=7) - ival_W_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_W_to_H_end = Period(freq="H", year=2007, month=1, day=7, hour=23) + ival_W_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_W_to_H_end = Period(freq="h", year=2007, month=1, day=7, hour=23) ival_W_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -311,14 +316,14 @@ freq="Min", year=2007, month=1, day=7, hour=23, minute=59 ) ival_W_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_W_to_S_end = Period( - freq="S", year=2007, month=1, day=7, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=7, hour=23, minute=59, second=59 ) - assert ival_W.asfreq("A") == ival_W_to_A - assert ival_W_end_of_year.asfreq("A") == ival_W_to_A_end_of_year + assert ival_W.asfreq("Y") == ival_W_to_A + assert ival_W_end_of_year.asfreq("Y") == ival_W_to_A_end_of_year assert ival_W.asfreq("Q") == ival_W_to_Q assert ival_W_end_of_quarter.asfreq("Q") == ival_W_to_Q_end_of_quarter @@ -327,33 +332,33 @@ assert ival_W_end_of_month.asfreq("M") == ival_W_to_M_end_of_month with tm.assert_produces_warning(FutureWarning, match=bday_msg): - assert ival_W.asfreq("B", "S") == ival_W_to_B_start + assert ival_W.asfreq("B", "s") == ival_W_to_B_start assert ival_W.asfreq("B", "E") == ival_W_to_B_end - assert ival_W.asfreq("D", "S") == ival_W_to_D_start + assert ival_W.asfreq("D", "s") == ival_W_to_D_start assert ival_W.asfreq("D", "E") == ival_W_to_D_end - assert ival_WSUN.asfreq("D", "S") == ival_WSUN_to_D_start + assert ival_WSUN.asfreq("D", "s") == ival_WSUN_to_D_start assert ival_WSUN.asfreq("D", "E") == ival_WSUN_to_D_end - assert ival_WSAT.asfreq("D", "S") == ival_WSAT_to_D_start + assert ival_WSAT.asfreq("D", "s") == ival_WSAT_to_D_start assert ival_WSAT.asfreq("D", "E") == ival_WSAT_to_D_end - assert ival_WFRI.asfreq("D", "S") == ival_WFRI_to_D_start + assert ival_WFRI.asfreq("D", "s") == ival_WFRI_to_D_start assert ival_WFRI.asfreq("D", "E") == ival_WFRI_to_D_end - assert ival_WTHU.asfreq("D", "S") == ival_WTHU_to_D_start + assert ival_WTHU.asfreq("D", "s") == ival_WTHU_to_D_start assert ival_WTHU.asfreq("D", "E") == ival_WTHU_to_D_end - assert ival_WWED.asfreq("D", "S") == ival_WWED_to_D_start + assert ival_WWED.asfreq("D", "s") == ival_WWED_to_D_start assert ival_WWED.asfreq("D", "E") == ival_WWED_to_D_end - assert ival_WTUE.asfreq("D", "S") == ival_WTUE_to_D_start + assert ival_WTUE.asfreq("D", "s") == ival_WTUE_to_D_start assert ival_WTUE.asfreq("D", "E") == ival_WTUE_to_D_end - assert ival_WMON.asfreq("D", "S") == ival_WMON_to_D_start + assert ival_WMON.asfreq("D", "s") == ival_WMON_to_D_start assert ival_WMON.asfreq("D", "E") == ival_WMON_to_D_end - assert ival_W.asfreq("H", "S") == ival_W_to_H_start - assert ival_W.asfreq("H", "E") == ival_W_to_H_end - assert ival_W.asfreq("Min", "S") == ival_W_to_T_start + assert ival_W.asfreq("h", "s") == ival_W_to_H_start + assert ival_W.asfreq("h", "E") == ival_W_to_H_end + assert ival_W.asfreq("Min", "s") == ival_W_to_T_start assert ival_W.asfreq("Min", "E") == ival_W_to_T_end - assert ival_W.asfreq("S", "S") == ival_W_to_S_start - assert ival_W.asfreq("S", "E") == ival_W_to_S_end + assert ival_W.asfreq("s", "s") == ival_W_to_S_start + assert ival_W.asfreq("s", "E") == ival_W_to_S_end assert ival_W.asfreq("W") == ival_W @@ -390,13 +395,13 @@ ival_B_end_of_month = Period(freq="B", year=2007, month=1, day=31) ival_B_end_of_week = Period(freq="B", year=2007, month=1, day=5) - ival_B_to_A = Period(freq="A", year=2007) + ival_B_to_A = Period(freq="Y", year=2007) ival_B_to_Q = Period(freq="Q", year=2007, quarter=1) ival_B_to_M = Period(freq="M", year=2007, month=1) ival_B_to_W = Period(freq="W", year=2007, month=1, day=7) ival_B_to_D = Period(freq="D", year=2007, month=1, day=1) - ival_B_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_B_to_H_end = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_B_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_B_to_H_end = Period(freq="h", year=2007, month=1, day=1, hour=23) ival_B_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -404,14 +409,14 @@ freq="Min", year=2007, month=1, day=1, hour=23, minute=59 ) ival_B_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_B_to_S_end = Period( - freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=1, hour=23, minute=59, second=59 ) - assert ival_B.asfreq("A") == ival_B_to_A - assert ival_B_end_of_year.asfreq("A") == ival_B_to_A + assert ival_B.asfreq("Y") == ival_B_to_A + assert ival_B_end_of_year.asfreq("Y") == ival_B_to_A assert ival_B.asfreq("Q") == ival_B_to_Q assert ival_B_end_of_quarter.asfreq("Q") == ival_B_to_Q assert ival_B.asfreq("M") == ival_B_to_M @@ -421,12 +426,12 @@ assert ival_B.asfreq("D") == ival_B_to_D - assert ival_B.asfreq("H", "S") == ival_B_to_H_start - assert ival_B.asfreq("H", "E") == ival_B_to_H_end - assert ival_B.asfreq("Min", "S") == ival_B_to_T_start + assert ival_B.asfreq("h", "s") == ival_B_to_H_start + assert ival_B.asfreq("h", "E") == ival_B_to_H_end + assert ival_B.asfreq("Min", "s") == ival_B_to_T_start assert ival_B.asfreq("Min", "E") == ival_B_to_T_end - assert ival_B.asfreq("S", "S") == ival_B_to_S_start - assert ival_B.asfreq("S", "E") == ival_B_to_S_end + assert ival_B.asfreq("s", "s") == ival_B_to_S_start + assert ival_B.asfreq("s", "E") == ival_B_to_S_end with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert ival_B.asfreq("B") == ival_B @@ -448,11 +453,11 @@ ival_B_friday = Period(freq="B", year=2007, month=1, day=5) ival_B_monday = Period(freq="B", year=2007, month=1, day=8) - ival_D_to_A = Period(freq="A", year=2007) + ival_D_to_A = Period(freq="Y", year=2007) - ival_Deoq_to_AJAN = Period(freq="A-JAN", year=2008) - ival_Deoq_to_AJUN = Period(freq="A-JUN", year=2007) - ival_Deoq_to_ADEC = Period(freq="A-DEC", year=2007) + ival_Deoq_to_AJAN = Period(freq="Y-JAN", year=2008) + ival_Deoq_to_AJUN = Period(freq="Y-JUN", year=2007) + ival_Deoq_to_ADEC = Period(freq="Y-DEC", year=2007) ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4) ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3) @@ -461,8 +466,8 @@ ival_D_to_M = Period(freq="M", year=2007, month=1) ival_D_to_W = Period(freq="W", year=2007, month=1, day=7) - ival_D_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_D_to_H_end = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_D_to_H_start = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_D_to_H_end = Period(freq="h", year=2007, month=1, day=1, hour=23) ival_D_to_T_start = Period( freq="Min", year=2007, month=1, day=1, hour=0, minute=0 ) @@ -470,19 +475,19 @@ freq="Min", year=2007, month=1, day=1, hour=23, minute=59 ) ival_D_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_D_to_S_end = Period( - freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=1, hour=23, minute=59, second=59 ) - assert ival_D.asfreq("A") == ival_D_to_A + assert ival_D.asfreq("Y") == ival_D_to_A - assert ival_D_end_of_quarter.asfreq("A-JAN") == ival_Deoq_to_AJAN - assert ival_D_end_of_quarter.asfreq("A-JUN") == ival_Deoq_to_AJUN - assert ival_D_end_of_quarter.asfreq("A-DEC") == ival_Deoq_to_ADEC + assert ival_D_end_of_quarter.asfreq("Y-JAN") == ival_Deoq_to_AJAN + assert ival_D_end_of_quarter.asfreq("Y-JUN") == ival_Deoq_to_AJUN + assert ival_D_end_of_quarter.asfreq("Y-DEC") == ival_Deoq_to_ADEC - assert ival_D_end_of_year.asfreq("A") == ival_D_to_A + assert ival_D_end_of_year.asfreq("Y") == ival_D_to_A assert ival_D_end_of_quarter.asfreq("Q") == ival_D_to_QEDEC assert ival_D.asfreq("Q-JAN") == ival_D_to_QEJAN assert ival_D.asfreq("Q-JUN") == ival_D_to_QEJUN @@ -494,32 +499,32 @@ with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert ival_D_friday.asfreq("B") == ival_B_friday - assert ival_D_saturday.asfreq("B", "S") == ival_B_friday + assert ival_D_saturday.asfreq("B", "s") == ival_B_friday assert ival_D_saturday.asfreq("B", "E") == ival_B_monday - assert ival_D_sunday.asfreq("B", "S") == ival_B_friday + assert ival_D_sunday.asfreq("B", "s") == ival_B_friday assert ival_D_sunday.asfreq("B", "E") == ival_B_monday - assert ival_D.asfreq("H", "S") == ival_D_to_H_start - assert ival_D.asfreq("H", "E") == ival_D_to_H_end - assert ival_D.asfreq("Min", "S") == ival_D_to_T_start + assert ival_D.asfreq("h", "s") == ival_D_to_H_start + assert ival_D.asfreq("h", "E") == ival_D_to_H_end + assert ival_D.asfreq("Min", "s") == ival_D_to_T_start assert ival_D.asfreq("Min", "E") == ival_D_to_T_end - assert ival_D.asfreq("S", "S") == ival_D_to_S_start - assert ival_D.asfreq("S", "E") == ival_D_to_S_end + assert ival_D.asfreq("s", "s") == ival_D_to_S_start + assert ival_D.asfreq("s", "E") == ival_D_to_S_end assert ival_D.asfreq("D") == ival_D def test_conv_hourly(self): # frequency conversion tests: from Hourly Frequency" - ival_H = Period(freq="H", year=2007, month=1, day=1, hour=0) - ival_H_end_of_year = Period(freq="H", year=2007, month=12, day=31, hour=23) - ival_H_end_of_quarter = Period(freq="H", year=2007, month=3, day=31, hour=23) - ival_H_end_of_month = Period(freq="H", year=2007, month=1, day=31, hour=23) - ival_H_end_of_week = Period(freq="H", year=2007, month=1, day=7, hour=23) - ival_H_end_of_day = Period(freq="H", year=2007, month=1, day=1, hour=23) - ival_H_end_of_bus = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_H = Period(freq="h", year=2007, month=1, day=1, hour=0) + ival_H_end_of_year = Period(freq="h", year=2007, month=12, day=31, hour=23) + ival_H_end_of_quarter = Period(freq="h", year=2007, month=3, day=31, hour=23) + ival_H_end_of_month = Period(freq="h", year=2007, month=1, day=31, hour=23) + ival_H_end_of_week = Period(freq="h", year=2007, month=1, day=7, hour=23) + ival_H_end_of_day = Period(freq="h", year=2007, month=1, day=1, hour=23) + ival_H_end_of_bus = Period(freq="h", year=2007, month=1, day=1, hour=23) - ival_H_to_A = Period(freq="A", year=2007) + ival_H_to_A = Period(freq="Y", year=2007) ival_H_to_Q = Period(freq="Q", year=2007, quarter=1) ival_H_to_M = Period(freq="M", year=2007, month=1) ival_H_to_W = Period(freq="W", year=2007, month=1, day=7) @@ -534,14 +539,14 @@ freq="Min", year=2007, month=1, day=1, hour=0, minute=59 ) ival_H_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_H_to_S_end = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=59, second=59 + freq="s", year=2007, month=1, day=1, hour=0, minute=59, second=59 ) - assert ival_H.asfreq("A") == ival_H_to_A - assert ival_H_end_of_year.asfreq("A") == ival_H_to_A + assert ival_H.asfreq("Y") == ival_H_to_A + assert ival_H_end_of_year.asfreq("Y") == ival_H_to_A assert ival_H.asfreq("Q") == ival_H_to_Q assert ival_H_end_of_quarter.asfreq("Q") == ival_H_to_Q assert ival_H.asfreq("M") == ival_H_to_M @@ -554,12 +559,12 @@ assert ival_H.asfreq("B") == ival_H_to_B assert ival_H_end_of_bus.asfreq("B") == ival_H_to_B - assert ival_H.asfreq("Min", "S") == ival_H_to_T_start + assert ival_H.asfreq("Min", "s") == ival_H_to_T_start assert ival_H.asfreq("Min", "E") == ival_H_to_T_end - assert ival_H.asfreq("S", "S") == ival_H_to_S_start - assert ival_H.asfreq("S", "E") == ival_H_to_S_end + assert ival_H.asfreq("s", "s") == ival_H_to_S_start + assert ival_H.asfreq("s", "E") == ival_H_to_S_end - assert ival_H.asfreq("H") == ival_H + assert ival_H.asfreq("h") == ival_H def test_conv_minutely(self): # frequency conversion tests: from Minutely Frequency" @@ -587,24 +592,24 @@ freq="Min", year=2007, month=1, day=1, hour=0, minute=59 ) - ival_T_to_A = Period(freq="A", year=2007) + ival_T_to_A = Period(freq="Y", year=2007) ival_T_to_Q = Period(freq="Q", year=2007, quarter=1) ival_T_to_M = Period(freq="M", year=2007, month=1) ival_T_to_W = Period(freq="W", year=2007, month=1, day=7) ival_T_to_D = Period(freq="D", year=2007, month=1, day=1) with tm.assert_produces_warning(FutureWarning, match=bday_msg): ival_T_to_B = Period(freq="B", year=2007, month=1, day=1) - ival_T_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_T_to_H = Period(freq="h", year=2007, month=1, day=1, hour=0) ival_T_to_S_start = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0 ) ival_T_to_S_end = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=59 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=59 ) - assert ival_T.asfreq("A") == ival_T_to_A - assert ival_T_end_of_year.asfreq("A") == ival_T_to_A + assert ival_T.asfreq("Y") == ival_T_to_A + assert ival_T_end_of_year.asfreq("Y") == ival_T_to_A assert ival_T.asfreq("Q") == ival_T_to_Q assert ival_T_end_of_quarter.asfreq("Q") == ival_T_to_Q assert ival_T.asfreq("M") == ival_T_to_M @@ -616,55 +621,55 @@ with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert ival_T.asfreq("B") == ival_T_to_B assert ival_T_end_of_bus.asfreq("B") == ival_T_to_B - assert ival_T.asfreq("H") == ival_T_to_H - assert ival_T_end_of_hour.asfreq("H") == ival_T_to_H + assert ival_T.asfreq("h") == ival_T_to_H + assert ival_T_end_of_hour.asfreq("h") == ival_T_to_H - assert ival_T.asfreq("S", "S") == ival_T_to_S_start - assert ival_T.asfreq("S", "E") == ival_T_to_S_end + assert ival_T.asfreq("s", "s") == ival_T_to_S_start + assert ival_T.asfreq("s", "E") == ival_T_to_S_end assert ival_T.asfreq("Min") == ival_T def test_conv_secondly(self): # frequency conversion tests: from Secondly Frequency" - ival_S = Period(freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0) + ival_S = Period(freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=0) ival_S_end_of_year = Period( - freq="S", year=2007, month=12, day=31, hour=23, minute=59, second=59 + freq="s", year=2007, month=12, day=31, hour=23, minute=59, second=59 ) ival_S_end_of_quarter = Period( - freq="S", year=2007, month=3, day=31, hour=23, minute=59, second=59 + freq="s", year=2007, month=3, day=31, hour=23, minute=59, second=59 ) ival_S_end_of_month = Period( - freq="S", year=2007, month=1, day=31, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=31, hour=23, minute=59, second=59 ) ival_S_end_of_week = Period( - freq="S", year=2007, month=1, day=7, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=7, hour=23, minute=59, second=59 ) ival_S_end_of_day = Period( - freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=1, hour=23, minute=59, second=59 ) ival_S_end_of_bus = Period( - freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + freq="s", year=2007, month=1, day=1, hour=23, minute=59, second=59 ) ival_S_end_of_hour = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=59, second=59 + freq="s", year=2007, month=1, day=1, hour=0, minute=59, second=59 ) ival_S_end_of_minute = Period( - freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=59 + freq="s", year=2007, month=1, day=1, hour=0, minute=0, second=59 ) - ival_S_to_A = Period(freq="A", year=2007) + ival_S_to_A = Period(freq="Y", year=2007) ival_S_to_Q = Period(freq="Q", year=2007, quarter=1) ival_S_to_M = Period(freq="M", year=2007, month=1) ival_S_to_W = Period(freq="W", year=2007, month=1, day=7) ival_S_to_D = Period(freq="D", year=2007, month=1, day=1) with tm.assert_produces_warning(FutureWarning, match=bday_msg): ival_S_to_B = Period(freq="B", year=2007, month=1, day=1) - ival_S_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_S_to_H = Period(freq="h", year=2007, month=1, day=1, hour=0) ival_S_to_T = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) - assert ival_S.asfreq("A") == ival_S_to_A - assert ival_S_end_of_year.asfreq("A") == ival_S_to_A + assert ival_S.asfreq("Y") == ival_S_to_A + assert ival_S_end_of_year.asfreq("Y") == ival_S_to_A assert ival_S.asfreq("Q") == ival_S_to_Q assert ival_S_end_of_quarter.asfreq("Q") == ival_S_to_Q assert ival_S.asfreq("M") == ival_S_to_M @@ -676,17 +681,17 @@ with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert ival_S.asfreq("B") == ival_S_to_B assert ival_S_end_of_bus.asfreq("B") == ival_S_to_B - assert ival_S.asfreq("H") == ival_S_to_H - assert ival_S_end_of_hour.asfreq("H") == ival_S_to_H + assert ival_S.asfreq("h") == ival_S_to_H + assert ival_S_end_of_hour.asfreq("h") == ival_S_to_H assert ival_S.asfreq("Min") == ival_S_to_T assert ival_S_end_of_minute.asfreq("Min") == ival_S_to_T - assert ival_S.asfreq("S") == ival_S + assert ival_S.asfreq("s") == ival_S def test_conv_microsecond(self): # GH#31475 Avoid floating point errors dropping the start_time to # before the beginning of the Period - per = Period("2020-01-30 15:57:27.576166", freq="U") + per = Period("2020-01-30 15:57:27.576166", freq="us") assert per.ordinal == 1580399847576166 start = per.start_time @@ -703,44 +708,44 @@ def test_asfreq_mult(self): # normal freq to mult freq - p = Period(freq="A", year=2007) + p = Period(freq="Y", year=2007) # ordinal will not change - for freq in ["3A", offsets.YearEnd(3)]: + for freq in ["3Y", offsets.YearEnd(3)]: result = p.asfreq(freq) - expected = Period("2007", freq="3A") + expected = Period("2007", freq="3Y") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # ordinal will not change - for freq in ["3A", offsets.YearEnd(3)]: + for freq in ["3Y", offsets.YearEnd(3)]: result = p.asfreq(freq, how="S") - expected = Period("2007", freq="3A") + expected = Period("2007", freq="3Y") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # mult freq to normal freq - p = Period(freq="3A", year=2007) + p = Period(freq="3Y", year=2007) # ordinal will change because how=E is the default - for freq in ["A", offsets.YearEnd()]: + for freq in ["Y", offsets.YearEnd()]: result = p.asfreq(freq) - expected = Period("2009", freq="A") + expected = Period("2009", freq="Y") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # ordinal will not change - for freq in ["A", offsets.YearEnd()]: - result = p.asfreq(freq, how="S") - expected = Period("2007", freq="A") + for freq in ["Y", offsets.YearEnd()]: + result = p.asfreq(freq, how="s") + expected = Period("2007", freq="Y") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq - p = Period(freq="A", year=2007) + p = Period(freq="Y", year=2007) for freq in ["2M", offsets.MonthEnd(2)]: result = p.asfreq(freq) expected = Period("2007-12", freq="2M") @@ -749,14 +754,14 @@ assert result.ordinal == expected.ordinal assert result.freq == expected.freq for freq in ["2M", offsets.MonthEnd(2)]: - result = p.asfreq(freq, how="S") + result = p.asfreq(freq, how="s") expected = Period("2007-01", freq="2M") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq - p = Period(freq="3A", year=2007) + p = Period(freq="3Y", year=2007) for freq in ["2M", offsets.MonthEnd(2)]: result = p.asfreq(freq) expected = Period("2009-12", freq="2M") @@ -765,7 +770,7 @@ assert result.ordinal == expected.ordinal assert result.freq == expected.freq for freq in ["2M", offsets.MonthEnd(2)]: - result = p.asfreq(freq, how="S") + result = p.asfreq(freq, how="s") expected = Period("2007-01", freq="2M") assert result == expected @@ -774,24 +779,24 @@ def test_asfreq_combined(self): # normal freq to combined freq - p = Period("2007", freq="H") + p = Period("2007", freq="h") # ordinal will not change - expected = Period("2007", freq="25H") - for freq, how in zip(["1D1H", "1H1D"], ["E", "S"]): + expected = Period("2007", freq="25h") + for freq, how in zip(["1D1h", "1h1D"], ["E", "S"]): result = p.asfreq(freq, how=how) assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # combined freq to normal freq - p1 = Period(freq="1D1H", year=2007) - p2 = Period(freq="1H1D", year=2007) + p1 = Period(freq="1D1h", year=2007) + p2 = Period(freq="1h1D", year=2007) # ordinal will change because how=E is the default - result1 = p1.asfreq("H") - result2 = p2.asfreq("H") - expected = Period("2007-01-02", freq="H") + result1 = p1.asfreq("h") + result2 = p2.asfreq("h") + expected = Period("2007-01-02", freq="h") assert result1 == expected assert result1.ordinal == expected.ordinal assert result1.freq == expected.freq @@ -800,9 +805,9 @@ assert result2.freq == expected.freq # ordinal will not change - result1 = p1.asfreq("H", how="S") - result2 = p2.asfreq("H", how="S") - expected = Period("2007-01-01", freq="H") + result1 = p1.asfreq("h", how="S") + result2 = p2.asfreq("h", how="S") + expected = Period("2007-01-01", freq="h") assert result1 == expected assert result1.ordinal == expected.ordinal assert result1.freq == expected.freq @@ -815,11 +820,9 @@ assert initial.asfreq(freq="M", how="S") == Period("2013-01", "M") - msg = INVALID_FREQ_ERR_MSG + msg = "MS is not supported as period frequency" with pytest.raises(ValueError, match=msg): initial.asfreq(freq="MS", how="S") with pytest.raises(ValueError, match=msg): Period("2013-01", "MS") - - assert _period_code_map.get("MS") is None diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/period/test_period.py pandas-2.2.2+dfsg/pandas/tests/scalar/period/test_period.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/period/test_period.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/period/test_period.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,26 +3,20 @@ datetime, timedelta, ) +import re import numpy as np import pytest -from pandas._libs.tslibs import ( - iNaT, - period as libperiod, -) +from pandas._libs.tslibs import iNaT from pandas._libs.tslibs.ccalendar import ( DAYS, MONTHS, ) from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import DateParseError -from pandas._libs.tslibs.period import ( - INVALID_FREQ_ERR_MSG, - IncompatibleFrequency, -) +from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG -import pandas as pd from pandas import ( NaT, Period, @@ -35,15 +29,43 @@ bday_msg = "Period with BDay freq is deprecated" -class TestPeriodConstruction: +class TestPeriodDisallowedFreqs: + @pytest.mark.parametrize( + "freq, freq_msg", + [ + (offsets.BYearBegin(), "BYearBegin"), + (offsets.YearBegin(2), "YearBegin"), + (offsets.QuarterBegin(startingMonth=12), "QuarterBegin"), + (offsets.BusinessMonthEnd(2), "BusinessMonthEnd"), + ], + ) + def test_offsets_not_supported(self, freq, freq_msg): + # GH#55785 + msg = re.escape(f"{freq} is not supported as period frequency") + with pytest.raises(ValueError, match=msg): + Period(year=2014, freq=freq) + def test_custom_business_day_freq_raises(self): # GH#52534 - msg = "CustomBusinessDay cannot be used with Period or PeriodDtype" - with pytest.raises(TypeError, match=msg): + msg = "C is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2023-04-10", freq="C") - with pytest.raises(TypeError, match=msg): + msg = f"{offsets.CustomBusinessDay().base} is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2023-04-10", freq=offsets.CustomBusinessDay()) + def test_invalid_frequency_error_message(self): + msg = "WOM-1MON is not supported as period frequency" + with pytest.raises(ValueError, match=msg): + Period("2012-01-02", freq="WOM-1MON") + + def test_invalid_frequency_period_error_message(self): + msg = "for Period, please use 'M' instead of 'ME'" + with pytest.raises(ValueError, match=msg): + Period("2012-01-02", freq="ME") + + +class TestPeriodConstruction: def test_from_td64nat_raises(self): # GH#44507 td = NaT.to_numpy("m8[ns]") @@ -61,25 +83,20 @@ assert i1 == i2 - i1 = Period("2005", freq="A") + # GH#54105 - Period can be confusingly instantiated with lowercase freq + # TODO: raise in the future an error when passing lowercase freq + i1 = Period("2005", freq="Y") i2 = Period("2005") - i3 = Period("2005", freq="a") assert i1 == i2 - assert i1 == i3 i4 = Period("2005", freq="M") - i5 = Period("2005", freq="m") - assert i1 != i4 - assert i4 == i5 i1 = Period.now(freq="Q") i2 = Period(datetime.now(), freq="Q") - i3 = Period.now("q") assert i1 == i2 - assert i1 == i3 # Pass in freq as a keyword argument sometimes as a test for # https://github.com/pandas-dev/pandas/issues/53369 @@ -91,7 +108,9 @@ assert i1 == i3 i1 = Period("1982", freq="min") - i2 = Period("1982", freq="MIN") + msg = "'MIN' is deprecated and will be removed in a future version." + with tm.assert_produces_warning(FutureWarning, match=msg): + i2 = Period("1982", freq="MIN") assert i1 == i2 i1 = Period(year=2005, month=3, day=1, freq="D") @@ -102,17 +121,17 @@ assert i1 == i3 i1 = Period("2007-01-01 09:00:00.001") - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="L") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="ms") assert i1 == expected - expected = Period("2007-01-01 09:00:00.001", freq="L") + expected = Period("2007-01-01 09:00:00.001", freq="ms") assert i1 == expected i1 = Period("2007-01-01 09:00:00.00101") - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="U") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="us") assert i1 == expected - expected = Period("2007-01-01 09:00:00.00101", freq="U") + expected = Period("2007-01-01 09:00:00.00101", freq="us") assert i1 == expected msg = "Must supply freq for ordinal value" @@ -123,10 +142,14 @@ with pytest.raises(ValueError, match=msg): Period("2007-1-1", freq="X") + def test_tuple_freq_disallowed(self): # GH#34703 tuple freq disallowed with pytest.raises(TypeError, match="pass as a string instead"): Period("1982", freq=("Min", 1)) + with pytest.raises(TypeError, match="pass as a string instead"): + Period("2006-12-31", ("w", 1)) + def test_construction_from_timestamp_nanos(self): # GH#46811 don't drop nanos from Timestamp ts = Timestamp("2022-04-20 09:23:24.123456789") @@ -227,7 +250,7 @@ assert Period("1/1/2005", freq=offsets.MonthEnd()) == Period( "1/1/2005", freq="M" ) - assert Period("2005", freq=offsets.YearEnd()) == Period("2005", freq="A") + assert Period("2005", freq=offsets.YearEnd()) == Period("2005", freq="Y") assert Period("2005", freq=offsets.MonthEnd()) == Period("2005", freq="M") with tm.assert_produces_warning(FutureWarning, match=bday_msg): assert Period("3/10/12", freq=offsets.BusinessDay()) == Period( @@ -282,17 +305,17 @@ assert i1 == i5 i1 = Period("2007-01-01 09:00:00.001") - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="L") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="ms") assert i1 == expected - expected = Period("2007-01-01 09:00:00.001", freq="L") + expected = Period("2007-01-01 09:00:00.001", freq="ms") assert i1 == expected i1 = Period("2007-01-01 09:00:00.00101") - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="U") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="us") assert i1 == expected - expected = Period("2007-01-01 09:00:00.00101", freq="U") + expected = Period("2007-01-01 09:00:00.00101", freq="us") assert i1 == expected def test_invalid_arguments(self): @@ -318,13 +341,13 @@ msg = '^Given date string "-2000" not likely a datetime$' with pytest.raises(ValueError, match=msg): - Period("-2000", "A") + Period("-2000", "Y") msg = "day is out of range for month" with pytest.raises(DateParseError, match=msg): - Period("0", "A") + Period("0", "Y") msg = "Unknown datetime string format, unable to parse" with pytest.raises(DateParseError, match=msg): - Period("1/1/-2000", "A") + Period("1/1/-2000", "Y") def test_constructor_corner(self): expected = Period("2007-01", freq="2M") @@ -334,8 +357,8 @@ p = Period("2007-01-01", freq="D") - result = Period(p, freq="A") - exp = Period("2007", freq="A") + result = Period(p, freq="Y") + exp = Period("2007", freq="Y") assert result == exp def test_constructor_infer_freq(self): @@ -343,31 +366,31 @@ assert p.freq == "D" p = Period("2007-01-01 07") - assert p.freq == "H" + assert p.freq == "h" p = Period("2007-01-01 07:10") - assert p.freq == "T" + assert p.freq == "min" p = Period("2007-01-01 07:10:15") - assert p.freq == "S" + assert p.freq == "s" p = Period("2007-01-01 07:10:15.123") - assert p.freq == "L" + assert p.freq == "ms" # We see that there are 6 digits after the decimal, so get microsecond # even though they are all zeros. p = Period("2007-01-01 07:10:15.123000") - assert p.freq == "U" + assert p.freq == "us" p = Period("2007-01-01 07:10:15.123400") - assert p.freq == "U" + assert p.freq == "us" def test_multiples(self): - result1 = Period("1989", freq="2A") - result2 = Period("1989", freq="A") + result1 = Period("1989", freq="2Y") + result2 = Period("1989", freq="Y") assert result1.ordinal == result2.ordinal - assert result1.freqstr == "2A-DEC" - assert result2.freqstr == "A-DEC" + assert result1.freqstr == "2Y-DEC" + assert result2.freqstr == "Y-DEC" assert result1.freq == offsets.YearEnd(2) assert result2.freq == offsets.YearEnd() @@ -393,7 +416,7 @@ @pytest.mark.parametrize("month", MONTHS) def test_period_cons_annual(self, month): # bugs in scikits.timeseries - freq = f"A-{month}" + freq = f"Y-{month}" exp = Period("1989", freq=freq) stamp = exp.to_timestamp("D", how="end") + timedelta(days=30) p = Period(stamp, freq=freq) @@ -427,11 +450,11 @@ def test_period_from_ordinal(self): p = Period("2011-01", freq="M") - res = Period._from_ordinal(p.ordinal, freq="M") + res = Period._from_ordinal(p.ordinal, freq=p.freq) assert p == res assert isinstance(res, Period) - @pytest.mark.parametrize("freq", ["A", "M", "D", "H"]) + @pytest.mark.parametrize("freq", ["Y", "M", "D", "h"]) def test_construct_from_nat_string_and_freq(self, freq): per = Period("NaT", freq=freq) assert per is NaT @@ -452,7 +475,7 @@ p = Period(iNaT, freq="3D") assert p is NaT - p = Period(iNaT, freq="1D1H") + p = Period(iNaT, freq="1D1h") assert p is NaT p = Period("NaT") @@ -494,14 +517,14 @@ def test_period_cons_combined(self): p = [ ( - Period("2011-01", freq="1D1H"), - Period("2011-01", freq="1H1D"), - Period("2011-01", freq="H"), + Period("2011-01", freq="1D1h"), + Period("2011-01", freq="1h1D"), + Period("2011-01", freq="h"), ), ( - Period(ordinal=1, freq="1D1H"), - Period(ordinal=1, freq="1H1D"), - Period(ordinal=1, freq="H"), + Period(ordinal=1, freq="1D1h"), + Period(ordinal=1, freq="1h1D"), + Period(ordinal=1, freq="h"), ), ] @@ -510,49 +533,49 @@ assert p2.ordinal == p3.ordinal assert p1.freq == offsets.Hour(25) - assert p1.freqstr == "25H" + assert p1.freqstr == "25h" assert p2.freq == offsets.Hour(25) - assert p2.freqstr == "25H" + assert p2.freqstr == "25h" assert p3.freq == offsets.Hour() - assert p3.freqstr == "H" + assert p3.freqstr == "h" result = p1 + 1 assert result.ordinal == (p3 + 25).ordinal assert result.freq == p1.freq - assert result.freqstr == "25H" + assert result.freqstr == "25h" result = p2 + 1 assert result.ordinal == (p3 + 25).ordinal assert result.freq == p2.freq - assert result.freqstr == "25H" + assert result.freqstr == "25h" result = p1 - 1 assert result.ordinal == (p3 - 25).ordinal assert result.freq == p1.freq - assert result.freqstr == "25H" + assert result.freqstr == "25h" result = p2 - 1 assert result.ordinal == (p3 - 25).ordinal assert result.freq == p2.freq - assert result.freqstr == "25H" + assert result.freqstr == "25h" - msg = "Frequency must be positive, because it represents span: -25H" + msg = "Frequency must be positive, because it represents span: -25h" with pytest.raises(ValueError, match=msg): - Period("2011-01", freq="-1D1H") + Period("2011-01", freq="-1D1h") with pytest.raises(ValueError, match=msg): - Period("2011-01", freq="-1H1D") + Period("2011-01", freq="-1h1D") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq="-1D1H") + Period(ordinal=1, freq="-1D1h") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq="-1H1D") + Period(ordinal=1, freq="-1h1D") msg = "Frequency must be positive, because it represents span: 0D" with pytest.raises(ValueError, match=msg): - Period("2011-01", freq="0D0H") + Period("2011-01", freq="0D0h") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq="0D0H") + Period(ordinal=1, freq="0D0h") # You can only combine together day and intraday offsets msg = "Invalid frequency: 1W1D" @@ -587,7 +610,7 @@ def test_period_large_ordinal(self, hour): # Issue #36430 # Integer overflow for Period over the maximum timestamp - p = Period(ordinal=2562048 + hour, freq="1H") + p = Period(ordinal=2562048 + hour, freq="1h") assert p.hour == hour @@ -624,7 +647,7 @@ "ignore:Period with BDay freq is deprecated:FutureWarning" ) def test_to_timestamp(self): - p = Period("1982", freq="A") + p = Period("1982", freq="Y") start_ts = p.to_timestamp(how="S") aliases = ["s", "StarT", "BEGIn"] for a in aliases: @@ -638,7 +661,7 @@ assert end_ts == p.to_timestamp("D", how=a) assert end_ts == p.to_timestamp("3D", how=a) - from_lst = ["A", "Q", "M", "W", "B", "D", "H", "Min", "S"] + from_lst = ["Y", "Q", "M", "W", "B", "D", "h", "Min", "s"] def _ex(p): if p.freq == "B": @@ -656,18 +679,18 @@ # Frequency other than daily - p = Period("1985", freq="A") + p = Period("1985", freq="Y") - result = p.to_timestamp("H", how="end") + result = p.to_timestamp("h", how="end") expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") assert result == expected - result = p.to_timestamp("3H", how="end") + result = p.to_timestamp("3h", how="end") assert result == expected - result = p.to_timestamp("T", how="end") + result = p.to_timestamp("min", how="end") expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") assert result == expected - result = p.to_timestamp("2T", how="end") + result = p.to_timestamp("2min", how="end") assert result == expected result = p.to_timestamp(how="end") @@ -675,15 +698,15 @@ assert result == expected expected = datetime(1985, 1, 1) - result = p.to_timestamp("H", how="start") + result = p.to_timestamp("h", how="start") assert result == expected - result = p.to_timestamp("T", how="start") + result = p.to_timestamp("min", how="start") assert result == expected - result = p.to_timestamp("S", how="start") + result = p.to_timestamp("s", how="start") assert result == expected - result = p.to_timestamp("3H", how="start") + result = p.to_timestamp("3h", how="start") assert result == expected - result = p.to_timestamp("5S", how="start") + result = p.to_timestamp("5s", how="start") assert result == expected def test_to_timestamp_business_end(self): @@ -724,18 +747,18 @@ ("2000-12-15", None, "2000-12-15", "D"), ( "2000-12-15 13:45:26.123456789", - "N", + "ns", "2000-12-15 13:45:26.123456789", - "N", + "ns", ), - ("2000-12-15 13:45:26.123456789", "U", "2000-12-15 13:45:26.123456", "U"), - ("2000-12-15 13:45:26.123456", None, "2000-12-15 13:45:26.123456", "U"), - ("2000-12-15 13:45:26.123456789", "L", "2000-12-15 13:45:26.123", "L"), - ("2000-12-15 13:45:26.123", None, "2000-12-15 13:45:26.123", "L"), - ("2000-12-15 13:45:26", "S", "2000-12-15 13:45:26", "S"), - ("2000-12-15 13:45:26", "T", "2000-12-15 13:45", "T"), - ("2000-12-15 13:45:26", "H", "2000-12-15 13:00", "H"), - ("2000-12-15", "Y", "2000", "A-DEC"), + ("2000-12-15 13:45:26.123456789", "us", "2000-12-15 13:45:26.123456", "us"), + ("2000-12-15 13:45:26.123456", None, "2000-12-15 13:45:26.123456", "us"), + ("2000-12-15 13:45:26.123456789", "ms", "2000-12-15 13:45:26.123", "ms"), + ("2000-12-15 13:45:26.123", None, "2000-12-15 13:45:26.123", "ms"), + ("2000-12-15 13:45:26", "s", "2000-12-15 13:45:26", "s"), + ("2000-12-15 13:45:26", "min", "2000-12-15 13:45", "min"), + ("2000-12-15 13:45:26", "h", "2000-12-15 13:00", "h"), + ("2000-12-15", "Y", "2000", "Y-DEC"), ("2000-12-15", "Q", "2000Q4", "Q-DEC"), ("2000-12-15", "M", "2000-12", "M"), ("2000-12-15", "W", "2000-12-11/2000-12-17", "W-SUN"), @@ -757,7 +780,7 @@ def test_strftime(self): # GH#3363 - p = Period("2000-1-1 12:34:12", freq="S") + p = Period("2000-1-1 12:34:12", freq="s") res = p.strftime("%Y-%m-%d %H:%M:%S") assert res == "2000-01-01 12:34:12" assert isinstance(res, str) @@ -766,7 +789,7 @@ class TestPeriodProperties: """Test properties such as year, month, weekday, etc....""" - @pytest.mark.parametrize("freq", ["A", "M", "D", "H"]) + @pytest.mark.parametrize("freq", ["Y", "M", "D", "h"]) def test_is_leap_year(self, freq): # GH 13727 p = Period("2000-01-01 00:00:00", freq=freq) @@ -801,7 +824,7 @@ def test_freq_str(self): i1 = Period("1982", freq="Min") assert i1.freq == offsets.Minute() - assert i1.freqstr == "T" + assert i1.freqstr == "min" @pytest.mark.filterwarnings( "ignore:Period with BDay freq is deprecated:FutureWarning" @@ -811,12 +834,12 @@ "M": ["MTH", "MONTH", "MONTHLY", "Mth", "month", "monthly"], "B": ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY", "bus"], "D": ["DAY", "DLY", "DAILY", "Day", "Dly", "Daily"], - "H": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], - "T": ["minute", "MINUTE", "MINUTELY", "minutely"], - "S": ["sec", "SEC", "SECOND", "SECONDLY", "second"], - "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"], - "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"], - "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"], + "h": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], + "min": ["minute", "MINUTE", "MINUTELY", "minutely"], + "s": ["sec", "SEC", "SECOND", "SECONDLY", "second"], + "ms": ["MILLISECOND", "MILLISECONDLY", "millisecond"], + "us": ["MICROSECOND", "MICROSECONDLY", "microsecond"], + "ns": ["NANOSECOND", "NANOSECONDLY", "nanosecond"], } msg = INVALID_FREQ_ERR_MSG @@ -858,13 +881,13 @@ def test_inner_bounds_start_and_end_time(self, bound, offset, period_property): # GH #13346 period = TestPeriodProperties._period_constructor(bound, -offset) - expected = period.to_timestamp().round(freq="S") - assert getattr(period, period_property).round(freq="S") == expected - expected = (bound - offset * Timedelta(1, unit="S")).floor("S") - assert getattr(period, period_property).floor("S") == expected + expected = period.to_timestamp().round(freq="s") + assert getattr(period, period_property).round(freq="s") == expected + expected = (bound - offset * Timedelta(1, unit="s")).floor("s") + assert getattr(period, period_property).floor("s") == expected def test_start_time(self): - freq_lst = ["A", "Q", "M", "D", "H", "T", "S"] + freq_lst = ["Y", "Q", "M", "D", "h", "min", "s"] xp = datetime(2012, 1, 1) for f in freq_lst: p = Period("2012", freq=f) @@ -874,7 +897,7 @@ assert Period("2012", freq="W").start_time == datetime(2011, 12, 26) def test_end_time(self): - p = Period("2012", freq="A") + p = Period("2012", freq="Y") def _ex(*args): return Timestamp(Timestamp(datetime(*args)).as_unit("ns")._value - 1) @@ -894,7 +917,7 @@ xp = _ex(2012, 1, 2) assert xp == p.end_time - p = Period("2012", freq="H") + p = Period("2012", freq="h") xp = _ex(2012, 1, 1, 1) assert xp == p.end_time @@ -912,11 +935,11 @@ xp = _ex(2012, 1, 16) assert xp == p.end_time - p = Period("2012", freq="1D1H") + p = Period("2012", freq="1D1h") xp = _ex(2012, 1, 2, 1) assert xp == p.end_time - p = Period("2012", freq="1H1D") + p = Period("2012", freq="1h1D") xp = _ex(2012, 1, 2, 1) assert xp == p.end_time @@ -939,7 +962,7 @@ def test_properties_annually(self): # Test properties on Periods with annually frequency. - a_date = Period(freq="A", year=2007) + a_date = Period(freq="Y", year=2007) assert a_date.year == 2007 def test_properties_quarterly(self): @@ -1026,8 +1049,8 @@ def test_properties_hourly(self): # Test properties on Periods with hourly frequency. - h_date1 = Period(freq="H", year=2007, month=1, day=1, hour=0) - h_date2 = Period(freq="2H", year=2007, month=1, day=1, hour=0) + h_date1 = Period(freq="h", year=2007, month=1, day=1, hour=0) + h_date2 = Period(freq="2h", year=2007, month=1, day=1, hour=0) for h_date in [h_date1, h_date2]: assert h_date.year == 2007 @@ -1039,7 +1062,7 @@ assert h_date.hour == 0 assert h_date.days_in_month == 31 assert ( - Period(freq="H", year=2012, month=2, day=1, hour=0).days_in_month == 29 + Period(freq="h", year=2012, month=2, day=1, hour=0).days_in_month == 29 ) def test_properties_minutely(self): @@ -1083,70 +1106,7 @@ ) -class TestPeriodField: - def test_get_period_field_array_raises_on_out_of_range(self): - msg = "Buffer dtype mismatch, expected 'const int64_t' but got 'double'" - with pytest.raises(ValueError, match=msg): - libperiod.get_period_field_arr(-1, np.empty(1), 0) - - class TestPeriodComparisons: - def test_comparison_same_period_different_object(self): - # Separate Period objects for the same period - left = Period("2000-01", "M") - right = Period("2000-01", "M") - - assert left == right - assert left >= right - assert left <= right - assert not left < right - assert not left > right - - def test_comparison_same_freq(self): - jan = Period("2000-01", "M") - feb = Period("2000-02", "M") - - assert not jan == feb - assert jan != feb - assert jan < feb - assert jan <= feb - assert not jan > feb - assert not jan >= feb - - def test_comparison_mismatched_freq(self): - jan = Period("2000-01", "M") - day = Period("2012-01-01", "D") - - assert not jan == day - assert jan != day - msg = r"Input has different freq=D from Period\(freq=M\)" - with pytest.raises(IncompatibleFrequency, match=msg): - jan < day - with pytest.raises(IncompatibleFrequency, match=msg): - jan <= day - with pytest.raises(IncompatibleFrequency, match=msg): - jan > day - with pytest.raises(IncompatibleFrequency, match=msg): - jan >= day - - def test_comparison_invalid_type(self): - jan = Period("2000-01", "M") - - assert not jan == 1 - assert jan != 1 - - int_or_per = "'(Period|int)'" - msg = f"not supported between instances of {int_or_per} and {int_or_per}" - for left, right in [(jan, 1), (1, jan)]: - with pytest.raises(TypeError, match=msg): - left > right - with pytest.raises(TypeError, match=msg): - left >= right - with pytest.raises(TypeError, match=msg): - left < right - with pytest.raises(TypeError, match=msg): - left <= right - def test_sort_periods(self): jan = Period("2000-01", "M") feb = Period("2000-02", "M") @@ -1155,422 +1115,6 @@ correctPeriods = [jan, feb, mar] assert sorted(periods) == correctPeriods - def test_period_cmp_nat(self): - p = Period("2011-01-01", freq="D") - - t = Timestamp("2011-01-01") - # confirm Period('NaT') work identical with Timestamp('NaT') - for left, right in [ - (NaT, p), - (p, NaT), - (NaT, t), - (t, NaT), - ]: - assert not left < right - assert not left > right - assert not left == right - assert left != right - assert not left <= right - assert not left >= right - - @pytest.mark.parametrize( - "zerodim_arr, expected", - ((np.array(0), False), (np.array(Period("2000-01", "M")), True)), - ) - def test_comparison_numpy_zerodim_arr(self, zerodim_arr, expected): - p = Period("2000-01", "M") - - assert (p == zerodim_arr) is expected - assert (zerodim_arr == p) is expected - - -class TestArithmetic: - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "m"]) - def test_add_sub_td64_nat(self, unit): - # GH#47196 - per = Period("2022-06-01", "D") - nat = np.timedelta64("NaT", unit) - - assert per + nat is NaT - assert nat + per is NaT - assert per - nat is NaT - - with pytest.raises(TypeError, match="unsupported operand"): - nat - per - - def test_sub_delta(self): - left, right = Period("2011", freq="A"), Period("2007", freq="A") - result = left - right - assert result == 4 * right.freq - - msg = r"Input has different freq=M from Period\(freq=A-DEC\)" - with pytest.raises(IncompatibleFrequency, match=msg): - left - Period("2007-01", freq="M") - - def test_add_integer(self): - per1 = Period(freq="D", year=2008, month=1, day=1) - per2 = Period(freq="D", year=2008, month=1, day=2) - assert per1 + 1 == per2 - assert 1 + per1 == per2 - - def test_add_sub_nat(self): - # GH#13071 - p = Period("2011-01", freq="M") - assert p + NaT is NaT - assert NaT + p is NaT - assert p - NaT is NaT - assert NaT - p is NaT - - def test_add_invalid(self): - # GH#4731 - per1 = Period(freq="D", year=2008, month=1, day=1) - per2 = Period(freq="D", year=2008, month=1, day=2) - - msg = "|".join( - [ - r"unsupported operand type\(s\)", - "can only concatenate str", - "must be str, not Period", - ] - ) - with pytest.raises(TypeError, match=msg): - per1 + "str" - with pytest.raises(TypeError, match=msg): - "str" + per1 - with pytest.raises(TypeError, match=msg): - per1 + per2 - - boxes = [lambda x: x, lambda x: pd.Series([x]), lambda x: pd.Index([x])] - ids = ["identity", "Series", "Index"] - - @pytest.mark.parametrize("lbox", boxes, ids=ids) - @pytest.mark.parametrize("rbox", boxes, ids=ids) - def test_add_timestamp_raises(self, rbox, lbox): - # GH#17983 - ts = Timestamp("2017") - per = Period("2017", freq="M") - - # We may get a different message depending on which class raises - # the error. - msg = "|".join( - [ - "cannot add", - "unsupported operand", - "can only operate on a", - "incompatible type", - "ufunc add cannot use operands", - ] - ) - with pytest.raises(TypeError, match=msg): - lbox(ts) + rbox(per) - - with pytest.raises(TypeError, match=msg): - lbox(per) + rbox(ts) - - with pytest.raises(TypeError, match=msg): - lbox(per) + rbox(per) - - def test_sub(self): - per1 = Period("2011-01-01", freq="D") - per2 = Period("2011-01-15", freq="D") - - off = per1.freq - assert per1 - per2 == -14 * off - assert per2 - per1 == 14 * off - - msg = r"Input has different freq=M from Period\(freq=D\)" - with pytest.raises(IncompatibleFrequency, match=msg): - per1 - Period("2011-02", freq="M") - - @pytest.mark.parametrize("n", [1, 2, 3, 4]) - def test_sub_n_gt_1_ticks(self, tick_classes, n): - # GH 23878 - p1 = Period("19910905", freq=tick_classes(n)) - p2 = Period("19920406", freq=tick_classes(n)) - - expected = Period(str(p2), freq=p2.freq.base) - Period( - str(p1), freq=p1.freq.base - ) - - assert (p2 - p1) == expected - - @pytest.mark.parametrize("normalize", [True, False]) - @pytest.mark.parametrize("n", [1, 2, 3, 4]) - @pytest.mark.parametrize( - "offset, kwd_name", - [ - (offsets.YearEnd, "month"), - (offsets.QuarterEnd, "startingMonth"), - (offsets.MonthEnd, None), - (offsets.Week, "weekday"), - ], - ) - def test_sub_n_gt_1_offsets(self, offset, kwd_name, n, normalize): - # GH 23878 - kwds = {kwd_name: 3} if kwd_name is not None else {} - p1_d = "19910905" - p2_d = "19920406" - p1 = Period(p1_d, freq=offset(n, normalize, **kwds)) - p2 = Period(p2_d, freq=offset(n, normalize, **kwds)) - - expected = Period(p2_d, freq=p2.freq.base) - Period(p1_d, freq=p1.freq.base) - - assert (p2 - p1) == expected - - def test_add_offset(self): - # freq is DateOffset - for freq in ["A", "2A", "3A"]: - p = Period("2011", freq=freq) - exp = Period("2013", freq=freq) - assert p + offsets.YearEnd(2) == exp - assert offsets.YearEnd(2) + p == exp - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - msg = "Input has different freq|Input cannot be converted to Period" - with pytest.raises(IncompatibleFrequency, match=msg): - p + o - with pytest.raises(IncompatibleFrequency, match=msg): - o + p - - for freq in ["M", "2M", "3M"]: - p = Period("2011-03", freq=freq) - exp = Period("2011-05", freq=freq) - assert p + offsets.MonthEnd(2) == exp - assert offsets.MonthEnd(2) + p == exp - - exp = Period("2012-03", freq=freq) - assert p + offsets.MonthEnd(12) == exp - assert offsets.MonthEnd(12) + p == exp - - msg = "|".join( - [ - "Input has different freq", - "Input cannot be converted to Period", - ] - ) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p + o - with pytest.raises(IncompatibleFrequency, match=msg): - o + p - - # freq is Tick - for freq in ["D", "2D", "3D"]: - p = Period("2011-04-01", freq=freq) - - exp = Period("2011-04-06", freq=freq) - assert p + offsets.Day(5) == exp - assert offsets.Day(5) + p == exp - - exp = Period("2011-04-02", freq=freq) - assert p + offsets.Hour(24) == exp - assert offsets.Hour(24) + p == exp - - exp = Period("2011-04-03", freq=freq) - assert p + np.timedelta64(2, "D") == exp - assert np.timedelta64(2, "D") + p == exp - - exp = Period("2011-04-02", freq=freq) - assert p + np.timedelta64(3600 * 24, "s") == exp - assert np.timedelta64(3600 * 24, "s") + p == exp - - exp = Period("2011-03-30", freq=freq) - assert p + timedelta(-2) == exp - assert timedelta(-2) + p == exp - - exp = Period("2011-04-03", freq=freq) - assert p + timedelta(hours=48) == exp - assert timedelta(hours=48) + p == exp - - msg = "|".join( - [ - "Input has different freq", - "Input cannot be converted to Period", - ] - ) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(4, "h"), - timedelta(hours=23), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p + o - with pytest.raises(IncompatibleFrequency, match=msg): - o + p - - for freq in ["H", "2H", "3H"]: - p = Period("2011-04-01 09:00", freq=freq) - - exp = Period("2011-04-03 09:00", freq=freq) - assert p + offsets.Day(2) == exp - assert offsets.Day(2) + p == exp - - exp = Period("2011-04-01 12:00", freq=freq) - assert p + offsets.Hour(3) == exp - assert offsets.Hour(3) + p == exp - - msg = "cannot use operands with types" - exp = Period("2011-04-01 12:00", freq=freq) - assert p + np.timedelta64(3, "h") == exp - assert np.timedelta64(3, "h") + p == exp - - exp = Period("2011-04-01 10:00", freq=freq) - assert p + np.timedelta64(3600, "s") == exp - assert np.timedelta64(3600, "s") + p == exp - - exp = Period("2011-04-01 11:00", freq=freq) - assert p + timedelta(minutes=120) == exp - assert timedelta(minutes=120) + p == exp - - exp = Period("2011-04-05 12:00", freq=freq) - assert p + timedelta(days=4, minutes=180) == exp - assert timedelta(days=4, minutes=180) + p == exp - - msg = "|".join( - [ - "Input has different freq", - "Input cannot be converted to Period", - ] - ) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(3200, "s"), - timedelta(hours=23, minutes=30), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p + o - with pytest.raises(IncompatibleFrequency, match=msg): - o + p - - def test_sub_offset(self): - # freq is DateOffset - msg = "|".join( - [ - "Input has different freq", - "Input cannot be converted to Period", - ] - ) - - for freq in ["A", "2A", "3A"]: - p = Period("2011", freq=freq) - assert p - offsets.YearEnd(2) == Period("2009", freq=freq) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p - o - - for freq in ["M", "2M", "3M"]: - p = Period("2011-03", freq=freq) - assert p - offsets.MonthEnd(2) == Period("2011-01", freq=freq) - assert p - offsets.MonthEnd(12) == Period("2010-03", freq=freq) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(365, "D"), - timedelta(365), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p - o - - # freq is Tick - for freq in ["D", "2D", "3D"]: - p = Period("2011-04-01", freq=freq) - assert p - offsets.Day(5) == Period("2011-03-27", freq=freq) - assert p - offsets.Hour(24) == Period("2011-03-31", freq=freq) - assert p - np.timedelta64(2, "D") == Period("2011-03-30", freq=freq) - assert p - np.timedelta64(3600 * 24, "s") == Period("2011-03-31", freq=freq) - assert p - timedelta(-2) == Period("2011-04-03", freq=freq) - assert p - timedelta(hours=48) == Period("2011-03-30", freq=freq) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(4, "h"), - timedelta(hours=23), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p - o - - for freq in ["H", "2H", "3H"]: - p = Period("2011-04-01 09:00", freq=freq) - assert p - offsets.Day(2) == Period("2011-03-30 09:00", freq=freq) - assert p - offsets.Hour(3) == Period("2011-04-01 06:00", freq=freq) - assert p - np.timedelta64(3, "h") == Period("2011-04-01 06:00", freq=freq) - assert p - np.timedelta64(3600, "s") == Period( - "2011-04-01 08:00", freq=freq - ) - assert p - timedelta(minutes=120) == Period("2011-04-01 07:00", freq=freq) - assert p - timedelta(days=4, minutes=180) == Period( - "2011-03-28 06:00", freq=freq - ) - - for o in [ - offsets.YearBegin(2), - offsets.MonthBegin(1), - offsets.Minute(), - np.timedelta64(3200, "s"), - timedelta(hours=23, minutes=30), - ]: - with pytest.raises(IncompatibleFrequency, match=msg): - p - o - - @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) - def test_period_addsub_nat(self, freq): - per = Period("2011-01", freq=freq) - - # For subtraction, NaT is treated as another Period object - assert NaT - per is NaT - assert per - NaT is NaT - - # For addition, NaT is treated as offset-like - assert NaT + per is NaT - assert per + NaT is NaT - - def test_period_ops_offset(self): - p = Period("2011-04-01", freq="D") - result = p + offsets.Day() - exp = Period("2011-04-02", freq="D") - assert result == exp - - result = p - offsets.Day(2) - exp = Period("2011-03-30", freq="D") - assert result == exp - - msg = r"Input cannot be converted to Period\(freq=D\)" - with pytest.raises(IncompatibleFrequency, match=msg): - p + offsets.Hour(2) - - with pytest.raises(IncompatibleFrequency, match=msg): - p - offsets.Hour(2) - def test_period_immutable(): # see gh-17116 @@ -1592,7 +1136,7 @@ def test_negone_ordinals(): - freqs = ["A", "M", "Q", "D", "H", "T", "S"] + freqs = ["Y", "M", "Q", "D", "h", "min", "s"] period = Period(ordinal=-1, freq="D") for freq in freqs: @@ -1608,9 +1152,3 @@ repr(period) period = Period(ordinal=-1, freq="W") repr(period) - - -def test_invalid_frequency_error_message(): - msg = "Invalid frequency: " - with pytest.raises(ValueError, match=msg): - Period("2012-01-02", freq="WOM-1MON") diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/test_nat.py pandas-2.2.2+dfsg/pandas/tests/scalar/test_nat.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/test_nat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/test_nat.py 2024-04-10 17:42:52.000000000 +0000 @@ -33,6 +33,17 @@ ) +class TestNaTFormatting: + def test_repr(self): + assert repr(NaT) == "NaT" + + def test_str(self): + assert str(NaT) == "NaT" + + def test_isoformat(self): + assert NaT.isoformat() == "NaT" + + @pytest.mark.parametrize( "nat,idx", [ @@ -431,7 +442,7 @@ [ DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), DatetimeIndex(["2011-01-01", "2011-01-02"], tz="US/Eastern", name="x"), - DatetimeArray._from_sequence(["2011-01-01", "2011-01-02"]), + DatetimeArray._from_sequence(["2011-01-01", "2011-01-02"], dtype="M8[ns]"), DatetimeArray._from_sequence( ["2011-01-01", "2011-01-02"], dtype=DatetimeTZDtype(tz="US/Pacific") ), @@ -447,6 +458,7 @@ expected = DatetimeIndex(exp_data, tz=value.tz, name=exp_name) else: expected = TimedeltaIndex(exp_data, name=exp_name) + expected = expected.as_unit(value.unit) if not isinstance(value, Index): expected = expected.array @@ -529,6 +541,8 @@ marks=pytest.mark.xfail( not np_version_gte1p24p3, reason="td64 doesn't return NotImplemented, see numpy#17017", + # When this xfail is fixed, test_nat_comparisons_numpy + # can be removed. ), ), Timestamp(0), diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timedelta/methods/test_as_unit.py pandas-2.2.2+dfsg/pandas/tests/scalar/timedelta/methods/test_as_unit.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timedelta/methods/test_as_unit.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timedelta/methods/test_as_unit.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,80 @@ +import pytest + +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas.errors import OutOfBoundsTimedelta + +from pandas import Timedelta + + +class TestAsUnit: + def test_as_unit(self): + td = Timedelta(days=1) + + assert td.as_unit("ns") is td + + res = td.as_unit("us") + assert res._value == td._value // 1000 + assert res._creso == NpyDatetimeUnit.NPY_FR_us.value + + rt = res.as_unit("ns") + assert rt._value == td._value + assert rt._creso == td._creso + + res = td.as_unit("ms") + assert res._value == td._value // 1_000_000 + assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value + + rt = res.as_unit("ns") + assert rt._value == td._value + assert rt._creso == td._creso + + res = td.as_unit("s") + assert res._value == td._value // 1_000_000_000 + assert res._creso == NpyDatetimeUnit.NPY_FR_s.value + + rt = res.as_unit("ns") + assert rt._value == td._value + assert rt._creso == td._creso + + def test_as_unit_overflows(self): + # microsecond that would be just out of bounds for nano + us = 9223372800000000 + td = Timedelta._from_value_and_reso(us, NpyDatetimeUnit.NPY_FR_us.value) + + msg = "Cannot cast 106752 days 00:00:00 to unit='ns' without overflow" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + td.as_unit("ns") + + res = td.as_unit("ms") + assert res._value == us // 1000 + assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value + + def test_as_unit_rounding(self): + td = Timedelta(microseconds=1500) + res = td.as_unit("ms") + + expected = Timedelta(milliseconds=1) + assert res == expected + + assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value + assert res._value == 1 + + with pytest.raises(ValueError, match="Cannot losslessly convert units"): + td.as_unit("ms", round_ok=False) + + def test_as_unit_non_nano(self): + # case where we are going neither to nor from nano + td = Timedelta(days=1).as_unit("ms") + assert td.days == 1 + assert td._value == 86_400_000 + assert td.components.days == 1 + assert td._d == 1 + assert td.total_seconds() == 86400 + + res = td.as_unit("us") + assert res._value == 86_400_000_000 + assert res.components.days == 1 + assert res.components.hours == 0 + assert res._d == 1 + assert res._h == 0 + assert res.total_seconds() == 86400 diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timedelta/methods/test_round.py pandas-2.2.2+dfsg/pandas/tests/scalar/timedelta/methods/test_round.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timedelta/methods/test_round.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timedelta/methods/test_round.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,187 @@ +from hypothesis import ( + given, + strategies as st, +) +import numpy as np +import pytest + +from pandas._libs import lib +from pandas._libs.tslibs import iNaT +from pandas.errors import OutOfBoundsTimedelta + +from pandas import Timedelta + + +class TestTimedeltaRound: + @pytest.mark.parametrize( + "freq,s1,s2", + [ + # This first case has s1, s2 being the same as t1,t2 below + ( + "ns", + Timedelta("1 days 02:34:56.789123456"), + Timedelta("-1 days 02:34:56.789123456"), + ), + ( + "us", + Timedelta("1 days 02:34:56.789123000"), + Timedelta("-1 days 02:34:56.789123000"), + ), + ( + "ms", + Timedelta("1 days 02:34:56.789000000"), + Timedelta("-1 days 02:34:56.789000000"), + ), + ("s", Timedelta("1 days 02:34:57"), Timedelta("-1 days 02:34:57")), + ("2s", Timedelta("1 days 02:34:56"), Timedelta("-1 days 02:34:56")), + ("5s", Timedelta("1 days 02:34:55"), Timedelta("-1 days 02:34:55")), + ("min", Timedelta("1 days 02:35:00"), Timedelta("-1 days 02:35:00")), + ("12min", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), + ("h", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), + ("d", Timedelta("1 days"), Timedelta("-1 days")), + ], + ) + def test_round(self, freq, s1, s2): + t1 = Timedelta("1 days 02:34:56.789123456") + t2 = Timedelta("-1 days 02:34:56.789123456") + + r1 = t1.round(freq) + assert r1 == s1 + r2 = t2.round(freq) + assert r2 == s2 + + def test_round_invalid(self): + t1 = Timedelta("1 days 02:34:56.789123456") + + for freq, msg in [ + ("YE", " is a non-fixed frequency"), + ("ME", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ]: + with pytest.raises(ValueError, match=msg): + t1.round(freq) + + @pytest.mark.skip_ubsan + def test_round_implementation_bounds(self): + # See also: analogous test for Timestamp + # GH#38964 + result = Timedelta.min.ceil("s") + expected = Timedelta.min + Timedelta(seconds=1) - Timedelta(145224193) + assert result == expected + + result = Timedelta.max.floor("s") + expected = Timedelta.max - Timedelta(854775807) + assert result == expected + + msg = ( + r"Cannot round -106752 days \+00:12:43.145224193 to freq=s without overflow" + ) + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta.min.floor("s") + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta.min.round("s") + + msg = "Cannot round 106751 days 23:47:16.854775807 to freq=s without overflow" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta.max.ceil("s") + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta.max.round("s") + + @pytest.mark.skip_ubsan + @given(val=st.integers(min_value=iNaT + 1, max_value=lib.i8max)) + @pytest.mark.parametrize( + "method", [Timedelta.round, Timedelta.floor, Timedelta.ceil] + ) + def test_round_sanity(self, val, method): + cls = Timedelta + err_cls = OutOfBoundsTimedelta + + val = np.int64(val) + td = cls(val) + + def checker(ts, nanos, unit): + # First check that we do raise in cases where we should + if nanos == 1: + pass + else: + div, mod = divmod(ts._value, nanos) + diff = int(nanos - mod) + lb = ts._value - mod + assert lb <= ts._value # i.e. no overflows with python ints + ub = ts._value + diff + assert ub > ts._value # i.e. no overflows with python ints + + msg = "without overflow" + if mod == 0: + # We should never be raising in this + pass + elif method is cls.ceil: + if ub > cls.max._value: + with pytest.raises(err_cls, match=msg): + method(ts, unit) + return + elif method is cls.floor: + if lb < cls.min._value: + with pytest.raises(err_cls, match=msg): + method(ts, unit) + return + elif mod >= diff: + if ub > cls.max._value: + with pytest.raises(err_cls, match=msg): + method(ts, unit) + return + elif lb < cls.min._value: + with pytest.raises(err_cls, match=msg): + method(ts, unit) + return + + res = method(ts, unit) + + td = res - ts + diff = abs(td._value) + assert diff < nanos + assert res._value % nanos == 0 + + if method is cls.round: + assert diff <= nanos / 2 + elif method is cls.floor: + assert res <= ts + elif method is cls.ceil: + assert res >= ts + + nanos = 1 + checker(td, nanos, "ns") + + nanos = 1000 + checker(td, nanos, "us") + + nanos = 1_000_000 + checker(td, nanos, "ms") + + nanos = 1_000_000_000 + checker(td, nanos, "s") + + nanos = 60 * 1_000_000_000 + checker(td, nanos, "min") + + nanos = 60 * 60 * 1_000_000_000 + checker(td, nanos, "h") + + nanos = 24 * 60 * 60 * 1_000_000_000 + checker(td, nanos, "D") + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_round_non_nano(self, unit): + td = Timedelta("1 days 02:34:57").as_unit(unit) + + res = td.round("min") + assert res == Timedelta("1 days 02:35:00") + assert res._creso == td._creso + + res = td.floor("min") + assert res == Timedelta("1 days 02:34:00") + assert res._creso == td._creso + + res = td.ceil("min") + assert res == Timedelta("1 days 02:35:00") + assert res._creso == td._creso diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timedelta/test_arithmetic.py pandas-2.2.2+dfsg/pandas/tests/scalar/timedelta/test_arithmetic.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timedelta/test_arithmetic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timedelta/test_arithmetic.py 2024-04-10 17:42:52.000000000 +0000 @@ -309,13 +309,13 @@ def test_td_add_sub_dt64_ndarray(self): td = Timedelta("1 day") - other = pd.to_datetime(["2000-01-01"]).values + other = np.array(["2000-01-01"], dtype="M8[ns]") - expected = pd.to_datetime(["2000-01-02"]).values + expected = np.array(["2000-01-02"], dtype="M8[ns]") tm.assert_numpy_array_equal(td + other, expected) tm.assert_numpy_array_equal(other + td, expected) - expected = pd.to_datetime(["1999-12-31"]).values + expected = np.array(["1999-12-31"], dtype="M8[ns]") tm.assert_numpy_array_equal(-td + other, expected) tm.assert_numpy_array_equal(other - td, expected) @@ -966,6 +966,7 @@ class TestTimedeltaComparison: + @pytest.mark.skip_ubsan def test_compare_pytimedelta_bounds(self): # GH#49021 don't overflow on comparison with very large pytimedeltas @@ -1034,7 +1035,7 @@ cls = tick_classes off = cls(4) - td = off.delta + td = off._as_pd_timedelta assert isinstance(td, Timedelta) assert td == off diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timedelta/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/scalar/timedelta/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timedelta/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timedelta/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,11 +8,177 @@ from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas import ( + Index, NaT, Timedelta, + TimedeltaIndex, offsets, to_timedelta, ) +import pandas._testing as tm + + +class TestTimedeltaConstructorUnitKeyword: + @pytest.mark.parametrize("unit", ["Y", "y", "M"]) + def test_unit_m_y_raises(self, unit): + msg = "Units 'M', 'Y', and 'y' are no longer supported" + + with pytest.raises(ValueError, match=msg): + Timedelta(10, unit) + + with pytest.raises(ValueError, match=msg): + to_timedelta(10, unit) + + with pytest.raises(ValueError, match=msg): + to_timedelta([1, 2], unit) + + @pytest.mark.parametrize( + "unit,unit_depr", + [ + ("h", "H"), + ("min", "T"), + ("s", "S"), + ("ms", "L"), + ("ns", "N"), + ("us", "U"), + ], + ) + def test_units_H_T_S_L_N_U_deprecated(self, unit, unit_depr): + # GH#52536 + msg = f"'{unit_depr}' is deprecated and will be removed in a future version." + + expected = Timedelta(1, unit=unit) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = Timedelta(1, unit=unit_depr) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize( + "unit, np_unit", + [(value, "W") for value in ["W", "w"]] + + [(value, "D") for value in ["D", "d", "days", "day", "Days", "Day"]] + + [ + (value, "m") + for value in [ + "m", + "minute", + "min", + "minutes", + "Minute", + "Min", + "Minutes", + ] + ] + + [ + (value, "s") + for value in [ + "s", + "seconds", + "sec", + "second", + "Seconds", + "Sec", + "Second", + ] + ] + + [ + (value, "ms") + for value in [ + "ms", + "milliseconds", + "millisecond", + "milli", + "millis", + "MS", + "Milliseconds", + "Millisecond", + "Milli", + "Millis", + ] + ] + + [ + (value, "us") + for value in [ + "us", + "microseconds", + "microsecond", + "micro", + "micros", + "u", + "US", + "Microseconds", + "Microsecond", + "Micro", + "Micros", + "U", + ] + ] + + [ + (value, "ns") + for value in [ + "ns", + "nanoseconds", + "nanosecond", + "nano", + "nanos", + "n", + "NS", + "Nanoseconds", + "Nanosecond", + "Nano", + "Nanos", + "N", + ] + ], + ) + @pytest.mark.parametrize("wrapper", [np.array, list, Index]) + def test_unit_parser(self, unit, np_unit, wrapper): + # validate all units, GH 6855, GH 21762 + # array-likes + expected = TimedeltaIndex( + [np.timedelta64(i, np_unit) for i in np.arange(5).tolist()], + dtype="m8[ns]", + ) + # TODO(2.0): the desired output dtype may have non-nano resolution + msg = f"'{unit}' is deprecated and will be removed in a future version." + + if (unit, np_unit) in (("u", "us"), ("U", "us"), ("n", "ns"), ("N", "ns")): + warn = FutureWarning + else: + warn = FutureWarning + msg = "The 'unit' keyword in TimedeltaIndex construction is deprecated" + with tm.assert_produces_warning(warn, match=msg): + result = to_timedelta(wrapper(range(5)), unit=unit) + tm.assert_index_equal(result, expected) + result = TimedeltaIndex(wrapper(range(5)), unit=unit) + tm.assert_index_equal(result, expected) + + str_repr = [f"{x}{unit}" for x in np.arange(5)] + result = to_timedelta(wrapper(str_repr)) + tm.assert_index_equal(result, expected) + result = to_timedelta(wrapper(str_repr)) + tm.assert_index_equal(result, expected) + + # scalar + expected = Timedelta(np.timedelta64(2, np_unit).astype("timedelta64[ns]")) + result = to_timedelta(2, unit=unit) + assert result == expected + result = Timedelta(2, unit=unit) + assert result == expected + + result = to_timedelta(f"2{unit}") + assert result == expected + result = Timedelta(f"2{unit}") + assert result == expected + + +def test_construct_from_kwargs_overflow(): + # GH#55503 + msg = "seconds=86400000000000000000, milliseconds=0, microseconds=0, nanoseconds=0" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta(days=10**6) + msg = "seconds=60000000000000000000, milliseconds=0, microseconds=0, nanoseconds=0" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + Timedelta(minutes=10**9) def test_construct_with_weeks_unit_overflow(): @@ -208,8 +374,8 @@ assert Timedelta(offsets.Second(2)) == Timedelta(seconds=2) # GH#11995: unicode - expected = Timedelta("1H") - result = Timedelta("1H") + expected = Timedelta("1h") + result = Timedelta("1h") assert result == expected assert to_timedelta(offsets.Hour(2)) == Timedelta("0 days, 02:00:00") diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timedelta/test_formats.py pandas-2.2.2+dfsg/pandas/tests/scalar/timedelta/test_formats.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timedelta/test_formats.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timedelta/test_formats.py 2024-04-10 17:42:52.000000000 +0000 @@ -42,3 +42,68 @@ ) def test_isoformat(td, expected_iso): assert td.isoformat() == expected_iso + + +class TestReprBase: + def test_none(self): + delta_1d = Timedelta(1, unit="D") + delta_0d = Timedelta(0, unit="D") + delta_1s = Timedelta(1, unit="s") + delta_500ms = Timedelta(500, unit="ms") + + drepr = lambda x: x._repr_base() + assert drepr(delta_1d) == "1 days" + assert drepr(-delta_1d) == "-1 days" + assert drepr(delta_0d) == "0 days" + assert drepr(delta_1s) == "0 days 00:00:01" + assert drepr(delta_500ms) == "0 days 00:00:00.500000" + assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" + assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" + assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" + assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" + + def test_sub_day(self): + delta_1d = Timedelta(1, unit="D") + delta_0d = Timedelta(0, unit="D") + delta_1s = Timedelta(1, unit="s") + delta_500ms = Timedelta(500, unit="ms") + + drepr = lambda x: x._repr_base(format="sub_day") + assert drepr(delta_1d) == "1 days" + assert drepr(-delta_1d) == "-1 days" + assert drepr(delta_0d) == "00:00:00" + assert drepr(delta_1s) == "00:00:01" + assert drepr(delta_500ms) == "00:00:00.500000" + assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" + assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" + assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" + assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" + + def test_long(self): + delta_1d = Timedelta(1, unit="D") + delta_0d = Timedelta(0, unit="D") + delta_1s = Timedelta(1, unit="s") + delta_500ms = Timedelta(500, unit="ms") + + drepr = lambda x: x._repr_base(format="long") + assert drepr(delta_1d) == "1 days 00:00:00" + assert drepr(-delta_1d) == "-1 days +00:00:00" + assert drepr(delta_0d) == "0 days 00:00:00" + assert drepr(delta_1s) == "0 days 00:00:01" + assert drepr(delta_500ms) == "0 days 00:00:00.500000" + assert drepr(delta_1d + delta_1s) == "1 days 00:00:01" + assert drepr(-delta_1d + delta_1s) == "-1 days +00:00:01" + assert drepr(delta_1d + delta_500ms) == "1 days 00:00:00.500000" + assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" + + def test_all(self): + delta_1d = Timedelta(1, unit="D") + delta_0d = Timedelta(0, unit="D") + delta_1ns = Timedelta(1, unit="ns") + + drepr = lambda x: x._repr_base(format="all") + assert drepr(delta_1d) == "1 days 00:00:00.000000000" + assert drepr(-delta_1d) == "-1 days +00:00:00.000000000" + assert drepr(delta_0d) == "0 days 00:00:00.000000000" + assert drepr(delta_1ns) == "0 days 00:00:00.000000001" + assert drepr(-delta_1d + delta_1ns) == "-1 days +00:00:00.000000001" diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timedelta/test_timedelta.py pandas-2.2.2+dfsg/pandas/tests/scalar/timedelta/test_timedelta.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timedelta/test_timedelta.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timedelta/test_timedelta.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,89 +17,13 @@ from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas.errors import OutOfBoundsTimedelta -import pandas as pd from pandas import ( Timedelta, - TimedeltaIndex, to_timedelta, ) import pandas._testing as tm -class TestAsUnit: - def test_as_unit(self): - td = Timedelta(days=1) - - assert td.as_unit("ns") is td - - res = td.as_unit("us") - assert res._value == td._value // 1000 - assert res._creso == NpyDatetimeUnit.NPY_FR_us.value - - rt = res.as_unit("ns") - assert rt._value == td._value - assert rt._creso == td._creso - - res = td.as_unit("ms") - assert res._value == td._value // 1_000_000 - assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value - - rt = res.as_unit("ns") - assert rt._value == td._value - assert rt._creso == td._creso - - res = td.as_unit("s") - assert res._value == td._value // 1_000_000_000 - assert res._creso == NpyDatetimeUnit.NPY_FR_s.value - - rt = res.as_unit("ns") - assert rt._value == td._value - assert rt._creso == td._creso - - def test_as_unit_overflows(self): - # microsecond that would be just out of bounds for nano - us = 9223372800000000 - td = Timedelta._from_value_and_reso(us, NpyDatetimeUnit.NPY_FR_us.value) - - msg = "Cannot cast 106752 days 00:00:00 to unit='ns' without overflow" - with pytest.raises(OutOfBoundsTimedelta, match=msg): - td.as_unit("ns") - - res = td.as_unit("ms") - assert res._value == us // 1000 - assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value - - def test_as_unit_rounding(self): - td = Timedelta(microseconds=1500) - res = td.as_unit("ms") - - expected = Timedelta(milliseconds=1) - assert res == expected - - assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value - assert res._value == 1 - - with pytest.raises(ValueError, match="Cannot losslessly convert units"): - td.as_unit("ms", round_ok=False) - - def test_as_unit_non_nano(self): - # case where we are going neither to nor from nano - td = Timedelta(days=1).as_unit("ms") - assert td.days == 1 - assert td._value == 86_400_000 - assert td.components.days == 1 - assert td._d == 1 - assert td.total_seconds() == 86400 - - res = td.as_unit("us") - assert res._value == 86_400_000_000 - assert res.components.days == 1 - assert res.components.hours == 0 - assert res._d == 1 - assert res._h == 0 - assert res.total_seconds() == 86400 - - class TestNonNano: @pytest.fixture(params=["s", "ms", "us"]) def unit_str(self, request): @@ -474,11 +398,13 @@ assert tup.microseconds == 999 assert tup.nanoseconds == 0 + # TODO: this is a test of to_timedelta string parsing def test_iso_conversion(self): # GH #21877 expected = Timedelta(1, unit="s") assert to_timedelta("P0DT0H0M1S") == expected + # TODO: this is a test of to_timedelta returning NaT def test_nat_converters(self): result = to_timedelta("nat").to_numpy() assert result.dtype.kind == "M" @@ -488,129 +414,6 @@ assert result.dtype.kind == "M" assert result.astype("int64") == iNaT - @pytest.mark.parametrize( - "unit, np_unit", - [(value, "W") for value in ["W", "w"]] - + [(value, "D") for value in ["D", "d", "days", "day", "Days", "Day"]] - + [ - (value, "m") - for value in [ - "m", - "minute", - "min", - "minutes", - "Minute", - "Min", - "Minutes", - ] - ] - + [ - (value, "s") - for value in [ - "s", - "seconds", - "sec", - "second", - "S", - "Seconds", - "Sec", - "Second", - ] - ] - + [ - (value, "ms") - for value in [ - "ms", - "milliseconds", - "millisecond", - "milli", - "millis", - "MS", - "Milliseconds", - "Millisecond", - "Milli", - "Millis", - ] - ] - + [ - (value, "us") - for value in [ - "us", - "microseconds", - "microsecond", - "micro", - "micros", - "u", - "US", - "Microseconds", - "Microsecond", - "Micro", - "Micros", - "U", - ] - ] - + [ - (value, "ns") - for value in [ - "ns", - "nanoseconds", - "nanosecond", - "nano", - "nanos", - "n", - "NS", - "Nanoseconds", - "Nanosecond", - "Nano", - "Nanos", - "N", - ] - ], - ) - @pytest.mark.parametrize("wrapper", [np.array, list, pd.Index]) - def test_unit_parser(self, unit, np_unit, wrapper): - # validate all units, GH 6855, GH 21762 - # array-likes - expected = TimedeltaIndex( - [np.timedelta64(i, np_unit) for i in np.arange(5).tolist()], - dtype="m8[ns]", - ) - # TODO(2.0): the desired output dtype may have non-nano resolution - result = to_timedelta(wrapper(range(5)), unit=unit) - tm.assert_index_equal(result, expected) - result = TimedeltaIndex(wrapper(range(5)), unit=unit) - tm.assert_index_equal(result, expected) - - str_repr = [f"{x}{unit}" for x in np.arange(5)] - result = to_timedelta(wrapper(str_repr)) - tm.assert_index_equal(result, expected) - result = to_timedelta(wrapper(str_repr)) - tm.assert_index_equal(result, expected) - - # scalar - expected = Timedelta(np.timedelta64(2, np_unit).astype("timedelta64[ns]")) - result = to_timedelta(2, unit=unit) - assert result == expected - result = Timedelta(2, unit=unit) - assert result == expected - - result = to_timedelta(f"2{unit}") - assert result == expected - result = Timedelta(f"2{unit}") - assert result == expected - - @pytest.mark.parametrize("unit", ["Y", "y", "M"]) - def test_unit_m_y_raises(self, unit): - msg = "Units 'M', 'Y', and 'y' are no longer supported" - with pytest.raises(ValueError, match=msg): - Timedelta(10, unit) - - with pytest.raises(ValueError, match=msg): - to_timedelta(10, unit) - - with pytest.raises(ValueError, match=msg): - to_timedelta([1, 2], unit) - def test_numeric_conversions(self): assert Timedelta(0) == np.timedelta64(0, "ns") assert Timedelta(10) == np.timedelta64(10, "ns") @@ -642,177 +445,6 @@ with pytest.raises(ValueError, match=msg): td.to_numpy(copy=True) - @pytest.mark.parametrize( - "freq,s1,s2", - [ - # This first case has s1, s2 being the same as t1,t2 below - ( - "N", - Timedelta("1 days 02:34:56.789123456"), - Timedelta("-1 days 02:34:56.789123456"), - ), - ( - "U", - Timedelta("1 days 02:34:56.789123000"), - Timedelta("-1 days 02:34:56.789123000"), - ), - ( - "L", - Timedelta("1 days 02:34:56.789000000"), - Timedelta("-1 days 02:34:56.789000000"), - ), - ("S", Timedelta("1 days 02:34:57"), Timedelta("-1 days 02:34:57")), - ("2S", Timedelta("1 days 02:34:56"), Timedelta("-1 days 02:34:56")), - ("5S", Timedelta("1 days 02:34:55"), Timedelta("-1 days 02:34:55")), - ("T", Timedelta("1 days 02:35:00"), Timedelta("-1 days 02:35:00")), - ("12T", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), - ("H", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), - ("d", Timedelta("1 days"), Timedelta("-1 days")), - ], - ) - def test_round(self, freq, s1, s2): - t1 = Timedelta("1 days 02:34:56.789123456") - t2 = Timedelta("-1 days 02:34:56.789123456") - - r1 = t1.round(freq) - assert r1 == s1 - r2 = t2.round(freq) - assert r2 == s2 - - def test_round_invalid(self): - t1 = Timedelta("1 days 02:34:56.789123456") - - for freq, msg in [ - ("Y", " is a non-fixed frequency"), - ("M", " is a non-fixed frequency"), - ("foobar", "Invalid frequency: foobar"), - ]: - with pytest.raises(ValueError, match=msg): - t1.round(freq) - - def test_round_implementation_bounds(self): - # See also: analogous test for Timestamp - # GH#38964 - result = Timedelta.min.ceil("s") - expected = Timedelta.min + Timedelta(seconds=1) - Timedelta(145224193) - assert result == expected - - result = Timedelta.max.floor("s") - expected = Timedelta.max - Timedelta(854775807) - assert result == expected - - msg = ( - r"Cannot round -106752 days \+00:12:43.145224193 to freq=s without overflow" - ) - with pytest.raises(OutOfBoundsTimedelta, match=msg): - Timedelta.min.floor("s") - with pytest.raises(OutOfBoundsTimedelta, match=msg): - Timedelta.min.round("s") - - msg = "Cannot round 106751 days 23:47:16.854775807 to freq=s without overflow" - with pytest.raises(OutOfBoundsTimedelta, match=msg): - Timedelta.max.ceil("s") - with pytest.raises(OutOfBoundsTimedelta, match=msg): - Timedelta.max.round("s") - - @given(val=st.integers(min_value=iNaT + 1, max_value=lib.i8max)) - @pytest.mark.parametrize( - "method", [Timedelta.round, Timedelta.floor, Timedelta.ceil] - ) - def test_round_sanity(self, val, method): - cls = Timedelta - err_cls = OutOfBoundsTimedelta - - val = np.int64(val) - td = cls(val) - - def checker(ts, nanos, unit): - # First check that we do raise in cases where we should - if nanos == 1: - pass - else: - div, mod = divmod(ts._value, nanos) - diff = int(nanos - mod) - lb = ts._value - mod - assert lb <= ts._value # i.e. no overflows with python ints - ub = ts._value + diff - assert ub > ts._value # i.e. no overflows with python ints - - msg = "without overflow" - if mod == 0: - # We should never be raising in this - pass - elif method is cls.ceil: - if ub > cls.max._value: - with pytest.raises(err_cls, match=msg): - method(ts, unit) - return - elif method is cls.floor: - if lb < cls.min._value: - with pytest.raises(err_cls, match=msg): - method(ts, unit) - return - elif mod >= diff: - if ub > cls.max._value: - with pytest.raises(err_cls, match=msg): - method(ts, unit) - return - elif lb < cls.min._value: - with pytest.raises(err_cls, match=msg): - method(ts, unit) - return - - res = method(ts, unit) - - td = res - ts - diff = abs(td._value) - assert diff < nanos - assert res._value % nanos == 0 - - if method is cls.round: - assert diff <= nanos / 2 - elif method is cls.floor: - assert res <= ts - elif method is cls.ceil: - assert res >= ts - - nanos = 1 - checker(td, nanos, "ns") - - nanos = 1000 - checker(td, nanos, "us") - - nanos = 1_000_000 - checker(td, nanos, "ms") - - nanos = 1_000_000_000 - checker(td, nanos, "s") - - nanos = 60 * 1_000_000_000 - checker(td, nanos, "min") - - nanos = 60 * 60 * 1_000_000_000 - checker(td, nanos, "h") - - nanos = 24 * 60 * 60 * 1_000_000_000 - checker(td, nanos, "D") - - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) - def test_round_non_nano(self, unit): - td = Timedelta("1 days 02:34:57").as_unit(unit) - - res = td.round("min") - assert res == Timedelta("1 days 02:35:00") - assert res._creso == td._creso - - res = td.floor("min") - assert res == Timedelta("1 days 02:34:00") - assert res._creso == td._creso - - res = td.ceil("min") - assert res == Timedelta("1 days 02:35:00") - assert res._creso == td._creso - def test_identity(self): td = Timedelta(10, unit="d") assert isinstance(td, Timedelta) @@ -919,6 +551,7 @@ ns_td = Timedelta(1, "ns") assert hash(ns_td) != hash(ns_td.to_pytimedelta()) + @pytest.mark.skip_ubsan @pytest.mark.xfail( reason="pd.Timedelta violates the Python hash invariant (GH#44504).", ) @@ -975,21 +608,21 @@ def test_total_seconds_precision(self): # GH 19458 - assert Timedelta("30S").total_seconds() == 30.0 + assert Timedelta("30s").total_seconds() == 30.0 assert Timedelta("0").total_seconds() == 0.0 - assert Timedelta("-2S").total_seconds() == -2.0 - assert Timedelta("5.324S").total_seconds() == 5.324 - assert (Timedelta("30S").total_seconds() - 30.0) < 1e-20 - assert (30.0 - Timedelta("30S").total_seconds()) < 1e-20 + assert Timedelta("-2s").total_seconds() == -2.0 + assert Timedelta("5.324s").total_seconds() == 5.324 + assert (Timedelta("30s").total_seconds() - 30.0) < 1e-20 + assert (30.0 - Timedelta("30s").total_seconds()) < 1e-20 def test_resolution_string(self): assert Timedelta(days=1).resolution_string == "D" - assert Timedelta(days=1, hours=6).resolution_string == "H" - assert Timedelta(days=1, minutes=6).resolution_string == "T" - assert Timedelta(days=1, seconds=6).resolution_string == "S" - assert Timedelta(days=1, milliseconds=6).resolution_string == "L" - assert Timedelta(days=1, microseconds=6).resolution_string == "U" - assert Timedelta(days=1, nanoseconds=6).resolution_string == "N" + assert Timedelta(days=1, hours=6).resolution_string == "h" + assert Timedelta(days=1, minutes=6).resolution_string == "min" + assert Timedelta(days=1, seconds=6).resolution_string == "s" + assert Timedelta(days=1, milliseconds=6).resolution_string == "ms" + assert Timedelta(days=1, microseconds=6).resolution_string == "us" + assert Timedelta(days=1, nanoseconds=6).resolution_string == "ns" def test_resolution_deprecated(self): # GH#21344 @@ -1006,8 +639,8 @@ @pytest.mark.parametrize( "value, expected", [ - (Timedelta("10S"), True), - (Timedelta("-10S"), True), + (Timedelta("10s"), True), + (Timedelta("-10s"), True), (Timedelta(10, unit="ns"), True), (Timedelta(0, unit="ns"), False), (Timedelta(-10, unit="ns"), True), diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_as_unit.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_as_unit.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_as_unit.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_as_unit.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,86 @@ +import pytest + +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas.errors import OutOfBoundsDatetime + +from pandas import Timestamp + + +class TestTimestampAsUnit: + def test_as_unit(self): + ts = Timestamp("1970-01-01").as_unit("ns") + assert ts.unit == "ns" + + assert ts.as_unit("ns") is ts + + res = ts.as_unit("us") + assert res._value == ts._value // 1000 + assert res._creso == NpyDatetimeUnit.NPY_FR_us.value + + rt = res.as_unit("ns") + assert rt._value == ts._value + assert rt._creso == ts._creso + + res = ts.as_unit("ms") + assert res._value == ts._value // 1_000_000 + assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value + + rt = res.as_unit("ns") + assert rt._value == ts._value + assert rt._creso == ts._creso + + res = ts.as_unit("s") + assert res._value == ts._value // 1_000_000_000 + assert res._creso == NpyDatetimeUnit.NPY_FR_s.value + + rt = res.as_unit("ns") + assert rt._value == ts._value + assert rt._creso == ts._creso + + def test_as_unit_overflows(self): + # microsecond that would be just out of bounds for nano + us = 9223372800000000 + ts = Timestamp._from_value_and_reso(us, NpyDatetimeUnit.NPY_FR_us.value, None) + + msg = "Cannot cast 2262-04-12 00:00:00 to unit='ns' without overflow" + with pytest.raises(OutOfBoundsDatetime, match=msg): + ts.as_unit("ns") + + res = ts.as_unit("ms") + assert res._value == us // 1000 + assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value + + def test_as_unit_rounding(self): + ts = Timestamp(1_500_000) # i.e. 1500 microseconds + res = ts.as_unit("ms") + + expected = Timestamp(1_000_000) # i.e. 1 millisecond + assert res == expected + + assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value + assert res._value == 1 + + with pytest.raises(ValueError, match="Cannot losslessly convert units"): + ts.as_unit("ms", round_ok=False) + + def test_as_unit_non_nano(self): + # case where we are going neither to nor from nano + ts = Timestamp("1970-01-02").as_unit("ms") + assert ts.year == 1970 + assert ts.month == 1 + assert ts.day == 2 + assert ts.hour == ts.minute == ts.second == ts.microsecond == ts.nanosecond == 0 + + res = ts.as_unit("s") + assert res._value == 24 * 3600 + assert res.year == 1970 + assert res.month == 1 + assert res.day == 2 + assert ( + res.hour + == res.minute + == res.second + == res.microsecond + == res.nanosecond + == 0 + ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_normalize.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_normalize.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_normalize.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_normalize.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,22 @@ +import pytest + +from pandas._libs.tslibs import Timestamp +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit + + +class TestTimestampNormalize: + @pytest.mark.parametrize("arg", ["2013-11-30", "2013-11-30 12:00:00"]) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_normalize(self, tz_naive_fixture, arg, unit): + tz = tz_naive_fixture + ts = Timestamp(arg, tz=tz).as_unit(unit) + result = ts.normalize() + expected = Timestamp("2013-11-30", tz=tz) + assert result == expected + assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value + + def test_normalize_pre_epoch_dates(self): + # GH: 36294 + result = Timestamp("1969-01-01 09:00:00").normalize() + expected = Timestamp("1969-01-01 00:00:00") + assert result == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_replace.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_replace.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_replace.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_replace.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,193 @@ +from datetime import datetime + +from dateutil.tz import gettz +import numpy as np +import pytest +import pytz + +from pandas._libs.tslibs import ( + OutOfBoundsDatetime, + Timestamp, + conversion, +) +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +import pandas.util._test_decorators as td + +import pandas._testing as tm + + +class TestTimestampReplace: + def test_replace_out_of_pydatetime_bounds(self): + # GH#50348 + ts = Timestamp("2016-01-01").as_unit("ns") + + msg = "Out of bounds timestamp: 99999-01-01 00:00:00 with frequency 'ns'" + with pytest.raises(OutOfBoundsDatetime, match=msg): + ts.replace(year=99_999) + + ts = ts.as_unit("ms") + result = ts.replace(year=99_999) + assert result.year == 99_999 + assert result._value == Timestamp(np.datetime64("99999-01-01", "ms"))._value + + def test_replace_non_nano(self): + ts = Timestamp._from_value_and_reso( + 91514880000000000, NpyDatetimeUnit.NPY_FR_us.value, None + ) + assert ts.to_pydatetime() == datetime(4869, 12, 28) + + result = ts.replace(year=4900) + assert result._creso == ts._creso + assert result.to_pydatetime() == datetime(4900, 12, 28) + + def test_replace_naive(self): + # GH#14621, GH#7825 + ts = Timestamp("2016-01-01 09:00:00") + result = ts.replace(hour=0) + expected = Timestamp("2016-01-01 00:00:00") + assert result == expected + + def test_replace_aware(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH#14621, GH#7825 + # replacing datetime components with and w/o presence of a timezone + ts = Timestamp("2016-01-01 09:00:00", tz=tz) + result = ts.replace(hour=0) + expected = Timestamp("2016-01-01 00:00:00", tz=tz) + assert result == expected + + def test_replace_preserves_nanos(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH#14621, GH#7825 + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) + result = ts.replace(hour=0) + expected = Timestamp("2016-01-01 00:00:00.000000123", tz=tz) + assert result == expected + + def test_replace_multiple(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH#14621, GH#7825 + # replacing datetime components with and w/o presence of a timezone + # test all + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) + result = ts.replace( + year=2015, + month=2, + day=2, + hour=0, + minute=5, + second=5, + microsecond=5, + nanosecond=5, + ) + expected = Timestamp("2015-02-02 00:05:05.000005005", tz=tz) + assert result == expected + + def test_replace_invalid_kwarg(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH#14621, GH#7825 + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) + msg = r"replace\(\) got an unexpected keyword argument" + with pytest.raises(TypeError, match=msg): + ts.replace(foo=5) + + def test_replace_integer_args(self, tz_aware_fixture): + tz = tz_aware_fixture + # GH#14621, GH#7825 + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) + msg = "value must be an integer, received for hour" + with pytest.raises(ValueError, match=msg): + ts.replace(hour=0.1) + + def test_replace_tzinfo_equiv_tz_localize_none(self): + # GH#14621, GH#7825 + # assert conversion to naive is the same as replacing tzinfo with None + ts = Timestamp("2013-11-03 01:59:59.999999-0400", tz="US/Eastern") + assert ts.tz_localize(None) == ts.replace(tzinfo=None) + + @td.skip_if_windows + def test_replace_tzinfo(self): + # GH#15683 + dt = datetime(2016, 3, 27, 1) + tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo + + result_dt = dt.replace(tzinfo=tzinfo) + result_pd = Timestamp(dt).replace(tzinfo=tzinfo) + + # datetime.timestamp() converts in the local timezone + with tm.set_timezone("UTC"): + assert result_dt.timestamp() == result_pd.timestamp() + + assert result_dt == result_pd + assert result_dt == result_pd.to_pydatetime() + + result_dt = dt.replace(tzinfo=tzinfo).replace(tzinfo=None) + result_pd = Timestamp(dt).replace(tzinfo=tzinfo).replace(tzinfo=None) + + # datetime.timestamp() converts in the local timezone + with tm.set_timezone("UTC"): + assert result_dt.timestamp() == result_pd.timestamp() + + assert result_dt == result_pd + assert result_dt == result_pd.to_pydatetime() + + @pytest.mark.parametrize( + "tz, normalize", + [ + (pytz.timezone("US/Eastern"), lambda x: x.tzinfo.normalize(x)), + (gettz("US/Eastern"), lambda x: x), + ], + ) + def test_replace_across_dst(self, tz, normalize): + # GH#18319 check that 1) timezone is correctly normalized and + # 2) that hour is not incorrectly changed by this normalization + ts_naive = Timestamp("2017-12-03 16:03:30") + ts_aware = conversion.localize_pydatetime(ts_naive, tz) + + # Preliminary sanity-check + assert ts_aware == normalize(ts_aware) + + # Replace across DST boundary + ts2 = ts_aware.replace(month=6) + + # Check that `replace` preserves hour literal + assert (ts2.hour, ts2.minute) == (ts_aware.hour, ts_aware.minute) + + # Check that post-replace object is appropriately normalized + ts2b = normalize(ts2) + assert ts2 == ts2b + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_replace_dst_border(self, unit): + # Gh 7825 + t = Timestamp("2013-11-3", tz="America/Chicago").as_unit(unit) + result = t.replace(hour=3) + expected = Timestamp("2013-11-3 03:00:00", tz="America/Chicago") + assert result == expected + assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value + + @pytest.mark.parametrize("fold", [0, 1]) + @pytest.mark.parametrize("tz", ["dateutil/Europe/London", "Europe/London"]) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_replace_dst_fold(self, fold, tz, unit): + # GH 25017 + d = datetime(2019, 10, 27, 2, 30) + ts = Timestamp(d, tz=tz).as_unit(unit) + result = ts.replace(hour=1, fold=fold) + expected = Timestamp(datetime(2019, 10, 27, 1, 30)).tz_localize( + tz, ambiguous=not fold + ) + assert result == expected + assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value + + @pytest.mark.parametrize("fold", [0, 1]) + def test_replace_preserves_fold(self, fold): + # GH#37610. Check that replace preserves Timestamp fold property + tz = gettz("Europe/Moscow") + + ts = Timestamp( + year=2009, month=10, day=25, hour=2, minute=30, fold=fold, tzinfo=tz + ) + ts_replaced = ts.replace(second=1) + + assert ts_replaced.fold == fold diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_round.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_round.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_round.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_round.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,383 @@ +from hypothesis import ( + given, + strategies as st, +) +import numpy as np +import pytest +import pytz + +from pandas._libs import lib +from pandas._libs.tslibs import ( + NaT, + OutOfBoundsDatetime, + Timedelta, + Timestamp, + iNaT, + to_offset, +) +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG + +import pandas._testing as tm + + +class TestTimestampRound: + def test_round_division_by_zero_raises(self): + ts = Timestamp("2016-01-01") + + msg = "Division by zero in rounding" + with pytest.raises(ValueError, match=msg): + ts.round("0ns") + + @pytest.mark.parametrize( + "timestamp, freq, expected", + [ + ("20130101 09:10:11", "D", "20130101"), + ("20130101 19:10:11", "D", "20130102"), + ("20130201 12:00:00", "D", "20130202"), + ("20130104 12:00:00", "D", "20130105"), + ("2000-01-05 05:09:15.13", "D", "2000-01-05 00:00:00"), + ("2000-01-05 05:09:15.13", "h", "2000-01-05 05:00:00"), + ("2000-01-05 05:09:15.13", "s", "2000-01-05 05:09:15"), + ], + ) + def test_round_frequencies(self, timestamp, freq, expected): + dt = Timestamp(timestamp) + result = dt.round(freq) + expected = Timestamp(expected) + assert result == expected + + def test_round_tzaware(self): + dt = Timestamp("20130101 09:10:11", tz="US/Eastern") + result = dt.round("D") + expected = Timestamp("20130101", tz="US/Eastern") + assert result == expected + + dt = Timestamp("20130101 09:10:11", tz="US/Eastern") + result = dt.round("s") + assert result == dt + + def test_round_30min(self): + # round + dt = Timestamp("20130104 12:32:00") + result = dt.round("30Min") + expected = Timestamp("20130104 12:30:00") + assert result == expected + + def test_round_subsecond(self): + # GH#14440 & GH#15578 + result = Timestamp("2016-10-17 12:00:00.0015").round("ms") + expected = Timestamp("2016-10-17 12:00:00.002000") + assert result == expected + + result = Timestamp("2016-10-17 12:00:00.00149").round("ms") + expected = Timestamp("2016-10-17 12:00:00.001000") + assert result == expected + + ts = Timestamp("2016-10-17 12:00:00.0015") + for freq in ["us", "ns"]: + assert ts == ts.round(freq) + + result = Timestamp("2016-10-17 12:00:00.001501031").round("10ns") + expected = Timestamp("2016-10-17 12:00:00.001501030") + assert result == expected + + def test_round_nonstandard_freq(self): + with tm.assert_produces_warning(False): + Timestamp("2016-10-17 12:00:00.001501031").round("1010ns") + + def test_round_invalid_arg(self): + stamp = Timestamp("2000-01-05 05:09:15.13") + with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): + stamp.round("foo") + + @pytest.mark.parametrize( + "test_input, rounder, freq, expected", + [ + ("2117-01-01 00:00:45", "floor", "15s", "2117-01-01 00:00:45"), + ("2117-01-01 00:00:45", "ceil", "15s", "2117-01-01 00:00:45"), + ( + "2117-01-01 00:00:45.000000012", + "floor", + "10ns", + "2117-01-01 00:00:45.000000010", + ), + ( + "1823-01-01 00:00:01.000000012", + "ceil", + "10ns", + "1823-01-01 00:00:01.000000020", + ), + ("1823-01-01 00:00:01", "floor", "1s", "1823-01-01 00:00:01"), + ("1823-01-01 00:00:01", "ceil", "1s", "1823-01-01 00:00:01"), + ("NaT", "floor", "1s", "NaT"), + ("NaT", "ceil", "1s", "NaT"), + ], + ) + def test_ceil_floor_edge(self, test_input, rounder, freq, expected): + dt = Timestamp(test_input) + func = getattr(dt, rounder) + result = func(freq) + + if dt is NaT: + assert result is NaT + else: + expected = Timestamp(expected) + assert result == expected + + @pytest.mark.parametrize( + "test_input, freq, expected", + [ + ("2018-01-01 00:02:06", "2s", "2018-01-01 00:02:06"), + ("2018-01-01 00:02:00", "2min", "2018-01-01 00:02:00"), + ("2018-01-01 00:04:00", "4min", "2018-01-01 00:04:00"), + ("2018-01-01 00:15:00", "15min", "2018-01-01 00:15:00"), + ("2018-01-01 00:20:00", "20min", "2018-01-01 00:20:00"), + ("2018-01-01 03:00:00", "3h", "2018-01-01 03:00:00"), + ], + ) + @pytest.mark.parametrize("rounder", ["ceil", "floor", "round"]) + def test_round_minute_freq(self, test_input, freq, expected, rounder): + # Ensure timestamps that shouldn't round dont! + # GH#21262 + + dt = Timestamp(test_input) + expected = Timestamp(expected) + func = getattr(dt, rounder) + result = func(freq) + assert result == expected + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_ceil(self, unit): + dt = Timestamp("20130101 09:10:11").as_unit(unit) + result = dt.ceil("D") + expected = Timestamp("20130102") + assert result == expected + assert result._creso == dt._creso + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_floor(self, unit): + dt = Timestamp("20130101 09:10:11").as_unit(unit) + result = dt.floor("D") + expected = Timestamp("20130101") + assert result == expected + assert result._creso == dt._creso + + @pytest.mark.parametrize("method", ["ceil", "round", "floor"]) + @pytest.mark.parametrize( + "unit", + ["ns", "us", "ms", "s"], + ) + def test_round_dst_border_ambiguous(self, method, unit): + # GH 18946 round near "fall back" DST + ts = Timestamp("2017-10-29 00:00:00", tz="UTC").tz_convert("Europe/Madrid") + ts = ts.as_unit(unit) + # + result = getattr(ts, method)("h", ambiguous=True) + assert result == ts + assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value + + result = getattr(ts, method)("h", ambiguous=False) + expected = Timestamp("2017-10-29 01:00:00", tz="UTC").tz_convert( + "Europe/Madrid" + ) + assert result == expected + assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value + + result = getattr(ts, method)("h", ambiguous="NaT") + assert result is NaT + + msg = "Cannot infer dst time" + with pytest.raises(pytz.AmbiguousTimeError, match=msg): + getattr(ts, method)("h", ambiguous="raise") + + @pytest.mark.parametrize( + "method, ts_str, freq", + [ + ["ceil", "2018-03-11 01:59:00-0600", "5min"], + ["round", "2018-03-11 01:59:00-0600", "5min"], + ["floor", "2018-03-11 03:01:00-0500", "2h"], + ], + ) + @pytest.mark.parametrize( + "unit", + ["ns", "us", "ms", "s"], + ) + def test_round_dst_border_nonexistent(self, method, ts_str, freq, unit): + # GH 23324 round near "spring forward" DST + ts = Timestamp(ts_str, tz="America/Chicago").as_unit(unit) + result = getattr(ts, method)(freq, nonexistent="shift_forward") + expected = Timestamp("2018-03-11 03:00:00", tz="America/Chicago") + assert result == expected + assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value + + result = getattr(ts, method)(freq, nonexistent="NaT") + assert result is NaT + + msg = "2018-03-11 02:00:00" + with pytest.raises(pytz.NonExistentTimeError, match=msg): + getattr(ts, method)(freq, nonexistent="raise") + + @pytest.mark.parametrize( + "timestamp", + [ + "2018-01-01 0:0:0.124999360", + "2018-01-01 0:0:0.125000367", + "2018-01-01 0:0:0.125500", + "2018-01-01 0:0:0.126500", + "2018-01-01 12:00:00", + "2019-01-01 12:00:00", + ], + ) + @pytest.mark.parametrize( + "freq", + [ + "2ns", + "3ns", + "4ns", + "5ns", + "6ns", + "7ns", + "250ns", + "500ns", + "750ns", + "1us", + "19us", + "250us", + "500us", + "750us", + "1s", + "2s", + "3s", + "1D", + ], + ) + def test_round_int64(self, timestamp, freq): + # check that all rounding modes are accurate to int64 precision + # see GH#22591 + dt = Timestamp(timestamp).as_unit("ns") + unit = to_offset(freq).nanos + + # test floor + result = dt.floor(freq) + assert result._value % unit == 0, f"floor not a {freq} multiple" + assert 0 <= dt._value - result._value < unit, "floor error" + + # test ceil + result = dt.ceil(freq) + assert result._value % unit == 0, f"ceil not a {freq} multiple" + assert 0 <= result._value - dt._value < unit, "ceil error" + + # test round + result = dt.round(freq) + assert result._value % unit == 0, f"round not a {freq} multiple" + assert abs(result._value - dt._value) <= unit // 2, "round error" + if unit % 2 == 0 and abs(result._value - dt._value) == unit // 2: + # round half to even + assert result._value // unit % 2 == 0, "round half to even error" + + def test_round_implementation_bounds(self): + # See also: analogous test for Timedelta + result = Timestamp.min.ceil("s") + expected = Timestamp(1677, 9, 21, 0, 12, 44) + assert result == expected + + result = Timestamp.max.floor("s") + expected = Timestamp.max - Timedelta(854775807) + assert result == expected + + msg = "Cannot round 1677-09-21 00:12:43.145224193 to freq=" + with pytest.raises(OutOfBoundsDatetime, match=msg): + Timestamp.min.floor("s") + + with pytest.raises(OutOfBoundsDatetime, match=msg): + Timestamp.min.round("s") + + msg = "Cannot round 2262-04-11 23:47:16.854775807 to freq=" + with pytest.raises(OutOfBoundsDatetime, match=msg): + Timestamp.max.ceil("s") + + with pytest.raises(OutOfBoundsDatetime, match=msg): + Timestamp.max.round("s") + + @given(val=st.integers(iNaT + 1, lib.i8max)) + @pytest.mark.parametrize( + "method", [Timestamp.round, Timestamp.floor, Timestamp.ceil] + ) + def test_round_sanity(self, val, method): + cls = Timestamp + err_cls = OutOfBoundsDatetime + + val = np.int64(val) + ts = cls(val) + + def checker(ts, nanos, unit): + # First check that we do raise in cases where we should + if nanos == 1: + pass + else: + div, mod = divmod(ts._value, nanos) + diff = int(nanos - mod) + lb = ts._value - mod + assert lb <= ts._value # i.e. no overflows with python ints + ub = ts._value + diff + assert ub > ts._value # i.e. no overflows with python ints + + msg = "without overflow" + if mod == 0: + # We should never be raising in this + pass + elif method is cls.ceil: + if ub > cls.max._value: + with pytest.raises(err_cls, match=msg): + method(ts, unit) + return + elif method is cls.floor: + if lb < cls.min._value: + with pytest.raises(err_cls, match=msg): + method(ts, unit) + return + elif mod >= diff: + if ub > cls.max._value: + with pytest.raises(err_cls, match=msg): + method(ts, unit) + return + elif lb < cls.min._value: + with pytest.raises(err_cls, match=msg): + method(ts, unit) + return + + res = method(ts, unit) + + td = res - ts + diff = abs(td._value) + assert diff < nanos + assert res._value % nanos == 0 + + if method is cls.round: + assert diff <= nanos / 2 + elif method is cls.floor: + assert res <= ts + elif method is cls.ceil: + assert res >= ts + + nanos = 1 + checker(ts, nanos, "ns") + + nanos = 1000 + checker(ts, nanos, "us") + + nanos = 1_000_000 + checker(ts, nanos, "ms") + + nanos = 1_000_000_000 + checker(ts, nanos, "s") + + nanos = 60 * 1_000_000_000 + checker(ts, nanos, "min") + + nanos = 60 * 60 * 1_000_000_000 + checker(ts, nanos, "h") + + nanos = 24 * 60 * 60 * 1_000_000_000 + checker(ts, nanos, "D") diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_timestamp_method.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,31 @@ +# NB: This is for the Timestamp.timestamp *method* specifically, not +# the Timestamp class in general. + +from pytz import utc + +from pandas._libs.tslibs import Timestamp +import pandas.util._test_decorators as td + +import pandas._testing as tm + + +class TestTimestampMethod: + @td.skip_if_windows + def test_timestamp(self, fixed_now_ts): + # GH#17329 + # tz-naive --> treat it as if it were UTC for purposes of timestamp() + ts = fixed_now_ts + uts = ts.replace(tzinfo=utc) + assert ts.timestamp() == uts.timestamp() + + tsc = Timestamp("2014-10-11 11:00:01.12345678", tz="US/Central") + utsc = tsc.tz_convert("UTC") + + # utsc is a different representation of the same time + assert tsc.timestamp() == utsc.timestamp() + + # datetime.timestamp() converts in the local timezone + with tm.set_timezone("UTC"): + # should agree with datetime.timestamp method + dt = ts.to_pydatetime() + assert dt.timestamp() == ts.timestamp() diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_to_julian_date.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_to_julian_date.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_to_julian_date.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_to_julian_date.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,28 @@ +from pandas import Timestamp + + +class TestTimestampToJulianDate: + def test_compare_1700(self): + ts = Timestamp("1700-06-23") + res = ts.to_julian_date() + assert res == 2_342_145.5 + + def test_compare_2000(self): + ts = Timestamp("2000-04-12") + res = ts.to_julian_date() + assert res == 2_451_646.5 + + def test_compare_2100(self): + ts = Timestamp("2100-08-12") + res = ts.to_julian_date() + assert res == 2_488_292.5 + + def test_compare_hour01(self): + ts = Timestamp("2000-08-12T01:00:00") + res = ts.to_julian_date() + assert res == 2_451_768.5416666666666666 + + def test_compare_hour13(self): + ts = Timestamp("2000-08-12T13:00:00") + res = ts.to_julian_date() + assert res == 2_451_769.0416666666666666 diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_to_pydatetime.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,81 @@ +from datetime import ( + datetime, + timedelta, +) + +import pytz + +from pandas._libs.tslibs.timezones import dateutil_gettz as gettz +import pandas.util._test_decorators as td + +from pandas import Timestamp +import pandas._testing as tm + + +class TestTimestampToPyDatetime: + def test_to_pydatetime_fold(self): + # GH#45087 + tzstr = "dateutil/usr/share/zoneinfo/America/Chicago" + ts = Timestamp(year=2013, month=11, day=3, hour=1, minute=0, fold=1, tz=tzstr) + dt = ts.to_pydatetime() + assert dt.fold == 1 + + def test_to_pydatetime_nonzero_nano(self): + ts = Timestamp("2011-01-01 9:00:00.123456789") + + # Warn the user of data loss (nanoseconds). + with tm.assert_produces_warning(UserWarning): + expected = datetime(2011, 1, 1, 9, 0, 0, 123456) + result = ts.to_pydatetime() + assert result == expected + + def test_timestamp_to_datetime(self): + stamp = Timestamp("20090415", tz="US/Eastern") + dtval = stamp.to_pydatetime() + assert stamp == dtval + assert stamp.tzinfo == dtval.tzinfo + + def test_timestamp_to_pydatetime_dateutil(self): + stamp = Timestamp("20090415", tz="dateutil/US/Eastern") + dtval = stamp.to_pydatetime() + assert stamp == dtval + assert stamp.tzinfo == dtval.tzinfo + + def test_timestamp_to_pydatetime_explicit_pytz(self): + stamp = Timestamp("20090415", tz=pytz.timezone("US/Eastern")) + dtval = stamp.to_pydatetime() + assert stamp == dtval + assert stamp.tzinfo == dtval.tzinfo + + @td.skip_if_windows + def test_timestamp_to_pydatetime_explicit_dateutil(self): + stamp = Timestamp("20090415", tz=gettz("US/Eastern")) + dtval = stamp.to_pydatetime() + assert stamp == dtval + assert stamp.tzinfo == dtval.tzinfo + + def test_to_pydatetime_bijective(self): + # Ensure that converting to datetime and back only loses precision + # by going from nanoseconds to microseconds. + exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning): + pydt_max = Timestamp.max.to_pydatetime() + + assert ( + Timestamp(pydt_max).as_unit("ns")._value / 1000 + == Timestamp.max._value / 1000 + ) + + exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning): + pydt_min = Timestamp.min.to_pydatetime() + + # The next assertion can be enabled once GH#39221 is merged + # assert pydt_min < Timestamp.min # this is bc nanos are dropped + tdus = timedelta(microseconds=1) + assert pydt_min + tdus > Timestamp.min + + assert ( + Timestamp(pydt_min + tdus).as_unit("ns")._value / 1000 + == Timestamp.min._value / 1000 + ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_tz_convert.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_tz_convert.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_tz_convert.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_tz_convert.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,51 @@ +import dateutil +import pytest + +from pandas._libs.tslibs import timezones +import pandas.util._test_decorators as td + +from pandas import Timestamp + + +class TestTimestampTZConvert: + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) + def test_astimezone(self, tzstr): + # astimezone is an alias for tz_convert, so keep it with + # the tz_convert tests + utcdate = Timestamp("3/11/2012 22:00", tz="UTC") + expected = utcdate.tz_convert(tzstr) + result = utcdate.astimezone(tzstr) + assert expected == result + assert isinstance(result, Timestamp) + + @pytest.mark.parametrize( + "stamp", + [ + "2014-02-01 09:00", + "2014-07-08 09:00", + "2014-11-01 17:00", + "2014-11-05 00:00", + ], + ) + def test_tz_convert_roundtrip(self, stamp, tz_aware_fixture): + tz = tz_aware_fixture + + ts = Timestamp(stamp, tz="UTC") + converted = ts.tz_convert(tz) + + reset = converted.tz_convert(None) + assert reset == Timestamp(stamp) + assert reset.tzinfo is None + assert reset == converted.tz_convert("UTC").tz_localize(None) + + @td.skip_if_windows + def test_tz_convert_utc_with_system_utc(self): + # from system utc to real utc + ts = Timestamp("2001-01-05 11:56", tz=timezones.maybe_get_tz("dateutil/UTC")) + # check that the time hasn't changed. + assert ts == ts.tz_convert(dateutil.tz.tzutc()) + + # from system utc to real utc + ts = Timestamp("2001-01-05 11:56", tz=timezones.maybe_get_tz("dateutil/UTC")) + # check that the time hasn't changed. + assert ts == ts.tz_convert(dateutil.tz.tzutc()) diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_tz_localize.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_tz_localize.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/methods/test_tz_localize.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/methods/test_tz_localize.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,351 @@ +from datetime import timedelta +import re + +from dateutil.tz import gettz +import pytest +import pytz +from pytz.exceptions import ( + AmbiguousTimeError, + NonExistentTimeError, +) + +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas.errors import OutOfBoundsDatetime + +from pandas import ( + NaT, + Timestamp, +) + +try: + from zoneinfo import ZoneInfo +except ImportError: + # Cannot assign to a type + ZoneInfo = None # type: ignore[misc, assignment] + + +class TestTimestampTZLocalize: + @pytest.mark.skip_ubsan + def test_tz_localize_pushes_out_of_bounds(self): + # GH#12677 + # tz_localize that pushes away from the boundary is OK + msg = ( + f"Converting {Timestamp.min.strftime('%Y-%m-%d %H:%M:%S')} " + f"underflows past {Timestamp.min}" + ) + pac = Timestamp.min.tz_localize("US/Pacific") + assert pac._value > Timestamp.min._value + pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value + with pytest.raises(OutOfBoundsDatetime, match=msg): + Timestamp.min.tz_localize("Asia/Tokyo") + + # tz_localize that pushes away from the boundary is OK + msg = ( + f"Converting {Timestamp.max.strftime('%Y-%m-%d %H:%M:%S')} " + f"overflows past {Timestamp.max}" + ) + tokyo = Timestamp.max.tz_localize("Asia/Tokyo") + assert tokyo._value < Timestamp.max._value + tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value + with pytest.raises(OutOfBoundsDatetime, match=msg): + Timestamp.max.tz_localize("US/Pacific") + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_tz_localize_ambiguous_bool(self, unit): + # make sure that we are correctly accepting bool values as ambiguous + # GH#14402 + ts = Timestamp("2015-11-01 01:00:03").as_unit(unit) + expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") + expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") + + msg = "Cannot infer dst time from 2015-11-01 01:00:03" + with pytest.raises(pytz.AmbiguousTimeError, match=msg): + ts.tz_localize("US/Central") + + with pytest.raises(pytz.AmbiguousTimeError, match=msg): + ts.tz_localize("dateutil/US/Central") + + if ZoneInfo is not None: + try: + tz = ZoneInfo("US/Central") + except KeyError: + # no tzdata + pass + else: + with pytest.raises(pytz.AmbiguousTimeError, match=msg): + ts.tz_localize(tz) + + result = ts.tz_localize("US/Central", ambiguous=True) + assert result == expected0 + assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value + + result = ts.tz_localize("US/Central", ambiguous=False) + assert result == expected1 + assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value + + def test_tz_localize_ambiguous(self): + ts = Timestamp("2014-11-02 01:00") + ts_dst = ts.tz_localize("US/Eastern", ambiguous=True) + ts_no_dst = ts.tz_localize("US/Eastern", ambiguous=False) + + assert ts_no_dst._value - ts_dst._value == 3600 + msg = re.escape( + "'ambiguous' parameter must be one of: " + "True, False, 'NaT', 'raise' (default)" + ) + with pytest.raises(ValueError, match=msg): + ts.tz_localize("US/Eastern", ambiguous="infer") + + # GH#8025 + msg = "Cannot localize tz-aware Timestamp, use tz_convert for conversions" + with pytest.raises(TypeError, match=msg): + Timestamp("2011-01-01", tz="US/Eastern").tz_localize("Asia/Tokyo") + + msg = "Cannot convert tz-naive Timestamp, use tz_localize to localize" + with pytest.raises(TypeError, match=msg): + Timestamp("2011-01-01").tz_convert("Asia/Tokyo") + + @pytest.mark.parametrize( + "stamp, tz", + [ + ("2015-03-08 02:00", "US/Eastern"), + ("2015-03-08 02:30", "US/Pacific"), + ("2015-03-29 02:00", "Europe/Paris"), + ("2015-03-29 02:30", "Europe/Belgrade"), + ], + ) + def test_tz_localize_nonexistent(self, stamp, tz): + # GH#13057 + ts = Timestamp(stamp) + with pytest.raises(NonExistentTimeError, match=stamp): + ts.tz_localize(tz) + # GH 22644 + with pytest.raises(NonExistentTimeError, match=stamp): + ts.tz_localize(tz, nonexistent="raise") + assert ts.tz_localize(tz, nonexistent="NaT") is NaT + + @pytest.mark.parametrize( + "stamp, tz, forward_expected, backward_expected", + [ + ( + "2015-03-29 02:00:00", + "Europe/Warsaw", + "2015-03-29 03:00:00", + "2015-03-29 01:59:59", + ), # utc+1 -> utc+2 + ( + "2023-03-12 02:00:00", + "America/Los_Angeles", + "2023-03-12 03:00:00", + "2023-03-12 01:59:59", + ), # utc-8 -> utc-7 + ( + "2023-03-26 01:00:00", + "Europe/London", + "2023-03-26 02:00:00", + "2023-03-26 00:59:59", + ), # utc+0 -> utc+1 + ( + "2023-03-26 00:00:00", + "Atlantic/Azores", + "2023-03-26 01:00:00", + "2023-03-25 23:59:59", + ), # utc-1 -> utc+0 + ], + ) + def test_tz_localize_nonexistent_shift( + self, stamp, tz, forward_expected, backward_expected + ): + ts = Timestamp(stamp) + forward_ts = ts.tz_localize(tz, nonexistent="shift_forward") + assert forward_ts == Timestamp(forward_expected, tz=tz) + backward_ts = ts.tz_localize(tz, nonexistent="shift_backward") + assert backward_ts == Timestamp(backward_expected, tz=tz) + + def test_tz_localize_ambiguous_raise(self): + # GH#13057 + ts = Timestamp("2015-11-1 01:00") + msg = "Cannot infer dst time from 2015-11-01 01:00:00," + with pytest.raises(AmbiguousTimeError, match=msg): + ts.tz_localize("US/Pacific", ambiguous="raise") + + def test_tz_localize_nonexistent_invalid_arg(self, warsaw): + # GH 22644 + tz = warsaw + ts = Timestamp("2015-03-29 02:00:00") + msg = ( + "The nonexistent argument must be one of 'raise', 'NaT', " + "'shift_forward', 'shift_backward' or a timedelta object" + ) + with pytest.raises(ValueError, match=msg): + ts.tz_localize(tz, nonexistent="foo") + + @pytest.mark.parametrize( + "stamp", + [ + "2014-02-01 09:00", + "2014-07-08 09:00", + "2014-11-01 17:00", + "2014-11-05 00:00", + ], + ) + def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): + tz = tz_aware_fixture + ts = Timestamp(stamp) + localized = ts.tz_localize(tz) + assert localized == Timestamp(stamp, tz=tz) + + msg = "Cannot localize tz-aware Timestamp" + with pytest.raises(TypeError, match=msg): + localized.tz_localize(tz) + + reset = localized.tz_localize(None) + assert reset == ts + assert reset.tzinfo is None + + def test_tz_localize_ambiguous_compat(self): + # validate that pytz and dateutil are compat for dst + # when the transition happens + naive = Timestamp("2013-10-27 01:00:00") + + pytz_zone = "Europe/London" + dateutil_zone = "dateutil/Europe/London" + result_pytz = naive.tz_localize(pytz_zone, ambiguous=False) + result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=False) + assert result_pytz._value == result_dateutil._value + assert result_pytz._value == 1382835600 + + # fixed ambiguous behavior + # see gh-14621, GH#45087 + assert result_pytz.to_pydatetime().tzname() == "GMT" + assert result_dateutil.to_pydatetime().tzname() == "GMT" + assert str(result_pytz) == str(result_dateutil) + + # 1 hour difference + result_pytz = naive.tz_localize(pytz_zone, ambiguous=True) + result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=True) + assert result_pytz._value == result_dateutil._value + assert result_pytz._value == 1382832000 + + # see gh-14621 + assert str(result_pytz) == str(result_dateutil) + assert ( + result_pytz.to_pydatetime().tzname() + == result_dateutil.to_pydatetime().tzname() + ) + + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + "US/Eastern", + "dateutil/US/Eastern", + ], + ) + def test_timestamp_tz_localize(self, tz): + stamp = Timestamp("3/11/2012 04:00") + + result = stamp.tz_localize(tz) + expected = Timestamp("3/11/2012 04:00", tz=tz) + assert result.hour == expected.hour + assert result == expected + + @pytest.mark.parametrize( + "start_ts, tz, end_ts, shift", + [ + ["2015-03-29 02:20:00", "Europe/Warsaw", "2015-03-29 03:00:00", "forward"], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:59:59.999999999", + "backward", + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 03:20:00", + timedelta(hours=1), + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:20:00", + timedelta(hours=-1), + ], + ["2018-03-11 02:33:00", "US/Pacific", "2018-03-11 03:00:00", "forward"], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:59:59.999999999", + "backward", + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 03:33:00", + timedelta(hours=1), + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:33:00", + timedelta(hours=-1), + ], + ], + ) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_timestamp_tz_localize_nonexistent_shift( + self, start_ts, tz, end_ts, shift, tz_type, unit + ): + # GH 8917, 24466 + tz = tz_type + tz + if isinstance(shift, str): + shift = "shift_" + shift + ts = Timestamp(start_ts).as_unit(unit) + result = ts.tz_localize(tz, nonexistent=shift) + expected = Timestamp(end_ts).tz_localize(tz) + + if unit == "us": + assert result == expected.replace(nanosecond=0) + elif unit == "ms": + micros = expected.microsecond - expected.microsecond % 1000 + assert result == expected.replace(microsecond=micros, nanosecond=0) + elif unit == "s": + assert result == expected.replace(microsecond=0, nanosecond=0) + else: + assert result == expected + assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value + + @pytest.mark.parametrize("offset", [-1, 1]) + def test_timestamp_tz_localize_nonexistent_shift_invalid(self, offset, warsaw): + # GH 8917, 24466 + tz = warsaw + ts = Timestamp("2015-03-29 02:20:00") + msg = "The provided timedelta will relocalize on a nonexistent time" + with pytest.raises(ValueError, match=msg): + ts.tz_localize(tz, nonexistent=timedelta(seconds=offset)) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_timestamp_tz_localize_nonexistent_NaT(self, warsaw, unit): + # GH 8917 + tz = warsaw + ts = Timestamp("2015-03-29 02:20:00").as_unit(unit) + result = ts.tz_localize(tz, nonexistent="NaT") + assert result is NaT + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_timestamp_tz_localize_nonexistent_raise(self, warsaw, unit): + # GH 8917 + tz = warsaw + ts = Timestamp("2015-03-29 02:20:00").as_unit(unit) + msg = "2015-03-29 02:20:00" + with pytest.raises(pytz.NonExistentTimeError, match=msg): + ts.tz_localize(tz, nonexistent="raise") + msg = ( + "The nonexistent argument must be one of 'raise', 'NaT', " + "'shift_forward', 'shift_backward' or a timedelta object" + ) + with pytest.raises(ValueError, match=msg): + ts.tz_localize(tz, nonexistent="foo") diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_arithmetic.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_arithmetic.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_arithmetic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_arithmetic.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,8 +4,10 @@ timezone, ) +from dateutil.tz import gettz import numpy as np import pytest +import pytz from pandas._libs.tslibs import ( OutOfBoundsDatetime, @@ -40,17 +42,12 @@ stamp = Timestamp("2017-01-13 00:00:00").as_unit("ns") offset_overflow = 20169940 * offsets.Day(1) - msg = ( - "the add operation between " - r"\<-?\d+ \* Days\> and \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} " - "will overflow" - ) lmsg2 = r"Cannot cast -?20169940 days \+?00:00:00 to unit='ns' without overflow" with pytest.raises(OutOfBoundsTimedelta, match=lmsg2): stamp + offset_overflow - with pytest.raises(OverflowError, match=msg): + with pytest.raises(OutOfBoundsTimedelta, match=lmsg2): offset_overflow + stamp with pytest.raises(OutOfBoundsTimedelta, match=lmsg2): @@ -68,7 +65,7 @@ with pytest.raises(OutOfBoundsTimedelta, match=lmsg3): stamp + offset_overflow - with pytest.raises(OverflowError, match=msg): + with pytest.raises(OutOfBoundsTimedelta, match=lmsg3): offset_overflow + stamp with pytest.raises(OutOfBoundsTimedelta, match=lmsg3): @@ -293,3 +290,45 @@ result = ts1 - ts2 expected = Timedelta(0) assert result == expected + + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + "US/Eastern", + "dateutil/US/Eastern", + ], + ) + def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): + # GH#1389 + + # 4 hours before DST transition + stamp = Timestamp("3/10/2012 22:00", tz=tz) + + result = stamp + timedelta(hours=6) + + # spring forward, + "7" hours + expected = Timestamp("3/11/2012 05:00", tz=tz) + + assert result == expected + + +class SubDatetime(datetime): + pass + + +@pytest.mark.parametrize( + "lh,rh", + [ + (SubDatetime(2000, 1, 1), Timedelta(hours=1)), + (Timedelta(hours=1), SubDatetime(2000, 1, 1)), + ], +) +def test_dt_subclass_add_timedelta(lh, rh): + # GH#25851 + # ensure that subclassed datetime works for + # Timedelta operations + result = lh + rh + expected = SubDatetime(2000, 1, 1, 1) + assert result == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,7 +8,11 @@ import zoneinfo import dateutil.tz -from dateutil.tz import tzutc +from dateutil.tz import ( + gettz, + tzoffset, + tzutc, +) import numpy as np import pytest import pytz @@ -26,28 +30,397 @@ ) -class TestTimestampConstructors: +class TestTimestampConstructorUnitKeyword: + @pytest.mark.parametrize("typ", [int, float]) + def test_constructor_int_float_with_YM_unit(self, typ): + # GH#47266 avoid the conversions in cast_from_unit + val = typ(150) + + ts = Timestamp(val, unit="Y") + expected = Timestamp("2120-01-01") + assert ts == expected + + ts = Timestamp(val, unit="M") + expected = Timestamp("1982-07-01") + assert ts == expected + + @pytest.mark.parametrize("typ", [int, float]) + def test_construct_from_int_float_with_unit_out_of_bound_raises(self, typ): + # GH#50870 make sure we get a OutOfBoundsDatetime instead of OverflowError + val = typ(150000000000000) + + msg = f"cannot convert input {val} with the unit 'D'" + with pytest.raises(OutOfBoundsDatetime, match=msg): + Timestamp(val, unit="D") + + def test_constructor_float_not_round_with_YM_unit_raises(self): + # GH#47267 avoid the conversions in cast_from-unit + + msg = "Conversion of non-round float with unit=[MY] is ambiguous" + with pytest.raises(ValueError, match=msg): + Timestamp(150.5, unit="Y") + + with pytest.raises(ValueError, match=msg): + Timestamp(150.5, unit="M") + + @pytest.mark.parametrize( + "value, check_kwargs", + [ + [946688461000000000, {}], + [946688461000000000 / 1000, {"unit": "us"}], + [946688461000000000 / 1_000_000, {"unit": "ms"}], + [946688461000000000 / 1_000_000_000, {"unit": "s"}], + [10957, {"unit": "D", "h": 0}], + [ + (946688461000000000 + 500000) / 1000000000, + {"unit": "s", "us": 499, "ns": 964}, + ], + [ + (946688461000000000 + 500000000) / 1000000000, + {"unit": "s", "us": 500000}, + ], + [(946688461000000000 + 500000) / 1000000, {"unit": "ms", "us": 500}], + [(946688461000000000 + 500000) / 1000, {"unit": "us", "us": 500}], + [(946688461000000000 + 500000000) / 1000000, {"unit": "ms", "us": 500000}], + [946688461000000000 / 1000.0 + 5, {"unit": "us", "us": 5}], + [946688461000000000 / 1000.0 + 5000, {"unit": "us", "us": 5000}], + [946688461000000000 / 1000000.0 + 0.5, {"unit": "ms", "us": 500}], + [946688461000000000 / 1000000.0 + 0.005, {"unit": "ms", "us": 5, "ns": 5}], + [946688461000000000 / 1000000000.0 + 0.5, {"unit": "s", "us": 500000}], + [10957 + 0.5, {"unit": "D", "h": 12}], + ], + ) + def test_construct_with_unit(self, value, check_kwargs): + def check(value, unit=None, h=1, s=1, us=0, ns=0): + stamp = Timestamp(value, unit=unit) + assert stamp.year == 2000 + assert stamp.month == 1 + assert stamp.day == 1 + assert stamp.hour == h + if unit != "D": + assert stamp.minute == 1 + assert stamp.second == s + assert stamp.microsecond == us + else: + assert stamp.minute == 0 + assert stamp.second == 0 + assert stamp.microsecond == 0 + assert stamp.nanosecond == ns + + check(value, **check_kwargs) + + +class TestTimestampConstructorFoldKeyword: + def test_timestamp_constructor_invalid_fold_raise(self): + # Test for GH#25057 + # Valid fold values are only [None, 0, 1] + msg = "Valid values for the fold argument are None, 0, or 1." + with pytest.raises(ValueError, match=msg): + Timestamp(123, fold=2) + + def test_timestamp_constructor_pytz_fold_raise(self): + # Test for GH#25057 + # pytz doesn't support fold. Check that we raise + # if fold is passed with pytz + msg = "pytz timezones do not support fold. Please use dateutil timezones." + tz = pytz.timezone("Europe/London") + with pytest.raises(ValueError, match=msg): + Timestamp(datetime(2019, 10, 27, 0, 30, 0, 0), tz=tz, fold=0) + + @pytest.mark.parametrize("fold", [0, 1]) + @pytest.mark.parametrize( + "ts_input", + [ + 1572136200000000000, + 1572136200000000000.0, + np.datetime64(1572136200000000000, "ns"), + "2019-10-27 01:30:00+01:00", + datetime(2019, 10, 27, 0, 30, 0, 0, tzinfo=timezone.utc), + ], + ) + def test_timestamp_constructor_fold_conflict(self, ts_input, fold): + # Test for GH#25057 + # Check that we raise on fold conflict + msg = ( + "Cannot pass fold with possibly unambiguous input: int, float, " + "numpy.datetime64, str, or timezone-aware datetime-like. " + "Pass naive datetime-like or build Timestamp from components." + ) + with pytest.raises(ValueError, match=msg): + Timestamp(ts_input=ts_input, fold=fold) + + @pytest.mark.parametrize("tz", ["dateutil/Europe/London", None]) + @pytest.mark.parametrize("fold", [0, 1]) + def test_timestamp_constructor_retain_fold(self, tz, fold): + # Test for GH#25057 + # Check that we retain fold + ts = Timestamp(year=2019, month=10, day=27, hour=1, minute=30, tz=tz, fold=fold) + result = ts.fold + expected = fold + assert result == expected + + try: + _tzs = [ + "dateutil/Europe/London", + zoneinfo.ZoneInfo("Europe/London"), + ] + except zoneinfo.ZoneInfoNotFoundError: + _tzs = ["dateutil/Europe/London"] + + @pytest.mark.parametrize("tz", _tzs) + @pytest.mark.parametrize( + "ts_input,fold_out", + [ + (1572136200000000000, 0), + (1572139800000000000, 1), + ("2019-10-27 01:30:00+01:00", 0), + ("2019-10-27 01:30:00+00:00", 1), + (datetime(2019, 10, 27, 1, 30, 0, 0, fold=0), 0), + (datetime(2019, 10, 27, 1, 30, 0, 0, fold=1), 1), + ], + ) + def test_timestamp_constructor_infer_fold_from_value(self, tz, ts_input, fold_out): + # Test for GH#25057 + # Check that we infer fold correctly based on timestamps since utc + # or strings + ts = Timestamp(ts_input, tz=tz) + result = ts.fold + expected = fold_out + assert result == expected + + @pytest.mark.parametrize("tz", ["dateutil/Europe/London"]) + @pytest.mark.parametrize( + "ts_input,fold,value_out", + [ + (datetime(2019, 10, 27, 1, 30, 0, 0), 0, 1572136200000000), + (datetime(2019, 10, 27, 1, 30, 0, 0), 1, 1572139800000000), + ], + ) + def test_timestamp_constructor_adjust_value_for_fold( + self, tz, ts_input, fold, value_out + ): + # Test for GH#25057 + # Check that we adjust value for fold correctly + # based on timestamps since utc + ts = Timestamp(ts_input, tz=tz, fold=fold) + result = ts._value + expected = value_out + assert result == expected + + +class TestTimestampConstructorPositionalAndKeywordSupport: + def test_constructor_positional(self): + # see GH#10758 + msg = ( + "'NoneType' object cannot be interpreted as an integer" + if PY310 + else "an integer is required" + ) + with pytest.raises(TypeError, match=msg): + Timestamp(2000, 1) + + msg = "month must be in 1..12" + with pytest.raises(ValueError, match=msg): + Timestamp(2000, 0, 1) + with pytest.raises(ValueError, match=msg): + Timestamp(2000, 13, 1) + + msg = "day is out of range for month" + with pytest.raises(ValueError, match=msg): + Timestamp(2000, 1, 0) + with pytest.raises(ValueError, match=msg): + Timestamp(2000, 1, 32) + + # see gh-11630 + assert repr(Timestamp(2015, 11, 12)) == repr(Timestamp("20151112")) + assert repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)) == repr( + Timestamp("2015-11-12 01:02:03.999999") + ) + + def test_constructor_keyword(self): + # GH#10758 + msg = "function missing required argument 'day'|Required argument 'day'" + with pytest.raises(TypeError, match=msg): + Timestamp(year=2000, month=1) + + msg = "month must be in 1..12" + with pytest.raises(ValueError, match=msg): + Timestamp(year=2000, month=0, day=1) + with pytest.raises(ValueError, match=msg): + Timestamp(year=2000, month=13, day=1) + + msg = "day is out of range for month" + with pytest.raises(ValueError, match=msg): + Timestamp(year=2000, month=1, day=0) + with pytest.raises(ValueError, match=msg): + Timestamp(year=2000, month=1, day=32) + + assert repr(Timestamp(year=2015, month=11, day=12)) == repr( + Timestamp("20151112") + ) + + assert repr( + Timestamp( + year=2015, + month=11, + day=12, + hour=1, + minute=2, + second=3, + microsecond=999999, + ) + ) == repr(Timestamp("2015-11-12 01:02:03.999999")) + + @pytest.mark.parametrize( + "arg", + [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", + "nanosecond", + ], + ) + def test_invalid_date_kwarg_with_string_input(self, arg): + kwarg = {arg: 1} + msg = "Cannot pass a date attribute keyword argument" + with pytest.raises(ValueError, match=msg): + Timestamp("2010-10-10 12:59:59.999999999", **kwarg) + + @pytest.mark.parametrize("kwargs", [{}, {"year": 2020}, {"year": 2020, "month": 1}]) + def test_constructor_missing_keyword(self, kwargs): + # GH#31200 + + # The exact error message of datetime() depends on its version + msg1 = r"function missing required argument '(year|month|day)' \(pos [123]\)" + msg2 = r"Required argument '(year|month|day)' \(pos [123]\) not found" + msg = "|".join([msg1, msg2]) + + with pytest.raises(TypeError, match=msg): + Timestamp(**kwargs) + + def test_constructor_positional_with_tzinfo(self): + # GH#31929 + ts = Timestamp(2020, 12, 31, tzinfo=timezone.utc) + expected = Timestamp("2020-12-31", tzinfo=timezone.utc) + assert ts == expected + + @pytest.mark.parametrize("kwd", ["nanosecond", "microsecond", "second", "minute"]) + def test_constructor_positional_keyword_mixed_with_tzinfo(self, kwd, request): + # TODO: if we passed microsecond with a keyword we would mess up + # xref GH#45307 + if kwd != "nanosecond": + # nanosecond is keyword-only as of 2.0, others are not + mark = pytest.mark.xfail(reason="GH#45307") + request.applymarker(mark) + + kwargs = {kwd: 4} + ts = Timestamp(2020, 12, 31, tzinfo=timezone.utc, **kwargs) + + td_kwargs = {kwd + "s": 4} + td = Timedelta(**td_kwargs) + expected = Timestamp("2020-12-31", tz=timezone.utc) + td + assert ts == expected + + +class TestTimestampClassMethodConstructors: + # Timestamp constructors other than __new__ + + def test_constructor_strptime(self): + # GH#25016 + # Test support for Timestamp.strptime + fmt = "%Y%m%d-%H%M%S-%f%z" + ts = "20190129-235348-000001+0000" + msg = r"Timestamp.strptime\(\) is not implemented" + with pytest.raises(NotImplementedError, match=msg): + Timestamp.strptime(ts, fmt) + + def test_constructor_fromisocalendar(self): + # GH#30395 + expected_timestamp = Timestamp("2000-01-03 00:00:00") + expected_stdlib = datetime.fromisocalendar(2000, 1, 1) + result = Timestamp.fromisocalendar(2000, 1, 1) + assert result == expected_timestamp + assert result == expected_stdlib + assert isinstance(result, Timestamp) + + def test_constructor_fromordinal(self): + base = datetime(2000, 1, 1) + + ts = Timestamp.fromordinal(base.toordinal()) + assert base == ts + assert base.toordinal() == ts.toordinal() + + ts = Timestamp.fromordinal(base.toordinal(), tz="US/Eastern") + assert Timestamp("2000-01-01", tz="US/Eastern") == ts + assert base.toordinal() == ts.toordinal() + + # GH#3042 + dt = datetime(2011, 4, 16, 0, 0) + ts = Timestamp.fromordinal(dt.toordinal()) + assert ts.to_pydatetime() == dt + + # with a tzinfo + stamp = Timestamp("2011-4-16", tz="US/Eastern") + dt_tz = stamp.to_pydatetime() + ts = Timestamp.fromordinal(dt_tz.toordinal(), tz="US/Eastern") + assert ts.to_pydatetime() == dt_tz + + def test_now(self): + # GH#9000 + ts_from_string = Timestamp("now") + ts_from_method = Timestamp.now() + ts_datetime = datetime.now() + + ts_from_string_tz = Timestamp("now", tz="US/Eastern") + ts_from_method_tz = Timestamp.now(tz="US/Eastern") + + # Check that the delta between the times is less than 1s (arbitrarily + # small) + delta = Timedelta(seconds=1) + assert abs(ts_from_method - ts_from_string) < delta + assert abs(ts_datetime - ts_from_method) < delta + assert abs(ts_from_method_tz - ts_from_string_tz) < delta + assert ( + abs( + ts_from_string_tz.tz_localize(None) + - ts_from_method_tz.tz_localize(None) + ) + < delta + ) + + def test_today(self): + ts_from_string = Timestamp("today") + ts_from_method = Timestamp.today() + ts_datetime = datetime.today() + + ts_from_string_tz = Timestamp("today", tz="US/Eastern") + ts_from_method_tz = Timestamp.today(tz="US/Eastern") + + # Check that the delta between the times is less than 1s (arbitrarily + # small) + delta = Timedelta(seconds=1) + assert abs(ts_from_method - ts_from_string) < delta + assert abs(ts_datetime - ts_from_method) < delta + assert abs(ts_from_method_tz - ts_from_string_tz) < delta + assert ( + abs( + ts_from_string_tz.tz_localize(None) + - ts_from_method_tz.tz_localize(None) + ) + < delta + ) + + +class TestTimestampResolutionInference: def test_construct_from_time_unit(self): # GH#54097 only passing a time component, no date ts = Timestamp("01:01:01.111") assert ts.unit == "ms" - def test_weekday_but_no_day_raises(self): - # GH#52659 - msg = "Parsing datetimes with weekday but no day information is not supported" - with pytest.raises(ValueError, match=msg): - Timestamp("2023 Sept Thu") - - def test_construct_from_string_invalid_raises(self): - # dateutil (weirdly) parses "200622-12-31" as - # datetime(2022, 6, 20, 12, 0, tzinfo=tzoffset(None, -111600) - # which besides being mis-parsed, is a tzoffset that will cause - # str(ts) to raise ValueError. Ensure we raise in the constructor - # instead. - # see test_to_datetime_malformed_raise for analogous to_datetime test - with pytest.raises(ValueError, match="gives an invalid tzoffset"): - Timestamp("200622-12-31") - def test_constructor_str_infer_reso(self): # non-iso8601 path @@ -72,6 +445,50 @@ ts = Timestamp("300 June 1:30:01.300") assert ts.unit == "ms" + # dateutil path -> don't drop trailing zeros + ts = Timestamp("01-01-2013T00:00:00.000000000+0000") + assert ts.unit == "ns" + + ts = Timestamp("2016/01/02 03:04:05.001000 UTC") + assert ts.unit == "us" + + # higher-than-nanosecond -> we drop the trailing bits + ts = Timestamp("01-01-2013T00:00:00.000000002100+0000") + assert ts == Timestamp("01-01-2013T00:00:00.000000002+0000") + assert ts.unit == "ns" + + # GH#56208 minute reso through the ISO8601 path with tz offset + ts = Timestamp("2020-01-01 00:00+00:00") + assert ts.unit == "s" + + ts = Timestamp("2020-01-01 00+00:00") + assert ts.unit == "s" + + @pytest.mark.parametrize("method", ["now", "today"]) + def test_now_today_unit(self, method): + # GH#55879 + ts_from_method = getattr(Timestamp, method)() + ts_from_string = Timestamp(method) + assert ts_from_method.unit == ts_from_string.unit == "us" + + +class TestTimestampConstructors: + def test_weekday_but_no_day_raises(self): + # GH#52659 + msg = "Parsing datetimes with weekday but no day information is not supported" + with pytest.raises(ValueError, match=msg): + Timestamp("2023 Sept Thu") + + def test_construct_from_string_invalid_raises(self): + # dateutil (weirdly) parses "200622-12-31" as + # datetime(2022, 6, 20, 12, 0, tzinfo=tzoffset(None, -111600) + # which besides being mis-parsed, is a tzoffset that will cause + # str(ts) to raise ValueError. Ensure we raise in the constructor + # instead. + # see test_to_datetime_malformed_raise for analogous to_datetime test + with pytest.raises(ValueError, match="gives an invalid tzoffset"): + Timestamp("200622-12-31") + def test_constructor_from_iso8601_str_with_offset_reso(self): # GH#49737 ts = Timestamp("2016-01-01 04:05:06-01:00") @@ -93,38 +510,6 @@ ts = Timestamp(obj) assert ts.unit == "s" - @pytest.mark.parametrize("typ", [int, float]) - def test_construct_from_int_float_with_unit_out_of_bound_raises(self, typ): - # GH#50870 make sure we get a OutOfBoundsDatetime instead of OverflowError - val = typ(150000000000000) - - msg = f"cannot convert input {val} with the unit 'D'" - with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp(val, unit="D") - - @pytest.mark.parametrize("typ", [int, float]) - def test_constructor_int_float_with_YM_unit(self, typ): - # GH#47266 avoid the conversions in cast_from_unit - val = typ(150) - - ts = Timestamp(val, unit="Y") - expected = Timestamp("2120-01-01") - assert ts == expected - - ts = Timestamp(val, unit="M") - expected = Timestamp("1982-07-01") - assert ts == expected - - def test_constructor_float_not_round_with_YM_unit_deprecated(self): - # GH#47267 avoid the conversions in cast_from-unit - - msg = "Conversion of non-round float with unit=[MY] is ambiguous" - with pytest.raises(ValueError, match=msg): - Timestamp(150.5, unit="Y") - - with pytest.raises(ValueError, match=msg): - Timestamp(150.5, unit="M") - def test_constructor_datetime64_with_tz(self): # GH#42288, GH#24559 dt = np.datetime64("1970-01-01 05:00:00") @@ -319,15 +704,6 @@ # interpreted as `year` Timestamp("2012-01-01", "US/Pacific") - def test_constructor_strptime(self): - # GH25016 - # Test support for Timestamp.strptime - fmt = "%Y%m%d-%H%M%S-%f%z" - ts = "20190129-235348-000001+0000" - msg = r"Timestamp.strptime\(\) is not implemented" - with pytest.raises(NotImplementedError, match=msg): - Timestamp.strptime(ts, fmt) - def test_constructor_tz_or_tzinfo(self): # GH#17943, GH#17690, GH#5168 stamps = [ @@ -340,113 +716,6 @@ ] assert all(ts == stamps[0] for ts in stamps) - def test_constructor_positional_with_tzinfo(self): - # GH#31929 - ts = Timestamp(2020, 12, 31, tzinfo=timezone.utc) - expected = Timestamp("2020-12-31", tzinfo=timezone.utc) - assert ts == expected - - @pytest.mark.parametrize("kwd", ["nanosecond", "microsecond", "second", "minute"]) - def test_constructor_positional_keyword_mixed_with_tzinfo(self, kwd, request): - # TODO: if we passed microsecond with a keyword we would mess up - # xref GH#45307 - if kwd != "nanosecond": - # nanosecond is keyword-only as of 2.0, others are not - mark = pytest.mark.xfail(reason="GH#45307") - request.node.add_marker(mark) - - kwargs = {kwd: 4} - ts = Timestamp(2020, 12, 31, tzinfo=timezone.utc, **kwargs) - - td_kwargs = {kwd + "s": 4} - td = Timedelta(**td_kwargs) - expected = Timestamp("2020-12-31", tz=timezone.utc) + td - assert ts == expected - - def test_constructor_positional(self): - # see gh-10758 - msg = ( - "'NoneType' object cannot be interpreted as an integer" - if PY310 - else "an integer is required" - ) - with pytest.raises(TypeError, match=msg): - Timestamp(2000, 1) - - msg = "month must be in 1..12" - with pytest.raises(ValueError, match=msg): - Timestamp(2000, 0, 1) - with pytest.raises(ValueError, match=msg): - Timestamp(2000, 13, 1) - - msg = "day is out of range for month" - with pytest.raises(ValueError, match=msg): - Timestamp(2000, 1, 0) - with pytest.raises(ValueError, match=msg): - Timestamp(2000, 1, 32) - - # see gh-11630 - assert repr(Timestamp(2015, 11, 12)) == repr(Timestamp("20151112")) - assert repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)) == repr( - Timestamp("2015-11-12 01:02:03.999999") - ) - - def test_constructor_keyword(self): - # GH 10758 - msg = "function missing required argument 'day'|Required argument 'day'" - with pytest.raises(TypeError, match=msg): - Timestamp(year=2000, month=1) - - msg = "month must be in 1..12" - with pytest.raises(ValueError, match=msg): - Timestamp(year=2000, month=0, day=1) - with pytest.raises(ValueError, match=msg): - Timestamp(year=2000, month=13, day=1) - - msg = "day is out of range for month" - with pytest.raises(ValueError, match=msg): - Timestamp(year=2000, month=1, day=0) - with pytest.raises(ValueError, match=msg): - Timestamp(year=2000, month=1, day=32) - - assert repr(Timestamp(year=2015, month=11, day=12)) == repr( - Timestamp("20151112") - ) - - assert repr( - Timestamp( - year=2015, - month=11, - day=12, - hour=1, - minute=2, - second=3, - microsecond=999999, - ) - ) == repr(Timestamp("2015-11-12 01:02:03.999999")) - - def test_constructor_fromordinal(self): - base = datetime(2000, 1, 1) - - ts = Timestamp.fromordinal(base.toordinal()) - assert base == ts - assert base.toordinal() == ts.toordinal() - - ts = Timestamp.fromordinal(base.toordinal(), tz="US/Eastern") - assert Timestamp("2000-01-01", tz="US/Eastern") == ts - assert base.toordinal() == ts.toordinal() - - # GH#3042 - dt = datetime(2011, 4, 16, 0, 0) - ts = Timestamp.fromordinal(dt.toordinal()) - assert ts.to_pydatetime() == dt - - # with a tzinfo - stamp = Timestamp("2011-4-16", tz="US/Eastern") - dt_tz = stamp.to_pydatetime() - ts = Timestamp.fromordinal(dt_tz.toordinal(), tz="US/Eastern") - assert ts.to_pydatetime() == dt_tz - @pytest.mark.parametrize( "result", [ @@ -490,25 +759,6 @@ with pytest.raises(ValueError, match=msg): Timestamp(f"2014-11-02 01:00{z}") - @pytest.mark.parametrize( - "arg", - [ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - "nanosecond", - ], - ) - def test_invalid_date_kwarg_with_string_input(self, arg): - kwarg = {arg: 1} - msg = "Cannot pass a date attribute keyword argument" - with pytest.raises(ValueError, match=msg): - Timestamp("2010-10-10 12:59:59.999999999", **kwarg) - def test_out_of_bounds_integer_value(self): # GH#26651 check that we raise OutOfBoundsDatetime, not OverflowError msg = str(Timestamp.max._value * 2) @@ -572,6 +822,7 @@ with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp("2262-04-11 23:47:16.854775808") + @pytest.mark.skip_ubsan def test_bounds_with_different_units(self): out_of_bounds_dates = ("1677-09-21", "2262-04-12") @@ -590,7 +841,7 @@ # With more extreme cases, we can't even fit inside second resolution info = np.iinfo(np.int64) - msg = "Out of bounds nanosecond timestamp:" + msg = "Out of bounds second timestamp:" for value in [info.min + 1, info.max]: for unit in ["D", "h", "m"]: dt64 = np.datetime64(value, unit) @@ -623,59 +874,6 @@ # Ensure that Timestamp.max is a valid Timestamp Timestamp(Timestamp.max) - def test_now(self): - # GH#9000 - ts_from_string = Timestamp("now") - ts_from_method = Timestamp.now() - ts_datetime = datetime.now() - - ts_from_string_tz = Timestamp("now", tz="US/Eastern") - ts_from_method_tz = Timestamp.now(tz="US/Eastern") - - # Check that the delta between the times is less than 1s (arbitrarily - # small) - delta = Timedelta(seconds=1) - assert abs(ts_from_method - ts_from_string) < delta - assert abs(ts_datetime - ts_from_method) < delta - assert abs(ts_from_method_tz - ts_from_string_tz) < delta - assert ( - abs( - ts_from_string_tz.tz_localize(None) - - ts_from_method_tz.tz_localize(None) - ) - < delta - ) - - def test_today(self): - ts_from_string = Timestamp("today") - ts_from_method = Timestamp.today() - ts_datetime = datetime.today() - - ts_from_string_tz = Timestamp("today", tz="US/Eastern") - ts_from_method_tz = Timestamp.today(tz="US/Eastern") - - # Check that the delta between the times is less than 1s (arbitrarily - # small) - delta = Timedelta(seconds=1) - assert abs(ts_from_method - ts_from_string) < delta - assert abs(ts_datetime - ts_from_method) < delta - assert abs(ts_from_method_tz - ts_from_string_tz) < delta - assert ( - abs( - ts_from_string_tz.tz_localize(None) - - ts_from_method_tz.tz_localize(None) - ) - < delta - ) - - @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")]) - def test_disallow_setting_tz(self, tz): - # GH 3746 - ts = Timestamp("2010") - msg = "Cannot directly set timezone" - with pytest.raises(AttributeError, match=msg): - ts.tz = tz - @pytest.mark.parametrize("offset", ["+0300", "+0200"]) def test_construct_timestamp_near_dst(self, offset): # GH 20854 @@ -720,14 +918,88 @@ expected = Timestamp(2000, 1, 1) assert result == expected - def test_constructor_fromisocalendar(self): - # GH 30395 - expected_timestamp = Timestamp("2000-01-03 00:00:00") - expected_stdlib = datetime.fromisocalendar(2000, 1, 1) - result = Timestamp.fromisocalendar(2000, 1, 1) - assert result == expected_timestamp - assert result == expected_stdlib - assert isinstance(result, Timestamp) + def test_timestamp_constructor_tz_utc(self): + utc_stamp = Timestamp("3/11/2012 05:00", tz="utc") + assert utc_stamp.tzinfo is timezone.utc + assert utc_stamp.hour == 5 + + utc_stamp = Timestamp("3/11/2012 05:00").tz_localize("utc") + assert utc_stamp.hour == 5 + + def test_timestamp_to_datetime_tzoffset(self): + tzinfo = tzoffset(None, 7200) + expected = Timestamp("3/11/2012 04:00", tz=tzinfo) + result = Timestamp(expected.to_pydatetime()) + assert expected == result + + def test_timestamp_constructor_near_dst_boundary(self): + # GH#11481 & GH#15777 + # Naive string timestamps were being localized incorrectly + # with tz_convert_from_utc_single instead of tz_localize_to_utc + + for tz in ["Europe/Brussels", "Europe/Prague"]: + result = Timestamp("2015-10-25 01:00", tz=tz) + expected = Timestamp("2015-10-25 01:00").tz_localize(tz) + assert result == expected + + msg = "Cannot infer dst time from 2015-10-25 02:00:00" + with pytest.raises(pytz.AmbiguousTimeError, match=msg): + Timestamp("2015-10-25 02:00", tz=tz) + + result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 01:00").tz_localize("Europe/Paris") + assert result == expected + + msg = "2017-03-26 02:00" + with pytest.raises(pytz.NonExistentTimeError, match=msg): + Timestamp("2017-03-26 02:00", tz="Europe/Paris") + + # GH#11708 + naive = Timestamp("2015-11-18 10:00:00") + result = naive.tz_localize("UTC").tz_convert("Asia/Kolkata") + expected = Timestamp("2015-11-18 15:30:00+0530", tz="Asia/Kolkata") + assert result == expected + + # GH#15823 + result = Timestamp("2017-03-26 00:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 00:00:00+0100", tz="Europe/Paris") + assert result == expected + + result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 01:00:00+0100", tz="Europe/Paris") + assert result == expected + + msg = "2017-03-26 02:00" + with pytest.raises(pytz.NonExistentTimeError, match=msg): + Timestamp("2017-03-26 02:00", tz="Europe/Paris") + + result = Timestamp("2017-03-26 02:00:00+0100", tz="Europe/Paris") + naive = Timestamp(result.as_unit("ns")._value) + expected = naive.tz_localize("UTC").tz_convert("Europe/Paris") + assert result == expected + + result = Timestamp("2017-03-26 03:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 03:00:00+0200", tz="Europe/Paris") + assert result == expected + + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + "US/Eastern", + "dateutil/US/Eastern", + ], + ) + def test_timestamp_constructed_by_date_and_tz(self, tz): + # GH#2993, Timestamp cannot be constructed by datetime.date + # and tz correctly + + result = Timestamp(date(2012, 3, 11), tz=tz) + + expected = Timestamp("3/11/2012", tz=tz) + assert result.hour == expected.hour + assert result == expected def test_constructor_ambiguous_dst(): @@ -761,19 +1033,6 @@ assert result is expected -@pytest.mark.parametrize("kwargs", [{}, {"year": 2020}, {"year": 2020, "month": 1}]) -def test_constructor_missing_keyword(kwargs): - # GH 31200 - - # The exact error message of datetime() depends on its version - msg1 = r"function missing required argument '(year|month|day)' \(pos [123]\)" - msg2 = r"Required argument '(year|month|day)' \(pos [123]\) not found" - msg = "|".join([msg1, msg2]) - - with pytest.raises(TypeError, match=msg): - Timestamp(**kwargs) - - @pytest.mark.parametrize("nano", [-1, 1000]) def test_timestamp_nano_range(nano): # GH 48255 @@ -801,107 +1060,6 @@ assert result == -52700112000 -def test_timestamp_constructor_invalid_fold_raise(): - # Test forGH #25057 - # Valid fold values are only [None, 0, 1] - msg = "Valid values for the fold argument are None, 0, or 1." - with pytest.raises(ValueError, match=msg): - Timestamp(123, fold=2) - - -def test_timestamp_constructor_pytz_fold_raise(): - # Test for GH#25057 - # pytz doesn't support fold. Check that we raise - # if fold is passed with pytz - msg = "pytz timezones do not support fold. Please use dateutil timezones." - tz = pytz.timezone("Europe/London") - with pytest.raises(ValueError, match=msg): - Timestamp(datetime(2019, 10, 27, 0, 30, 0, 0), tz=tz, fold=0) - - -@pytest.mark.parametrize("fold", [0, 1]) -@pytest.mark.parametrize( - "ts_input", - [ - 1572136200000000000, - 1572136200000000000.0, - np.datetime64(1572136200000000000, "ns"), - "2019-10-27 01:30:00+01:00", - datetime(2019, 10, 27, 0, 30, 0, 0, tzinfo=timezone.utc), - ], -) -def test_timestamp_constructor_fold_conflict(ts_input, fold): - # Test for GH#25057 - # Check that we raise on fold conflict - msg = ( - "Cannot pass fold with possibly unambiguous input: int, float, " - "numpy.datetime64, str, or timezone-aware datetime-like. " - "Pass naive datetime-like or build Timestamp from components." - ) - with pytest.raises(ValueError, match=msg): - Timestamp(ts_input=ts_input, fold=fold) - - -@pytest.mark.parametrize("tz", ["dateutil/Europe/London", None]) -@pytest.mark.parametrize("fold", [0, 1]) -def test_timestamp_constructor_retain_fold(tz, fold): - # Test for GH#25057 - # Check that we retain fold - ts = Timestamp(year=2019, month=10, day=27, hour=1, minute=30, tz=tz, fold=fold) - result = ts.fold - expected = fold - assert result == expected - - -try: - _tzs = [ - "dateutil/Europe/London", - zoneinfo.ZoneInfo("Europe/London"), - ] -except zoneinfo.ZoneInfoNotFoundError: - _tzs = ["dateutil/Europe/London"] - - -@pytest.mark.parametrize("tz", _tzs) -@pytest.mark.parametrize( - "ts_input,fold_out", - [ - (1572136200000000000, 0), - (1572139800000000000, 1), - ("2019-10-27 01:30:00+01:00", 0), - ("2019-10-27 01:30:00+00:00", 1), - (datetime(2019, 10, 27, 1, 30, 0, 0, fold=0), 0), - (datetime(2019, 10, 27, 1, 30, 0, 0, fold=1), 1), - ], -) -def test_timestamp_constructor_infer_fold_from_value(tz, ts_input, fold_out): - # Test for GH#25057 - # Check that we infer fold correctly based on timestamps since utc - # or strings - ts = Timestamp(ts_input, tz=tz) - result = ts.fold - expected = fold_out - assert result == expected - - -@pytest.mark.parametrize("tz", ["dateutil/Europe/London"]) -@pytest.mark.parametrize( - "ts_input,fold,value_out", - [ - (datetime(2019, 10, 27, 1, 30, 0, 0), 0, 1572136200000000), - (datetime(2019, 10, 27, 1, 30, 0, 0), 1, 1572139800000000), - ], -) -def test_timestamp_constructor_adjust_value_for_fold(tz, ts_input, fold, value_out): - # Test for GH#25057 - # Check that we adjust value for fold correctly - # based on timestamps since utc - ts = Timestamp(ts_input, tz=tz, fold=fold) - result = ts._value - expected = value_out - assert result == expected - - @pytest.mark.parametrize("na_value", [None, np.nan, np.datetime64("NaT"), NaT, NA]) def test_timestamp_constructor_na_value(na_value): # GH45481 diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_formats.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_formats.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_formats.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_formats.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,9 @@ +from datetime import datetime +import pprint + +import dateutil.tz import pytest +import pytz # a test below uses pytz but only inside a `eval` call from pandas import Timestamp @@ -80,3 +85,117 @@ ) def test_isoformat(ts, timespec, expected_iso): assert ts.isoformat(timespec=timespec) == expected_iso + + +class TestTimestampRendering: + timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/America/Los_Angeles"] + + @pytest.mark.parametrize("tz", timezones) + @pytest.mark.parametrize("freq", ["D", "M", "S", "N"]) + @pytest.mark.parametrize( + "date", ["2014-03-07", "2014-01-01 09:00", "2014-01-01 00:00:00.000000001"] + ) + def test_repr(self, date, freq, tz): + # avoid to match with timezone name + freq_repr = f"'{freq}'" + if tz.startswith("dateutil"): + tz_repr = tz.replace("dateutil", "") + else: + tz_repr = tz + + date_only = Timestamp(date) + assert date in repr(date_only) + assert tz_repr not in repr(date_only) + assert freq_repr not in repr(date_only) + assert date_only == eval(repr(date_only)) + + date_tz = Timestamp(date, tz=tz) + assert date in repr(date_tz) + assert tz_repr in repr(date_tz) + assert freq_repr not in repr(date_tz) + assert date_tz == eval(repr(date_tz)) + + def test_repr_utcoffset(self): + # This can cause the tz field to be populated, but it's redundant to + # include this information in the date-string. + date_with_utc_offset = Timestamp("2014-03-13 00:00:00-0400", tz=None) + assert "2014-03-13 00:00:00-0400" in repr(date_with_utc_offset) + assert "tzoffset" not in repr(date_with_utc_offset) + assert "UTC-04:00" in repr(date_with_utc_offset) + expr = repr(date_with_utc_offset) + assert date_with_utc_offset == eval(expr) + + def test_timestamp_repr_pre1900(self): + # pre-1900 + stamp = Timestamp("1850-01-01", tz="US/Eastern") + repr(stamp) + + iso8601 = "1850-01-01 01:23:45.012345" + stamp = Timestamp(iso8601, tz="US/Eastern") + result = repr(stamp) + assert iso8601 in result + + def test_pprint(self): + # GH#12622 + nested_obj = {"foo": 1, "bar": [{"w": {"a": Timestamp("2011-01-01")}}] * 10} + result = pprint.pformat(nested_obj, width=50) + expected = r"""{'bar': [{'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, + {'w': {'a': Timestamp('2011-01-01 00:00:00')}}], + 'foo': 1}""" + assert result == expected + + def test_to_timestamp_repr_is_code(self): + zs = [ + Timestamp("99-04-17 00:00:00", tz="UTC"), + Timestamp("2001-04-17 00:00:00", tz="UTC"), + Timestamp("2001-04-17 00:00:00", tz="America/Los_Angeles"), + Timestamp("2001-04-17 00:00:00", tz=None), + ] + for z in zs: + assert eval(repr(z)) == z + + def test_repr_matches_pydatetime_no_tz(self): + dt_date = datetime(2013, 1, 2) + assert str(dt_date) == str(Timestamp(dt_date)) + + dt_datetime = datetime(2013, 1, 2, 12, 1, 3) + assert str(dt_datetime) == str(Timestamp(dt_datetime)) + + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45) + assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) + + ts_nanos_only = Timestamp(200) + assert str(ts_nanos_only) == "1970-01-01 00:00:00.000000200" + + ts_nanos_micros = Timestamp(1200) + assert str(ts_nanos_micros) == "1970-01-01 00:00:00.000001200" + + def test_repr_matches_pydatetime_tz_pytz(self): + dt_date = datetime(2013, 1, 2, tzinfo=pytz.utc) + assert str(dt_date) == str(Timestamp(dt_date)) + + dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=pytz.utc) + assert str(dt_datetime) == str(Timestamp(dt_datetime)) + + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=pytz.utc) + assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) + + def test_repr_matches_pydatetime_tz_dateutil(self): + utc = dateutil.tz.tzutc() + + dt_date = datetime(2013, 1, 2, tzinfo=utc) + assert str(dt_date) == str(Timestamp(dt_date)) + + dt_datetime = datetime(2013, 1, 2, 12, 1, 3, tzinfo=utc) + assert str(dt_datetime) == str(Timestamp(dt_datetime)) + + dt_datetime_us = datetime(2013, 1, 2, 12, 1, 3, 45, tzinfo=utc) + assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_rendering.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_rendering.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_rendering.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_rendering.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,82 +0,0 @@ -import pprint - -import pytest -import pytz # noqa: F401 # a test below uses pytz but only inside a `eval` call - -from pandas import Timestamp - - -class TestTimestampRendering: - timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] - - @pytest.mark.parametrize("tz", timezones) - @pytest.mark.parametrize("freq", ["D", "M", "S", "N"]) - @pytest.mark.parametrize( - "date", ["2014-03-07", "2014-01-01 09:00", "2014-01-01 00:00:00.000000001"] - ) - def test_repr(self, date, freq, tz): - # avoid to match with timezone name - freq_repr = f"'{freq}'" - if tz.startswith("dateutil"): - tz_repr = tz.replace("dateutil", "") - else: - tz_repr = tz - - date_only = Timestamp(date) - assert date in repr(date_only) - assert tz_repr not in repr(date_only) - assert freq_repr not in repr(date_only) - assert date_only == eval(repr(date_only)) - - date_tz = Timestamp(date, tz=tz) - assert date in repr(date_tz) - assert tz_repr in repr(date_tz) - assert freq_repr not in repr(date_tz) - assert date_tz == eval(repr(date_tz)) - - def test_repr_utcoffset(self): - # This can cause the tz field to be populated, but it's redundant to - # include this information in the date-string. - date_with_utc_offset = Timestamp("2014-03-13 00:00:00-0400", tz=None) - assert "2014-03-13 00:00:00-0400" in repr(date_with_utc_offset) - assert "tzoffset" not in repr(date_with_utc_offset) - assert "UTC-04:00" in repr(date_with_utc_offset) - expr = repr(date_with_utc_offset) - assert date_with_utc_offset == eval(expr) - - def test_timestamp_repr_pre1900(self): - # pre-1900 - stamp = Timestamp("1850-01-01", tz="US/Eastern") - repr(stamp) - - iso8601 = "1850-01-01 01:23:45.012345" - stamp = Timestamp(iso8601, tz="US/Eastern") - result = repr(stamp) - assert iso8601 in result - - def test_pprint(self): - # GH#12622 - nested_obj = {"foo": 1, "bar": [{"w": {"a": Timestamp("2011-01-01")}}] * 10} - result = pprint.pformat(nested_obj, width=50) - expected = r"""{'bar': [{'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, - {'w': {'a': Timestamp('2011-01-01 00:00:00')}}], - 'foo': 1}""" - assert result == expected - - def test_to_timestamp_repr_is_code(self): - zs = [ - Timestamp("99-04-17 00:00:00", tz="UTC"), - Timestamp("2001-04-17 00:00:00", tz="UTC"), - Timestamp("2001-04-17 00:00:00", tz="America/Los_Angeles"), - Timestamp("2001-04-17 00:00:00", tz=None), - ] - for z in zs: - assert eval(repr(z)) == z diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_timestamp.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_timestamp.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_timestamp.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_timestamp.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,8 +31,6 @@ tz_compare, ) from pandas.compat import IS64 -from pandas.errors import OutOfBoundsDatetime -import pandas.util._test_decorators as td from pandas import ( NaT, @@ -262,6 +260,14 @@ class TestTimestamp: + @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")]) + def test_disallow_setting_tz(self, tz): + # GH#3746 + ts = Timestamp("2010") + msg = "Cannot directly set timezone" + with pytest.raises(AttributeError, match=msg): + ts.tz = tz + def test_default_to_stdlib_utc(self): assert Timestamp.utcnow().tz is timezone.utc assert Timestamp.now("UTC").tz is timezone.utc @@ -299,12 +305,13 @@ assert Timestamp("nat").asm8.view("i8") == np.datetime64("nat", "ns").view("i8") - def test_class_ops_pytz(self): + def test_class_ops(self): def compare(x, y): assert int((Timestamp(x)._value - Timestamp(y)._value) / 1e9) == 0 compare(Timestamp.now(), datetime.now()) compare(Timestamp.now("UTC"), datetime.now(pytz.timezone("UTC"))) + compare(Timestamp.now("UTC"), datetime.now(tzutc())) compare(Timestamp.utcnow(), datetime.now(timezone.utc)) compare(Timestamp.today(), datetime.today()) current_time = calendar.timegm(datetime.now().utctimetuple()) @@ -332,36 +339,6 @@ datetime.combine(date_component, time_component), ) - def test_class_ops_dateutil(self): - def compare(x, y): - assert ( - int( - np.round(Timestamp(x)._value / 1e9) - - np.round(Timestamp(y)._value / 1e9) - ) - == 0 - ) - - compare(Timestamp.now(), datetime.now()) - compare(Timestamp.now("UTC"), datetime.now(tzutc())) - compare(Timestamp.utcnow(), datetime.now(timezone.utc)) - compare(Timestamp.today(), datetime.today()) - current_time = calendar.timegm(datetime.now().utctimetuple()) - - ts_utc = Timestamp.utcfromtimestamp(current_time) - assert ts_utc.timestamp() == current_time - - compare( - Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time) - ) - - date_component = datetime.now(timezone.utc) - time_component = (date_component + timedelta(minutes=10)).time() - compare( - Timestamp.combine(date_component, time_component), - datetime.combine(date_component, time_component), - ) - def test_basics_nanos(self): val = np.int64(946_684_800_000_000_000).view("M8[ns]") stamp = Timestamp(val.view("i8") + 500) @@ -379,52 +356,6 @@ assert stamp.microsecond == 145224 assert stamp.nanosecond == 192 - @pytest.mark.parametrize( - "value, check_kwargs", - [ - [946688461000000000, {}], - [946688461000000000 / 1000, {"unit": "us"}], - [946688461000000000 / 1_000_000, {"unit": "ms"}], - [946688461000000000 / 1_000_000_000, {"unit": "s"}], - [10957, {"unit": "D", "h": 0}], - [ - (946688461000000000 + 500000) / 1000000000, - {"unit": "s", "us": 499, "ns": 964}, - ], - [ - (946688461000000000 + 500000000) / 1000000000, - {"unit": "s", "us": 500000}, - ], - [(946688461000000000 + 500000) / 1000000, {"unit": "ms", "us": 500}], - [(946688461000000000 + 500000) / 1000, {"unit": "us", "us": 500}], - [(946688461000000000 + 500000000) / 1000000, {"unit": "ms", "us": 500000}], - [946688461000000000 / 1000.0 + 5, {"unit": "us", "us": 5}], - [946688461000000000 / 1000.0 + 5000, {"unit": "us", "us": 5000}], - [946688461000000000 / 1000000.0 + 0.5, {"unit": "ms", "us": 500}], - [946688461000000000 / 1000000.0 + 0.005, {"unit": "ms", "us": 5, "ns": 5}], - [946688461000000000 / 1000000000.0 + 0.5, {"unit": "s", "us": 500000}], - [10957 + 0.5, {"unit": "D", "h": 12}], - ], - ) - def test_unit(self, value, check_kwargs): - def check(value, unit=None, h=1, s=1, us=0, ns=0): - stamp = Timestamp(value, unit=unit) - assert stamp.year == 2000 - assert stamp.month == 1 - assert stamp.day == 1 - assert stamp.hour == h - if unit != "D": - assert stamp.minute == 1 - assert stamp.second == s - assert stamp.microsecond == us - else: - assert stamp.minute == 0 - assert stamp.second == 0 - assert stamp.microsecond == 0 - assert stamp.nanosecond == ns - - check(value, **check_kwargs) - def test_roundtrip(self): # test value to string and back conversions # further test accessors @@ -545,28 +476,6 @@ assert t.nanosecond == 10 -class TestTimestampToJulianDate: - def test_compare_1700(self): - r = Timestamp("1700-06-23").to_julian_date() - assert r == 2_342_145.5 - - def test_compare_2000(self): - r = Timestamp("2000-04-12").to_julian_date() - assert r == 2_451_646.5 - - def test_compare_2100(self): - r = Timestamp("2100-08-12").to_julian_date() - assert r == 2_488_292.5 - - def test_compare_hour01(self): - r = Timestamp("2000-08-12T01:00:00").to_julian_date() - assert r == 2_451_768.5416666666666666 - - def test_compare_hour13(self): - r = Timestamp("2000-08-12T13:00:00").to_julian_date() - assert r == 2_451_769.0416666666666666 - - class TestTimestampConversion: def test_conversion(self): # GH#9255 @@ -583,73 +492,6 @@ assert type(result) == type(expected) assert result.dtype == expected.dtype - def test_to_pydatetime_fold(self): - # GH#45087 - tzstr = "dateutil/usr/share/zoneinfo/America/Chicago" - ts = Timestamp(year=2013, month=11, day=3, hour=1, minute=0, fold=1, tz=tzstr) - dt = ts.to_pydatetime() - assert dt.fold == 1 - - def test_to_pydatetime_nonzero_nano(self): - ts = Timestamp("2011-01-01 9:00:00.123456789") - - # Warn the user of data loss (nanoseconds). - with tm.assert_produces_warning(UserWarning): - expected = datetime(2011, 1, 1, 9, 0, 0, 123456) - result = ts.to_pydatetime() - assert result == expected - - def test_timestamp_to_datetime(self): - stamp = Timestamp("20090415", tz="US/Eastern") - dtval = stamp.to_pydatetime() - assert stamp == dtval - assert stamp.tzinfo == dtval.tzinfo - - def test_timestamp_to_datetime_dateutil(self): - stamp = Timestamp("20090415", tz="dateutil/US/Eastern") - dtval = stamp.to_pydatetime() - assert stamp == dtval - assert stamp.tzinfo == dtval.tzinfo - - def test_timestamp_to_datetime_explicit_pytz(self): - stamp = Timestamp("20090415", tz=pytz.timezone("US/Eastern")) - dtval = stamp.to_pydatetime() - assert stamp == dtval - assert stamp.tzinfo == dtval.tzinfo - - @td.skip_if_windows - def test_timestamp_to_datetime_explicit_dateutil(self): - stamp = Timestamp("20090415", tz=gettz("US/Eastern")) - dtval = stamp.to_pydatetime() - assert stamp == dtval - assert stamp.tzinfo == dtval.tzinfo - - def test_to_datetime_bijective(self): - # Ensure that converting to datetime and back only loses precision - # by going from nanoseconds to microseconds. - exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning - with tm.assert_produces_warning(exp_warning): - pydt_max = Timestamp.max.to_pydatetime() - - assert ( - Timestamp(pydt_max).as_unit("ns")._value / 1000 - == Timestamp.max._value / 1000 - ) - - exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning - with tm.assert_produces_warning(exp_warning): - pydt_min = Timestamp.min.to_pydatetime() - - # The next assertion can be enabled once GH#39221 is merged - # assert pydt_min < Timestamp.min # this is bc nanos are dropped - tdus = timedelta(microseconds=1) - assert pydt_min + tdus > Timestamp.min - - assert ( - Timestamp(pydt_min + tdus).as_unit("ns")._value / 1000 - == Timestamp.min._value / 1000 - ) - def test_to_period_tz_warning(self): # GH#21333 make sure a warning is issued when timezone # info is lost @@ -671,26 +513,6 @@ ts.to_numpy(copy=True) -class SubDatetime(datetime): - pass - - -@pytest.mark.parametrize( - "lh,rh", - [ - (SubDatetime(2000, 1, 1), Timedelta(hours=1)), - (Timedelta(hours=1), SubDatetime(2000, 1, 1)), - ], -) -def test_dt_subclass_add_timedelta(lh, rh): - # GH#25851 - # ensure that subclassed datetime works for - # Timedelta operations - result = lh + rh - expected = SubDatetime(2000, 1, 1, 1) - assert result == expected - - class TestNonNano: @pytest.fixture(params=["s", "ms", "us"]) def reso(self, request): @@ -1060,86 +882,6 @@ assert Timestamp.resolution._creso == NpyDatetimeUnit.NPY_FR_ns.value -class TestAsUnit: - def test_as_unit(self): - ts = Timestamp("1970-01-01").as_unit("ns") - assert ts.unit == "ns" - - assert ts.as_unit("ns") is ts - - res = ts.as_unit("us") - assert res._value == ts._value // 1000 - assert res._creso == NpyDatetimeUnit.NPY_FR_us.value - - rt = res.as_unit("ns") - assert rt._value == ts._value - assert rt._creso == ts._creso - - res = ts.as_unit("ms") - assert res._value == ts._value // 1_000_000 - assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value - - rt = res.as_unit("ns") - assert rt._value == ts._value - assert rt._creso == ts._creso - - res = ts.as_unit("s") - assert res._value == ts._value // 1_000_000_000 - assert res._creso == NpyDatetimeUnit.NPY_FR_s.value - - rt = res.as_unit("ns") - assert rt._value == ts._value - assert rt._creso == ts._creso - - def test_as_unit_overflows(self): - # microsecond that would be just out of bounds for nano - us = 9223372800000000 - ts = Timestamp._from_value_and_reso(us, NpyDatetimeUnit.NPY_FR_us.value, None) - - msg = "Cannot cast 2262-04-12 00:00:00 to unit='ns' without overflow" - with pytest.raises(OutOfBoundsDatetime, match=msg): - ts.as_unit("ns") - - res = ts.as_unit("ms") - assert res._value == us // 1000 - assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value - - def test_as_unit_rounding(self): - ts = Timestamp(1_500_000) # i.e. 1500 microseconds - res = ts.as_unit("ms") - - expected = Timestamp(1_000_000) # i.e. 1 millisecond - assert res == expected - - assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value - assert res._value == 1 - - with pytest.raises(ValueError, match="Cannot losslessly convert units"): - ts.as_unit("ms", round_ok=False) - - def test_as_unit_non_nano(self): - # case where we are going neither to nor from nano - ts = Timestamp("1970-01-02").as_unit("ms") - assert ts.year == 1970 - assert ts.month == 1 - assert ts.day == 2 - assert ts.hour == ts.minute == ts.second == ts.microsecond == ts.nanosecond == 0 - - res = ts.as_unit("s") - assert res._value == 24 * 3600 - assert res.year == 1970 - assert res.month == 1 - assert res.day == 2 - assert ( - res.hour - == res.minute - == res.second - == res.microsecond - == res.nanosecond - == 0 - ) - - def test_delimited_date(): # https://github.com/pandas-dev/pandas/issues/50231 with tm.assert_produces_warning(None): diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_timezones.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_timezones.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_timezones.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_timezones.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,485 +1,15 @@ """ Tests for Timestamp timezone-related methods """ -from datetime import ( - date, - datetime, - timedelta, - timezone, -) -import re - -import dateutil -from dateutil.tz import ( - gettz, - tzoffset, -) -import pytest -import pytz -from pytz.exceptions import ( - AmbiguousTimeError, - NonExistentTimeError, -) +from datetime import datetime from pandas._libs.tslibs import timezones -from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas.errors import OutOfBoundsDatetime -import pandas.util._test_decorators as td - -from pandas import ( - NaT, - Timestamp, -) -try: - from zoneinfo import ZoneInfo -except ImportError: - # Cannot assign to a type - ZoneInfo = None # type: ignore[misc, assignment] +from pandas import Timestamp class TestTimestampTZOperations: - # -------------------------------------------------------------- - # Timestamp.tz_localize - - def test_tz_localize_pushes_out_of_bounds(self): - # GH#12677 - # tz_localize that pushes away from the boundary is OK - msg = ( - f"Converting {Timestamp.min.strftime('%Y-%m-%d %H:%M:%S')} " - f"underflows past {Timestamp.min}" - ) - pac = Timestamp.min.tz_localize("US/Pacific") - assert pac._value > Timestamp.min._value - pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value - with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.min.tz_localize("Asia/Tokyo") - - # tz_localize that pushes away from the boundary is OK - msg = ( - f"Converting {Timestamp.max.strftime('%Y-%m-%d %H:%M:%S')} " - f"overflows past {Timestamp.max}" - ) - tokyo = Timestamp.max.tz_localize("Asia/Tokyo") - assert tokyo._value < Timestamp.max._value - tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value - with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.max.tz_localize("US/Pacific") - - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) - def test_tz_localize_ambiguous_bool(self, unit): - # make sure that we are correctly accepting bool values as ambiguous - # GH#14402 - ts = Timestamp("2015-11-01 01:00:03").as_unit(unit) - expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") - expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") - - msg = "Cannot infer dst time from 2015-11-01 01:00:03" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize("US/Central") - - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize("dateutil/US/Central") - - if ZoneInfo is not None: - try: - tz = ZoneInfo("US/Central") - except KeyError: - # no tzdata - pass - else: - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - ts.tz_localize(tz) - - result = ts.tz_localize("US/Central", ambiguous=True) - assert result == expected0 - assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - - result = ts.tz_localize("US/Central", ambiguous=False) - assert result == expected1 - assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - - def test_tz_localize_ambiguous(self): - ts = Timestamp("2014-11-02 01:00") - ts_dst = ts.tz_localize("US/Eastern", ambiguous=True) - ts_no_dst = ts.tz_localize("US/Eastern", ambiguous=False) - - assert ts_no_dst._value - ts_dst._value == 3600 - msg = re.escape( - "'ambiguous' parameter must be one of: " - "True, False, 'NaT', 'raise' (default)" - ) - with pytest.raises(ValueError, match=msg): - ts.tz_localize("US/Eastern", ambiguous="infer") - - # GH#8025 - msg = "Cannot localize tz-aware Timestamp, use tz_convert for conversions" - with pytest.raises(TypeError, match=msg): - Timestamp("2011-01-01", tz="US/Eastern").tz_localize("Asia/Tokyo") - - msg = "Cannot convert tz-naive Timestamp, use tz_localize to localize" - with pytest.raises(TypeError, match=msg): - Timestamp("2011-01-01").tz_convert("Asia/Tokyo") - - @pytest.mark.parametrize( - "stamp, tz", - [ - ("2015-03-08 02:00", "US/Eastern"), - ("2015-03-08 02:30", "US/Pacific"), - ("2015-03-29 02:00", "Europe/Paris"), - ("2015-03-29 02:30", "Europe/Belgrade"), - ], - ) - def test_tz_localize_nonexistent(self, stamp, tz): - # GH#13057 - ts = Timestamp(stamp) - with pytest.raises(NonExistentTimeError, match=stamp): - ts.tz_localize(tz) - # GH 22644 - with pytest.raises(NonExistentTimeError, match=stamp): - ts.tz_localize(tz, nonexistent="raise") - assert ts.tz_localize(tz, nonexistent="NaT") is NaT - - def test_tz_localize_ambiguous_raise(self): - # GH#13057 - ts = Timestamp("2015-11-1 01:00") - msg = "Cannot infer dst time from 2015-11-01 01:00:00," - with pytest.raises(AmbiguousTimeError, match=msg): - ts.tz_localize("US/Pacific", ambiguous="raise") - - def test_tz_localize_nonexistent_invalid_arg(self, warsaw): - # GH 22644 - tz = warsaw - ts = Timestamp("2015-03-29 02:00:00") - msg = ( - "The nonexistent argument must be one of 'raise', 'NaT', " - "'shift_forward', 'shift_backward' or a timedelta object" - ) - with pytest.raises(ValueError, match=msg): - ts.tz_localize(tz, nonexistent="foo") - - @pytest.mark.parametrize( - "stamp", - [ - "2014-02-01 09:00", - "2014-07-08 09:00", - "2014-11-01 17:00", - "2014-11-05 00:00", - ], - ) - def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): - tz = tz_aware_fixture - ts = Timestamp(stamp) - localized = ts.tz_localize(tz) - assert localized == Timestamp(stamp, tz=tz) - - msg = "Cannot localize tz-aware Timestamp" - with pytest.raises(TypeError, match=msg): - localized.tz_localize(tz) - - reset = localized.tz_localize(None) - assert reset == ts - assert reset.tzinfo is None - - def test_tz_localize_ambiguous_compat(self): - # validate that pytz and dateutil are compat for dst - # when the transition happens - naive = Timestamp("2013-10-27 01:00:00") - - pytz_zone = "Europe/London" - dateutil_zone = "dateutil/Europe/London" - result_pytz = naive.tz_localize(pytz_zone, ambiguous=False) - result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=False) - assert result_pytz._value == result_dateutil._value - assert result_pytz._value == 1382835600 - - # fixed ambiguous behavior - # see gh-14621, GH#45087 - assert result_pytz.to_pydatetime().tzname() == "GMT" - assert result_dateutil.to_pydatetime().tzname() == "GMT" - assert str(result_pytz) == str(result_dateutil) - - # 1 hour difference - result_pytz = naive.tz_localize(pytz_zone, ambiguous=True) - result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=True) - assert result_pytz._value == result_dateutil._value - assert result_pytz._value == 1382832000 - - # see gh-14621 - assert str(result_pytz) == str(result_dateutil) - assert ( - result_pytz.to_pydatetime().tzname() - == result_dateutil.to_pydatetime().tzname() - ) - - @pytest.mark.parametrize( - "tz", - [ - pytz.timezone("US/Eastern"), - gettz("US/Eastern"), - "US/Eastern", - "dateutil/US/Eastern", - ], - ) - def test_timestamp_tz_localize(self, tz): - stamp = Timestamp("3/11/2012 04:00") - - result = stamp.tz_localize(tz) - expected = Timestamp("3/11/2012 04:00", tz=tz) - assert result.hour == expected.hour - assert result == expected - - @pytest.mark.parametrize( - "start_ts, tz, end_ts, shift", - [ - ["2015-03-29 02:20:00", "Europe/Warsaw", "2015-03-29 03:00:00", "forward"], - [ - "2015-03-29 02:20:00", - "Europe/Warsaw", - "2015-03-29 01:59:59.999999999", - "backward", - ], - [ - "2015-03-29 02:20:00", - "Europe/Warsaw", - "2015-03-29 03:20:00", - timedelta(hours=1), - ], - [ - "2015-03-29 02:20:00", - "Europe/Warsaw", - "2015-03-29 01:20:00", - timedelta(hours=-1), - ], - ["2018-03-11 02:33:00", "US/Pacific", "2018-03-11 03:00:00", "forward"], - [ - "2018-03-11 02:33:00", - "US/Pacific", - "2018-03-11 01:59:59.999999999", - "backward", - ], - [ - "2018-03-11 02:33:00", - "US/Pacific", - "2018-03-11 03:33:00", - timedelta(hours=1), - ], - [ - "2018-03-11 02:33:00", - "US/Pacific", - "2018-03-11 01:33:00", - timedelta(hours=-1), - ], - ], - ) - @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) - def test_timestamp_tz_localize_nonexistent_shift( - self, start_ts, tz, end_ts, shift, tz_type, unit - ): - # GH 8917, 24466 - tz = tz_type + tz - if isinstance(shift, str): - shift = "shift_" + shift - ts = Timestamp(start_ts).as_unit(unit) - result = ts.tz_localize(tz, nonexistent=shift) - expected = Timestamp(end_ts).tz_localize(tz) - - if unit == "us": - assert result == expected.replace(nanosecond=0) - elif unit == "ms": - micros = expected.microsecond - expected.microsecond % 1000 - assert result == expected.replace(microsecond=micros, nanosecond=0) - elif unit == "s": - assert result == expected.replace(microsecond=0, nanosecond=0) - else: - assert result == expected - assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - - @pytest.mark.parametrize("offset", [-1, 1]) - def test_timestamp_tz_localize_nonexistent_shift_invalid(self, offset, warsaw): - # GH 8917, 24466 - tz = warsaw - ts = Timestamp("2015-03-29 02:20:00") - msg = "The provided timedelta will relocalize on a nonexistent time" - with pytest.raises(ValueError, match=msg): - ts.tz_localize(tz, nonexistent=timedelta(seconds=offset)) - - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) - def test_timestamp_tz_localize_nonexistent_NaT(self, warsaw, unit): - # GH 8917 - tz = warsaw - ts = Timestamp("2015-03-29 02:20:00").as_unit(unit) - result = ts.tz_localize(tz, nonexistent="NaT") - assert result is NaT - - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) - def test_timestamp_tz_localize_nonexistent_raise(self, warsaw, unit): - # GH 8917 - tz = warsaw - ts = Timestamp("2015-03-29 02:20:00").as_unit(unit) - msg = "2015-03-29 02:20:00" - with pytest.raises(pytz.NonExistentTimeError, match=msg): - ts.tz_localize(tz, nonexistent="raise") - msg = ( - "The nonexistent argument must be one of 'raise', 'NaT', " - "'shift_forward', 'shift_backward' or a timedelta object" - ) - with pytest.raises(ValueError, match=msg): - ts.tz_localize(tz, nonexistent="foo") - - # ------------------------------------------------------------------ - # Timestamp.tz_convert - - @pytest.mark.parametrize( - "stamp", - [ - "2014-02-01 09:00", - "2014-07-08 09:00", - "2014-11-01 17:00", - "2014-11-05 00:00", - ], - ) - def test_tz_convert_roundtrip(self, stamp, tz_aware_fixture): - tz = tz_aware_fixture - - ts = Timestamp(stamp, tz="UTC") - converted = ts.tz_convert(tz) - - reset = converted.tz_convert(None) - assert reset == Timestamp(stamp) - assert reset.tzinfo is None - assert reset == converted.tz_convert("UTC").tz_localize(None) - - @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) - def test_astimezone(self, tzstr): - # astimezone is an alias for tz_convert, so keep it with - # the tz_convert tests - utcdate = Timestamp("3/11/2012 22:00", tz="UTC") - expected = utcdate.tz_convert(tzstr) - result = utcdate.astimezone(tzstr) - assert expected == result - assert isinstance(result, Timestamp) - - @td.skip_if_windows - def test_tz_convert_utc_with_system_utc(self): - # from system utc to real utc - ts = Timestamp("2001-01-05 11:56", tz=timezones.maybe_get_tz("dateutil/UTC")) - # check that the time hasn't changed. - assert ts == ts.tz_convert(dateutil.tz.tzutc()) - - # from system utc to real utc - ts = Timestamp("2001-01-05 11:56", tz=timezones.maybe_get_tz("dateutil/UTC")) - # check that the time hasn't changed. - assert ts == ts.tz_convert(dateutil.tz.tzutc()) - # ------------------------------------------------------------------ - # Timestamp.__init__ with tz str or tzinfo - - def test_timestamp_constructor_tz_utc(self): - utc_stamp = Timestamp("3/11/2012 05:00", tz="utc") - assert utc_stamp.tzinfo is timezone.utc - assert utc_stamp.hour == 5 - - utc_stamp = Timestamp("3/11/2012 05:00").tz_localize("utc") - assert utc_stamp.hour == 5 - - def test_timestamp_to_datetime_tzoffset(self): - tzinfo = tzoffset(None, 7200) - expected = Timestamp("3/11/2012 04:00", tz=tzinfo) - result = Timestamp(expected.to_pydatetime()) - assert expected == result - - def test_timestamp_constructor_near_dst_boundary(self): - # GH#11481 & GH#15777 - # Naive string timestamps were being localized incorrectly - # with tz_convert_from_utc_single instead of tz_localize_to_utc - - for tz in ["Europe/Brussels", "Europe/Prague"]: - result = Timestamp("2015-10-25 01:00", tz=tz) - expected = Timestamp("2015-10-25 01:00").tz_localize(tz) - assert result == expected - - msg = "Cannot infer dst time from 2015-10-25 02:00:00" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - Timestamp("2015-10-25 02:00", tz=tz) - - result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") - expected = Timestamp("2017-03-26 01:00").tz_localize("Europe/Paris") - assert result == expected - - msg = "2017-03-26 02:00" - with pytest.raises(pytz.NonExistentTimeError, match=msg): - Timestamp("2017-03-26 02:00", tz="Europe/Paris") - - # GH#11708 - naive = Timestamp("2015-11-18 10:00:00") - result = naive.tz_localize("UTC").tz_convert("Asia/Kolkata") - expected = Timestamp("2015-11-18 15:30:00+0530", tz="Asia/Kolkata") - assert result == expected - - # GH#15823 - result = Timestamp("2017-03-26 00:00", tz="Europe/Paris") - expected = Timestamp("2017-03-26 00:00:00+0100", tz="Europe/Paris") - assert result == expected - - result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") - expected = Timestamp("2017-03-26 01:00:00+0100", tz="Europe/Paris") - assert result == expected - - msg = "2017-03-26 02:00" - with pytest.raises(pytz.NonExistentTimeError, match=msg): - Timestamp("2017-03-26 02:00", tz="Europe/Paris") - - result = Timestamp("2017-03-26 02:00:00+0100", tz="Europe/Paris") - naive = Timestamp(result.as_unit("ns")._value) - expected = naive.tz_localize("UTC").tz_convert("Europe/Paris") - assert result == expected - - result = Timestamp("2017-03-26 03:00", tz="Europe/Paris") - expected = Timestamp("2017-03-26 03:00:00+0200", tz="Europe/Paris") - assert result == expected - - @pytest.mark.parametrize( - "tz", - [ - pytz.timezone("US/Eastern"), - gettz("US/Eastern"), - "US/Eastern", - "dateutil/US/Eastern", - ], - ) - def test_timestamp_constructed_by_date_and_tz(self, tz): - # GH#2993, Timestamp cannot be constructed by datetime.date - # and tz correctly - - result = Timestamp(date(2012, 3, 11), tz=tz) - - expected = Timestamp("3/11/2012", tz=tz) - assert result.hour == expected.hour - assert result == expected - - @pytest.mark.parametrize( - "tz", - [ - pytz.timezone("US/Eastern"), - gettz("US/Eastern"), - "US/Eastern", - "dateutil/US/Eastern", - ], - ) - def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): - # GH#1389 - - # 4 hours before DST transition - stamp = Timestamp("3/10/2012 22:00", tz=tz) - - result = stamp + timedelta(hours=6) - - # spring forward, + "7" hours - expected = Timestamp("3/11/2012 05:00", tz=tz) - - assert result == expected def test_timestamp_timetz_equivalent_with_datetime_tz(self, tz_naive_fixture): # GH21358 diff -Nru pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_unary_ops.py pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_unary_ops.py --- pandas-2.1.4+dfsg/pandas/tests/scalar/timestamp/test_unary_ops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/scalar/timestamp/test_unary_ops.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,609 +0,0 @@ -from datetime import datetime - -from dateutil.tz import gettz -from hypothesis import ( - given, - strategies as st, -) -import numpy as np -import pytest -import pytz -from pytz import utc - -from pandas._libs import lib -from pandas._libs.tslibs import ( - NaT, - OutOfBoundsDatetime, - Timedelta, - Timestamp, - conversion, - iNaT, - to_offset, -) -from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG -import pandas.util._test_decorators as td - -import pandas._testing as tm - - -class TestTimestampUnaryOps: - # -------------------------------------------------------------- - def test_round_division_by_zero_raises(self): - ts = Timestamp("2016-01-01") - - msg = "Division by zero in rounding" - with pytest.raises(ValueError, match=msg): - ts.round("0ns") - - # Timestamp.round - @pytest.mark.parametrize( - "timestamp, freq, expected", - [ - ("20130101 09:10:11", "D", "20130101"), - ("20130101 19:10:11", "D", "20130102"), - ("20130201 12:00:00", "D", "20130202"), - ("20130104 12:00:00", "D", "20130105"), - ("2000-01-05 05:09:15.13", "D", "2000-01-05 00:00:00"), - ("2000-01-05 05:09:15.13", "H", "2000-01-05 05:00:00"), - ("2000-01-05 05:09:15.13", "S", "2000-01-05 05:09:15"), - ], - ) - def test_round_frequencies(self, timestamp, freq, expected): - dt = Timestamp(timestamp) - result = dt.round(freq) - expected = Timestamp(expected) - assert result == expected - - def test_round_tzaware(self): - dt = Timestamp("20130101 09:10:11", tz="US/Eastern") - result = dt.round("D") - expected = Timestamp("20130101", tz="US/Eastern") - assert result == expected - - dt = Timestamp("20130101 09:10:11", tz="US/Eastern") - result = dt.round("s") - assert result == dt - - def test_round_30min(self): - # round - dt = Timestamp("20130104 12:32:00") - result = dt.round("30Min") - expected = Timestamp("20130104 12:30:00") - assert result == expected - - def test_round_subsecond(self): - # GH#14440 & GH#15578 - result = Timestamp("2016-10-17 12:00:00.0015").round("ms") - expected = Timestamp("2016-10-17 12:00:00.002000") - assert result == expected - - result = Timestamp("2016-10-17 12:00:00.00149").round("ms") - expected = Timestamp("2016-10-17 12:00:00.001000") - assert result == expected - - ts = Timestamp("2016-10-17 12:00:00.0015") - for freq in ["us", "ns"]: - assert ts == ts.round(freq) - - result = Timestamp("2016-10-17 12:00:00.001501031").round("10ns") - expected = Timestamp("2016-10-17 12:00:00.001501030") - assert result == expected - - def test_round_nonstandard_freq(self): - with tm.assert_produces_warning(False): - Timestamp("2016-10-17 12:00:00.001501031").round("1010ns") - - def test_round_invalid_arg(self): - stamp = Timestamp("2000-01-05 05:09:15.13") - with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - stamp.round("foo") - - @pytest.mark.parametrize( - "test_input, rounder, freq, expected", - [ - ("2117-01-01 00:00:45", "floor", "15s", "2117-01-01 00:00:45"), - ("2117-01-01 00:00:45", "ceil", "15s", "2117-01-01 00:00:45"), - ( - "2117-01-01 00:00:45.000000012", - "floor", - "10ns", - "2117-01-01 00:00:45.000000010", - ), - ( - "1823-01-01 00:00:01.000000012", - "ceil", - "10ns", - "1823-01-01 00:00:01.000000020", - ), - ("1823-01-01 00:00:01", "floor", "1s", "1823-01-01 00:00:01"), - ("1823-01-01 00:00:01", "ceil", "1s", "1823-01-01 00:00:01"), - ("NaT", "floor", "1s", "NaT"), - ("NaT", "ceil", "1s", "NaT"), - ], - ) - def test_ceil_floor_edge(self, test_input, rounder, freq, expected): - dt = Timestamp(test_input) - func = getattr(dt, rounder) - result = func(freq) - - if dt is NaT: - assert result is NaT - else: - expected = Timestamp(expected) - assert result == expected - - @pytest.mark.parametrize( - "test_input, freq, expected", - [ - ("2018-01-01 00:02:06", "2s", "2018-01-01 00:02:06"), - ("2018-01-01 00:02:00", "2T", "2018-01-01 00:02:00"), - ("2018-01-01 00:04:00", "4T", "2018-01-01 00:04:00"), - ("2018-01-01 00:15:00", "15T", "2018-01-01 00:15:00"), - ("2018-01-01 00:20:00", "20T", "2018-01-01 00:20:00"), - ("2018-01-01 03:00:00", "3H", "2018-01-01 03:00:00"), - ], - ) - @pytest.mark.parametrize("rounder", ["ceil", "floor", "round"]) - def test_round_minute_freq(self, test_input, freq, expected, rounder): - # Ensure timestamps that shouldn't round dont! - # GH#21262 - - dt = Timestamp(test_input) - expected = Timestamp(expected) - func = getattr(dt, rounder) - result = func(freq) - assert result == expected - - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) - def test_ceil(self, unit): - dt = Timestamp("20130101 09:10:11").as_unit(unit) - result = dt.ceil("D") - expected = Timestamp("20130102") - assert result == expected - assert result._creso == dt._creso - - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) - def test_floor(self, unit): - dt = Timestamp("20130101 09:10:11").as_unit(unit) - result = dt.floor("D") - expected = Timestamp("20130101") - assert result == expected - assert result._creso == dt._creso - - @pytest.mark.parametrize("method", ["ceil", "round", "floor"]) - @pytest.mark.parametrize( - "unit", - ["ns", "us", "ms", "s"], - ) - def test_round_dst_border_ambiguous(self, method, unit): - # GH 18946 round near "fall back" DST - ts = Timestamp("2017-10-29 00:00:00", tz="UTC").tz_convert("Europe/Madrid") - ts = ts.as_unit(unit) - # - result = getattr(ts, method)("H", ambiguous=True) - assert result == ts - assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - - result = getattr(ts, method)("H", ambiguous=False) - expected = Timestamp("2017-10-29 01:00:00", tz="UTC").tz_convert( - "Europe/Madrid" - ) - assert result == expected - assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - - result = getattr(ts, method)("H", ambiguous="NaT") - assert result is NaT - - msg = "Cannot infer dst time" - with pytest.raises(pytz.AmbiguousTimeError, match=msg): - getattr(ts, method)("H", ambiguous="raise") - - @pytest.mark.parametrize( - "method, ts_str, freq", - [ - ["ceil", "2018-03-11 01:59:00-0600", "5min"], - ["round", "2018-03-11 01:59:00-0600", "5min"], - ["floor", "2018-03-11 03:01:00-0500", "2H"], - ], - ) - @pytest.mark.parametrize( - "unit", - ["ns", "us", "ms", "s"], - ) - def test_round_dst_border_nonexistent(self, method, ts_str, freq, unit): - # GH 23324 round near "spring forward" DST - ts = Timestamp(ts_str, tz="America/Chicago").as_unit(unit) - result = getattr(ts, method)(freq, nonexistent="shift_forward") - expected = Timestamp("2018-03-11 03:00:00", tz="America/Chicago") - assert result == expected - assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - - result = getattr(ts, method)(freq, nonexistent="NaT") - assert result is NaT - - msg = "2018-03-11 02:00:00" - with pytest.raises(pytz.NonExistentTimeError, match=msg): - getattr(ts, method)(freq, nonexistent="raise") - - @pytest.mark.parametrize( - "timestamp", - [ - "2018-01-01 0:0:0.124999360", - "2018-01-01 0:0:0.125000367", - "2018-01-01 0:0:0.125500", - "2018-01-01 0:0:0.126500", - "2018-01-01 12:00:00", - "2019-01-01 12:00:00", - ], - ) - @pytest.mark.parametrize( - "freq", - [ - "2ns", - "3ns", - "4ns", - "5ns", - "6ns", - "7ns", - "250ns", - "500ns", - "750ns", - "1us", - "19us", - "250us", - "500us", - "750us", - "1s", - "2s", - "3s", - "1D", - ], - ) - def test_round_int64(self, timestamp, freq): - # check that all rounding modes are accurate to int64 precision - # see GH#22591 - dt = Timestamp(timestamp).as_unit("ns") - unit = to_offset(freq).nanos - - # test floor - result = dt.floor(freq) - assert result._value % unit == 0, f"floor not a {freq} multiple" - assert 0 <= dt._value - result._value < unit, "floor error" - - # test ceil - result = dt.ceil(freq) - assert result._value % unit == 0, f"ceil not a {freq} multiple" - assert 0 <= result._value - dt._value < unit, "ceil error" - - # test round - result = dt.round(freq) - assert result._value % unit == 0, f"round not a {freq} multiple" - assert abs(result._value - dt._value) <= unit // 2, "round error" - if unit % 2 == 0 and abs(result._value - dt._value) == unit // 2: - # round half to even - assert result._value // unit % 2 == 0, "round half to even error" - - def test_round_implementation_bounds(self): - # See also: analogous test for Timedelta - result = Timestamp.min.ceil("s") - expected = Timestamp(1677, 9, 21, 0, 12, 44) - assert result == expected - - result = Timestamp.max.floor("s") - expected = Timestamp.max - Timedelta(854775807) - assert result == expected - - msg = "Cannot round 1677-09-21 00:12:43.145224193 to freq=" - with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.min.floor("s") - - with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.min.round("s") - - msg = "Cannot round 2262-04-11 23:47:16.854775807 to freq=" - with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.max.ceil("s") - - with pytest.raises(OutOfBoundsDatetime, match=msg): - Timestamp.max.round("s") - - @given(val=st.integers(iNaT + 1, lib.i8max)) - @pytest.mark.parametrize( - "method", [Timestamp.round, Timestamp.floor, Timestamp.ceil] - ) - def test_round_sanity(self, val, method): - cls = Timestamp - err_cls = OutOfBoundsDatetime - - val = np.int64(val) - ts = cls(val) - - def checker(ts, nanos, unit): - # First check that we do raise in cases where we should - if nanos == 1: - pass - else: - div, mod = divmod(ts._value, nanos) - diff = int(nanos - mod) - lb = ts._value - mod - assert lb <= ts._value # i.e. no overflows with python ints - ub = ts._value + diff - assert ub > ts._value # i.e. no overflows with python ints - - msg = "without overflow" - if mod == 0: - # We should never be raising in this - pass - elif method is cls.ceil: - if ub > cls.max._value: - with pytest.raises(err_cls, match=msg): - method(ts, unit) - return - elif method is cls.floor: - if lb < cls.min._value: - with pytest.raises(err_cls, match=msg): - method(ts, unit) - return - elif mod >= diff: - if ub > cls.max._value: - with pytest.raises(err_cls, match=msg): - method(ts, unit) - return - elif lb < cls.min._value: - with pytest.raises(err_cls, match=msg): - method(ts, unit) - return - - res = method(ts, unit) - - td = res - ts - diff = abs(td._value) - assert diff < nanos - assert res._value % nanos == 0 - - if method is cls.round: - assert diff <= nanos / 2 - elif method is cls.floor: - assert res <= ts - elif method is cls.ceil: - assert res >= ts - - nanos = 1 - checker(ts, nanos, "ns") - - nanos = 1000 - checker(ts, nanos, "us") - - nanos = 1_000_000 - checker(ts, nanos, "ms") - - nanos = 1_000_000_000 - checker(ts, nanos, "s") - - nanos = 60 * 1_000_000_000 - checker(ts, nanos, "min") - - nanos = 60 * 60 * 1_000_000_000 - checker(ts, nanos, "h") - - nanos = 24 * 60 * 60 * 1_000_000_000 - checker(ts, nanos, "D") - - # -------------------------------------------------------------- - # Timestamp.replace - - def test_replace_out_of_pydatetime_bounds(self): - # GH#50348 - ts = Timestamp("2016-01-01").as_unit("ns") - - msg = "Out of bounds nanosecond timestamp: 99999-01-01 00:00:00" - with pytest.raises(OutOfBoundsDatetime, match=msg): - ts.replace(year=99_999) - - ts = ts.as_unit("ms") - result = ts.replace(year=99_999) - assert result.year == 99_999 - assert result._value == Timestamp(np.datetime64("99999-01-01", "ms"))._value - - def test_replace_non_nano(self): - ts = Timestamp._from_value_and_reso( - 91514880000000000, NpyDatetimeUnit.NPY_FR_us.value, None - ) - assert ts.to_pydatetime() == datetime(4869, 12, 28) - - result = ts.replace(year=4900) - assert result._creso == ts._creso - assert result.to_pydatetime() == datetime(4900, 12, 28) - - def test_replace_naive(self): - # GH#14621, GH#7825 - ts = Timestamp("2016-01-01 09:00:00") - result = ts.replace(hour=0) - expected = Timestamp("2016-01-01 00:00:00") - assert result == expected - - def test_replace_aware(self, tz_aware_fixture): - tz = tz_aware_fixture - # GH#14621, GH#7825 - # replacing datetime components with and w/o presence of a timezone - ts = Timestamp("2016-01-01 09:00:00", tz=tz) - result = ts.replace(hour=0) - expected = Timestamp("2016-01-01 00:00:00", tz=tz) - assert result == expected - - def test_replace_preserves_nanos(self, tz_aware_fixture): - tz = tz_aware_fixture - # GH#14621, GH#7825 - ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) - result = ts.replace(hour=0) - expected = Timestamp("2016-01-01 00:00:00.000000123", tz=tz) - assert result == expected - - def test_replace_multiple(self, tz_aware_fixture): - tz = tz_aware_fixture - # GH#14621, GH#7825 - # replacing datetime components with and w/o presence of a timezone - # test all - ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) - result = ts.replace( - year=2015, - month=2, - day=2, - hour=0, - minute=5, - second=5, - microsecond=5, - nanosecond=5, - ) - expected = Timestamp("2015-02-02 00:05:05.000005005", tz=tz) - assert result == expected - - def test_replace_invalid_kwarg(self, tz_aware_fixture): - tz = tz_aware_fixture - # GH#14621, GH#7825 - ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) - msg = r"replace\(\) got an unexpected keyword argument" - with pytest.raises(TypeError, match=msg): - ts.replace(foo=5) - - def test_replace_integer_args(self, tz_aware_fixture): - tz = tz_aware_fixture - # GH#14621, GH#7825 - ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) - msg = "value must be an integer, received for hour" - with pytest.raises(ValueError, match=msg): - ts.replace(hour=0.1) - - def test_replace_tzinfo_equiv_tz_localize_none(self): - # GH#14621, GH#7825 - # assert conversion to naive is the same as replacing tzinfo with None - ts = Timestamp("2013-11-03 01:59:59.999999-0400", tz="US/Eastern") - assert ts.tz_localize(None) == ts.replace(tzinfo=None) - - @td.skip_if_windows - def test_replace_tzinfo(self): - # GH#15683 - dt = datetime(2016, 3, 27, 1) - tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo - - result_dt = dt.replace(tzinfo=tzinfo) - result_pd = Timestamp(dt).replace(tzinfo=tzinfo) - - # datetime.timestamp() converts in the local timezone - with tm.set_timezone("UTC"): - assert result_dt.timestamp() == result_pd.timestamp() - - assert result_dt == result_pd - assert result_dt == result_pd.to_pydatetime() - - result_dt = dt.replace(tzinfo=tzinfo).replace(tzinfo=None) - result_pd = Timestamp(dt).replace(tzinfo=tzinfo).replace(tzinfo=None) - - # datetime.timestamp() converts in the local timezone - with tm.set_timezone("UTC"): - assert result_dt.timestamp() == result_pd.timestamp() - - assert result_dt == result_pd - assert result_dt == result_pd.to_pydatetime() - - @pytest.mark.parametrize( - "tz, normalize", - [ - (pytz.timezone("US/Eastern"), lambda x: x.tzinfo.normalize(x)), - (gettz("US/Eastern"), lambda x: x), - ], - ) - def test_replace_across_dst(self, tz, normalize): - # GH#18319 check that 1) timezone is correctly normalized and - # 2) that hour is not incorrectly changed by this normalization - ts_naive = Timestamp("2017-12-03 16:03:30") - ts_aware = conversion.localize_pydatetime(ts_naive, tz) - - # Preliminary sanity-check - assert ts_aware == normalize(ts_aware) - - # Replace across DST boundary - ts2 = ts_aware.replace(month=6) - - # Check that `replace` preserves hour literal - assert (ts2.hour, ts2.minute) == (ts_aware.hour, ts_aware.minute) - - # Check that post-replace object is appropriately normalized - ts2b = normalize(ts2) - assert ts2 == ts2b - - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) - def test_replace_dst_border(self, unit): - # Gh 7825 - t = Timestamp("2013-11-3", tz="America/Chicago").as_unit(unit) - result = t.replace(hour=3) - expected = Timestamp("2013-11-3 03:00:00", tz="America/Chicago") - assert result == expected - assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - - @pytest.mark.parametrize("fold", [0, 1]) - @pytest.mark.parametrize("tz", ["dateutil/Europe/London", "Europe/London"]) - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) - def test_replace_dst_fold(self, fold, tz, unit): - # GH 25017 - d = datetime(2019, 10, 27, 2, 30) - ts = Timestamp(d, tz=tz).as_unit(unit) - result = ts.replace(hour=1, fold=fold) - expected = Timestamp(datetime(2019, 10, 27, 1, 30)).tz_localize( - tz, ambiguous=not fold - ) - assert result == expected - assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - - # -------------------------------------------------------------- - # Timestamp.normalize - - @pytest.mark.parametrize("arg", ["2013-11-30", "2013-11-30 12:00:00"]) - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) - def test_normalize(self, tz_naive_fixture, arg, unit): - tz = tz_naive_fixture - ts = Timestamp(arg, tz=tz).as_unit(unit) - result = ts.normalize() - expected = Timestamp("2013-11-30", tz=tz) - assert result == expected - assert result._creso == getattr(NpyDatetimeUnit, f"NPY_FR_{unit}").value - - def test_normalize_pre_epoch_dates(self): - # GH: 36294 - result = Timestamp("1969-01-01 09:00:00").normalize() - expected = Timestamp("1969-01-01 00:00:00") - assert result == expected - - # -------------------------------------------------------------- - - @td.skip_if_windows - def test_timestamp(self, fixed_now_ts): - # GH#17329 - # tz-naive --> treat it as if it were UTC for purposes of timestamp() - ts = fixed_now_ts - uts = ts.replace(tzinfo=utc) - assert ts.timestamp() == uts.timestamp() - - tsc = Timestamp("2014-10-11 11:00:01.12345678", tz="US/Central") - utsc = tsc.tz_convert("UTC") - - # utsc is a different representation of the same time - assert tsc.timestamp() == utsc.timestamp() - - # datetime.timestamp() converts in the local timezone - with tm.set_timezone("UTC"): - # should agree with datetime.timestamp method - dt = ts.to_pydatetime() - assert dt.timestamp() == ts.timestamp() - - -@pytest.mark.parametrize("fold", [0, 1]) -def test_replace_preserves_fold(fold): - # GH 37610. Check that replace preserves Timestamp fold property - tz = gettz("Europe/Moscow") - - ts = Timestamp(year=2009, month=10, day=25, hour=2, minute=30, fold=fold, tzinfo=tz) - ts_replaced = ts.replace(second=1) - - assert ts_replaced.fold == fold diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/accessors/test_dt_accessor.py pandas-2.2.2+dfsg/pandas/tests/series/accessors/test_dt_accessor.py --- pandas-2.1.4+dfsg/pandas/tests/series/accessors/test_dt_accessor.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/accessors/test_dt_accessor.py 2024-04-10 17:42:52.000000000 +0000 @@ -253,15 +253,15 @@ tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) # tzaware - ser = Series(date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") + ser = Series(date_range("2015-01-01", "2016-01-01", freq="min"), name="xxx") ser = ser.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") results = get_dir(ser) tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) # Period - ser = Series( - period_range("20130101", periods=5, freq="D", name="xxx").astype(object) - ) + idx = period_range("20130101", periods=5, freq="D", name="xxx").astype(object) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + ser = Series(idx) results = get_dir(ser) tm.assert_almost_equal( results, sorted(set(ok_for_period + ok_for_period_methods)) @@ -270,18 +270,18 @@ def test_dt_accessor_ambiguous_freq_conversions(self): # GH#11295 # ambiguous time error on the conversions - ser = Series(date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") + ser = Series(date_range("2015-01-01", "2016-01-01", freq="min"), name="xxx") ser = ser.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") exp_values = date_range( - "2015-01-01", "2016-01-01", freq="T", tz="UTC" + "2015-01-01", "2016-01-01", freq="min", tz="UTC" ).tz_convert("America/Chicago") # freq not preserved by tz_localize above exp_values = exp_values._with_freq(None) expected = Series(exp_values, name="xxx") tm.assert_series_equal(ser, expected) - def test_dt_accessor_not_writeable(self, using_copy_on_write): + def test_dt_accessor_not_writeable(self, using_copy_on_write, warn_copy_on_write): # no setting allowed ser = Series(date_range("20130101", periods=5, freq="D"), name="xxx") with pytest.raises(ValueError, match="modifications"): @@ -293,6 +293,11 @@ if using_copy_on_write: with tm.raises_chained_assignment_error(): ser.dt.hour[0] = 5 + elif warn_copy_on_write: + with tm.assert_produces_warning( + FutureWarning, match="ChainedAssignmentError" + ): + ser.dt.hour[0] = 5 else: with pytest.raises(SettingWithCopyError, match=msg): ser.dt.hour[0] = 5 @@ -345,30 +350,30 @@ ) df1["date"] = df1["date"].dt.tz_convert("Europe/Madrid") # infer - result = getattr(df1.date.dt, method)("H", ambiguous="infer") + result = getattr(df1.date.dt, method)("h", ambiguous="infer") expected = df1["date"] tm.assert_series_equal(result, expected) # bool-array - result = getattr(df1.date.dt, method)("H", ambiguous=[True, False, False]) + result = getattr(df1.date.dt, method)("h", ambiguous=[True, False, False]) tm.assert_series_equal(result, expected) # NaT - result = getattr(df1.date.dt, method)("H", ambiguous="NaT") + result = getattr(df1.date.dt, method)("h", ambiguous="NaT") expected = df1["date"].copy() expected.iloc[0:2] = pd.NaT tm.assert_series_equal(result, expected) # raise with tm.external_error_raised(pytz.AmbiguousTimeError): - getattr(df1.date.dt, method)("H", ambiguous="raise") + getattr(df1.date.dt, method)("h", ambiguous="raise") @pytest.mark.parametrize( "method, ts_str, freq", [ ["ceil", "2018-03-11 01:59:00-0600", "5min"], ["round", "2018-03-11 01:59:00-0600", "5min"], - ["floor", "2018-03-11 03:01:00-0500", "2H"], + ["floor", "2018-03-11 03:01:00-0500", "2h"], ], ) def test_dt_round_tz_nonexistent(self, method, ts_str, freq): @@ -385,7 +390,7 @@ with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): getattr(ser.dt, method)(freq, nonexistent="raise") - @pytest.mark.parametrize("freq", ["ns", "U", "1000U"]) + @pytest.mark.parametrize("freq", ["ns", "us", "1000us"]) def test_dt_round_nonnano_higher_resolution_no_op(self, freq): # GH 52761 ser = Series( @@ -499,7 +504,7 @@ ser = pd.concat([ser, Series([pd.NaT])]) assert np.isnan(ser.dt.day_name(locale=time_locale).iloc[-1]) - ser = Series(date_range(freq="M", start="2012", end="2013")) + ser = Series(date_range(freq="ME", start="2012", end="2013")) result = ser.dt.month_name(locale=time_locale) expected = Series([month.capitalize() for month in expected_months]) @@ -582,13 +587,15 @@ # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) - def test_strftime_period_days(self): + def test_strftime_period_days(self, using_infer_string): period_index = period_range("20150301", periods=5) result = period_index.strftime("%Y/%m/%d") expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], dtype="=U10", ) + if using_infer_string: + expected = expected.astype("string[pyarrow_numpy]") tm.assert_index_equal(result, expected) def test_strftime_dt64_microsecond_resolution(self): @@ -598,7 +605,7 @@ tm.assert_series_equal(result, expected) def test_strftime_period_hours(self): - ser = Series(period_range("20130101", periods=4, freq="H")) + ser = Series(period_range("20130101", periods=4, freq="h")) result = ser.dt.strftime("%Y/%m/%d %H:%M:%S") expected = Series( [ @@ -611,7 +618,7 @@ tm.assert_series_equal(result, expected) def test_strftime_period_minutes(self): - ser = Series(period_range("20130101", periods=4, freq="L")) + ser = Series(period_range("20130101", periods=4, freq="ms")) result = ser.dt.strftime("%Y/%m/%d %H:%M:%S.%l") expected = Series( [ @@ -776,16 +783,16 @@ [Period("2016-01", freq="M"), Period("2016-02", freq="M")], [Period("2016-01-01", freq="D"), Period("2016-01-02", freq="D")], [ - Period("2016-01-01 00:00:00", freq="H"), - Period("2016-01-01 01:00:00", freq="H"), + Period("2016-01-01 00:00:00", freq="h"), + Period("2016-01-01 01:00:00", freq="h"), ], [ Period("2016-01-01 00:00:00", freq="M"), Period("2016-01-01 00:01:00", freq="M"), ], [ - Period("2016-01-01 00:00:00", freq="S"), - Period("2016-01-01 00:00:01", freq="S"), + Period("2016-01-01 00:00:00", freq="s"), + Period("2016-01-01 00:00:01", freq="s"), ], ], ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/accessors/test_list_accessor.py pandas-2.2.2+dfsg/pandas/tests/series/accessors/test_list_accessor.py --- pandas-2.1.4+dfsg/pandas/tests/series/accessors/test_list_accessor.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/accessors/test_list_accessor.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,129 @@ +import re + +import pytest + +from pandas import ( + ArrowDtype, + Series, +) +import pandas._testing as tm + +pa = pytest.importorskip("pyarrow") + +from pandas.compat import pa_version_under11p0 + + +@pytest.mark.parametrize( + "list_dtype", + ( + pa.list_(pa.int64()), + pa.list_(pa.int64(), list_size=3), + pa.large_list(pa.int64()), + ), +) +def test_list_getitem(list_dtype): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(list_dtype), + ) + actual = ser.list[1] + expected = Series([2, None, None], dtype="int64[pyarrow]") + tm.assert_series_equal(actual, expected) + + +def test_list_getitem_slice(): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + if pa_version_under11p0: + with pytest.raises( + NotImplementedError, match="List slice not supported by pyarrow " + ): + ser.list[1:None:None] + else: + actual = ser.list[1:None:None] + expected = Series( + [[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())) + ) + tm.assert_series_equal(actual, expected) + + +def test_list_len(): + ser = Series( + [[1, 2, 3], [4, None], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + actual = ser.list.len() + expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32())) + tm.assert_series_equal(actual, expected) + + +def test_list_flatten(): + ser = Series( + [[1, 2, 3], [4, None], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + actual = ser.list.flatten() + expected = Series([1, 2, 3, 4, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(actual, expected) + + +def test_list_getitem_slice_invalid(): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + if pa_version_under11p0: + with pytest.raises( + NotImplementedError, match="List slice not supported by pyarrow " + ): + ser.list[1:None:0] + else: + with pytest.raises(pa.lib.ArrowInvalid, match=re.escape("`step` must be >= 1")): + ser.list[1:None:0] + + +def test_list_accessor_non_list_dtype(): + ser = Series( + [1, 2, 4], + dtype=ArrowDtype(pa.int64()), + ) + with pytest.raises( + AttributeError, + match=re.escape( + "Can only use the '.list' accessor with 'list[pyarrow]' dtype, " + "not int64[pyarrow]." + ), + ): + ser.list[1:None:0] + + +@pytest.mark.parametrize( + "list_dtype", + ( + pa.list_(pa.int64()), + pa.list_(pa.int64(), list_size=3), + pa.large_list(pa.int64()), + ), +) +def test_list_getitem_invalid_index(list_dtype): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(list_dtype), + ) + with pytest.raises(pa.lib.ArrowInvalid, match="Index -1 is out of bounds"): + ser.list[-1] + with pytest.raises(pa.lib.ArrowInvalid, match="Index 5 is out of bounds"): + ser.list[5] + with pytest.raises(ValueError, match="key must be an int or slice, got str"): + ser.list["abc"] + + +def test_list_accessor_not_iterable(): + ser = Series( + [[1, 2, 3], [4, None], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + with pytest.raises(TypeError, match="'ListAccessor' object is not iterable"): + iter(ser.list) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/accessors/test_struct_accessor.py pandas-2.2.2+dfsg/pandas/tests/series/accessors/test_struct_accessor.py --- pandas-2.1.4+dfsg/pandas/tests/series/accessors/test_struct_accessor.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/accessors/test_struct_accessor.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,196 @@ +import re + +import pytest + +from pandas.compat.pyarrow import ( + pa_version_under11p0, + pa_version_under13p0, +) + +from pandas import ( + ArrowDtype, + DataFrame, + Index, + Series, +) +import pandas._testing as tm + +pa = pytest.importorskip("pyarrow") +pc = pytest.importorskip("pyarrow.compute") + + +def test_struct_accessor_dtypes(): + ser = Series( + [], + dtype=ArrowDtype( + pa.struct( + [ + ("int_col", pa.int64()), + ("string_col", pa.string()), + ( + "struct_col", + pa.struct( + [ + ("int_col", pa.int64()), + ("float_col", pa.float64()), + ] + ), + ), + ] + ) + ), + ) + actual = ser.struct.dtypes + expected = Series( + [ + ArrowDtype(pa.int64()), + ArrowDtype(pa.string()), + ArrowDtype( + pa.struct( + [ + ("int_col", pa.int64()), + ("float_col", pa.float64()), + ] + ) + ), + ], + index=Index(["int_col", "string_col", "struct_col"]), + ) + tm.assert_series_equal(actual, expected) + + +@pytest.mark.skipif(pa_version_under13p0, reason="pyarrow>=13.0.0 required") +def test_struct_accessor_field(): + index = Index([-100, 42, 123]) + ser = Series( + [ + {"rice": 1.0, "maize": -1, "wheat": "a"}, + {"rice": 2.0, "maize": 0, "wheat": "b"}, + {"rice": 3.0, "maize": 1, "wheat": "c"}, + ], + dtype=ArrowDtype( + pa.struct( + [ + ("rice", pa.float64()), + ("maize", pa.int64()), + ("wheat", pa.string()), + ] + ) + ), + index=index, + ) + by_name = ser.struct.field("maize") + by_name_expected = Series( + [-1, 0, 1], + dtype=ArrowDtype(pa.int64()), + index=index, + name="maize", + ) + tm.assert_series_equal(by_name, by_name_expected) + + by_index = ser.struct.field(2) + by_index_expected = Series( + ["a", "b", "c"], + dtype=ArrowDtype(pa.string()), + index=index, + name="wheat", + ) + tm.assert_series_equal(by_index, by_index_expected) + + +def test_struct_accessor_field_with_invalid_name_or_index(): + ser = Series([], dtype=ArrowDtype(pa.struct([("field", pa.int64())]))) + + with pytest.raises(ValueError, match="name_or_index must be an int, str,"): + ser.struct.field(1.1) + + +@pytest.mark.skipif(pa_version_under11p0, reason="pyarrow>=11.0.0 required") +def test_struct_accessor_explode(): + index = Index([-100, 42, 123]) + ser = Series( + [ + {"painted": 1, "snapping": {"sea": "green"}}, + {"painted": 2, "snapping": {"sea": "leatherback"}}, + {"painted": 3, "snapping": {"sea": "hawksbill"}}, + ], + dtype=ArrowDtype( + pa.struct( + [ + ("painted", pa.int64()), + ("snapping", pa.struct([("sea", pa.string())])), + ] + ) + ), + index=index, + ) + actual = ser.struct.explode() + expected = DataFrame( + { + "painted": Series([1, 2, 3], index=index, dtype=ArrowDtype(pa.int64())), + "snapping": Series( + [{"sea": "green"}, {"sea": "leatherback"}, {"sea": "hawksbill"}], + index=index, + dtype=ArrowDtype(pa.struct([("sea", pa.string())])), + ), + }, + ) + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize( + "invalid", + [ + pytest.param(Series([1, 2, 3], dtype="int64"), id="int64"), + pytest.param( + Series(["a", "b", "c"], dtype="string[pyarrow]"), id="string-pyarrow" + ), + ], +) +def test_struct_accessor_api_for_invalid(invalid): + with pytest.raises( + AttributeError, + match=re.escape( + "Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, " + f"not {invalid.dtype}." + ), + ): + invalid.struct + + +@pytest.mark.parametrize( + ["indices", "name"], + [ + (0, "int_col"), + ([1, 2], "str_col"), + (pc.field("int_col"), "int_col"), + ("int_col", "int_col"), + (b"string_col", b"string_col"), + ([b"string_col"], "string_col"), + ], +) +@pytest.mark.skipif(pa_version_under13p0, reason="pyarrow>=13.0.0 required") +def test_struct_accessor_field_expanded(indices, name): + arrow_type = pa.struct( + [ + ("int_col", pa.int64()), + ( + "struct_col", + pa.struct( + [ + ("int_col", pa.int64()), + ("float_col", pa.float64()), + ("str_col", pa.string()), + ] + ), + ), + (b"string_col", pa.string()), + ] + ) + + data = pa.array([], type=arrow_type) + ser = Series(data, dtype=ArrowDtype(arrow_type)) + expected = pc.struct_field(data, indices) + result = ser.struct.field(indices) + tm.assert_equal(result.array._pa_array.combine_chunks(), expected) + assert result.name == name diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_datetime.py pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_datetime.py --- pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_datetime.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_datetime.py 2024-04-10 17:42:52.000000000 +0000 @@ -76,7 +76,7 @@ N = 50 # testing with timezone, GH #2785 - rng = date_range("1/1/1990", periods=N, freq="H", tz=tzget("US/Eastern")) + rng = date_range("1/1/1990", periods=N, freq="h", tz=tzget("US/Eastern")) ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) # also test Timestamp tz handling, GH #2789 @@ -107,7 +107,7 @@ def test_getitem_setitem_datetimeindex(): N = 50 # testing with timezone, GH #2785 - rng = date_range("1/1/1990", periods=N, freq="H", tz="US/Eastern") + rng = date_range("1/1/1990", periods=N, freq="h", tz="US/Eastern") ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) result = ts["1990-01-01 04:00:00"] @@ -213,7 +213,7 @@ def test_getitem_setitem_periodindex(): N = 50 - rng = period_range("1/1/1990", periods=N, freq="H") + rng = period_range("1/1/1990", periods=N, freq="h") ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) result = ts["1990-01-01 04"] @@ -355,7 +355,7 @@ monkeypatch.setattr(libindex, "_SIZE_CUTOFF", 1000) n = 1100 - idx = period_range("1/1/2000", freq="T", periods=n) + idx = period_range("1/1/2000", freq="min", periods=n) assert idx._engine.over_size_threshold s = Series(np.random.default_rng(2).standard_normal(len(idx)), index=idx) @@ -411,7 +411,7 @@ def test_indexing_unordered2(): # diff freq - rng = date_range(datetime(2005, 1, 1), periods=20, freq="M") + rng = date_range(datetime(2005, 1, 1), periods=20, freq="ME") ts = Series(np.arange(len(rng)), index=rng) ts = ts.take(np.random.default_rng(2).permutation(20)) @@ -421,16 +421,16 @@ def test_indexing(): - idx = date_range("2001-1-1", periods=20, freq="M") + idx = date_range("2001-1-1", periods=20, freq="ME") ts = Series(np.random.default_rng(2).random(len(idx)), index=idx) # getting # GH 3070, make sure semantics work on Series/Frame - expected = ts["2001"] - expected.name = "A" + result = ts["2001"] + tm.assert_series_equal(result, ts.iloc[:12]) - df = DataFrame({"A": ts}) + df = DataFrame({"A": ts.copy()}) # GH#36179 pre-2.0 df["2001"] operated as slicing on rows. in 2.0 it behaves # like any other key, so raises @@ -438,24 +438,26 @@ df["2001"] # setting + ts = Series(np.random.default_rng(2).random(len(idx)), index=idx) + expected = ts.copy() + expected.iloc[:12] = 1 ts["2001"] = 1 - expected = ts["2001"] - expected.name = "A" + tm.assert_series_equal(ts, expected) + expected = df.copy() + expected.iloc[:12, 0] = 1 df.loc["2001", "A"] = 1 - - with pytest.raises(KeyError, match="2001"): - df["2001"] + tm.assert_frame_equal(df, expected) def test_getitem_str_month_with_datetimeindex(): # GH3546 (not including times on the last day) - idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:00", freq="H") + idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:00", freq="h") ts = Series(range(len(idx)), index=idx) expected = ts["2013-05"] tm.assert_series_equal(expected, ts) - idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:59", freq="S") + idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:59", freq="s") ts = Series(range(len(idx)), index=idx) expected = ts["2013-05"] tm.assert_series_equal(expected, ts) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_delitem.py pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_delitem.py --- pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_delitem.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_delitem.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,19 +31,16 @@ del s[0] tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) - def test_delitem_object_index(self): + def test_delitem_object_index(self, using_infer_string): # Index(dtype=object) - s = Series(1, index=["a"]) + dtype = "string[pyarrow_numpy]" if using_infer_string else object + s = Series(1, index=Index(["a"], dtype=dtype)) del s["a"] - tm.assert_series_equal( - s, Series(dtype="int64", index=Index([], dtype="object")) - ) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) s["a"] = 1 - tm.assert_series_equal(s, Series(1, index=["a"])) + tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype))) del s["a"] - tm.assert_series_equal( - s, Series(dtype="int64", index=Index([], dtype="object")) - ) + tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype))) def test_delitem_missing_key(self): # empty diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_get.py pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_get.py --- pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_get.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_get.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,8 +3,10 @@ import pandas as pd from pandas import ( + DatetimeIndex, Index, Series, + date_range, ) import pandas._testing as tm @@ -168,7 +170,9 @@ "arr", [ np.random.default_rng(2).standard_normal(10), - tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern"), + DatetimeIndex(date_range("2020-01-01", periods=10), name="a").tz_localize( + tz="US/Eastern" + ), ], ) def test_get_with_ea(arr): diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_getitem.py pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_getitem.py --- pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_getitem.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_getitem.py 2024-04-10 17:42:52.000000000 +0000 @@ -71,7 +71,7 @@ def test_getitem_negative_out_of_bounds(self): ser = Series(["a"] * 10, index=["a"] * 10) - msg = "index -11 is out of bounds for axis 0 with size 10" + msg = "index -11 is out of bounds for axis 0 with size 10|index out of bounds" warn_msg = "Series.__getitem__ treating keys as positions is deprecated" with pytest.raises(IndexError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): @@ -137,7 +137,7 @@ tz = timezones.maybe_get_tz(tzstr) index = date_range( - start="2012-12-24 16:00", end="2012-12-24 18:00", freq="H", tz=tzstr + start="2012-12-24 16:00", end="2012-12-24 18:00", freq="h", tz=tzstr ) ts = Series(index=index, data=index.hour) time_pandas = Timestamp("2012-12-24 17:00", tz=tzstr) @@ -363,7 +363,9 @@ key = Series(["C"], dtype=object) key = box(key) - msg = r"None of \[Index\(\['C'\], dtype='object'\)\] are in the \[index\]" + msg = ( + r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]" + ) with pytest.raises(KeyError, match=msg): ser[key] @@ -437,7 +439,7 @@ # GH#5877 # indexing with empty series - ser = Series(["A", "B"]) + ser = Series(["A", "B"], dtype=object) expected = Series(dtype=object, index=Index([], dtype="int64")) result = ser[Series([], dtype=object)] tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_indexing.py pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_indexing.py --- pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_indexing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_indexing.py 2024-04-10 17:42:52.000000000 +0000 @@ -101,14 +101,16 @@ assert result == expected -def test_getitem_setitem_ellipsis(): +def test_getitem_setitem_ellipsis(using_copy_on_write, warn_copy_on_write): s = Series(np.random.default_rng(2).standard_normal(10)) result = s[...] tm.assert_series_equal(result, s) - s[...] = 5 - assert (result == 5).all() + with tm.assert_cow_warning(warn_copy_on_write): + s[...] = 5 + if not using_copy_on_write: + assert (result == 5).all() @pytest.mark.parametrize( @@ -239,7 +241,7 @@ datetime_series[[5, [None, None]]] = 2 -def test_slice(string_series, object_series, using_copy_on_write): +def test_slice(string_series, object_series, using_copy_on_write, warn_copy_on_write): original = string_series.copy() numSlice = string_series[10:20] numSliceEnd = string_series[-10:] @@ -252,11 +254,12 @@ assert string_series[numSlice.index[0]] == numSlice[numSlice.index[0]] assert numSlice.index[1] == string_series.index[11] - assert tm.equalContents(numSliceEnd, np.array(string_series)[-10:]) + tm.assert_numpy_array_equal(np.array(numSliceEnd), np.array(string_series)[-10:]) # Test return view. sl = string_series[10:20] - sl[:] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + sl[:] = 0 if using_copy_on_write: # Doesn't modify parent (CoW) @@ -294,7 +297,8 @@ df["val"].update(s) expected = df_original else: - df["val"].update(s) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["val"].update(s) expected = DataFrame( {"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "val": [0, 1, 0]} ) @@ -327,7 +331,7 @@ [ date_range("2014-01-01", periods=20, freq="MS"), period_range("2014-01", periods=20, freq="M"), - timedelta_range("0", periods=20, freq="H"), + timedelta_range("0", periods=20, freq="h"), ], ) def test_slice_with_negative_step(index): @@ -487,7 +491,7 @@ np.datetime64("NaT"), np.timedelta64("NaT"), ] - _indexers = [0, [0], slice(0, 1), [True, False, False]] + _indexers = [0, [0], slice(0, 1), [True, False, False], slice(None, None, None)] @pytest.mark.parametrize( "invalid", _invalid_scalars + [1, 1.0, np.int64(1), np.float64(1)] @@ -501,7 +505,7 @@ @pytest.mark.parametrize("indexer", _indexers) def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer): ser = Series([1, 2, 3], dtype=any_int_numpy_dtype) - if isna(invalid) and invalid is not NaT: + if isna(invalid) and invalid is not NaT and not np.isnat(invalid): warn = None else: warn = FutureWarning diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_setitem.py pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_setitem.py --- pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_setitem.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_setitem.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,10 +2,12 @@ date, datetime, ) +from decimal import Decimal import numpy as np import pytest +from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import IndexingError from pandas.core.dtypes.common import is_list_like @@ -79,7 +81,7 @@ @pytest.mark.parametrize("tz", ["US/Eastern", "UTC", "Asia/Tokyo"]) def test_setitem_with_tz(self, tz, indexer_sli): - orig = Series(date_range("2016-01-01", freq="H", periods=3, tz=tz)) + orig = Series(date_range("2016-01-01", freq="h", periods=3, tz=tz)) assert orig.dtype == f"datetime64[ns, {tz}]" exp = Series( @@ -87,7 +89,8 @@ Timestamp("2016-01-01 00:00", tz=tz), Timestamp("2011-01-01 00:00", tz=tz), Timestamp("2016-01-01 02:00", tz=tz), - ] + ], + dtype=orig.dtype, ) # scalar @@ -99,6 +102,7 @@ vals = Series( [Timestamp("2011-01-01", tz=tz), Timestamp("2012-01-01", tz=tz)], index=[1, 2], + dtype=orig.dtype, ) assert vals.dtype == f"datetime64[ns, {tz}]" @@ -107,7 +111,8 @@ Timestamp("2016-01-01 00:00", tz=tz), Timestamp("2011-01-01 00:00", tz=tz), Timestamp("2012-01-01 00:00", tz=tz), - ] + ], + dtype=orig.dtype, ) ser = orig.copy() @@ -117,7 +122,7 @@ def test_setitem_with_tz_dst(self, indexer_sli): # GH#14146 trouble setting values near DST boundary tz = "US/Eastern" - orig = Series(date_range("2016-11-06", freq="H", periods=3, tz=tz)) + orig = Series(date_range("2016-11-06", freq="h", periods=3, tz=tz)) assert orig.dtype == f"datetime64[ns, {tz}]" exp = Series( @@ -125,7 +130,8 @@ Timestamp("2016-11-06 00:00-04:00", tz=tz), Timestamp("2011-01-01 00:00-05:00", tz=tz), Timestamp("2016-11-06 01:00-05:00", tz=tz), - ] + ], + dtype=orig.dtype, ) # scalar @@ -137,6 +143,7 @@ vals = Series( [Timestamp("2011-01-01", tz=tz), Timestamp("2012-01-01", tz=tz)], index=[1, 2], + dtype=orig.dtype, ) assert vals.dtype == f"datetime64[ns, {tz}]" @@ -145,7 +152,8 @@ Timestamp("2016-11-06 00:00", tz=tz), Timestamp("2011-01-01 00:00", tz=tz), Timestamp("2012-01-01 00:00", tz=tz), - ] + ], + dtype=orig.dtype, ) ser = orig.copy() @@ -175,7 +183,8 @@ def test_setitem_negative_out_of_bounds(self): ser = Series(["a"] * 10, index=["a"] * 10) - msg = "index -11 is out of bounds for axis 0 with size 10" + # string index falls back to positional + msg = "index -11|-1 is out of bounds for axis 0 with size 10" warn_msg = "Series.__setitem__ treating keys as positions is deprecated" with pytest.raises(IndexError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): @@ -232,7 +241,9 @@ def test_setitem_slicestep(self): # caught this bug when writing tests - series = Series(tm.makeIntIndex(20).astype(float), index=tm.makeIntIndex(20)) + series = Series( + np.arange(20, dtype=np.float64), index=np.arange(20, dtype=np.int64) + ) series[::2] = 0 assert (series[::2] == 0).all() @@ -500,10 +511,11 @@ def test_setitem_empty_series_datetimeindex_preserves_freq(self): # GH#33573 our index should retain its freq - series = Series([], DatetimeIndex([], freq="D"), dtype=object) + dti = DatetimeIndex([], freq="D", dtype="M8[ns]") + series = Series([], index=dti, dtype=object) key = Timestamp("2012-01-01") series[key] = 47 - expected = Series(47, DatetimeIndex([key], freq="D")) + expected = Series(47, DatetimeIndex([key], freq="D").as_unit("ns")) tm.assert_series_equal(series, expected) assert series.index.freq == expected.index.freq @@ -527,8 +539,12 @@ Timedelta("9 days").to_pytimedelta(), ], ) - def test_append_timedelta_does_not_cast(self, td): + def test_append_timedelta_does_not_cast(self, td, using_infer_string, request): # GH#22717 inserting a Timedelta should _not_ cast to int64 + if using_infer_string and not isinstance(td, Timedelta): + # TODO: GH#56010 + request.applymarker(pytest.mark.xfail(reason="inferred as string")) + expected = Series(["x", td], index=[0, "td"], dtype=object) ser = Series(["x"]) @@ -595,13 +611,21 @@ expected = Series(expected_values, dtype=target_dtype) tm.assert_series_equal(ser, expected) - def test_setitem_enlargement_object_none(self, nulls_fixture): + def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string): # GH#48665 ser = Series(["a", "b"]) ser[3] = nulls_fixture - expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3]) + dtype = ( + "string[pyarrow_numpy]" + if using_infer_string and not isinstance(nulls_fixture, Decimal) + else object + ) + expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3], dtype=dtype) tm.assert_series_equal(ser, expected) - assert ser[3] is nulls_fixture + if using_infer_string: + ser[3] is np.nan + else: + assert ser[3] is nulls_fixture def test_setitem_scalar_into_readonly_backing_data(): @@ -845,20 +869,28 @@ self._check_inplace(is_inplace, orig, arr, obj) - def test_index_where(self, obj, key, expected, warn, val): + def test_index_where(self, obj, key, expected, warn, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - res = Index(obj).where(~mask, val) - expected_idx = Index(expected, dtype=expected.dtype) - tm.assert_index_equal(res, expected_idx) + if using_infer_string and obj.dtype == object: + with pytest.raises(TypeError, match="Scalar must"): + Index(obj).where(~mask, val) + else: + res = Index(obj).where(~mask, val) + expected_idx = Index(expected, dtype=expected.dtype) + tm.assert_index_equal(res, expected_idx) - def test_index_putmask(self, obj, key, expected, warn, val): + def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - res = Index(obj).putmask(mask, val) - tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) + if using_infer_string and obj.dtype == object: + with pytest.raises(TypeError, match="Scalar must"): + Index(obj).putmask(mask, val) + else: + res = Index(obj).putmask(mask, val) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) @pytest.mark.parametrize( @@ -1409,6 +1441,10 @@ np.float32, None, marks=pytest.mark.xfail( + ( + not np_version_gte1p24 + or (np_version_gte1p24 and np._get_promotion_state() != "weak") + ), reason="np.float32(1.1) ends up as 1.100000023841858, so " "np_can_hold_element raises and we cast to float64", ), @@ -1433,7 +1469,7 @@ def test_slice_key(self, obj, key, expected, warn, val, indexer_sli, is_inplace): super().test_slice_key(obj, key, expected, warn, val, indexer_sli, is_inplace) - if type(val) is float: + if isinstance(val, float): # the xfail would xpass bc test_slice_key short-circuits raise AssertionError("xfail not relevant for this test.") @@ -1746,7 +1782,7 @@ # GH#42530 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - result = df.pop("b") + result = df.pop("b").copy() result[[True, False, False]] = 9 expected = Series(data=[9, 5, 6], name="b") tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_where.py pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_where.py --- pandas-2.1.4+dfsg/pandas/tests/series/indexing/test_where.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/indexing/test_where.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.core.dtypes.common import is_integer import pandas as pd @@ -230,6 +232,7 @@ tm.assert_series_equal(out, expected) +@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment @@ -393,16 +396,21 @@ expected = Series([10, 10]) mask = np.array([False, False]) - rs = ser.where(mask, [10, 10]) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.where(mask, [10, 10]) tm.assert_series_equal(rs, expected) - rs = ser.where(mask, 10) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.where(mask, 10) tm.assert_series_equal(rs, expected) - rs = ser.where(mask, 10.0) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.where(mask, 10.0) tm.assert_series_equal(rs, expected) - rs = ser.where(mask, [10.0, 10.0]) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.where(mask, [10.0, 10.0]) tm.assert_series_equal(rs, expected) rs = ser.where(mask, [10.0, np.nan]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_align.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_align.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_align.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_align.py 2024-04-10 17:42:52.000000000 +0000 @@ -193,7 +193,7 @@ def test_align_dt64tzindex_mismatched_tzs(): - idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") + idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern") ser = Series(np.random.default_rng(2).standard_normal(len(idx1)), index=idx1) ser_central = ser.tz_convert("US/Central") # different timezones convert to UTC @@ -204,7 +204,7 @@ def test_align_periodindex(join_type): - rng = period_range("1/1/2000", "1/1/2010", freq="A") + rng = period_range("1/1/2000", "1/1/2010", freq="Y") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) # TODO: assert something? @@ -240,10 +240,10 @@ result_left, result_right = left.align(right) expected_left = Series( - [2], index=pd.MultiIndex.from_tuples([(1, 3, 4, 2)], names=["a", "c", "d", "b"]) + [2], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"]) ) expected_right = Series( - [1], index=pd.MultiIndex.from_tuples([(1, 3, 4, 2)], names=["a", "c", "d", "b"]) + [1], index=pd.MultiIndex.from_tuples([(1, 4, 3, 2)], names=["a", "d", "c", "b"]) ) tm.assert_series_equal(result_left, expected_left) tm.assert_series_equal(result_right, expected_right) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_argsort.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_argsort.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_argsort.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_argsort.py 2024-04-10 17:42:52.000000000 +0000 @@ -42,14 +42,17 @@ argsorted = datetime_series.argsort() assert issubclass(argsorted.dtype.type, np.integer) + def test_argsort_dt64(self, unit): # GH#2967 (introduced bug in 0.11-dev I think) - s = Series([Timestamp(f"201301{i:02d}") for i in range(1, 6)]) - assert s.dtype == "datetime64[ns]" - shifted = s.shift(-1) - assert shifted.dtype == "datetime64[ns]" + ser = Series( + [Timestamp(f"201301{i:02d}") for i in range(1, 6)], dtype=f"M8[{unit}]" + ) + assert ser.dtype == f"datetime64[{unit}]" + shifted = ser.shift(-1) + assert shifted.dtype == f"datetime64[{unit}]" assert isna(shifted[4]) - result = s.argsort() + result = ser.argsort() expected = Series(range(5), dtype=np.intp) tm.assert_series_equal(result, expected) @@ -60,12 +63,12 @@ tm.assert_series_equal(result, expected) def test_argsort_stable(self): - s = Series(np.random.default_rng(2).integers(0, 100, size=10000)) - mindexer = s.argsort(kind="mergesort") - qindexer = s.argsort() + ser = Series(np.random.default_rng(2).integers(0, 100, size=10000)) + mindexer = ser.argsort(kind="mergesort") + qindexer = ser.argsort() - mexpected = np.argsort(s.values, kind="mergesort") - qexpected = np.argsort(s.values, kind="quicksort") + mexpected = np.argsort(ser.values, kind="mergesort") + qexpected = np.argsort(ser.values, kind="quicksort") tm.assert_series_equal(mindexer.astype(np.intp), Series(mexpected)) tm.assert_series_equal(qindexer.astype(np.intp), Series(qexpected)) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_asof.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_asof.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_asof.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_asof.py 2024-04-10 17:42:52.000000000 +0000 @@ -118,7 +118,7 @@ def test_periodindex(self): # array or list or dates N = 50 - rng = period_range("1/1/1990", periods=N, freq="H") + rng = period_range("1/1/1990", periods=N, freq="h") ts = Series(np.random.default_rng(2).standard_normal(N), index=rng) ts.iloc[15:30] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="37min") @@ -133,7 +133,7 @@ lb = ts.index[14] ub = ts.index[30] - pix = PeriodIndex(result.index.values, freq="H") + pix = PeriodIndex(result.index.values, freq="h") mask = (pix >= lb) & (pix < ub) rs = result[mask] assert (rs == ts[lb]).all() diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_astype.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_astype.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_astype.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_astype.py 2024-04-10 17:42:52.000000000 +0000 @@ -25,6 +25,7 @@ Timestamp, cut, date_range, + to_datetime, ) import pandas._testing as tm @@ -75,7 +76,7 @@ dt1 = dtype_class({"abc": str}) result = ser.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc") + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) tm.assert_series_equal(result, expected) dt2 = dtype_class({"abc": "float64"}) @@ -107,6 +108,32 @@ class TestAstype: + @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"]) + def test_astype_object_to_dt64_non_nano(self, tz): + # GH#55756, GH#54620 + ts = Timestamp("2999-01-01") + dtype = "M8[us]" + if tz is not None: + dtype = f"M8[us, {tz}]" + vals = [ts, "2999-01-02 03:04:05.678910", 2500] + ser = Series(vals, dtype=object) + result = ser.astype(dtype) + + # The 2500 is interpreted as microseconds, consistent with what + # we would get if we created DatetimeIndexes from vals[:2] and vals[2:] + # and concated the results. + pointwise = [ + vals[0].tz_localize(tz), + Timestamp(vals[1], tz=tz), + to_datetime(vals[2], unit="us", utc=True).tz_convert(tz), + ] + exp_vals = [x.as_unit("us").asm8 for x in pointwise] + exp_arr = np.array(exp_vals, dtype="M8[us]") + expected = Series(exp_arr, dtype="M8[us]") + if tz is not None: + expected = expected.dt.tz_localize("UTC").dt.tz_convert(tz) + tm.assert_series_equal(result, expected) + def test_astype_mixed_object_to_dt64tz(self): # pre-2.0 this raised ValueError bc of tz mismatch # xref GH#32581 @@ -143,10 +170,12 @@ Series([string.digits * 10, rand_str(63), rand_str(64), np.nan, 1.0]), ], ) - def test_astype_str_map(self, dtype, series): + def test_astype_str_map(self, dtype, series, using_infer_string): # see GH#4405 result = series.astype(dtype) expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_astype_float_to_period(self): @@ -170,7 +199,7 @@ if np.dtype(dtype).name not in ["timedelta64", "datetime64"]: mark = pytest.mark.xfail(reason="GH#33890 Is assigned ns unit") - request.node.add_marker(mark) + request.applymarker(mark) msg = ( rf"The '{dtype.__name__}' dtype has no unit\. " @@ -200,8 +229,8 @@ ) tm.assert_series_equal(result, expected) - def test_astype_datetime(self): - ser = Series(iNaT, dtype="M8[ns]", index=range(5)) + def test_astype_datetime(self, unit): + ser = Series(iNaT, dtype=f"M8[{unit}]", index=range(5)) ser = ser.astype("O") assert ser.dtype == np.object_ @@ -211,10 +240,12 @@ ser = ser.astype("O") assert ser.dtype == np.object_ - ser = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)]) + ser = Series( + [datetime(2001, 1, 2, 0, 0) for i in range(3)], dtype=f"M8[{unit}]" + ) ser[1] = np.nan - assert ser.dtype == "M8[ns]" + assert ser.dtype == f"M8[{unit}]" ser = ser.astype("O") assert ser.dtype == np.object_ @@ -254,13 +285,13 @@ ts = Series([Timestamp("2010-01-04 00:00:00")]) res = ts.astype(str) - expected = Series(["2010-01-04"]) + expected = Series(["2010-01-04"], dtype=object) tm.assert_series_equal(res, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) res = ts.astype(str) - expected = Series(["2010-01-04 00:00:00-05:00"]) + expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) tm.assert_series_equal(res, expected) def test_astype_str_cast_td64(self): @@ -269,7 +300,7 @@ td = Series([Timedelta(1, unit="d")]) ser = td.astype(str) - expected = Series(["1 days"]) + expected = Series(["1 days"], dtype=object) tm.assert_series_equal(ser, expected) def test_dt64_series_astype_object(self): @@ -316,7 +347,7 @@ # https://github.com/pandas-dev/pandas/issues/36451 ser = Series([0.1], dtype=dtype) result = ser.astype(str) - expected = Series(["0.1"]) + expected = Series(["0.1"], dtype=object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -387,7 +418,7 @@ tm.assert_series_equal(result, Series(np.arange(1, 5))) - def test_astype_unicode(self): + def test_astype_unicode(self, using_infer_string): # see GH#7758: A bit of magic is required to set # default encoding to utf-8 digits = string.digits @@ -404,12 +435,14 @@ item = "野菜食べないとやばい" ser = Series([item.encode()]) result = ser.astype(np.str_) - expected = Series([item]) + expected = Series([item], dtype=object) tm.assert_series_equal(result, expected) for ser in test_series: res = ser.astype(np.str_) expec = ser.map(str) + if using_infer_string: + expec = expec.astype(object) tm.assert_series_equal(res, expec) # Restore the former encoding @@ -485,7 +518,7 @@ mark = pytest.mark.xfail( reason="TODO StringArray.astype() with missing values #GH40566" ) - request.node.add_marker(mark) + request.applymarker(mark) # GH-40351 ser = Series(data, dtype=dtype) @@ -505,12 +538,12 @@ expected = ser tm.assert_series_equal(ser.astype("category"), expected) tm.assert_series_equal(ser.astype(CategoricalDtype()), expected) - msg = r"Cannot cast object dtype to float64" + msg = r"Cannot cast object|string dtype to float64" with pytest.raises(ValueError, match=msg): ser.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) - exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype("int") @@ -640,3 +673,11 @@ result = Series([Timedelta(1), np.nan], dtype="timedelta64[ns]") expected = Series([Timedelta(1), NaT], dtype="timedelta64[ns]") tm.assert_series_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_astype_int_na_string(self): + # GH#57418 + ser = Series([12, NA], dtype="Int64[pyarrow]") + result = ser.astype("string[pyarrow]") + expected = Series(["12", NA], dtype="string[pyarrow]") + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_between.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_between.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_between.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_between.py 2024-04-10 17:42:52.000000000 +0000 @@ -20,7 +20,7 @@ tm.assert_series_equal(result, expected) def test_between_datetime_object_dtype(self): - ser = Series(bdate_range("1/1/2000", periods=20).astype(object)) + ser = Series(bdate_range("1/1/2000", periods=20), dtype=object) ser[::2] = np.nan result = ser[ser.between(ser[3], ser[17])] diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_case_when.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_case_when.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_case_when.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_case_when.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,148 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, + array as pd_array, + date_range, +) +import pandas._testing as tm + + +@pytest.fixture +def df(): + """ + base dataframe for testing + """ + return DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + +def test_case_when_caselist_is_not_a_list(df): + """ + Raise ValueError if caselist is not a list. + """ + msg = "The caselist argument should be a list; " + msg += "instead got.+" + with pytest.raises(TypeError, match=msg): # GH39154 + df["a"].case_when(caselist=()) + + +def test_case_when_no_caselist(df): + """ + Raise ValueError if no caselist is provided. + """ + msg = "provide at least one boolean condition, " + msg += "with a corresponding replacement." + with pytest.raises(ValueError, match=msg): # GH39154 + df["a"].case_when([]) + + +def test_case_when_odd_caselist(df): + """ + Raise ValueError if no of caselist is odd. + """ + msg = "Argument 0 must have length 2; " + msg += "a condition and replacement; instead got length 3." + + with pytest.raises(ValueError, match=msg): + df["a"].case_when([(df["a"].eq(1), 1, df.a.gt(1))]) + + +def test_case_when_raise_error_from_mask(df): + """ + Raise Error from within Series.mask + """ + msg = "Failed to apply condition0 and replacement0." + with pytest.raises(ValueError, match=msg): + df["a"].case_when([(df["a"].eq(1), [1, 2])]) + + +def test_case_when_single_condition(df): + """ + Test output on a single condition. + """ + result = Series([np.nan, np.nan, np.nan]).case_when([(df.a.eq(1), 1)]) + expected = Series([1, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions(df): + """ + Test output when booleans are derived from a computation + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [(df.a.eq(1), 1), (Series([False, True, False]), 2)] + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_list(df): + """ + Test output when replacement is a list + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [([True, False, False], 1), (df["a"].gt(1) & df["b"].eq(5), [1, 2, 3])] + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_extension_dtype(df): + """ + Test output when replacement has an extension dtype + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [ + ([True, False, False], 1), + (df["a"].gt(1) & df["b"].eq(5), pd_array([1, 2, 3], dtype="Int64")), + ], + ) + expected = Series([1, 2, np.nan], dtype="Float64") + tm.assert_series_equal(result, expected) + + +def test_case_when_multiple_conditions_replacement_series(df): + """ + Test output when replacement is a Series + """ + result = Series([np.nan, np.nan, np.nan]).case_when( + [ + (np.array([True, False, False]), 1), + (df["a"].gt(1) & df["b"].eq(5), Series([1, 2, 3])), + ], + ) + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(result, expected) + + +def test_case_when_non_range_index(): + """ + Test output if index is not RangeIndex + """ + rng = np.random.default_rng(seed=123) + dates = date_range("1/1/2000", periods=8) + df = DataFrame( + rng.standard_normal(size=(8, 4)), index=dates, columns=["A", "B", "C", "D"] + ) + result = Series(5, index=df.index, name="A").case_when([(df.A.gt(0), df.B)]) + expected = df.A.mask(df.A.gt(0), df.B).where(df.A.gt(0), 5) + tm.assert_series_equal(result, expected) + + +def test_case_when_callable(): + """ + Test output on a callable + """ + # https://numpy.org/doc/stable/reference/generated/numpy.piecewise.html + x = np.linspace(-2.5, 2.5, 6) + ser = Series(x) + result = ser.case_when( + caselist=[ + (lambda df: df < 0, lambda df: -df), + (lambda df: df >= 0, lambda df: df), + ] + ) + expected = np.piecewise(x, [x < 0, x >= 0], [lambda x: -x, lambda x: x]) + tm.assert_series_equal(result, Series(expected)) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_clip.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_clip.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_clip.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_clip.py 2024-04-10 17:42:52.000000000 +0000 @@ -69,8 +69,15 @@ tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) # GH#19992 - tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, 3])) - tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, 2, 1])) + msg = "Downcasting behavior in Series and DataFrame methods 'where'" + # TODO: avoid this warning here? seems like we should never be upcasting + # in the first place? + with tm.assert_produces_warning(FutureWarning, match=msg): + res = s.clip(lower=[0, 4, np.nan]) + tm.assert_series_equal(res, Series([1, 4, 3])) + with tm.assert_produces_warning(FutureWarning, match=msg): + res = s.clip(upper=[1, np.nan, 1]) + tm.assert_series_equal(res, Series([1, 2, 1])) # GH#40420 s = Series([1, 2, 3]) @@ -128,11 +135,12 @@ ) tm.assert_series_equal(result, expected) - def test_clip_with_timestamps_and_oob_datetimes(self): + @pytest.mark.parametrize("dtype", [object, "M8[us]"]) + def test_clip_with_timestamps_and_oob_datetimes(self, dtype): # GH-42794 - ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)]) + ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)], dtype=dtype) result = ser.clip(lower=Timestamp.min, upper=Timestamp.max) - expected = Series([Timestamp.min, Timestamp.max], dtype="object") + expected = Series([Timestamp.min, Timestamp.max], dtype=dtype) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_combine_first.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_combine_first.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_combine_first.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_combine_first.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,7 +16,7 @@ class TestCombineFirst: def test_combine_first_period_datetime(self): # GH#3367 - didx = date_range(start="1950-01-31", end="1950-07-31", freq="M") + didx = date_range(start="1950-01-31", end="1950-07-31", freq="ME") pidx = period_range(start=Period("1950-1"), end=Period("1950-7"), freq="M") # check to be consistent with DatetimeIndex for idx in [didx, pidx]: @@ -32,8 +32,8 @@ assert result.name == datetime_series.name def test_combine_first(self): - values = tm.makeIntIndex(20).values.astype(float) - series = Series(values, index=tm.makeIntIndex(20)) + values = np.arange(20, dtype=np.float64) + series = Series(values, index=np.arange(20, dtype=np.int64)) series_copy = series * 2 series_copy[::2] = np.nan @@ -51,9 +51,9 @@ tm.assert_series_equal(combined[1::2], series_copy[1::2]) # mixed types - index = tm.makeStringIndex(20) + index = pd.Index([str(i) for i in range(20)]) floats = Series(np.random.default_rng(2).standard_normal(20), index=index) - strings = Series(tm.makeStringIndex(10), index=index[::2]) + strings = Series([str(i) for i in range(10)], index=index[::2], dtype=object) combined = strings.combine_first(floats) @@ -69,14 +69,14 @@ ser.index = ser.index.astype("O") tm.assert_series_equal(ser, result) - def test_combine_first_dt64(self): - s0 = to_datetime(Series(["2010", np.nan])) - s1 = to_datetime(Series([np.nan, "2011"])) + def test_combine_first_dt64(self, unit): + s0 = to_datetime(Series(["2010", np.nan])).dt.as_unit(unit) + s1 = to_datetime(Series([np.nan, "2011"])).dt.as_unit(unit) rs = s0.combine_first(s1) - xp = to_datetime(Series(["2010", "2011"])) + xp = to_datetime(Series(["2010", "2011"])).dt.as_unit(unit) tm.assert_series_equal(rs, xp) - s0 = to_datetime(Series(["2010", np.nan])) + s0 = to_datetime(Series(["2010", np.nan])).dt.as_unit(unit) s1 = Series([np.nan, "2011"]) rs = s0.combine_first(s1) @@ -107,7 +107,7 @@ time_index = date_range( datetime(2021, 1, 1, 1), datetime(2021, 1, 1, 10), - freq="H", + freq="h", tz="Europe/Rome", ) s1 = Series(range(10), index=time_index) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_convert_dtypes.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_convert_dtypes.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_convert_dtypes.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_convert_dtypes.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._libs import lib + import pandas as pd import pandas._testing as tm @@ -125,19 +127,37 @@ ), (["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}), ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("s"), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ms"), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("us"), + pd.DatetimeTZDtype(tz="UTC"), + pd.DatetimeTZDtype(tz="UTC"), + {}, + ), + ( + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), pd.DatetimeTZDtype(tz="UTC"), pd.DatetimeTZDtype(tz="UTC"), {}, ), ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), "datetime64[ns]", np.dtype("datetime64[ns]"), {}, ), ( - pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), + pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]).as_unit("ns"), object, np.dtype("datetime64[ns]"), {("infer_objects", False): np.dtype("object")}, @@ -166,11 +186,12 @@ self, test_cases, params, + using_infer_string, ): data, maindtype, expected_default, expected_other = test_cases if ( hasattr(data, "dtype") - and data.dtype == "M8[ns]" + and lib.is_np_dtype(data.dtype, "M") and isinstance(maindtype, pd.DatetimeTZDtype) ): # this astype is deprecated in favor of tz_localize @@ -199,6 +220,16 @@ for spec, dtype in expected_other.items(): if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])): expected_dtype = dtype + if ( + using_infer_string + and expected_default == "string" + and expected_dtype == object + and params[0] + and not params[1] + ): + # If we would convert with convert strings then infer_objects converts + # with the option + expected_dtype = "string[pyarrow_numpy]" expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -265,3 +296,11 @@ result = ser.convert_dtypes(dtype_backend="numpy_nullable") expected = pd.Series(range(2), dtype="Int32") tm.assert_series_equal(result, expected) + + def test_convert_dtypes_pyarrow_null(self): + # GH#55346 + pa = pytest.importorskip("pyarrow") + ser = pd.Series([None, None]) + result = ser.convert_dtypes(dtype_backend="pyarrow") + expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_copy.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_copy.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_copy.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_copy.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,7 +10,7 @@ class TestCopy: @pytest.mark.parametrize("deep", ["default", None, False, True]) - def test_copy(self, deep, using_copy_on_write): + def test_copy(self, deep, using_copy_on_write, warn_copy_on_write): ser = Series(np.arange(10), dtype="float64") # default deep is True @@ -27,7 +27,8 @@ else: assert not np.may_share_memory(ser.values, ser2.values) - ser2[::2] = np.nan + with tm.assert_cow_warning(warn_copy_on_write and deep is False): + ser2[::2] = np.nan if deep is not False or using_copy_on_write: # Did not modify original Series @@ -38,6 +39,7 @@ assert np.isnan(ser2[0]) assert np.isnan(ser[0]) + @pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning") @pytest.mark.parametrize("deep", ["default", None, False, True]) def test_copy_tzaware(self, deep, using_copy_on_write): # GH#11794 diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_cov_corr.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_cov_corr.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_cov_corr.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_cov_corr.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,6 +6,7 @@ import pandas as pd from pandas import ( Series, + date_range, isna, ) import pandas._testing as tm @@ -81,8 +82,12 @@ cp[:] = np.nan assert isna(cp.corr(cp)) - A = tm.makeTimeSeries() - B = tm.makeTimeSeries() + A = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + B = A.copy() result = A.corr(B) expected, _ = stats.pearsonr(A, B) tm.assert_almost_equal(result, expected) @@ -91,9 +96,13 @@ stats = pytest.importorskip("scipy.stats") # kendall and spearman - A = tm.makeTimeSeries() - B = tm.makeTimeSeries() - A[-5:] = A[:5] + A = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) + B = A.copy() + A[-5:] = A[:5].copy() result = A.corr(B, method="kendall") expected = stats.kendalltau(A, B)[0] tm.assert_almost_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_diff.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_diff.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_diff.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_diff.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,7 +31,11 @@ def test_diff_tz(self): # Combined datetime diff, normal diff and boolean diff test - ts = tm.makeTimeSeries(name="ts") + ts = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10), + name="ts", + ) ts.diff() # neg n diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_dropna.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_dropna.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_dropna.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_dropna.py 2024-04-10 17:42:52.000000000 +0000 @@ -68,7 +68,7 @@ tm.assert_series_equal(result, expected) - def test_datetime64_tz_dropna(self): + def test_datetime64_tz_dropna(self, unit): # DatetimeLikeBlock ser = Series( [ @@ -76,20 +76,23 @@ NaT, Timestamp("2011-01-03 10:00"), NaT, - ] + ], + dtype=f"M8[{unit}]", ) result = ser.dropna() expected = Series( - [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-03 10:00")], index=[0, 2] + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-03 10:00")], + index=[0, 2], + dtype=f"M8[{unit}]", ) tm.assert_series_equal(result, expected) # DatetimeTZBlock idx = DatetimeIndex( ["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz="Asia/Tokyo" - ) + ).as_unit(unit) ser = Series(idx) - assert ser.dtype == "datetime64[ns, Asia/Tokyo]" + assert ser.dtype == f"datetime64[{unit}, Asia/Tokyo]" result = ser.dropna() expected = Series( [ @@ -97,8 +100,9 @@ Timestamp("2011-01-03 10:00", tz="Asia/Tokyo"), ], index=[0, 2], + dtype=f"datetime64[{unit}, Asia/Tokyo]", ) - assert result.dtype == "datetime64[ns, Asia/Tokyo]" + assert result.dtype == f"datetime64[{unit}, Asia/Tokyo]" tm.assert_series_equal(result, expected) @pytest.mark.parametrize("val", [1, 1.5]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_equals.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_equals.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_equals.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_equals.py 2024-04-10 17:42:52.000000000 +0000 @@ -82,13 +82,15 @@ left = Series([np.datetime64("NaT")], dtype=object) right = Series([np.datetime64("NaT")], dtype=object) assert left.equals(right) - assert Index(left).equals(Index(right)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.timedelta64("NaT")], dtype=object) right = Series([np.timedelta64("NaT")], dtype=object) assert left.equals(right) - assert Index(left).equals(Index(right)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.float64("NaN")], dtype=object) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_explode.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_explode.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_explode.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_explode.py 2024-04-10 17:42:52.000000000 +0000 @@ -163,3 +163,13 @@ dtype=pd.ArrowDtype(pa.int64()), ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ignore_index", [True, False]) +def test_explode_pyarrow_non_list_type(ignore_index): + pa = pytest.importorskip("pyarrow") + data = [1, 2, 3] + ser = pd.Series(data, dtype=pd.ArrowDtype(pa.int64())) + result = ser.explode(ignore_index=ignore_index) + expected = pd.Series([1, 2, 3], dtype="int64[pyarrow]", index=[0, 1, 2]) + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_fillna.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_fillna.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_fillna.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_fillna.py 2024-04-10 17:42:52.000000000 +0000 @@ -19,6 +19,7 @@ Timestamp, date_range, isna, + timedelta_range, ) import pandas._testing as tm from pandas.core.arrays import period_array @@ -71,7 +72,9 @@ datetime_series.fillna(value=0, method="ffill") def test_fillna(self): - ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) + ts = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("2020-01-01", periods=5) + ) tm.assert_series_equal(ts, ts.fillna(method="ffill")) @@ -237,7 +240,7 @@ expected = Series([0, 1, 2.5, 4, 4], dtype=np.float64) tm.assert_series_equal(res, expected) - def test_timedelta_fillna(self, frame_or_series): + def test_timedelta_fillna(self, frame_or_series, unit): # GH#3371 ser = Series( [ @@ -245,10 +248,11 @@ Timestamp("20130101"), Timestamp("20130102"), Timestamp("20130103 9:01:01"), - ] + ], + dtype=f"M8[{unit}]", ) td = ser.diff() - obj = frame_or_series(td) + obj = frame_or_series(td).copy() # reg fillna result = obj.fillna(Timedelta(seconds=0)) @@ -258,7 +262,8 @@ timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] + ], + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -277,7 +282,8 @@ timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] + ], + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -289,7 +295,8 @@ timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] + ], + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -301,7 +308,8 @@ timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] + ], + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) @@ -314,14 +322,14 @@ timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), ], - dtype="m8[ns]", + dtype=f"m8[{unit}]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) # ffill td[2] = np.nan - obj = frame_or_series(td) + obj = frame_or_series(td).copy() result = obj.ffill() expected = td.fillna(Timedelta(seconds=0)) expected[0] = np.nan @@ -373,6 +381,72 @@ ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "scalar", + [ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + reason="GH#56410 scalar case not yet addressed" + ), + ), + ], + ) + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_datetime64_fillna_mismatched_reso_no_rounding(self, tz, scalar): + # GH#56410 + dti = date_range("2016-01-01", periods=3, unit="s", tz=tz) + item = Timestamp("2016-02-03 04:05:06.789", tz=tz) + vec = date_range(item, periods=3, unit="ms") + + exp_dtype = "M8[ms]" if tz is None else "M8[ms, UTC]" + expected = Series([item, dti[1], dti[2]], dtype=exp_dtype) + + ser = Series(dti) + ser[0] = NaT + ser2 = ser.copy() + + res = ser.fillna(item) + res2 = ser2.fillna(Series(vec)) + + if scalar: + tm.assert_series_equal(res, expected) + else: + tm.assert_series_equal(res2, expected) + + @pytest.mark.parametrize( + "scalar", + [ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + reason="GH#56410 scalar case not yet addressed" + ), + ), + ], + ) + def test_timedelta64_fillna_mismatched_reso_no_rounding(self, scalar): + # GH#56410 + tdi = date_range("2016-01-01", periods=3, unit="s") - Timestamp("1970-01-01") + item = Timestamp("2016-02-03 04:05:06.789") - Timestamp("1970-01-01") + vec = timedelta_range(item, periods=3, unit="ms") + + expected = Series([item, tdi[1], tdi[2]], dtype="m8[ms]") + + ser = Series(tdi) + ser[0] = NaT + ser2 = ser.copy() + + res = ser.fillna(item) + res2 = ser2.fillna(Series(vec)) + + if scalar: + tm.assert_series_equal(res, expected) + else: + tm.assert_series_equal(res2, expected) + def test_datetime64_fillna_backfill(self): # GH#6587 # make sure that we are treating as integer when filling @@ -390,7 +464,7 @@ tm.assert_series_equal(result, expected) @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) - def test_datetime64_tz_fillna(self, tz): + def test_datetime64_tz_fillna(self, tz, unit): # DatetimeLikeBlock ser = Series( [ @@ -398,7 +472,8 @@ NaT, Timestamp("2011-01-03 10:00"), NaT, - ] + ], + dtype=f"M8[{unit}]", ) null_loc = Series([False, True, False, True]) @@ -409,7 +484,8 @@ Timestamp("2011-01-02 10:00"), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00"), - ] + ], + dtype=f"M8[{unit}]", ) tm.assert_series_equal(expected, result) # check s is not changed @@ -466,15 +542,18 @@ Timestamp("2011-01-02 10:00"), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-04 10:00"), - ] + ], + dtype=f"M8[{unit}]", ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) # DatetimeTZBlock - idx = DatetimeIndex(["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz=tz) + idx = DatetimeIndex( + ["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz=tz + ).as_unit(unit) ser = Series(idx) - assert ser.dtype == f"datetime64[ns, {tz}]" + assert ser.dtype == f"datetime64[{unit}, {tz}]" tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna(Timestamp("2011-01-02 10:00")) @@ -498,7 +577,7 @@ "2011-01-02 10:00", ], tz=tz, - ) + ).as_unit(unit) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) @@ -512,7 +591,7 @@ "2011-01-02 10:00", ], tz=tz, - ) + ).as_unit(unit) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) @@ -560,7 +639,7 @@ Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2011-01-04 10:00", tz=tz), ] - ) + ).dt.as_unit(unit) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) @@ -587,7 +666,7 @@ Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2013-01-01", tz="US/Pacific").tz_convert(tz), ] - ) + ).dt.as_unit(unit) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) @@ -880,7 +959,9 @@ tm.assert_series_equal(filled, expected) def test_ffill(self): - ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) + ts = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("2020-01-01", periods=5) + ) ts.iloc[2] = np.nan tm.assert_series_equal(ts.ffill(), ts.fillna(method="ffill")) @@ -891,7 +972,9 @@ tm.assert_series_equal(series, result) def test_bfill(self): - ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) + ts = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("2020-01-01", periods=5) + ) ts.iloc[2] = np.nan tm.assert_series_equal(ts.bfill(), ts.fillna(method="bfill")) @@ -997,3 +1080,76 @@ ser = Series([1, 2, 3]) with tm.assert_produces_warning(FutureWarning): getattr(ser, func)() + + +@pytest.mark.parametrize( + "data, expected_data, method, kwargs", + ( + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan], + "ffill", + {"limit_area": "inside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan], + "ffill", + {"limit_area": "inside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0], + "ffill", + {"limit_area": "outside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan], + "ffill", + {"limit_area": "outside", "limit": 1}, + ), + ( + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + "ffill", + {"limit_area": "outside", "limit": 1}, + ), + ( + range(5), + range(5), + "ffill", + {"limit_area": "outside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "inside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "inside", "limit": 1}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "outside"}, + ), + ( + [np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan], + [np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan], + "bfill", + {"limit_area": "outside", "limit": 1}, + ), + ), +) +def test_ffill_bfill_limit_area(data, expected_data, method, kwargs): + # GH#56492 + s = Series(data) + expected = Series(expected_data) + result = getattr(s, method)(**kwargs) + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_get_numeric_data.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_get_numeric_data.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_get_numeric_data.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_get_numeric_data.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,14 +7,17 @@ class TestGetNumericData: - def test_get_numeric_data_preserve_dtype(self, using_copy_on_write): + def test_get_numeric_data_preserve_dtype( + self, using_copy_on_write, warn_copy_on_write + ): # get the numeric data obj = Series([1, 2, 3]) result = obj._get_numeric_data() tm.assert_series_equal(result, obj) # returned object is a shallow copy - result.iloc[0] = 0 + with tm.assert_cow_warning(warn_copy_on_write): + result.iloc[0] = 0 if using_copy_on_write: assert obj.iloc[0] == 1 else: diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_infer_objects.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_infer_objects.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_infer_objects.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_infer_objects.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,7 +31,7 @@ expected = index_or_series([1.0, 2.0, 3.0, np.nan]) tm.assert_equal(actual, expected) - # only soft conversions, unconvertable pass thru unchanged + # only soft conversions, unconvertible pass thru unchanged obj = index_or_series(np.array([1, 2, 3, None, "a"], dtype="O")) actual = obj.infer_objects() diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_info.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_info.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_info.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_info.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,181 @@ +from io import StringIO +from string import ascii_uppercase +import textwrap + +import numpy as np +import pytest + +from pandas.compat import PYPY + +from pandas import ( + CategoricalIndex, + MultiIndex, + Series, + date_range, +) + + +def test_info_categorical_column_just_works(): + n = 2500 + data = np.array(list("abcdefghij")).take( + np.random.default_rng(2).integers(0, 10, size=n, dtype=int) + ) + s = Series(data).astype("category") + s.isna() + buf = StringIO() + s.info(buf=buf) + + s2 = s[s == "d"] + buf = StringIO() + s2.info(buf=buf) + + +def test_info_categorical(): + # GH14298 + idx = CategoricalIndex(["a", "b"]) + s = Series(np.zeros(2), index=idx) + buf = StringIO() + s.info(buf=buf) + + +@pytest.mark.parametrize("verbose", [True, False]) +def test_info_series(lexsorted_two_level_string_multiindex, verbose): + index = lexsorted_two_level_string_multiindex + ser = Series(range(len(index)), index=index, name="sth") + buf = StringIO() + ser.info(verbose=verbose, buf=buf) + result = buf.getvalue() + + expected = textwrap.dedent( + """\ + + MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three') + """ + ) + if verbose: + expected += textwrap.dedent( + """\ + Series name: sth + Non-Null Count Dtype + -------------- ----- + 10 non-null int64 + """ + ) + expected += textwrap.dedent( + f"""\ + dtypes: int64(1) + memory usage: {ser.memory_usage()}.0+ bytes + """ + ) + assert result == expected + + +def test_info_memory(): + s = Series([1, 2], dtype="i8") + buf = StringIO() + s.info(buf=buf) + result = buf.getvalue() + memory_bytes = float(s.memory_usage()) + expected = textwrap.dedent( + f"""\ + + RangeIndex: 2 entries, 0 to 1 + Series name: None + Non-Null Count Dtype + -------------- ----- + 2 non-null int64 + dtypes: int64(1) + memory usage: {memory_bytes} bytes + """ + ) + assert result == expected + + +def test_info_wide(): + s = Series(np.random.default_rng(2).standard_normal(101)) + msg = "Argument `max_cols` can only be passed in DataFrame.info, not Series.info" + with pytest.raises(ValueError, match=msg): + s.info(max_cols=1) + + +def test_info_shows_dtypes(): + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] + n = 10 + for dtype in dtypes: + s = Series(np.random.default_rng(2).integers(2, size=n).astype(dtype)) + buf = StringIO() + s.info(buf=buf) + res = buf.getvalue() + name = f"{n:d} non-null {dtype}" + assert name in res + + +@pytest.mark.xfail(PYPY, reason="on PyPy deep=True doesn't change result") +def test_info_memory_usage_deep_not_pypy(): + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) > s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) > s_object.memory_usage() + + +@pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result") +def test_info_memory_usage_deep_pypy(): + s_with_object_index = Series({"a": [1]}, index=["foo"]) + assert s_with_object_index.memory_usage( + index=True, deep=True + ) == s_with_object_index.memory_usage(index=True) + + s_object = Series({"a": ["a"]}) + assert s_object.memory_usage(deep=True) == s_object.memory_usage() + + +@pytest.mark.parametrize( + "series, plus", + [ + (Series(1, index=[1, 2, 3]), False), + (Series(1, index=list("ABC")), True), + (Series(1, index=MultiIndex.from_product([range(3), range(3)])), False), + ( + Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])), + True, + ), + ], +) +def test_info_memory_usage_qualified(series, plus): + buf = StringIO() + series.info(buf=buf) + if plus: + assert "+" in buf.getvalue() + else: + assert "+" not in buf.getvalue() + + +def test_info_memory_usage_bug_on_multiindex(): + # GH 14308 + # memory usage introspection should not materialize .values + N = 100 + M = len(ascii_uppercase) + index = MultiIndex.from_product( + [list(ascii_uppercase), date_range("20160101", periods=N)], + names=["id", "date"], + ) + s = Series(np.random.default_rng(2).standard_normal(N * M), index=index) + + unstacked = s.unstack("id") + assert s.values.nbytes == unstacked.values.nbytes + assert s.memory_usage(deep=True) > unstacked.memory_usage(deep=True).sum() + + # high upper bound + diff = unstacked.memory_usage(deep=True).sum() - s.memory_usage(deep=True) + assert diff < 2000 diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_interpolate.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_interpolate.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_interpolate.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_interpolate.py 2024-04-10 17:42:52.000000000 +0000 @@ -205,7 +205,7 @@ [ {}, pytest.param( - {"method": "polynomial", "order": 1}, marks=td.skip_if_no_scipy + {"method": "polynomial", "order": 1}, marks=td.skip_if_no("scipy") ), ], ) @@ -253,7 +253,7 @@ [ {}, pytest.param( - {"method": "polynomial", "order": 1}, marks=td.skip_if_no_scipy + {"method": "polynomial", "order": 1}, marks=td.skip_if_no("scipy") ), ], ) @@ -628,7 +628,7 @@ tm.assert_series_equal(result, s) @pytest.mark.parametrize( - "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))] ) def test_interp_multiIndex(self, check_scipy): idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")]) @@ -780,7 +780,7 @@ exp = ts.reindex(new_index).interpolate(method="time") - index = date_range("1/1/2012", periods=4, freq="12H") + index = date_range("1/1/2012", periods=4, freq="12h") ts = Series([0, 12, 24, 36], index) new_index = index.append(index + pd.DateOffset(hours=1)).sort_values() result = ts.reindex(new_index).interpolate(method="time") @@ -831,7 +831,7 @@ method, kwargs = interp_methods_ind if method in {"cubic", "zero"}: - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason=f"{method} interpolation is not supported for TimedeltaIndex" ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_isin.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_isin.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_isin.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_isin.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,6 +7,7 @@ date_range, ) import pandas._testing as tm +from pandas.core import algorithms from pandas.core.arrays import PeriodArray @@ -197,13 +198,16 @@ tm.assert_series_equal(result, expected) -@pytest.mark.slow -def test_isin_large_series_mixed_dtypes_and_nan(): +def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch): # https://github.com/pandas-dev/pandas/issues/37094 - # combination of object dtype for the values and > 1_000_000 elements - ser = Series([1, 2, np.nan] * 1_000_000) - result = ser.isin({"foo", "bar"}) - expected = Series([False] * 3 * 1_000_000) + # combination of object dtype for the values + # and > _MINIMUM_COMP_ARR_LEN elements + min_isin_comp = 5 + ser = Series([1, 2, np.nan] * min_isin_comp) + with monkeypatch.context() as m: + m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp) + result = ser.isin({"foo", "bar"}) + expected = Series([False] * 3 * min_isin_comp) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_map.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_map.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_map.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_map.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,6 +14,8 @@ Index, MultiIndex, Series, + bdate_range, + date_range, isna, timedelta_range, ) @@ -73,7 +75,7 @@ def test_series_map_box_timestamps(): # GH#2689, GH#2627 - ser = Series(pd.date_range("1/1/2000", periods=3)) + ser = Series(date_range("1/1/2000", periods=3)) def func(x): return (x.hour, x.day, x.month) @@ -83,7 +85,7 @@ tm.assert_series_equal(result, expected) -def test_map_series_stringdtype(any_string_dtype): +def test_map_series_stringdtype(any_string_dtype, using_infer_string): # map test on StringDType, GH#40823 ser1 = Series( data=["cat", "dog", "rabbit"], @@ -98,6 +100,8 @@ item = np.nan expected = Series(data=["rabbit", "dog", "cat", item], dtype=any_string_dtype) + if using_infer_string and any_string_dtype == "object": + expected = expected.astype("string[pyarrow_numpy]") tm.assert_series_equal(result, expected) @@ -106,7 +110,7 @@ "data, expected_dtype", [(["1-1", "1-1", np.nan], "category"), (["1-1", "1-2", np.nan], object)], ) -def test_map_categorical_with_nan_values(data, expected_dtype): +def test_map_categorical_with_nan_values(data, expected_dtype, using_infer_string): # GH 20714 bug fixed in: GH 24275 def func(val): return val.split("-")[0] @@ -114,6 +118,8 @@ s = Series(data, dtype="category") result = s.map(func, na_action="ignore") + if using_infer_string and expected_dtype == object: + expected_dtype = "string[pyarrow_numpy]" expected = Series(["1", "1", np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -127,17 +133,21 @@ def test_map_empty_integer_series_with_datetime_index(): # GH 21245 - s = Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int) + s = Series([], index=date_range(start="2018-01-01", periods=0), dtype=int) result = s.map(lambda x: x) tm.assert_series_equal(result, s) @pytest.mark.parametrize("func", [str, lambda x: str(x)]) -def test_map_simple_str_callables_same_as_astype(string_series, func): +def test_map_simple_str_callables_same_as_astype( + string_series, func, using_infer_string +): # test that we are evaluating row-by-row first # before vectorized evaluation result = string_series.map(func) - expected = string_series.astype(str) + expected = string_series.astype( + str if not using_infer_string else "string[pyarrow_numpy]" + ) tm.assert_series_equal(result, expected) @@ -146,8 +156,13 @@ string_series.map([lambda x: x]) -def test_map(datetime_series): - index, data = tm.getMixedTypeDict() +def test_map(): + data = { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + } source = Series(data["B"], index=data["C"]) target = Series(data["C"][:4], index=data["D"][:4]) @@ -163,10 +178,14 @@ for k, v in merged.items(): assert v == source[target[k]] + +def test_map_datetime(datetime_series): # function result = datetime_series.map(lambda x: x * 2) tm.assert_series_equal(result, datetime_series * 2) + +def test_map_category(): # GH 10324 a = Series([1, 2, 3, 4]) b = Series(["even", "odd", "even", "odd"], dtype="category") @@ -177,6 +196,8 @@ exp = Series(["odd", "even", "odd", np.nan]) tm.assert_series_equal(a.map(c), exp) + +def test_map_category_numeric(): a = Series(["a", "b", "c", "d"]) b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(["b", "c", "d", "e"])) c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"])) @@ -186,6 +207,8 @@ exp = Series([np.nan, 1, 2, 3]) tm.assert_series_equal(a.map(c), exp) + +def test_map_category_string(): a = Series(["a", "b", "c", "d"]) b = Series( ["B", "C", "D", "E"], @@ -204,7 +227,7 @@ def test_map_empty(request, index): if isinstance(index, MultiIndex): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( reason="Initializing a Series from a MultiIndex is not supported" ) @@ -418,44 +441,50 @@ tm.assert_series_equal(result, expected) -def test_map_box(): +def test_map_box_dt64(unit): vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] - s = Series(vals) - assert s.dtype == "datetime64[ns]" + ser = Series(vals).dt.as_unit(unit) + assert ser.dtype == f"datetime64[{unit}]" # boxed value must be Timestamp instance - res = s.map(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + res = ser.map(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") exp = Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) + +def test_map_box_dt64tz(unit): vals = [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), ] - s = Series(vals) - assert s.dtype == "datetime64[ns, US/Eastern]" - res = s.map(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") + ser = Series(vals).dt.as_unit(unit) + assert ser.dtype == f"datetime64[{unit}, US/Eastern]" + res = ser.map(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) + +def test_map_box_td64(unit): # timedelta vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] - s = Series(vals) - assert s.dtype == "timedelta64[ns]" - res = s.map(lambda x: f"{type(x).__name__}_{x.days}") + ser = Series(vals).dt.as_unit(unit) + assert ser.dtype == f"timedelta64[{unit}]" + res = ser.map(lambda x: f"{type(x).__name__}_{x.days}") exp = Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) + +def test_map_box_period(): # period vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] - s = Series(vals) - assert s.dtype == "Period[M]" - res = s.map(lambda x: f"{type(x).__name__}_{x.freqstr}") + ser = Series(vals) + assert ser.dtype == "Period[M]" + res = ser.map(lambda x: f"{type(x).__name__}_{x.freqstr}") exp = Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) @pytest.mark.parametrize("na_action", [None, "ignore"]) -def test_map_categorical(na_action): +def test_map_categorical(na_action, using_infer_string): values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) s = Series(values, name="XX", index=list("abcdefg")) @@ -468,7 +497,7 @@ result = s.map(lambda x: "A", na_action=na_action) exp = Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == object + assert result.dtype == object if not using_infer_string else "string" @pytest.mark.parametrize( @@ -494,14 +523,12 @@ def test_map_datetimetz(): - values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( - "Asia/Tokyo" - ) + values = date_range("2011-01-01", "2011-01-02", freq="h").tz_localize("Asia/Tokyo") s = Series(values, name="XX") # keep tz result = s.map(lambda x: x + pd.offsets.Day()) - exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + exp_values = date_range("2011-01-02", "2011-01-03", freq="h").tz_localize( "Asia/Tokyo" ) exp = Series(exp_values, name="XX") @@ -530,20 +557,26 @@ (list(range(3)), {0: 42}, [42] + [np.nan] * 3), ], ) -def test_map_missing_mixed(vals, mapping, exp): +def test_map_missing_mixed(vals, mapping, exp, using_infer_string): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) - - tm.assert_series_equal(result, Series(exp)) + exp = Series(exp) + if using_infer_string and mapping == {np.nan: "not NaN"}: + exp.iloc[-1] = np.nan + tm.assert_series_equal(result, exp) def test_map_scalar_on_date_time_index_aware_series(): # GH 25959 # Calling map on a localized time series should not cause an error - series = tm.makeTimeSeries(nper=30).tz_localize("UTC") + series = Series( + np.arange(10, dtype=np.float64), + index=date_range("2020-01-01", periods=10, tz="UTC"), + name="ts", + ) result = Series(series.index).map(lambda x: 1) - tm.assert_series_equal(result, Series(np.ones(30), dtype="int64")) + tm.assert_series_equal(result, Series(np.ones(len(series)), dtype="int64")) def test_map_float_to_string_precision(): diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_pct_change.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_pct_change.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_pct_change.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_pct_change.py 2024-04-10 17:42:52.000000000 +0000 @@ -118,3 +118,11 @@ result = ser.pct_change() expected = Series([np.nan, np.nan, np.nan, 1, 0.5]) tm.assert_series_equal(result, expected) + + +def test_pct_change_empty(): + # GH 57056 + ser = Series([], dtype="float64") + expected = ser.copy() + result = ser.pct_change(periods=0) + tm.assert_series_equal(expected, result) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_quantile.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_quantile.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_quantile.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_quantile.py 2024-04-10 17:42:52.000000000 +0000 @@ -48,7 +48,8 @@ with pytest.raises(ValueError, match=msg): s.quantile(percentile_array) - def test_quantile_multi(self, datetime_series): + def test_quantile_multi(self, datetime_series, unit): + datetime_series.index = datetime_series.index.as_unit(unit) qs = [0.1, 0.9] result = datetime_series.quantile(qs) expected = Series( @@ -68,6 +69,7 @@ [Timestamp("2000-01-10 19:12:00"), Timestamp("2000-01-10 19:12:00")], index=[0.2, 0.2], name="xxx", + dtype=f"M8[{unit}]", ) tm.assert_series_equal(result, expected) @@ -103,8 +105,8 @@ def test_quantile_nan(self): # GH 13098 - s = Series([1, 2, 3, 4, np.nan]) - result = s.quantile(0.5) + ser = Series([1, 2, 3, 4, np.nan]) + result = ser.quantile(0.5) expected = 2.5 assert result == expected @@ -112,14 +114,14 @@ s1 = Series([], dtype=object) cases = [s1, Series([np.nan, np.nan])] - for s in cases: - res = s.quantile(0.5) + for ser in cases: + res = ser.quantile(0.5) assert np.isnan(res) - res = s.quantile([0.5]) + res = ser.quantile([0.5]) tm.assert_series_equal(res, Series([np.nan], index=[0.5])) - res = s.quantile([0.2, 0.3]) + res = ser.quantile([0.2, 0.3]) tm.assert_series_equal(res, Series([np.nan, np.nan], index=[0.2, 0.3])) @pytest.mark.parametrize( @@ -158,11 +160,11 @@ ], ) def test_quantile_box(self, case): - s = Series(case, name="XXX") - res = s.quantile(0.5) + ser = Series(case, name="XXX") + res = ser.quantile(0.5) assert res == case[1] - res = s.quantile([0.5]) + res = ser.quantile([0.5]) exp = Series([case[1]], index=[0.5], name="XXX") tm.assert_series_equal(res, exp) @@ -188,35 +190,37 @@ expected = Series(np.asarray(ser)).quantile([0.5]).astype("Sparse[float]") tm.assert_series_equal(result, expected) - def test_quantile_empty(self): + def test_quantile_empty_float64(self): # floats - s = Series([], dtype="float64") + ser = Series([], dtype="float64") - res = s.quantile(0.5) + res = ser.quantile(0.5) assert np.isnan(res) - res = s.quantile([0.5]) + res = ser.quantile([0.5]) exp = Series([np.nan], index=[0.5]) tm.assert_series_equal(res, exp) + def test_quantile_empty_int64(self): # int - s = Series([], dtype="int64") + ser = Series([], dtype="int64") - res = s.quantile(0.5) + res = ser.quantile(0.5) assert np.isnan(res) - res = s.quantile([0.5]) + res = ser.quantile([0.5]) exp = Series([np.nan], index=[0.5]) tm.assert_series_equal(res, exp) + def test_quantile_empty_dt64(self): # datetime - s = Series([], dtype="datetime64[ns]") + ser = Series([], dtype="datetime64[ns]") - res = s.quantile(0.5) + res = ser.quantile(0.5) assert res is pd.NaT - res = s.quantile([0.5]) - exp = Series([pd.NaT], index=[0.5]) + res = ser.quantile([0.5]) + exp = Series([pd.NaT], index=[0.5], dtype=ser.dtype) tm.assert_series_equal(res, exp) @pytest.mark.parametrize("dtype", [int, float, "Int64"]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_reindex.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_reindex.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_reindex.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_reindex.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -22,15 +24,13 @@ import pandas._testing as tm +@pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="share memory doesn't work for arrow" +) def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) - # __array_interface__ is not defined for older numpies - # and on some pythons - try: - assert np.may_share_memory(string_series.index, identity.index) - except AttributeError: - pass + assert np.may_share_memory(string_series.index, identity.index) assert identity.index.is_(string_series.index) assert identity.index.identical(string_series.index) @@ -157,16 +157,20 @@ # inference of new dtype s = Series([True, False, False, True], index=list("abcd")) new_index = "agc" - result = s.reindex(list(new_index)).ffill() + msg = "Downcasting object dtype arrays on" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.reindex(list(new_index)).ffill() expected = Series([True, True, False], index=list(new_index)) tm.assert_series_equal(result, expected) def test_reindex_downcasting(): # GH4618 shifted series downcasting - s = Series(False, index=range(0, 5)) - result = s.shift(1).bfill() - expected = Series(False, index=range(0, 5)) + s = Series(False, index=range(5)) + msg = "Downcasting object dtype arrays on" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.shift(1).bfill() + expected = Series(False, index=range(5)) tm.assert_series_equal(result, expected) @@ -330,7 +334,7 @@ def test_reindex_datetimeindexes_tz_naive_and_aware(): # GH 8306 idx = date_range("20131101", tz="America/Chicago", periods=7) - newidx = date_range("20131103", periods=10, freq="H") + newidx = date_range("20131103", periods=10, freq="h") s = Series(range(7), index=idx) msg = ( r"Cannot compare dtypes datetime64\[ns, America/Chicago\] " diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_rename.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_rename.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_rename.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_rename.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,7 @@ Index, MultiIndex, Series, + array, ) import pandas._testing as tm @@ -45,22 +46,28 @@ expected = Series(range(5), index=[0, 10, 20, 3, 4], name="foo") tm.assert_series_equal(result, expected) - def test_rename_set_name(self): + def test_rename_set_name(self, using_infer_string): ser = Series(range(4), index=list("abcd")) for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: result = ser.rename(name) assert result.name == name - tm.assert_numpy_array_equal(result.index.values, ser.index.values) + if using_infer_string: + tm.assert_extension_array_equal(result.index.values, ser.index.values) + else: + tm.assert_numpy_array_equal(result.index.values, ser.index.values) assert ser.name is None - def test_rename_set_name_inplace(self): + def test_rename_set_name_inplace(self, using_infer_string): ser = Series(range(3), index=list("abc")) for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: ser.rename(name, inplace=True) assert ser.name == name - exp = np.array(["a", "b", "c"], dtype=np.object_) - tm.assert_numpy_array_equal(ser.index.values, exp) + if using_infer_string: + exp = array(exp, dtype="string[pyarrow_numpy]") + tm.assert_extension_array_equal(ser.index.values, exp) + else: + tm.assert_numpy_array_equal(ser.index.values, exp) def test_rename_axis_supported(self): # Supporting axis for compatibility, detailed in GH-18589 @@ -162,12 +169,13 @@ with pytest.raises(KeyError, match=match): ser.rename({2: 9}, errors="raise") - def test_rename_copy_false(self, using_copy_on_write): + def test_rename_copy_false(self, using_copy_on_write, warn_copy_on_write): # GH 46889 ser = Series(["foo", "bar"]) ser_orig = ser.copy() shallow_copy = ser.rename({1: 9}, copy=False) - ser[0] = "foobar" + with tm.assert_cow_warning(warn_copy_on_write): + ser[0] = "foobar" if using_copy_on_write: assert ser_orig[0] == shallow_copy[0] assert ser_orig[1] == shallow_copy[9] diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_replace.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_replace.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_replace.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_replace.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -50,7 +52,7 @@ assert res.dtype == object def test_replace(self): - N = 100 + N = 50 ser = pd.Series(np.random.default_rng(2).standard_normal(N)) ser[0:4] = np.nan ser[6:10] = 0 @@ -68,7 +70,7 @@ ser = pd.Series( np.fabs(np.random.default_rng(2).standard_normal(N)), - tm.makeDateIndex(N), + pd.date_range("2020-01-01", periods=N), dtype=object, ) ser[:5] = np.nan @@ -76,7 +78,9 @@ ser[20:30] = "bar" # replace list with a single value - rs = ser.replace([np.nan, "foo", "bar"], -1) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -84,7 +88,8 @@ assert (pd.isna(ser[:5])).all() # replace with different values - rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -92,11 +97,13 @@ assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() @@ -283,10 +290,10 @@ tm.assert_series_equal(result, expected) def test_replace2(self): - N = 100 + N = 50 ser = pd.Series( np.fabs(np.random.default_rng(2).standard_normal(N)), - tm.makeDateIndex(N), + pd.date_range("2020-01-01", periods=N), dtype=object, ) ser[:5] = np.nan @@ -294,7 +301,9 @@ ser[20:30] = "bar" # replace list with a single value - rs = ser.replace([np.nan, "foo", "bar"], -1) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -302,7 +311,8 @@ assert (pd.isna(ser[:5])).all() # replace with different values - rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -310,11 +320,13 @@ assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) + with tm.assert_produces_warning(FutureWarning, match=msg): + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() @@ -373,10 +385,13 @@ def test_replace_mixed_types_with_string(self): # Testing mixed s = pd.Series([1, 2, 3, "4", 4, 5]) - result = s.replace([2, "4"], np.nan) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s.replace([2, "4"], np.nan) expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") @pytest.mark.parametrize( "categorical, numeric", [ @@ -387,7 +402,10 @@ def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(categorical) - result = ser.replace({"A": 1, "B": 2}) + msg = "Downcasting behavior in `replace`" + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present @@ -401,7 +419,9 @@ def test_replace_categorical_inplace(self, data, data_exp): # GH 53358 result = pd.Series(data, dtype="category") - result.replace(to_replace="a", value="b", inplace=True) + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result.replace(to_replace="a", value="b", inplace=True) expected = pd.Series(data_exp, dtype="category") tm.assert_series_equal(result, expected) @@ -417,16 +437,22 @@ expected = expected.cat.remove_unused_categories() assert c[2] != "foo" - result = c.replace(c[2], "foo") + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = c.replace(c[2], "foo") tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original - return_value = c.replace(c[2], "foo", inplace=True) + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = c.replace(c[2], "foo", inplace=True) assert return_value is None tm.assert_series_equal(expected, c) first_value = c[0] - return_value = c.replace(c[1], c[0], inplace=True) + msg = "with CategoricalDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = c.replace(c[1], c[0], inplace=True) assert return_value is None assert c[0] == c[1] == first_value # test replacing with existing value @@ -705,12 +731,15 @@ with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 series = pd.Series(["0"]) expected = pd.Series([1]) - result = series.replace(to_replace="0", value=1, regex=regex) + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) def test_replace_different_int_types(self, any_int_numpy_dtype): @@ -732,10 +761,12 @@ expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - def test_replace_change_dtype_series(self): + def test_replace_change_dtype_series(self, using_infer_string): # GH#25797 df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) - df["Test"] = df["Test"].replace([True], [np.nan]) + warn = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warn, match="Downcasting"): + df["Test"] = df["Test"].replace([True], [np.nan]) expected = pd.DataFrame.from_dict({"Test": ["0.5", np.nan, "0.6"]}) tm.assert_frame_equal(df, expected) @@ -768,3 +799,15 @@ ser.replace(to_replace=1, value=pd.NA, inplace=True) tm.assert_series_equal(ser, expected) + + def test_replace_ea_float_with_bool(self): + # GH#55398 + ser = pd.Series([0.0], dtype="Float64") + expected = ser.copy() + result = ser.replace(False, 1.0) + tm.assert_series_equal(result, expected) + + ser = pd.Series([False], dtype="boolean") + expected = ser.copy() + result = ser.replace(0.0, True) + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_reset_index.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_reset_index.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_reset_index.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_reset_index.py 2024-04-10 17:42:52.000000000 +0000 @@ -34,7 +34,11 @@ assert df.reset_index()["Date"].iloc[0] == stamp def test_reset_index(self): - df = tm.makeDataFrame()[:5] + df = DataFrame( + 1.1 * np.arange(120).reshape((30, 4)), + columns=Index(list("ABCD"), dtype=object), + index=Index([f"i-{i}" for i in range(30)], dtype=object), + )[:5] ser = df.stack(future_stack=True) ser.index.names = ["hash", "category"] @@ -137,8 +141,16 @@ with pytest.raises(KeyError, match="not found"): s.reset_index("wrong", drop=True) - def test_reset_index_with_drop(self, series_with_multilevel_index): - ser = series_with_multilevel_index + def test_reset_index_with_drop(self): + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + data = np.random.default_rng(2).standard_normal(8) + ser = Series(data, index=index) + ser.iloc[3] = np.nan deleveled = ser.reset_index() assert isinstance(deleveled, DataFrame) @@ -175,12 +187,20 @@ ), ], ) -def test_reset_index_dtypes_on_empty_series_with_multiindex(array, dtype): +def test_reset_index_dtypes_on_empty_series_with_multiindex( + array, dtype, using_infer_string +): # GH 19602 - Preserve dtype on empty Series with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = Series(dtype=object, index=idx)[:0].reset_index().dtypes + exp = "string" if using_infer_string else object expected = Series( - {"level_0": np.int64, "level_1": np.float64, "level_2": dtype, 0: object} + { + "level_0": np.int64, + "level_1": np.float64, + "level_2": exp if dtype == object else dtype, + 0: object, + } ) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_round.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_round.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_round.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_round.py 2024-04-10 17:42:52.000000000 +0000 @@ -56,9 +56,19 @@ @pytest.mark.parametrize("method", ["round", "floor", "ceil"]) @pytest.mark.parametrize("freq", ["s", "5s", "min", "5min", "h", "5h"]) - def test_round_nat(self, method, freq): - # GH14940 - ser = Series([pd.NaT]) - expected = Series(pd.NaT) + def test_round_nat(self, method, freq, unit): + # GH14940, GH#56158 + ser = Series([pd.NaT], dtype=f"M8[{unit}]") + expected = Series(pd.NaT, dtype=f"M8[{unit}]") round_method = getattr(ser.dt, method) - tm.assert_series_equal(round_method(freq), expected) + result = round_method(freq) + tm.assert_series_equal(result, expected) + + def test_round_ea_boolean(self): + # GH#55936 + ser = Series([True, False], dtype="boolean") + expected = ser.copy() + result = ser.round(2) + tm.assert_series_equal(result, expected) + result.iloc[0] = False + tm.assert_series_equal(ser, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_sort_index.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_sort_index.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_sort_index.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_sort_index.py 2024-04-10 17:42:52.000000000 +0000 @@ -317,3 +317,21 @@ result = s.sort_index(key=lambda x: x.month_name()) expected = s.iloc[[2, 1, 0]] tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "ascending", + [ + [True, False], + [False, True], + ], + ) + def test_sort_index_multi_already_monotonic(self, ascending): + # GH 56049 + mi = MultiIndex.from_product([[1, 2], [3, 4]]) + ser = Series(range(len(mi)), index=mi) + result = ser.sort_index(ascending=ascending) + if ascending == [True, False]: + expected = ser.take([1, 0, 3, 2]) + elif ascending == [False, True]: + expected = ser.take([2, 3, 0, 1]) + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_to_csv.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_to_csv.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_to_csv.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_to_csv.py 2024-04-10 17:42:52.000000000 +0000 @@ -165,7 +165,7 @@ pd.read_csv(fh, index_col=0, encoding=encoding).squeeze("columns"), ) - def test_to_csv_interval_index(self): + def test_to_csv_interval_index(self, using_infer_string): # GH 28210 s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) @@ -175,6 +175,8 @@ # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = s.copy() - expected.index = expected.index.astype(str) - + if using_infer_string: + expected.index = expected.index.astype("string[pyarrow_numpy]") + else: + expected.index = expected.index.astype(str) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_to_dict.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_to_dict.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_to_dict.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_to_dict.py 2024-04-10 17:42:52.000000000 +0000 @@ -13,12 +13,12 @@ ) def test_to_dict(self, mapping, datetime_series): # GH#16122 - result = Series(datetime_series.to_dict(mapping), name="ts") + result = Series(datetime_series.to_dict(into=mapping), name="ts") expected = datetime_series.copy() expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected) - from_method = Series(datetime_series.to_dict(collections.Counter)) + from_method = Series(datetime_series.to_dict(into=collections.Counter)) from_constructor = Series(collections.Counter(datetime_series.items())) tm.assert_series_equal(from_method, from_constructor) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_to_numpy.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_to_numpy.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_to_numpy.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_to_numpy.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,9 +1,12 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( NA, Series, + Timedelta, ) import pandas._testing as tm @@ -23,3 +26,24 @@ result = ser.to_numpy(dtype=np.float64, na_value=np.nan) expected = np.array([1.0]) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_to_numpy_arrow_dtype_given(): + # GH#57121 + ser = Series([1, NA], dtype="int64[pyarrow]") + result = ser.to_numpy(dtype="float64") + expected = np.array([1.0, np.nan]) + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_ea_int_to_td_ts(): + # GH#57093 + ser = Series([1, None], dtype="Int64") + result = ser.astype("m8[ns]") + expected = Series([1, Timedelta("nat")], dtype="m8[ns]") + tm.assert_series_equal(result, expected) + + result = ser.astype("M8[ns]") + expected = Series([1, Timedelta("nat")], dtype="M8[ns]") + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_tz_localize.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_tz_localize.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_tz_localize.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_tz_localize.py 2024-04-10 17:42:52.000000000 +0000 @@ -70,11 +70,11 @@ ["foo", "invalid"], ], ) - def test_tz_localize_nonexistent(self, warsaw, method, exp): + def test_tz_localize_nonexistent(self, warsaw, method, exp, unit): # GH 8917 tz = warsaw n = 60 - dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") + dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min", unit=unit) ser = Series(1, index=dti) df = ser.to_frame() @@ -101,7 +101,7 @@ else: result = ser.tz_localize(tz, nonexistent=method) - expected = Series(1, index=DatetimeIndex([exp] * n, tz=tz)) + expected = Series(1, index=DatetimeIndex([exp] * n, tz=tz).as_unit(unit)) tm.assert_series_equal(result, expected) result = df.tz_localize(tz, nonexistent=method) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_unstack.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_unstack.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_unstack.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_unstack.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,8 +4,10 @@ import pandas as pd from pandas import ( DataFrame, + Index, MultiIndex, Series, + date_range, ) import pandas._testing as tm @@ -92,7 +94,7 @@ expected = DataFrame( [[1, 1, 1], [1, 1, 1], [1, 1, 1]], columns=MultiIndex.from_tuples([("a",), ("b",), ("c",)], names=[("A", "a")]), - index=pd.Index([1, 2, 3], name=("B", "b")), + index=Index([1, 2, 3], name=("B", "b")), ) tm.assert_frame_equal(result, expected) @@ -109,7 +111,7 @@ ( (("A", "a"), "B"), [[1, 1, 1, 1], [1, 1, 1, 1]], - pd.Index([3, 4], name="C"), + Index([3, 4], name="C"), MultiIndex.from_tuples( [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=[("A", "a"), "B"] ), @@ -133,9 +135,12 @@ def test_unstack_multi_index_categorical_values(): - mi = ( - tm.makeTimeDataFrame().stack(future_stack=True).index.rename(["major", "minor"]) + df = DataFrame( + np.random.default_rng(2).standard_normal((10, 4)), + columns=Index(list("ABCD"), dtype=object), + index=date_range("2000-01-01", periods=10, freq="B"), ) + mi = df.stack(future_stack=True).index.rename(["major", "minor"]) ser = Series(["foo"] * len(mi), index=mi, name="category", dtype="category") result = ser.unstack() @@ -144,7 +149,7 @@ c = pd.Categorical(["foo"] * len(dti)) expected = DataFrame( {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, - columns=pd.Index(list("ABCD"), name="minor"), + columns=Index(list("ABCD"), name="minor"), index=dti.rename("major"), ) tm.assert_frame_equal(result, expected) @@ -158,7 +163,7 @@ result = ser.unstack("x") expected = DataFrame( [[1], [2]], - columns=pd.Index(["a"], name="x"), + columns=Index(["a"], name="x"), index=MultiIndex.from_tuples([(1, "red"), (2, "blue")], names=[0, "y"]), ) tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_update.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_update.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_update.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_update.py 2024-04-10 17:42:52.000000000 +0000 @@ -34,10 +34,12 @@ df["c"].update(Series(["foo"], index=[0])) expected = df_orig else: - df["c"].update(Series(["foo"], index=[0])) + with tm.assert_produces_warning(FutureWarning, match="inplace method"): + df["c"].update(Series(["foo"], index=[0])) expected = DataFrame( [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] ) + expected["c"] = expected["c"].astype(object) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_value_counts.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_value_counts.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_value_counts.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_value_counts.py 2024-04-10 17:42:52.000000000 +0000 @@ -12,7 +12,7 @@ class TestSeriesValueCounts: - def test_value_counts_datetime(self): + def test_value_counts_datetime(self, unit): # most dtypes are tested in tests/base values = [ pd.Timestamp("2011-01-01 09:00"), @@ -26,13 +26,13 @@ exp_idx = pd.DatetimeIndex( ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], name="xxx", - ) + ).as_unit(unit) exp = Series([3, 2, 1], index=exp_idx, name="count") - ser = Series(values, name="xxx") + ser = Series(values, name="xxx").dt.as_unit(unit) tm.assert_series_equal(ser.value_counts(), exp) # check DatetimeIndex outputs the same result - idx = pd.DatetimeIndex(values, name="xxx") + idx = pd.DatetimeIndex(values, name="xxx").as_unit(unit) tm.assert_series_equal(idx.value_counts(), exp) # normalize @@ -40,7 +40,7 @@ tm.assert_series_equal(ser.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) - def test_value_counts_datetime_tz(self): + def test_value_counts_datetime_tz(self, unit): values = [ pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), pd.Timestamp("2011-01-01 10:00", tz="US/Eastern"), @@ -54,12 +54,12 @@ ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], tz="US/Eastern", name="xxx", - ) + ).as_unit(unit) exp = Series([3, 2, 1], index=exp_idx, name="count") - ser = Series(values, name="xxx") + ser = Series(values, name="xxx").dt.as_unit(unit) tm.assert_series_equal(ser.value_counts(), exp) - idx = pd.DatetimeIndex(values, name="xxx") + idx = pd.DatetimeIndex(values, name="xxx").as_unit(unit) tm.assert_series_equal(idx.value_counts(), exp) exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion") @@ -250,3 +250,22 @@ # GH 17927 result = Series(input_array).value_counts() tm.assert_series_equal(result, expected) + + def test_value_counts_masked(self): + # GH#54984 + dtype = "Int64" + ser = Series([1, 2, None, 2, None, 3], dtype=dtype) + result = ser.value_counts(dropna=False) + expected = Series( + [2, 2, 1, 1], + index=Index([2, None, 1, 3], dtype=dtype), + dtype=dtype, + name="count", + ) + tm.assert_series_equal(result, expected) + + result = ser.value_counts(dropna=True) + expected = Series( + [2, 1, 1], index=Index([2, 1, 3], dtype=dtype), dtype=dtype, name="count" + ) + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/methods/test_view.py pandas-2.2.2+dfsg/pandas/tests/series/methods/test_view.py --- pandas-2.1.4+dfsg/pandas/tests/series/methods/test_view.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/methods/test_view.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Series.view is deprecated and will be removed in a future version.:FutureWarning" # noqa: E501 +) + class TestView: def test_view_i8_to_datetimelike(self): diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/test_api.py pandas-2.2.2+dfsg/pandas/tests/series/test_api.py --- pandas-2.1.4+dfsg/pandas/tests/series/test_api.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/test_api.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,6 +10,8 @@ Index, Series, date_range, + period_range, + timedelta_range, ) import pandas._testing as tm @@ -68,16 +70,15 @@ @pytest.mark.parametrize( "index", [ - tm.makeStringIndex(10), - tm.makeCategoricalIndex(10), + Index(list("ab") * 5, dtype="category"), + Index([str(i) for i in range(10)]), Index(["foo", "bar", "baz"] * 2), - tm.makeDateIndex(10), - tm.makePeriodIndex(10), - tm.makeTimedeltaIndex(10), - tm.makeIntIndex(10), - tm.makeUIntIndex(10), - tm.makeIntIndex(10), - tm.makeFloatIndex(10), + date_range("2020-01-01", periods=10), + period_range("2020-01-01", periods=10, freq="D"), + timedelta_range("1 day", periods=10), + Index(np.arange(10), dtype=np.uint64), + Index(np.arange(10), dtype=np.int64), + Index(np.arange(10), dtype=np.float64), Index([True, False]), Index([f"a{i}" for i in range(101)]), pd.MultiIndex.from_tuples(zip("ABCD", "EFGH")), @@ -140,7 +141,9 @@ def test_ndarray_compat_ravel(self): # ravel s = Series(np.random.default_rng(2).standard_normal(10)) - tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F")) + with tm.assert_produces_warning(FutureWarning, match="ravel is deprecated"): + result = s.ravel(order="F") + tm.assert_almost_equal(result, s.values.ravel(order="F")) def test_empty_method(self): s_empty = Series(dtype=object) @@ -176,7 +179,7 @@ def test_unknown_attribute(self): # GH#9680 - tdi = pd.timedelta_range(start=0, periods=10, freq="1s") + tdi = timedelta_range(start=0, periods=10, freq="1s") ser = Series(np.random.default_rng(2).normal(size=10), index=tdi) assert "foo" not in ser.__dict__ msg = "'Series' object has no attribute 'foo'" @@ -203,6 +206,7 @@ with pytest.raises(AttributeError, match=msg): ser.weekday + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize( "kernel, has_numeric_only", [ diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/test_arithmetic.py pandas-2.2.2+dfsg/pandas/tests/series/test_arithmetic.py --- pandas-2.1.4+dfsg/pandas/tests/series/test_arithmetic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/test_arithmetic.py 2024-04-10 17:42:52.000000000 +0000 @@ -30,11 +30,10 @@ @pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"]) -def switch_numexpr_min_elements(request): - _MIN_ELEMENTS = expr._MIN_ELEMENTS - expr._MIN_ELEMENTS = request.param - yield request.param - expr._MIN_ELEMENTS = _MIN_ELEMENTS +def switch_numexpr_min_elements(request, monkeypatch): + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", request.param) + yield def _permute(obj): @@ -48,7 +47,11 @@ (lambda x: x, lambda x: x * 2, False), (lambda x: x, lambda x: x[::2], False), (lambda x: x, lambda x: 5, True), - (lambda x: tm.makeFloatSeries(), lambda x: tm.makeFloatSeries(), True), + ( + lambda x: Series(range(10), dtype=np.float64), + lambda x: Series(range(10), dtype=np.float64), + True, + ), ], ) @pytest.mark.parametrize( @@ -56,7 +59,11 @@ ) def test_flex_method_equivalence(self, opname, ts): # check that Series.{opname} behaves like Series.__{opname}__, - tser = tm.makeTimeSeries().rename("ts") + tser = Series( + np.arange(20, dtype=np.float64), + index=date_range("2020-01-01", periods=20), + name="ts", + ) series = ts[0](tser) other = ts[1](tser) @@ -154,7 +161,7 @@ # Some of these may end up in tests/arithmetic, but are not yet sorted def test_add_series_with_period_index(self): - rng = pd.period_range("1/1/2000", "1/1/2010", freq="A") + rng = pd.period_range("1/1/2000", "1/1/2010", freq="Y") ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) result = ts + ts[::2] @@ -165,7 +172,7 @@ result = ts + _permute(ts[::2]) tm.assert_series_equal(result, expected) - msg = "Input has different freq=D from Period\\(freq=A-DEC\\)" + msg = "Input has different freq=D from Period\\(freq=Y-DEC\\)" with pytest.raises(IncompatibleFrequency, match=msg): ts + ts.asfreq("D", how="end") @@ -205,9 +212,9 @@ s1 = Series(range(1, 10)) s2 = Series("foo", index=index) - msg = "not all arguments converted during string formatting" + msg = "not all arguments converted during string formatting|mod not" - with pytest.raises(TypeError, match=msg): + with pytest.raises((TypeError, NotImplementedError), match=msg): s2 % s1 def test_add_with_duplicate_index(self): @@ -458,7 +465,7 @@ def test_ser_cmp_result_names(self, names, comparison_op): # datetime64 dtype op = comparison_op - dti = date_range("1949-06-07 03:00:00", freq="H", periods=5, name=names[0]) + dti = date_range("1949-06-07 03:00:00", freq="h", periods=5, name=names[0]) ser = Series(dti).rename(names[1]) result = op(ser, dti) assert result.name == names[2] @@ -492,14 +499,27 @@ result = op(ser, cidx) assert result.name == names[2] - def test_comparisons(self): + def test_comparisons(self, using_infer_string): s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) # it works! exp = Series([False, False, False]) - tm.assert_series_equal(s == s2, exp) - tm.assert_series_equal(s2 == s, exp) + if using_infer_string: + import pyarrow as pa + + msg = "has no kernel" + # TODO(3.0) GH56008 + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + s == s2 + with tm.assert_produces_warning( + DeprecationWarning, match="comparison", check_stacklevel=False + ): + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + s2 == s + else: + tm.assert_series_equal(s == s2, exp) + tm.assert_series_equal(s2 == s, exp) # ----------------------------------------------------------------- # Categorical Dtype Comparisons @@ -639,18 +659,20 @@ result = comparison_op(ser, val) expected = comparison_op(ser.dropna(), val).reindex(ser.index) - if comparison_op is operator.ne: - expected = expected.fillna(True).astype(bool) - else: - expected = expected.fillna(False).astype(bool) + msg = "Downcasting object dtype arrays" + with tm.assert_produces_warning(FutureWarning, match=msg): + if comparison_op is operator.ne: + expected = expected.fillna(True).astype(bool) + else: + expected = expected.fillna(False).astype(bool) tm.assert_series_equal(result, expected) def test_ne(self): ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float) - expected = [True, True, False, True, True] - assert tm.equalContents(ts.index != 5, expected) - assert tm.equalContents(~(ts.index == 5), expected) + expected = np.array([True, True, False, True, True]) + tm.assert_numpy_array_equal(ts.index != 5, expected) + tm.assert_numpy_array_equal(~(ts.index == 5), expected) @pytest.mark.parametrize( "left, right", @@ -712,7 +734,7 @@ class TestTimeSeriesArithmetic: def test_series_add_tz_mismatch_converts_to_utc(self): - rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") + rng = date_range("1/1/2011", periods=100, freq="h", tz="utc") perm = np.random.default_rng(2).permutation(100)[:90] ser1 = Series( @@ -732,11 +754,14 @@ uts2 = ser2.tz_convert("utc") expected = uts1 + uts2 + # sort since input indexes are not equal + expected = expected.sort_index() + assert result.index.tz is timezone.utc tm.assert_series_equal(result, expected) def test_series_add_aware_naive_raises(self): - rng = date_range("1/1/2011", periods=10, freq="H") + rng = date_range("1/1/2011", periods=10, freq="h") ser = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng) ser_utc = ser.tz_localize("utc") @@ -748,13 +773,17 @@ with pytest.raises(Exception, match=msg): ser_utc + ser - def test_datetime_understood(self): + # TODO: belongs in tests/arithmetic? + def test_datetime_understood(self, unit): # Ensures it doesn't fail to create the right series # reported in issue#16726 - series = Series(date_range("2012-01-01", periods=3)) + series = Series(date_range("2012-01-01", periods=3, unit=unit)) offset = pd.offsets.DateOffset(days=6) result = series - offset - expected = Series(pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"])) + exp_dti = pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"]).as_unit( + unit + ) + expected = Series(exp_dti) tm.assert_series_equal(result, expected) def test_align_date_objects_with_datetimeindex(self): @@ -777,7 +806,7 @@ @pytest.mark.parametrize("box", [list, tuple, np.array, Index, Series, pd.array]) @pytest.mark.parametrize("flex", [True, False]) def test_series_ops_name_retention(self, flex, box, names, all_binary_operators): - # GH#33930 consistent name renteiton + # GH#33930 consistent name-retention op = all_binary_operators left = Series(range(10), name=names[0]) @@ -880,7 +909,7 @@ series = series_with_simple_index if len(series) < 1: - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Test doesn't make sense on empty data") ) @@ -940,8 +969,8 @@ expected = Series( [1000, 2001, 3002, 4003], index=pd.MultiIndex.from_tuples( - [("x", 1, "a"), ("x", 2, "a"), ("y", 1, "a"), ("y", 2, "a")], - names=["xy", "num", "ab"], + [("a", "x", 1), ("a", "x", 2), ("a", "y", 1), ("a", "y", 2)], + names=["ab", "xy", "num"], ), ) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/test_constructors.py pandas-2.2.2+dfsg/pandas/tests/series/test_constructors.py --- pandas-2.1.4+dfsg/pandas/tests/series/test_constructors.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/test_constructors.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,6 +14,7 @@ iNaT, lib, ) +from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -151,7 +152,7 @@ assert ser.dtype == ea_dtype tm.assert_series_equal(ser, expected) - def test_constructor(self, datetime_series): + def test_constructor(self, datetime_series, using_infer_string): empty_series = Series() assert datetime_series.index._is_all_dates @@ -159,13 +160,13 @@ derived = Series(datetime_series) assert derived.index._is_all_dates - assert tm.equalContents(derived.index, datetime_series.index) + tm.assert_index_equal(derived.index, datetime_series.index) # Ensure new index is not created assert id(datetime_series.index) == id(derived.index) # Mixed type Series mixed = Series(["hello", np.nan], index=[0, 1]) - assert mixed.dtype == np.object_ + assert mixed.dtype == np.object_ if not using_infer_string else "string" assert np.isnan(mixed[1]) assert not empty_series.index._is_all_dates @@ -196,7 +197,7 @@ Series([1, 3, 2], index=df) @pytest.mark.parametrize("input_class", [list, dict, OrderedDict]) - def test_constructor_empty(self, input_class): + def test_constructor_empty(self, input_class, using_infer_string): empty = Series() empty2 = Series(input_class()) @@ -227,7 +228,10 @@ # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) - empty2 = Series("", index=range(3)) + if using_infer_string: + empty2 = Series("", index=range(3), dtype=object) + else: + empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) @pytest.mark.parametrize("input_arg", [np.nan, float("nan")]) @@ -334,8 +338,8 @@ [ ([1, 2]), (["1", "2"]), - (list(date_range("1/1/2011", periods=2, freq="H"))), - (list(date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + (list(date_range("1/1/2011", periods=2, freq="h"))), + (list(date_range("1/1/2011", periods=2, freq="h", tz="US/Eastern"))), ([Interval(left=0, right=5)]), ], ) @@ -405,8 +409,6 @@ s = Series(factor, name="A") assert s.dtype == "category" assert len(s) == len(factor) - str(s.values) - str(s) # in a frame df = DataFrame({"A": factor}) @@ -415,15 +417,11 @@ result = df.iloc[:, 0] tm.assert_series_equal(result, s) assert len(df) == len(factor) - str(df.values) - str(df) df = DataFrame({"A": s}) result = df["A"] tm.assert_series_equal(result, s) assert len(df) == len(factor) - str(df.values) - str(df) # multiples df = DataFrame({"A": s, "B": s, "C": 1}) @@ -433,8 +431,6 @@ tm.assert_series_equal(result2, s, check_names=False) assert result2.name == "B" assert len(df) == len(factor) - str(df.values) - str(df) def test_constructor_categorical_with_coercion2(self): # GH8623 @@ -570,7 +566,10 @@ data[1] = 1 result = Series(data, index=index) expected = Series([0, 1, 2], index=index, dtype=int) - tm.assert_series_equal(result, expected) + with pytest.raises(AssertionError, match="Series classes are different"): + # TODO should this be raising at all? + # https://github.com/pandas-dev/pandas/issues/56131 + tm.assert_series_equal(result, expected) data = ma.masked_all((3,), dtype=bool) result = Series(data) @@ -587,7 +586,10 @@ data[1] = True result = Series(data, index=index) expected = Series([True, True, False], index=index, dtype=bool) - tm.assert_series_equal(result, expected) + with pytest.raises(AssertionError, match="Series classes are different"): + # TODO should this be raising at all? + # https://github.com/pandas-dev/pandas/issues/56131 + tm.assert_series_equal(result, expected) data = ma.masked_all((3,), dtype="M8[ns]") result = Series(data) @@ -671,7 +673,7 @@ Series(["foo"], index=["a", "b", "c"]) def test_constructor_corner(self): - df = tm.makeTimeDataFrame() + df = DataFrame(range(5), index=date_range("2020-01-01", periods=5)) objs = [df, df] s = Series(objs, index=[0, 1]) assert isinstance(s, Series) @@ -772,11 +774,16 @@ def test_constructor_signed_int_overflow_raises(self): # GH#41734 disallow silent overflow, enforced in 2.0 - msg = "Values are too large to be losslessly converted" - with pytest.raises(ValueError, match=msg): + if np_version_gt2: + msg = "The elements provided in the data cannot all be casted to the dtype" + err = OverflowError + else: + msg = "Values are too large to be losslessly converted" + err = ValueError + with pytest.raises(err, match=msg): Series([1, 200, 923442], dtype="int8") - with pytest.raises(ValueError, match=msg): + with pytest.raises(err, match=msg): Series([1, 200, 923442], dtype="uint8") @pytest.mark.parametrize( @@ -800,7 +807,13 @@ def test_constructor_unsigned_dtype_overflow(self, any_unsigned_int_numpy_dtype): # see gh-15832 - msg = "Trying to coerce negative values to unsigned integers" + if np_version_gt2: + msg = ( + f"The elements provided in the data cannot " + f"all be casted to the dtype {any_unsigned_int_numpy_dtype}" + ) + else: + msg = "Trying to coerce negative values to unsigned integers" with pytest.raises(OverflowError, match=msg): Series([-1], dtype=any_unsigned_int_numpy_dtype) @@ -880,12 +893,14 @@ with pytest.raises(IntCastingNaNError, match=msg): Series(np.array(vals), dtype=any_int_numpy_dtype) - def test_constructor_dtype_no_cast(self, using_copy_on_write): + def test_constructor_dtype_no_cast(self, using_copy_on_write, warn_copy_on_write): # see gh-1572 s = Series([1, 2, 3]) s2 = Series(s, dtype=np.int64) - s2[1] = 5 + warn = FutureWarning if warn_copy_on_write else None + with tm.assert_produces_warning(warn): + s2[1] = 5 if using_copy_on_write: assert s[1] == 2 else: @@ -965,7 +980,7 @@ # GH3414 related expected = Series(pydates, dtype="datetime64[ms]") - result = Series(Series(dates).view(np.int64) / 1000000, dtype="M8[ms]") + result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") tm.assert_series_equal(result, expected) result = Series(dates, dtype="datetime64[ms]") @@ -1015,7 +1030,7 @@ def test_constructor_dtype_datetime64_7(self): # GH6529 # coerce datetime64 non-ns properly - dates = date_range("01-Jan-2015", "01-Dec-2015", freq="M") + dates = date_range("01-Jan-2015", "01-Dec-2015", freq="ME") values2 = dates.view(np.ndarray).astype("datetime64[ns]") expected = Series(values2, index=dates) @@ -1071,24 +1086,24 @@ def test_constructor_dtype_datetime64_4(self): # non-convertible - s = Series([1479596223000, -1479590, NaT]) - assert s.dtype == "object" - assert s[2] is NaT - assert "NaT" in str(s) + ser = Series([1479596223000, -1479590, NaT]) + assert ser.dtype == "object" + assert ser[2] is NaT + assert "NaT" in str(ser) def test_constructor_dtype_datetime64_3(self): # if we passed a NaT it remains - s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), NaT]) - assert s.dtype == "object" - assert s[2] is NaT - assert "NaT" in str(s) + ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), NaT]) + assert ser.dtype == "object" + assert ser[2] is NaT + assert "NaT" in str(ser) def test_constructor_dtype_datetime64_2(self): # if we passed a nan it remains - s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) - assert s.dtype == "object" - assert s[2] is np.nan - assert "NaN" in str(s) + ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) + assert ser.dtype == "object" + assert ser[2] is np.nan + assert "NaN" in str(ser) def test_constructor_with_datetime_tz(self): # 8260 @@ -1134,39 +1149,41 @@ assert "datetime64[ns, US/Eastern]" in str(result) assert "NaT" in str(result) - # long str - t = Series(date_range("20130101", periods=1000, tz="US/Eastern")) - assert "datetime64[ns, US/Eastern]" in str(t) - result = DatetimeIndex(s, freq="infer") tm.assert_index_equal(result, dr) + def test_constructor_with_datetime_tz5(self): + # long str + ser = Series(date_range("20130101", periods=1000, tz="US/Eastern")) + assert "datetime64[ns, US/Eastern]" in str(ser) + def test_constructor_with_datetime_tz4(self): # inference - s = Series( + ser = Series( [ Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), ] ) - assert s.dtype == "datetime64[ns, US/Pacific]" - assert lib.infer_dtype(s, skipna=True) == "datetime64" + assert ser.dtype == "datetime64[ns, US/Pacific]" + assert lib.infer_dtype(ser, skipna=True) == "datetime64" def test_constructor_with_datetime_tz3(self): - s = Series( + ser = Series( [ Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), Timestamp("2013-01-02 14:00:00-0800", tz="US/Eastern"), ] ) - assert s.dtype == "object" - assert lib.infer_dtype(s, skipna=True) == "datetime" + assert ser.dtype == "object" + assert lib.infer_dtype(ser, skipna=True) == "datetime" def test_constructor_with_datetime_tz2(self): # with all NaT - s = Series(NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") - expected = Series(DatetimeIndex(["NaT", "NaT"], tz="US/Eastern")) - tm.assert_series_equal(s, expected) + ser = Series(NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") + dti = DatetimeIndex(["NaT", "NaT"], tz="US/Eastern").as_unit("ns") + expected = Series(dti) + tm.assert_series_equal(ser, expected) def test_constructor_no_partial_datetime_casting(self): # GH#40111 @@ -1299,7 +1316,7 @@ assert isna(series[2]) def test_constructor_period_incompatible_frequency(self): - data = [Period("2000", "D"), Period("2001", "A")] + data = [Period("2000", "D"), Period("2001", "Y")] result = Series(data) assert result.dtype == object assert result.tolist() == data @@ -1311,7 +1328,8 @@ pi = period_range("20130101", periods=5, freq="D") s = Series(pi) assert s.dtype == "Period[D]" - expected = Series(pi.astype(object)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + expected = Series(pi.astype(object)) tm.assert_series_equal(s, expected) def test_constructor_dict(self): @@ -1325,7 +1343,7 @@ expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) tm.assert_series_equal(result, expected) - pidx = tm.makePeriodIndex(100) + pidx = period_range("2020-01-01", periods=10, freq="D") d = {pidx[0]: 0, pidx[1]: 1} result = Series(d, index=pidx) expected = Series(np.nan, pidx, dtype=np.float64) @@ -1355,7 +1373,7 @@ reason="Construction from dict goes through " "maybe_convert_objects which casts to nano" ) - request.node.add_marker(mark) + request.applymarker(mark) d = {"a": ea_scalar} result = Series(d, index=["a"]) expected = Series(ea_scalar, index=["a"], dtype=ea_dtype) @@ -1438,7 +1456,7 @@ # https://github.com/pandas-dev/pandas/issues/22698 @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") - def test_fromDict(self): + def test_fromDict(self, using_infer_string): data = {"a": 0, "b": 1, "c": 2, "d": 3} series = Series(data) @@ -1450,19 +1468,19 @@ data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) - assert series.dtype == np.object_ + assert series.dtype == np.object_ if not using_infer_string else "string" data = {"a": "0", "b": "1"} series = Series(data, dtype=float) assert series.dtype == np.float64 - def test_fromValue(self, datetime_series): + def test_fromValue(self, datetime_series, using_infer_string): nans = Series(np.nan, index=datetime_series.index, dtype=np.float64) assert nans.dtype == np.float64 assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) - assert strings.dtype == np.object_ + assert strings.dtype == np.object_ if not using_infer_string else "string" assert len(strings) == len(datetime_series) d = datetime.now() @@ -1575,7 +1593,7 @@ def test_NaT_cast(self): # GH10747 result = Series([np.nan]).astype("M8[ns]") - expected = Series([NaT]) + expected = Series([NaT], dtype="M8[ns]") tm.assert_series_equal(result, expected) def test_constructor_name_hashable(self): @@ -1681,7 +1699,7 @@ if np.dtype(dtype).name not in ["timedelta64", "datetime64"]: mark = pytest.mark.xfail(reason="GH#33890 Is assigned ns unit") - request.node.add_marker(mark) + request.applymarker(mark) with pytest.raises(ValueError, match=msg): Series([], dtype=dtype) @@ -1940,9 +1958,15 @@ def test_constructor_raise_on_lossy_conversion_of_strings(self): # GH#44923 - with pytest.raises( - ValueError, match="string values cannot be losslessly cast to int8" - ): + if not np_version_gt2: + raises = pytest.raises( + ValueError, match="string values cannot be losslessly cast to int8" + ) + else: + raises = pytest.raises( + OverflowError, match="The elements provided in the data" + ) + with raises: Series(["128"], dtype="int8") def test_constructor_dtype_timedelta_alternative_construct(self): @@ -2132,10 +2156,24 @@ result = Series([pd.NA, "b"]) tm.assert_series_equal(result, expected) + def test_inference_on_pandas_objects(self): + # GH#56012 + ser = Series([Timestamp("2019-12-31")], dtype=object) + with tm.assert_produces_warning(None): + # This doesn't do inference + result = Series(ser) + assert result.dtype == np.object_ + + idx = Index([Timestamp("2019-12-31")], dtype=object) + + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = Series(idx) + assert result.dtype != np.object_ + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): - idx = tm.makeDateIndex(10000) + idx = date_range("2020-01-01", periods=5) ser = Series( np.random.default_rng(2).standard_normal(len(idx)), idx.astype(object) ) @@ -2153,6 +2191,25 @@ multi = Series(data, index=indexes) assert isinstance(multi.index, MultiIndex) + # TODO: make this not cast to object in pandas 3.0 + @pytest.mark.skipif( + not np_version_gt2, reason="StringDType only available in numpy 2 and above" + ) + @pytest.mark.parametrize( + "data", + [ + ["a", "b", "c"], + ["a", "b", np.nan], + ], + ) + def test_np_string_array_object_cast(self, data): + from numpy.dtypes import StringDType + + arr = np.array(data, dtype=StringDType()) + res = Series(arr) + assert res.dtype == np.object_ + assert (res == data).all() + class TestSeriesConstructorInternals: def test_constructor_no_pandas_array(self, using_array_manager): @@ -2165,7 +2222,7 @@ @td.skip_array_manager_invalid_test def test_from_array(self): - result = Series(pd.array(["1H", "2H"], dtype="timedelta64[ns]")) + result = Series(pd.array(["1h", "2h"], dtype="timedelta64[ns]")) assert result._mgr.blocks[0].is_extension is False result = Series(pd.array(["2015"], dtype="datetime64[ns]")) @@ -2173,7 +2230,7 @@ @td.skip_array_manager_invalid_test def test_from_list_dtype(self): - result = Series(["1H", "2H"], dtype="timedelta64[ns]") + result = Series(["1h", "2h"], dtype="timedelta64[ns]") assert result._mgr.blocks[0].is_extension is False result = Series(["2015"], dtype="datetime64[ns]") diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/test_formats.py pandas-2.2.2+dfsg/pandas/tests/series/test_formats.py --- pandas-2.1.4+dfsg/pandas/tests/series/test_formats.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/test_formats.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,577 @@ +from datetime import ( + datetime, + timedelta, +) + +import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype + +import pandas as pd +from pandas import ( + Categorical, + DataFrame, + Index, + Series, + date_range, + option_context, + period_range, + timedelta_range, +) +import pandas._testing as tm + + +class TestSeriesRepr: + def test_multilevel_name_print_0(self): + # GH#55415 None does not get printed, but 0 does + # (matching DataFrame and flat index behavior) + mi = pd.MultiIndex.from_product([range(2, 3), range(3, 4)], names=[0, None]) + ser = Series(1.5, index=mi) + + res = repr(ser) + expected = "0 \n2 3 1.5\ndtype: float64" + assert res == expected + + def test_multilevel_name_print(self, lexsorted_two_level_string_multiindex): + index = lexsorted_two_level_string_multiindex + ser = Series(range(len(index)), index=index, name="sth") + expected = [ + "first second", + "foo one 0", + " two 1", + " three 2", + "bar one 3", + " two 4", + "baz two 5", + " three 6", + "qux one 7", + " two 8", + " three 9", + "Name: sth, dtype: int64", + ] + expected = "\n".join(expected) + assert repr(ser) == expected + + def test_small_name_printing(self): + # Test small Series. + s = Series([0, 1, 2]) + + s.name = "test" + assert "Name: test" in repr(s) + + s.name = None + assert "Name:" not in repr(s) + + def test_big_name_printing(self): + # Test big Series (diff code path). + s = Series(range(1000)) + + s.name = "test" + assert "Name: test" in repr(s) + + s.name = None + assert "Name:" not in repr(s) + + def test_empty_name_printing(self): + s = Series(index=date_range("20010101", "20020101"), name="test", dtype=object) + assert "Name: test" in repr(s) + + @pytest.mark.parametrize("args", [(), (0, -1)]) + def test_float_range(self, args): + str( + Series( + np.random.default_rng(2).standard_normal(1000), + index=np.arange(1000, *args), + ) + ) + + def test_empty_object(self): + # empty + str(Series(dtype=object)) + + def test_string(self, string_series): + str(string_series) + str(string_series.astype(int)) + + # with NaNs + string_series[5:7] = np.nan + str(string_series) + + def test_object(self, object_series): + str(object_series) + + def test_datetime(self, datetime_series): + str(datetime_series) + # with Nones + ots = datetime_series.astype("O") + ots[::2] = None + repr(ots) + + @pytest.mark.parametrize( + "name", + [ + "", + 1, + 1.2, + "foo", + "\u03B1\u03B2\u03B3", + "loooooooooooooooooooooooooooooooooooooooooooooooooooong", + ("foo", "bar", "baz"), + (1, 2), + ("foo", 1, 2.3), + ("\u03B1", "\u03B2", "\u03B3"), + ("\u03B1", "bar"), + ], + ) + def test_various_names(self, name, string_series): + # various names + string_series.name = name + repr(string_series) + + def test_tuple_name(self): + biggie = Series( + np.random.default_rng(2).standard_normal(1000), + index=np.arange(1000), + name=("foo", "bar", "baz"), + ) + repr(biggie) + + @pytest.mark.parametrize("arg", [100, 1001]) + def test_tidy_repr_name_0(self, arg): + # tidy repr + ser = Series(np.random.default_rng(2).standard_normal(arg), name=0) + rep_str = repr(ser) + assert "Name: 0" in rep_str + + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="TODO: investigate why this is failing" + ) + def test_newline(self): + ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) + assert "\t" not in repr(ser) + assert "\r" not in repr(ser) + assert "a\n" not in repr(ser) + + @pytest.mark.parametrize( + "name, expected", + [ + ["foo", "Series([], Name: foo, dtype: int64)"], + [None, "Series([], dtype: int64)"], + ], + ) + def test_empty_int64(self, name, expected): + # with empty series (#4651) + s = Series([], dtype=np.int64, name=name) + assert repr(s) == expected + + def test_repr_bool_fails(self, capsys): + s = Series( + [ + DataFrame(np.random.default_rng(2).standard_normal((2, 2))) + for i in range(5) + ] + ) + + # It works (with no Cython exception barf)! + repr(s) + + captured = capsys.readouterr() + assert captured.err == "" + + def test_repr_name_iterable_indexable(self): + s = Series([1, 2, 3], name=np.int64(3)) + + # it works! + repr(s) + + s.name = ("\u05d0",) * 2 + repr(s) + + def test_repr_max_rows(self): + # GH 6863 + with option_context("display.max_rows", None): + str(Series(range(1001))) # should not raise exception + + def test_unicode_string_with_unicode(self): + df = Series(["\u05d0"], name="\u05d1") + str(df) + + ser = Series(["\u03c3"] * 10) + repr(ser) + + ser2 = Series(["\u05d0"] * 1000) + ser2.name = "title1" + repr(ser2) + + def test_str_to_bytes_raises(self): + # GH 26447 + df = Series(["abc"], name="abc") + msg = "^'str' object cannot be interpreted as an integer$" + with pytest.raises(TypeError, match=msg): + bytes(df) + + def test_timeseries_repr_object_dtype(self): + index = Index( + [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], dtype=object + ) + ts = Series(np.random.default_rng(2).standard_normal(len(index)), index) + repr(ts) + + ts = Series( + np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) + ) + assert repr(ts).splitlines()[-1].startswith("Freq:") + + ts2 = ts.iloc[np.random.default_rng(2).integers(0, len(ts) - 1, 400)] + repr(ts2).splitlines()[-1] + + def test_latex_repr(self): + pytest.importorskip("jinja2") # uses Styler implementation + result = r"""\begin{tabular}{ll} +\toprule + & 0 \\ +\midrule +0 & $\alpha$ \\ +1 & b \\ +2 & c \\ +\bottomrule +\end{tabular} +""" + with option_context( + "styler.format.escape", None, "styler.render.repr", "latex" + ): + s = Series([r"$\alpha$", "b", "c"]) + assert result == s._repr_latex_() + + assert s._repr_latex_() is None + + def test_index_repr_in_frame_with_nan(self): + # see gh-25061 + i = Index([1, np.nan]) + s = Series([1, 2], index=i) + exp = """1.0 1\nNaN 2\ndtype: int64""" + + assert repr(s) == exp + + def test_format_pre_1900_dates(self): + rng = date_range("1/1/1850", "1/1/1950", freq="YE-DEC") + msg = "DatetimeIndex.format is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + rng.format() + ts = Series(1, index=rng) + repr(ts) + + def test_series_repr_nat(self): + series = Series([0, 1000, 2000, pd.NaT._value], dtype="M8[ns]") + + result = repr(series) + expected = ( + "0 1970-01-01 00:00:00.000000\n" + "1 1970-01-01 00:00:00.000001\n" + "2 1970-01-01 00:00:00.000002\n" + "3 NaT\n" + "dtype: datetime64[ns]" + ) + assert result == expected + + def test_float_repr(self): + # GH#35603 + # check float format when cast to object + ser = Series([1.0]).astype(object) + expected = "0 1.0\ndtype: object" + assert repr(ser) == expected + + def test_different_null_objects(self): + # GH#45263 + ser = Series([1, 2, 3, 4], [True, None, np.nan, pd.NaT]) + result = repr(ser) + expected = "True 1\nNone 2\nNaN 3\nNaT 4\ndtype: int64" + assert result == expected + + +class TestCategoricalRepr: + def test_categorical_repr_unicode(self): + # see gh-21002 + + class County: + name = "San Sebastián" + state = "PR" + + def __repr__(self) -> str: + return self.name + ", " + self.state + + cat = Categorical([County() for _ in range(61)]) + idx = Index(cat) + ser = idx.to_series() + + repr(ser) + str(ser) + + def test_categorical_repr(self, using_infer_string): + a = Series(Categorical([1, 2, 3, 4])) + exp = ( + "0 1\n1 2\n2 3\n3 4\n" + "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" + ) + + assert exp == a.__str__() + + a = Series(Categorical(["a", "b"] * 25)) + if using_infer_string: + exp = ( + "0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + "Length: 50, dtype: category\nCategories (2, string): [a, b]" + ) + else: + exp = ( + "0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" + ) + with option_context("display.max_rows", 5): + assert exp == repr(a) + + levs = list("abcdefghijklmnopqrstuvwxyz") + a = Series(Categorical(["a", "b"], categories=levs, ordered=True)) + if using_infer_string: + exp = ( + "0 a\n1 b\n" + "dtype: category\n" + "Categories (26, string): [a < b < c < d ... w < x < y < z]" + ) + else: + exp = ( + "0 a\n1 b\n" + "dtype: category\n" + "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... " + "'w' < 'x' < 'y' < 'z']" + ) + assert exp == a.__str__() + + def test_categorical_series_repr(self): + s = Series(Categorical([1, 2, 3])) + exp = """0 1 +1 2 +2 3 +dtype: category +Categories (3, int64): [1, 2, 3]""" + + assert repr(s) == exp + + s = Series(Categorical(np.arange(10))) + exp = f"""0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +dtype: category +Categories (10, {np.dtype(int)}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" + + assert repr(s) == exp + + def test_categorical_series_repr_ordered(self): + s = Series(Categorical([1, 2, 3], ordered=True)) + exp = """0 1 +1 2 +2 3 +dtype: category +Categories (3, int64): [1 < 2 < 3]""" + + assert repr(s) == exp + + s = Series(Categorical(np.arange(10), ordered=True)) + exp = f"""0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +dtype: category +Categories (10, {np.dtype(int)}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" + + assert repr(s) == exp + + def test_categorical_series_repr_datetime(self): + idx = date_range("2011-01-01 09:00", freq="h", periods=5) + s = Series(Categorical(idx)) + exp = """0 2011-01-01 09:00:00 +1 2011-01-01 10:00:00 +2 2011-01-01 11:00:00 +3 2011-01-01 12:00:00 +4 2011-01-01 13:00:00 +dtype: category +Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, + 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" # noqa: E501 + + assert repr(s) == exp + + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") + s = Series(Categorical(idx)) + exp = """0 2011-01-01 09:00:00-05:00 +1 2011-01-01 10:00:00-05:00 +2 2011-01-01 11:00:00-05:00 +3 2011-01-01 12:00:00-05:00 +4 2011-01-01 13:00:00-05:00 +dtype: category +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, + 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, + 2011-01-01 13:00:00-05:00]""" # noqa: E501 + + assert repr(s) == exp + + def test_categorical_series_repr_datetime_ordered(self): + idx = date_range("2011-01-01 09:00", freq="h", periods=5) + s = Series(Categorical(idx, ordered=True)) + exp = """0 2011-01-01 09:00:00 +1 2011-01-01 10:00:00 +2 2011-01-01 11:00:00 +3 2011-01-01 12:00:00 +4 2011-01-01 13:00:00 +dtype: category +Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < + 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa: E501 + + assert repr(s) == exp + + idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern") + s = Series(Categorical(idx, ordered=True)) + exp = """0 2011-01-01 09:00:00-05:00 +1 2011-01-01 10:00:00-05:00 +2 2011-01-01 11:00:00-05:00 +3 2011-01-01 12:00:00-05:00 +4 2011-01-01 13:00:00-05:00 +dtype: category +Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < + 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < + 2011-01-01 13:00:00-05:00]""" # noqa: E501 + + assert repr(s) == exp + + def test_categorical_series_repr_period(self): + idx = period_range("2011-01-01 09:00", freq="h", periods=5) + s = Series(Categorical(idx)) + exp = """0 2011-01-01 09:00 +1 2011-01-01 10:00 +2 2011-01-01 11:00 +3 2011-01-01 12:00 +4 2011-01-01 13:00 +dtype: category +Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" # noqa: E501 + + assert repr(s) == exp + + idx = period_range("2011-01", freq="M", periods=5) + s = Series(Categorical(idx)) + exp = """0 2011-01 +1 2011-02 +2 2011-03 +3 2011-04 +4 2011-05 +dtype: category +Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" + + assert repr(s) == exp + + def test_categorical_series_repr_period_ordered(self): + idx = period_range("2011-01-01 09:00", freq="h", periods=5) + s = Series(Categorical(idx, ordered=True)) + exp = """0 2011-01-01 09:00 +1 2011-01-01 10:00 +2 2011-01-01 11:00 +3 2011-01-01 12:00 +4 2011-01-01 13:00 +dtype: category +Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" # noqa: E501 + + assert repr(s) == exp + + idx = period_range("2011-01", freq="M", periods=5) + s = Series(Categorical(idx, ordered=True)) + exp = """0 2011-01 +1 2011-02 +2 2011-03 +3 2011-04 +4 2011-05 +dtype: category +Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" + + assert repr(s) == exp + + def test_categorical_series_repr_timedelta(self): + idx = timedelta_range("1 days", periods=5) + s = Series(Categorical(idx)) + exp = """0 1 days +1 2 days +2 3 days +3 4 days +4 5 days +dtype: category +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" + + assert repr(s) == exp + + idx = timedelta_range("1 hours", periods=10) + s = Series(Categorical(idx)) + exp = """0 0 days 01:00:00 +1 1 days 01:00:00 +2 2 days 01:00:00 +3 3 days 01:00:00 +4 4 days 01:00:00 +5 5 days 01:00:00 +6 6 days 01:00:00 +7 7 days 01:00:00 +8 8 days 01:00:00 +9 9 days 01:00:00 +dtype: category +Categories (10, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, + 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, + 8 days 01:00:00, 9 days 01:00:00]""" # noqa: E501 + + assert repr(s) == exp + + def test_categorical_series_repr_timedelta_ordered(self): + idx = timedelta_range("1 days", periods=5) + s = Series(Categorical(idx, ordered=True)) + exp = """0 1 days +1 2 days +2 3 days +3 4 days +4 5 days +dtype: category +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" + + assert repr(s) == exp + + idx = timedelta_range("1 hours", periods=10) + s = Series(Categorical(idx, ordered=True)) + exp = """0 0 days 01:00:00 +1 1 days 01:00:00 +2 2 days 01:00:00 +3 3 days 01:00:00 +4 4 days 01:00:00 +5 5 days 01:00:00 +6 6 days 01:00:00 +7 7 days 01:00:00 +8 8 days 01:00:00 +9 9 days 01:00:00 +dtype: category +Categories (10, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < + 3 days 01:00:00 ... 6 days 01:00:00 < 7 days 01:00:00 < + 8 days 01:00:00 < 9 days 01:00:00]""" # noqa: E501 + + assert repr(s) == exp diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/test_logical_ops.py pandas-2.2.2+dfsg/pandas/tests/series/test_logical_ops.py --- pandas-2.1.4+dfsg/pandas/tests/series/test_logical_ops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/test_logical_ops.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,6 +15,7 @@ class TestSeriesLogicalOps: + @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") @pytest.mark.parametrize("bool_op", [operator.and_, operator.or_, operator.xor]) def test_bool_operators_with_nas(self, bool_op): # boolean &, |, ^ should work with object arrays and propagate NAs @@ -39,11 +40,11 @@ s_empty = Series([], dtype=object) res = s_tft & s_empty - expected = s_fff + expected = s_fff.sort_index() tm.assert_series_equal(res, expected) res = s_tft | s_empty - expected = s_tft + expected = s_tft.sort_index() tm.assert_series_equal(res, expected) def test_logical_operators_int_dtype_with_int_dtype(self): @@ -145,7 +146,7 @@ expected = Series([False, True, True, True]) tm.assert_series_equal(result, expected) - def test_logical_operators_int_dtype_with_object(self): + def test_logical_operators_int_dtype_with_object(self, using_infer_string): # GH#9016: support bitwise op for integer types s_0123 = Series(range(4), dtype="int64") @@ -154,8 +155,14 @@ tm.assert_series_equal(result, expected) s_abNd = Series(["a", "b", np.nan, "d"]) - with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): - s_0123 & s_abNd + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + s_0123 & s_abNd + else: + with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): + s_0123 & s_abNd def test_logical_operators_bool_dtype_with_int(self): index = list("bca") @@ -353,7 +360,7 @@ result = op(ser, idx) tm.assert_series_equal(result, expected) - def test_logical_ops_label_based(self): + def test_logical_ops_label_based(self, using_infer_string): # GH#4947 # logical ops should be label based @@ -390,11 +397,11 @@ empty = Series([], dtype=object) result = a & empty.copy() - expected = Series([False, False, False], list("bca")) + expected = Series([False, False, False], list("abc")) tm.assert_series_equal(result, expected) result = a | empty.copy() - expected = Series([True, False, True], list("bca")) + expected = Series([True, True, False], list("abc")) tm.assert_series_equal(result, expected) # vs non-matching @@ -421,7 +428,17 @@ tm.assert_series_equal(result, a[a]) for e in [Series(["z"])]: - result = a[a | e] + warn = FutureWarning if using_infer_string else None + if using_infer_string: + import pyarrow as pa + + with tm.assert_produces_warning(warn, match="Operation between non"): + with pytest.raises( + pa.lib.ArrowNotImplementedError, match="has no kernel" + ): + result = a[a | e] + else: + result = a[a | e] tm.assert_series_equal(result, a[a]) # vs scalars @@ -513,3 +530,19 @@ result = ser1 ^ ser2 tm.assert_series_equal(result, expected) + + def test_pyarrow_numpy_string_invalid(self): + # GH#56008 + pytest.importorskip("pyarrow") + ser = Series([False, True]) + ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") + result = ser == ser2 + expected = Series(False, index=ser.index) + tm.assert_series_equal(result, expected) + + result = ser != ser2 + expected = Series(True, index=ser.index) + tm.assert_series_equal(result, expected) + + with pytest.raises(TypeError, match="Invalid comparison"): + ser > ser2 diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/test_npfuncs.py pandas-2.2.2+dfsg/pandas/tests/series/test_npfuncs.py --- pandas-2.1.4+dfsg/pandas/tests/series/test_npfuncs.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/test_npfuncs.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,6 +5,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import Series import pandas._testing as tm @@ -33,3 +35,12 @@ expected = np.array([[3], [4]], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_log_arrow_backed_missing_value(): + # GH#56285 + ser = Series([1, 2, None], dtype="float64[pyarrow]") + result = np.log(ser) + expected = np.log(Series([1, 2, None], dtype="float64")) + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/test_reductions.py pandas-2.2.2+dfsg/pandas/tests/series/test_reductions.py --- pandas-2.1.4+dfsg/pandas/tests/series/test_reductions.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/test_reductions.py 2024-04-10 17:42:52.000000000 +0000 @@ -29,6 +29,28 @@ tm.assert_series_equal(res, ser) +def test_mode_nullable_dtype(any_numeric_ea_dtype): + # GH#55340 + ser = Series([1, 3, 2, pd.NA, 3, 2, pd.NA], dtype=any_numeric_ea_dtype) + result = ser.mode(dropna=False) + expected = Series([2, 3, pd.NA], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + result = ser.mode(dropna=True) + expected = Series([2, 3], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + ser[-1] = pd.NA + + result = ser.mode(dropna=True) + expected = Series([2, 3], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + result = ser.mode(dropna=False) + expected = Series([pd.NA], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(result, expected) + + def test_mode_infer_string(): # GH#56183 pytest.importorskip("pyarrow") @@ -60,7 +82,7 @@ def test_td64_summation_overflow(): # GH#9442 - ser = Series(pd.date_range("20130101", periods=100000, freq="H")) + ser = Series(pd.date_range("20130101", periods=100000, freq="h")) ser[0] += pd.Timedelta("1s 1ms") # mean @@ -141,17 +163,22 @@ np.sum(ser, keepdims=True) -def test_mean_with_convertible_string_raises(using_array_manager): +def test_mean_with_convertible_string_raises(using_array_manager, using_infer_string): # GH#44008 ser = Series(["1", "2"]) - assert ser.sum() == "12" - msg = "Could not convert string '12' to numeric" + if using_infer_string: + msg = "does not support" + with pytest.raises(TypeError, match=msg): + ser.sum() + else: + assert ser.sum() == "12" + msg = "Could not convert string '12' to numeric|does not support" with pytest.raises(TypeError, match=msg): ser.mean() df = ser.to_frame() if not using_array_manager: - msg = r"Could not convert \['12'\] to numeric" + msg = r"Could not convert \['12'\] to numeric|does not support" with pytest.raises(TypeError, match=msg): df.mean() @@ -162,29 +189,30 @@ if using_array_manager: msg = "Could not convert string 'J' to numeric" else: - msg = r"Could not convert \['J'\] to numeric" + msg = r"Could not convert \['J'\] to numeric|does not support" with pytest.raises(TypeError, match=msg): df.mean() with pytest.raises(TypeError, match=msg): df.agg("mean") - msg = "Could not convert string 'J' to numeric" + msg = "Could not convert string 'J' to numeric|does not support" with pytest.raises(TypeError, match=msg): df["db"].mean() + msg = "Could not convert string 'J' to numeric|ufunc 'divide'" with pytest.raises(TypeError, match=msg): np.mean(df["db"].astype("string").array) def test_median_with_convertible_string_raises(using_array_manager): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 - msg = r"Cannot convert \['1' '2' '3'\] to numeric" + msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support" ser = Series(["1", "2", "3"]) with pytest.raises(TypeError, match=msg): ser.median() if not using_array_manager: - msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric" + msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support" df = ser.to_frame() with pytest.raises(TypeError, match=msg): df.median() diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/test_repr.py pandas-2.2.2+dfsg/pandas/tests/series/test_repr.py --- pandas-2.1.4+dfsg/pandas/tests/series/test_repr.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/test_repr.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,551 +0,0 @@ -from datetime import ( - datetime, - timedelta, -) - -import numpy as np -import pytest - -import pandas as pd -from pandas import ( - Categorical, - DataFrame, - Index, - Series, - date_range, - option_context, - period_range, - timedelta_range, -) -import pandas._testing as tm - - -class TestSeriesRepr: - def test_multilevel_name_print(self, lexsorted_two_level_string_multiindex): - index = lexsorted_two_level_string_multiindex - ser = Series(range(len(index)), index=index, name="sth") - expected = [ - "first second", - "foo one 0", - " two 1", - " three 2", - "bar one 3", - " two 4", - "baz two 5", - " three 6", - "qux one 7", - " two 8", - " three 9", - "Name: sth, dtype: int64", - ] - expected = "\n".join(expected) - assert repr(ser) == expected - - def test_small_name_printing(self): - # Test small Series. - s = Series([0, 1, 2]) - - s.name = "test" - assert "Name: test" in repr(s) - - s.name = None - assert "Name:" not in repr(s) - - def test_big_name_printing(self): - # Test big Series (diff code path). - s = Series(range(1000)) - - s.name = "test" - assert "Name: test" in repr(s) - - s.name = None - assert "Name:" not in repr(s) - - def test_empty_name_printing(self): - s = Series(index=date_range("20010101", "20020101"), name="test", dtype=object) - assert "Name: test" in repr(s) - - @pytest.mark.parametrize("args", [(), (0, -1)]) - def test_float_range(self, args): - str( - Series( - np.random.default_rng(2).standard_normal(1000), - index=np.arange(1000, *args), - ) - ) - - def test_empty_object(self): - # empty - str(Series(dtype=object)) - - def test_string(self, string_series): - str(string_series) - str(string_series.astype(int)) - - # with NaNs - string_series[5:7] = np.nan - str(string_series) - - def test_object(self, object_series): - str(object_series) - - def test_datetime(self, datetime_series): - str(datetime_series) - # with Nones - ots = datetime_series.astype("O") - ots[::2] = None - repr(ots) - - @pytest.mark.parametrize( - "name", - [ - "", - 1, - 1.2, - "foo", - "\u03B1\u03B2\u03B3", - "loooooooooooooooooooooooooooooooooooooooooooooooooooong", - ("foo", "bar", "baz"), - (1, 2), - ("foo", 1, 2.3), - ("\u03B1", "\u03B2", "\u03B3"), - ("\u03B1", "bar"), - ], - ) - def test_various_names(self, name, string_series): - # various names - string_series.name = name - repr(string_series) - - def test_tuple_name(self): - biggie = Series( - np.random.default_rng(2).standard_normal(1000), - index=np.arange(1000), - name=("foo", "bar", "baz"), - ) - repr(biggie) - - @pytest.mark.parametrize("arg", [100, 1001]) - def test_tidy_repr_name_0(self, arg): - # tidy repr - ser = Series(np.random.default_rng(2).standard_normal(arg), name=0) - rep_str = repr(ser) - assert "Name: 0" in rep_str - - def test_newline(self): - ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) - assert "\t" not in repr(ser) - assert "\r" not in repr(ser) - assert "a\n" not in repr(ser) - - @pytest.mark.parametrize( - "name, expected", - [ - ["foo", "Series([], Name: foo, dtype: int64)"], - [None, "Series([], dtype: int64)"], - ], - ) - def test_empty_int64(self, name, expected): - # with empty series (#4651) - s = Series([], dtype=np.int64, name=name) - assert repr(s) == expected - - def test_tidy_repr(self): - a = Series(["\u05d0"] * 1000) - a.name = "title1" - repr(a) # should not raise exception - - def test_repr_bool_fails(self, capsys): - s = Series( - [ - DataFrame(np.random.default_rng(2).standard_normal((2, 2))) - for i in range(5) - ] - ) - - # It works (with no Cython exception barf)! - repr(s) - - captured = capsys.readouterr() - assert captured.err == "" - - def test_repr_name_iterable_indexable(self): - s = Series([1, 2, 3], name=np.int64(3)) - - # it works! - repr(s) - - s.name = ("\u05d0",) * 2 - repr(s) - - def test_repr_should_return_str(self): - # https://docs.python.org/3/reference/datamodel.html#object.__repr__ - # ...The return value must be a string object. - - # (str on py2.x, str (unicode) on py3) - - data = [8, 5, 3, 5] - index1 = ["\u03c3", "\u03c4", "\u03c5", "\u03c6"] - df = Series(data, index=index1) - assert type(df.__repr__() == str) # both py2 / 3 - - def test_repr_max_rows(self): - # GH 6863 - with option_context("display.max_rows", None): - str(Series(range(1001))) # should not raise exception - - def test_unicode_string_with_unicode(self): - df = Series(["\u05d0"], name="\u05d1") - str(df) - - def test_str_to_bytes_raises(self): - # GH 26447 - df = Series(["abc"], name="abc") - msg = "^'str' object cannot be interpreted as an integer$" - with pytest.raises(TypeError, match=msg): - bytes(df) - - def test_timeseries_repr_object_dtype(self): - index = Index( - [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], dtype=object - ) - ts = Series(np.random.default_rng(2).standard_normal(len(index)), index) - repr(ts) - - ts = tm.makeTimeSeries(1000) - assert repr(ts).splitlines()[-1].startswith("Freq:") - - ts2 = ts.iloc[np.random.default_rng(2).integers(0, len(ts) - 1, 400)] - repr(ts2).splitlines()[-1] - - def test_latex_repr(self): - pytest.importorskip("jinja2") # uses Styler implementation - result = r"""\begin{tabular}{ll} -\toprule - & 0 \\ -\midrule -0 & $\alpha$ \\ -1 & b \\ -2 & c \\ -\bottomrule -\end{tabular} -""" - with option_context( - "styler.format.escape", None, "styler.render.repr", "latex" - ): - s = Series([r"$\alpha$", "b", "c"]) - assert result == s._repr_latex_() - - assert s._repr_latex_() is None - - def test_index_repr_in_frame_with_nan(self): - # see gh-25061 - i = Index([1, np.nan]) - s = Series([1, 2], index=i) - exp = """1.0 1\nNaN 2\ndtype: int64""" - - assert repr(s) == exp - - def test_format_pre_1900_dates(self): - rng = date_range("1/1/1850", "1/1/1950", freq="A-DEC") - rng.format() - ts = Series(1, index=rng) - repr(ts) - - def test_series_repr_nat(self): - series = Series([0, 1000, 2000, pd.NaT._value], dtype="M8[ns]") - - result = repr(series) - expected = ( - "0 1970-01-01 00:00:00.000000\n" - "1 1970-01-01 00:00:00.000001\n" - "2 1970-01-01 00:00:00.000002\n" - "3 NaT\n" - "dtype: datetime64[ns]" - ) - assert result == expected - - def test_float_repr(self): - # GH#35603 - # check float format when cast to object - ser = Series([1.0]).astype(object) - expected = "0 1.0\ndtype: object" - assert repr(ser) == expected - - def test_different_null_objects(self): - # GH#45263 - ser = Series([1, 2, 3, 4], [True, None, np.nan, pd.NaT]) - result = repr(ser) - expected = "True 1\nNone 2\nNaN 3\nNaT 4\ndtype: int64" - assert result == expected - - -class TestCategoricalRepr: - def test_categorical_repr_unicode(self): - # see gh-21002 - - class County: - name = "San Sebastián" - state = "PR" - - def __repr__(self) -> str: - return self.name + ", " + self.state - - cat = Categorical([County() for _ in range(61)]) - idx = Index(cat) - ser = idx.to_series() - - repr(ser) - str(ser) - - def test_categorical_repr(self): - a = Series(Categorical([1, 2, 3, 4])) - exp = ( - "0 1\n1 2\n2 3\n3 4\n" - "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" - ) - - assert exp == a.__str__() - - a = Series(Categorical(["a", "b"] * 25)) - exp = ( - "0 a\n1 b\n" - " ..\n" - "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" - ) - with option_context("display.max_rows", 5): - assert exp == repr(a) - - levs = list("abcdefghijklmnopqrstuvwxyz") - a = Series(Categorical(["a", "b"], categories=levs, ordered=True)) - exp = ( - "0 a\n1 b\n" - "dtype: category\n" - "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... 'w' < 'x' < 'y' < 'z']" - ) - assert exp == a.__str__() - - def test_categorical_series_repr(self): - s = Series(Categorical([1, 2, 3])) - exp = """0 1 -1 2 -2 3 -dtype: category -Categories (3, int64): [1, 2, 3]""" - - assert repr(s) == exp - - s = Series(Categorical(np.arange(10))) - exp = f"""0 0 -1 1 -2 2 -3 3 -4 4 -5 5 -6 6 -7 7 -8 8 -9 9 -dtype: category -Categories (10, {np.dtype(int)}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" - - assert repr(s) == exp - - def test_categorical_series_repr_ordered(self): - s = Series(Categorical([1, 2, 3], ordered=True)) - exp = """0 1 -1 2 -2 3 -dtype: category -Categories (3, int64): [1 < 2 < 3]""" - - assert repr(s) == exp - - s = Series(Categorical(np.arange(10), ordered=True)) - exp = f"""0 0 -1 1 -2 2 -3 3 -4 4 -5 5 -6 6 -7 7 -8 8 -9 9 -dtype: category -Categories (10, {np.dtype(int)}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" - - assert repr(s) == exp - - def test_categorical_series_repr_datetime(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) - s = Series(Categorical(idx)) - exp = """0 2011-01-01 09:00:00 -1 2011-01-01 10:00:00 -2 2011-01-01 11:00:00 -3 2011-01-01 12:00:00 -4 2011-01-01 13:00:00 -dtype: category -Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, - 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" # noqa: E501 - - assert repr(s) == exp - - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") - s = Series(Categorical(idx)) - exp = """0 2011-01-01 09:00:00-05:00 -1 2011-01-01 10:00:00-05:00 -2 2011-01-01 11:00:00-05:00 -3 2011-01-01 12:00:00-05:00 -4 2011-01-01 13:00:00-05:00 -dtype: category -Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, - 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, - 2011-01-01 13:00:00-05:00]""" # noqa: E501 - - assert repr(s) == exp - - def test_categorical_series_repr_datetime_ordered(self): - idx = date_range("2011-01-01 09:00", freq="H", periods=5) - s = Series(Categorical(idx, ordered=True)) - exp = """0 2011-01-01 09:00:00 -1 2011-01-01 10:00:00 -2 2011-01-01 11:00:00 -3 2011-01-01 12:00:00 -4 2011-01-01 13:00:00 -dtype: category -Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < - 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa: E501 - - assert repr(s) == exp - - idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") - s = Series(Categorical(idx, ordered=True)) - exp = """0 2011-01-01 09:00:00-05:00 -1 2011-01-01 10:00:00-05:00 -2 2011-01-01 11:00:00-05:00 -3 2011-01-01 12:00:00-05:00 -4 2011-01-01 13:00:00-05:00 -dtype: category -Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < - 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < - 2011-01-01 13:00:00-05:00]""" # noqa: E501 - - assert repr(s) == exp - - def test_categorical_series_repr_period(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) - s = Series(Categorical(idx)) - exp = """0 2011-01-01 09:00 -1 2011-01-01 10:00 -2 2011-01-01 11:00 -3 2011-01-01 12:00 -4 2011-01-01 13:00 -dtype: category -Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, - 2011-01-01 13:00]""" # noqa: E501 - - assert repr(s) == exp - - idx = period_range("2011-01", freq="M", periods=5) - s = Series(Categorical(idx)) - exp = """0 2011-01 -1 2011-02 -2 2011-03 -3 2011-04 -4 2011-05 -dtype: category -Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" - - assert repr(s) == exp - - def test_categorical_series_repr_period_ordered(self): - idx = period_range("2011-01-01 09:00", freq="H", periods=5) - s = Series(Categorical(idx, ordered=True)) - exp = """0 2011-01-01 09:00 -1 2011-01-01 10:00 -2 2011-01-01 11:00 -3 2011-01-01 12:00 -4 2011-01-01 13:00 -dtype: category -Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < - 2011-01-01 13:00]""" # noqa: E501 - - assert repr(s) == exp - - idx = period_range("2011-01", freq="M", periods=5) - s = Series(Categorical(idx, ordered=True)) - exp = """0 2011-01 -1 2011-02 -2 2011-03 -3 2011-04 -4 2011-05 -dtype: category -Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" - - assert repr(s) == exp - - def test_categorical_series_repr_timedelta(self): - idx = timedelta_range("1 days", periods=5) - s = Series(Categorical(idx)) - exp = """0 1 days -1 2 days -2 3 days -3 4 days -4 5 days -dtype: category -Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" - - assert repr(s) == exp - - idx = timedelta_range("1 hours", periods=10) - s = Series(Categorical(idx)) - exp = """0 0 days 01:00:00 -1 1 days 01:00:00 -2 2 days 01:00:00 -3 3 days 01:00:00 -4 4 days 01:00:00 -5 5 days 01:00:00 -6 6 days 01:00:00 -7 7 days 01:00:00 -8 8 days 01:00:00 -9 9 days 01:00:00 -dtype: category -Categories (10, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, - 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, - 8 days 01:00:00, 9 days 01:00:00]""" # noqa: E501 - - assert repr(s) == exp - - def test_categorical_series_repr_timedelta_ordered(self): - idx = timedelta_range("1 days", periods=5) - s = Series(Categorical(idx, ordered=True)) - exp = """0 1 days -1 2 days -2 3 days -3 4 days -4 5 days -dtype: category -Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" - - assert repr(s) == exp - - idx = timedelta_range("1 hours", periods=10) - s = Series(Categorical(idx, ordered=True)) - exp = """0 0 days 01:00:00 -1 1 days 01:00:00 -2 2 days 01:00:00 -3 3 days 01:00:00 -4 4 days 01:00:00 -5 5 days 01:00:00 -6 6 days 01:00:00 -7 7 days 01:00:00 -8 8 days 01:00:00 -9 9 days 01:00:00 -dtype: category -Categories (10, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < - 3 days 01:00:00 ... 6 days 01:00:00 < 7 days 01:00:00 < - 8 days 01:00:00 < 9 days 01:00:00]""" # noqa: E501 - - assert repr(s) == exp diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/test_ufunc.py pandas-2.2.2+dfsg/pandas/tests/series/test_ufunc.py --- pandas-2.1.4+dfsg/pandas/tests/series/test_ufunc.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/test_ufunc.py 2024-04-10 17:42:52.000000000 +0000 @@ -274,7 +274,7 @@ if isinstance(values, pd.core.arrays.SparseArray): mark = pytest.mark.xfail(reason="SparseArray has no 'prod'") - request.node.add_marker(mark) + request.applymarker(mark) if values.dtype.kind in "iuf": result = np.multiply.reduce(obj) @@ -413,7 +413,7 @@ ser = pd.Series([1, 2, 3]) obj = np.array([1, 2, 3]) - with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN): + with pytest.raises(NotImplementedError, match=""): np.subtract.outer(ser, obj) diff -Nru pandas-2.1.4+dfsg/pandas/tests/series/test_unary.py pandas-2.2.2+dfsg/pandas/tests/series/test_unary.py --- pandas-2.1.4+dfsg/pandas/tests/series/test_unary.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/series/test_unary.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,13 +8,11 @@ # __neg__, __pos__, __invert__ def test_neg(self): - ser = tm.makeStringSeries() - ser.name = "series" + ser = Series(range(5), dtype="float64", name="series") tm.assert_series_equal(-ser, -1 * ser) def test_invert(self): - ser = tm.makeStringSeries() - ser.name = "series" + ser = Series(range(5), dtype="float64", name="series") tm.assert_series_equal(-(ser < 0), ~(ser < 0)) @pytest.mark.parametrize( diff -Nru pandas-2.1.4+dfsg/pandas/tests/strings/conftest.py pandas-2.2.2+dfsg/pandas/tests/strings/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/strings/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/strings/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,3 @@ -import numpy as np import pytest from pandas import Series @@ -13,6 +12,10 @@ ("decode", ("UTF-8",), {}), ("encode", ("UTF-8",), {}), ("endswith", ("a",), {}), + ("endswith", ((),), {}), + ("endswith", (("a",),), {}), + ("endswith", (("a", "b"),), {}), + ("endswith", (("a", "MISSING"),), {}), ("endswith", ("a",), {"na": True}), ("endswith", ("a",), {"na": False}), ("extract", ("([a-z]*)",), {"expand": False}), @@ -44,6 +47,10 @@ ("split", (" ",), {"expand": False}), ("split", (" ",), {"expand": True}), ("startswith", ("a",), {}), + ("startswith", (("a",),), {}), + ("startswith", (("a", "b"),), {}), + ("startswith", (("a", "MISSING"),), {}), + ("startswith", ((),), {}), ("startswith", ("a",), {"na": True}), ("startswith", ("a",), {"na": False}), ("removeprefix", ("a",), {}), @@ -123,53 +130,3 @@ ... method(*args, **kwargs) """ return request.param - - -# subset of the full set from pandas/conftest.py -_any_allowed_skipna_inferred_dtype = [ - ("string", ["a", np.nan, "c"]), - ("bytes", [b"a", np.nan, b"c"]), - ("empty", [np.nan, np.nan, np.nan]), - ("empty", []), - ("mixed-integer", ["a", np.nan, 2]), -] -ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id - - -@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) -def any_allowed_skipna_inferred_dtype(request): - """ - Fixture for all (inferred) dtypes allowed in StringMethods.__init__ - - The covered (inferred) types are: - * 'string' - * 'empty' - * 'bytes' - * 'mixed' - * 'mixed-integer' - - Returns - ------- - inferred_dtype : str - The string for the inferred dtype from _libs.lib.infer_dtype - values : np.ndarray - An array of object dtype that will be inferred to have - `inferred_dtype` - - Examples - -------- - >>> from pandas._libs import lib - >>> - >>> def test_something(any_allowed_skipna_inferred_dtype): - ... inferred_dtype, values = any_allowed_skipna_inferred_dtype - ... # will pass - ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype - ... - ... # constructor for .str-accessor will also pass - ... Series(values).str - """ - inferred_dtype, values = request.param - values = np.array(values, dtype=object) # object dtype to avoid casting - - # correctness of inference tested in tests/dtypes/test_inference.py - return inferred_dtype, values diff -Nru pandas-2.1.4+dfsg/pandas/tests/strings/test_api.py pandas-2.2.2+dfsg/pandas/tests/strings/test_api.py --- pandas-2.1.4+dfsg/pandas/tests/strings/test_api.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/strings/test_api.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,14 +1,66 @@ +import numpy as np import pytest from pandas import ( + CategoricalDtype, DataFrame, Index, MultiIndex, Series, _testing as tm, + option_context, ) from pandas.core.strings.accessor import StringMethods +# subset of the full set from pandas/conftest.py +_any_allowed_skipna_inferred_dtype = [ + ("string", ["a", np.nan, "c"]), + ("bytes", [b"a", np.nan, b"c"]), + ("empty", [np.nan, np.nan, np.nan]), + ("empty", []), + ("mixed-integer", ["a", np.nan, 2]), +] +ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id + + +@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids) +def any_allowed_skipna_inferred_dtype(request): + """ + Fixture for all (inferred) dtypes allowed in StringMethods.__init__ + + The covered (inferred) types are: + * 'string' + * 'empty' + * 'bytes' + * 'mixed' + * 'mixed-integer' + + Returns + ------- + inferred_dtype : str + The string for the inferred dtype from _libs.lib.infer_dtype + values : np.ndarray + An array of object dtype that will be inferred to have + `inferred_dtype` + + Examples + -------- + >>> from pandas._libs import lib + >>> + >>> def test_something(any_allowed_skipna_inferred_dtype): + ... inferred_dtype, values = any_allowed_skipna_inferred_dtype + ... # will pass + ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + ... + ... # constructor for .str-accessor will also pass + ... Series(values).str + """ + inferred_dtype, values = request.param + values = np.array(values, dtype=object) # object dtype to avoid casting + + # correctness of inference tested in tests/dtypes/test_inference.py + return inferred_dtype, values + def test_api(any_string_dtype): # GH 6106, GH 9322 @@ -92,7 +144,7 @@ if reason is not None: mark = pytest.mark.xfail(raises=raises, reason=reason) - request.node.add_marker(mark) + request.applymarker(mark) t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) @@ -112,7 +164,8 @@ if inferred_dtype in allowed_types: # xref GH 23555, GH 23556 - method(*args, **kwargs) # works! + with option_context("future.no_silent_downcasting", True): + method(*args, **kwargs) # works! else: # GH 23011, GH 23163 msg = ( @@ -128,6 +181,7 @@ s = Series(list("aabb"), dtype=any_string_dtype) s = s + " " + s c = s.astype("category") + c = c.astype(CategoricalDtype(c.dtype.categories.astype("object"))) assert isinstance(c.str, StringMethods) method_name, args, kwargs = any_string_method diff -Nru pandas-2.1.4+dfsg/pandas/tests/strings/test_case_justify.py pandas-2.2.2+dfsg/pandas/tests/strings/test_case_justify.py --- pandas-2.1.4+dfsg/pandas/tests/strings/test_case_justify.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/strings/test_case_justify.py 2024-04-10 17:42:52.000000000 +0000 @@ -21,7 +21,8 @@ s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]) result = s.str.title() expected = Series( - ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan] + ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan], + dtype=object, ) tm.assert_almost_equal(result, expected) @@ -41,11 +42,15 @@ s = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) result = s.str.upper() - expected = Series(["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan]) + expected = Series( + ["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) result = s.str.lower() - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) + expected = Series( + ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -71,7 +76,8 @@ s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0]) result = s.str.capitalize() expected = Series( - ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan] + ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -87,7 +93,8 @@ s = Series(["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0]) result = s.str.swapcase() expected = Series( - ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan] + ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -138,19 +145,22 @@ result = s.str.pad(5, side="left") expected = Series( - [" a", np.nan, " b", np.nan, np.nan, " ee", None, np.nan, np.nan] + [" a", np.nan, " b", np.nan, np.nan, " ee", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) result = s.str.pad(5, side="right") expected = Series( - ["a ", np.nan, "b ", np.nan, np.nan, "ee ", None, np.nan, np.nan] + ["a ", np.nan, "b ", np.nan, np.nan, "ee ", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) result = s.str.pad(5, side="both") expected = Series( - [" a ", np.nan, " b ", np.nan, np.nan, " ee ", None, np.nan, np.nan] + [" a ", np.nan, " b ", np.nan, np.nan, " ee ", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -238,7 +248,8 @@ None, np.nan, np.nan, - ] + ], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -255,7 +266,8 @@ None, np.nan, np.nan, - ] + ], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -272,7 +284,8 @@ None, np.nan, np.nan, - ] + ], + dtype=object, ) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/strings/test_cat.py pandas-2.2.2+dfsg/pandas/tests/strings/test_cat.py --- pandas-2.1.4+dfsg/pandas/tests/strings/test_cat.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/strings/test_cat.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Index, @@ -10,6 +12,7 @@ Series, _testing as tm, concat, + option_context, ) @@ -26,45 +29,49 @@ assert result.name == "name" -def test_str_cat(index_or_series): - box = index_or_series - # test_cat above tests "str_cat" from ndarray; - # here testing "str.cat" from Series/Index to ndarray/list - s = box(["a", "a", "b", "b", "c", np.nan]) - - # single array - result = s.str.cat() - expected = "aabbc" - assert result == expected - - result = s.str.cat(na_rep="-") - expected = "aabbc-" - assert result == expected - - result = s.str.cat(sep="_", na_rep="NA") - expected = "a_a_b_b_c_NA" - assert result == expected - - t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) - expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) - - # Series/Index with array - result = s.str.cat(t, na_rep="-") - tm.assert_equal(result, expected) - - # Series/Index with list - result = s.str.cat(list(t), na_rep="-") - tm.assert_equal(result, expected) +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) +def test_str_cat(index_or_series, infer_string): + with option_context("future.infer_string", infer_string): + box = index_or_series + # test_cat above tests "str_cat" from ndarray; + # here testing "str.cat" from Series/Index to ndarray/list + s = box(["a", "a", "b", "b", "c", np.nan]) + + # single array + result = s.str.cat() + expected = "aabbc" + assert result == expected + + result = s.str.cat(na_rep="-") + expected = "aabbc-" + assert result == expected + + result = s.str.cat(sep="_", na_rep="NA") + expected = "a_a_b_b_c_NA" + assert result == expected + + t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) + expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) + + # Series/Index with array + result = s.str.cat(t, na_rep="-") + tm.assert_equal(result, expected) + + # Series/Index with list + result = s.str.cat(list(t), na_rep="-") + tm.assert_equal(result, expected) + + # errors for incorrect lengths + rgx = r"If `others` contains arrays or lists \(or other list-likes.*" + z = Series(["1", "2", "3"]) - # errors for incorrect lengths - rgx = r"If `others` contains arrays or lists \(or other list-likes.*" - z = Series(["1", "2", "3"]) + with pytest.raises(ValueError, match=rgx): + s.str.cat(z.values) - with pytest.raises(ValueError, match=rgx): - s.str.cat(z.values) - - with pytest.raises(ValueError, match=rgx): - s.str.cat(list(z)) + with pytest.raises(ValueError, match=rgx): + s.str.cat(list(z)) def test_str_cat_raises_intuitive_error(index_or_series): @@ -78,39 +85,65 @@ s.str.cat(" ") +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) @pytest.mark.parametrize("sep", ["", None]) @pytest.mark.parametrize("dtype_target", ["object", "category"]) @pytest.mark.parametrize("dtype_caller", ["object", "category"]) -def test_str_cat_categorical(index_or_series, dtype_caller, dtype_target, sep): +def test_str_cat_categorical( + index_or_series, dtype_caller, dtype_target, sep, infer_string +): box = index_or_series - s = Index(["a", "a", "b", "a"], dtype=dtype_caller) - s = s if box == Index else Series(s, index=s) - t = Index(["b", "a", "b", "c"], dtype=dtype_target) - - expected = Index(["ab", "aa", "bb", "ac"]) - expected = expected if box == Index else Series(expected, index=s) - - # Series/Index with unaligned Index -> t.values - result = s.str.cat(t.values, sep=sep) - tm.assert_equal(result, expected) + with option_context("future.infer_string", infer_string): + s = Index(["a", "a", "b", "a"], dtype=dtype_caller) + s = s if box == Index else Series(s, index=s, dtype=s.dtype) + t = Index(["b", "a", "b", "c"], dtype=dtype_target) + + expected = Index( + ["ab", "aa", "bb", "ac"], dtype=object if dtype_caller == "object" else None + ) + expected = ( + expected + if box == Index + else Series( + expected, index=Index(s, dtype=dtype_caller), dtype=expected.dtype + ) + ) + + # Series/Index with unaligned Index -> t.values + result = s.str.cat(t.values, sep=sep) + tm.assert_equal(result, expected) + + # Series/Index with Series having matching Index + t = Series(t.values, index=Index(s, dtype=dtype_caller)) + result = s.str.cat(t, sep=sep) + tm.assert_equal(result, expected) + + # Series/Index with Series.values + result = s.str.cat(t.values, sep=sep) + tm.assert_equal(result, expected) + + # Series/Index with Series having different Index + t = Series(t.values, index=t.values) + expected = Index( + ["aa", "aa", "bb", "bb", "aa"], + dtype=object if dtype_caller == "object" else None, + ) + dtype = object if dtype_caller == "object" else s.dtype.categories.dtype + expected = ( + expected + if box == Index + else Series( + expected, + index=Index(expected.str[:1], dtype=dtype), + dtype=expected.dtype, + ) + ) - # Series/Index with Series having matching Index - t = Series(t.values, index=s) - result = s.str.cat(t, sep=sep) - tm.assert_equal(result, expected) - - # Series/Index with Series.values - result = s.str.cat(t.values, sep=sep) - tm.assert_equal(result, expected) - - # Series/Index with Series having different Index - t = Series(t.values, index=t.values) - expected = Index(["aa", "aa", "aa", "bb", "bb"]) - expected = expected if box == Index else Series(expected, index=expected.str[:1]) - - result = s.str.cat(t, sep=sep) - tm.assert_equal(result, expected) + result = s.str.cat(t, sep=sep) + tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -321,8 +354,9 @@ # all-NA target if box == Series: - expected = Series([np.nan] * 4, index=s.index, dtype=object) + expected = Series([np.nan] * 4, index=s.index, dtype=s.dtype) else: # box == Index + # TODO: Strimg option, this should return string dtype expected = Index([np.nan] * 4, dtype=object) result = s.str.cat(t, join="left") tm.assert_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/strings/test_extract.py pandas-2.2.2+dfsg/pandas/tests/strings/test_extract.py --- pandas-2.1.4+dfsg/pandas/tests/strings/test_extract.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/strings/test_extract.py 2024-04-10 17:42:52.000000000 +0000 @@ -47,13 +47,16 @@ # two groups result = ser.str.extract(".*(BAD[_]+).*(BAD)", expand=False) er = [np.nan, np.nan] # empty row - expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) + expected = DataFrame( + [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object + ) tm.assert_frame_equal(result, expected) # single group result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False) expected = Series( - ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan] + ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -238,7 +241,9 @@ ) result = mixed.str.extract(".*(BAD[_]+).*(BAD)", expand=True) - expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) + expected = DataFrame( + [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object + ) tm.assert_frame_equal(result, expected) @@ -360,7 +365,7 @@ data = ["A1", "B2", "C"] if len(index) < len(data): - pytest.skip("Index too short") + pytest.skip(f"Index needs more than {len(data)} values") index = index[: len(data)] s = Series(data, index=index, dtype=any_string_dtype) @@ -603,8 +608,8 @@ # index.name doesn't affect to the result if any_string_dtype == "object": for idx in [ - Index(["a1a2", "b1", "c1"]), - Index(["a1a2", "b1", "c1"], name="xxx"), + Index(["a1a2", "b1", "c1"], dtype=object), + Index(["a1a2", "b1", "c1"], name="xxx", dtype=object), ]: result = idx.str.extractall(r"[ab](?P\d)") tm.assert_frame_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/strings/test_find_replace.py pandas-2.2.2+dfsg/pandas/tests/strings/test_find_replace.py --- pandas-2.1.4+dfsg/pandas/tests/strings/test_find_replace.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/strings/test_find_replace.py 2024-04-10 17:42:52.000000000 +0000 @@ -242,7 +242,7 @@ @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) -@pytest.mark.parametrize("dtype", [None, "category"]) +@pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) def test_startswith(pat, dtype, null_value, na): @@ -254,10 +254,10 @@ result = values.str.startswith(pat) exp = Series([False, np.nan, True, False, False, np.nan, True]) - if dtype is None and null_value is pd.NA: + if dtype == "object" and null_value is pd.NA: # GH#18463 exp = exp.fillna(null_value) - elif dtype is None and null_value is None: + elif dtype == "object" and null_value is None: exp[exp.isna()] = None tm.assert_series_equal(result, exp) @@ -300,7 +300,7 @@ @pytest.mark.parametrize("pat", ["foo", ("foo", "baz")]) -@pytest.mark.parametrize("dtype", [None, "category"]) +@pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) def test_endswith(pat, dtype, null_value, na): @@ -312,10 +312,10 @@ result = values.str.endswith(pat) exp = Series([False, np.nan, False, False, True, np.nan, True]) - if dtype is None and null_value is pd.NA: + if dtype == "object" and null_value is pd.NA: # GH#18463 - exp = exp.fillna(pd.NA) - elif dtype is None and null_value is None: + exp = exp.fillna(null_value) + elif dtype == "object" and null_value is None: exp[exp.isna()] = None tm.assert_series_equal(result, exp) @@ -382,7 +382,9 @@ ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace("BAD[_]*", "", regex=True) - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) + expected = Series( + ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -469,7 +471,9 @@ ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] ) result = Series(ser).str.replace(pat, "", regex=True) - expected = Series(["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan]) + expected = Series( + ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -726,6 +730,15 @@ tm.assert_series_equal(result, expected) +def test_fullmatch_dollar_literal(any_string_dtype): + # GH 56652 + ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) + result = ser.str.fullmatch("foo\\$") + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" + expected = Series([False, False, np.nan, True], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + def test_fullmatch_na_kwarg(any_string_dtype): ser = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype @@ -913,7 +926,7 @@ # Series with non-string values s = Series(["a", "b", "c", 1.2]) table = str.maketrans("abc", "cde") - expected = Series(["c", "d", "e", np.nan]) + expected = Series(["c", "d", "e", np.nan], dtype=object) result = s.str.translate(table) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/strings/test_split_partition.py pandas-2.2.2+dfsg/pandas/tests/strings/test_split_partition.py --- pandas-2.1.4+dfsg/pandas/tests/strings/test_split_partition.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/strings/test_split_partition.py 2024-04-10 17:42:52.000000000 +0000 @@ -681,14 +681,16 @@ def test_get(): ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = ser.str.split("_").str.get(1) - expected = Series(["b", "d", np.nan, "g"]) + expected = Series(["b", "d", np.nan, "g"], dtype=object) tm.assert_series_equal(result, expected) def test_get_mixed_object(): ser = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0]) result = ser.str.split("_").str.get(1) - expected = Series(["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan]) + expected = Series( + ["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan], dtype=object + ) tm.assert_series_equal(result, expected) @@ -696,7 +698,7 @@ def test_get_bounds(idx): ser = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"]) result = ser.str.split("_").str.get(idx) - expected = Series(["3", "8", np.nan]) + expected = Series(["3", "8", np.nan], dtype=object) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/strings/test_string_array.py pandas-2.2.2+dfsg/pandas/tests/strings/test_string_array.py --- pandas-2.1.4+dfsg/pandas/tests/strings/test_string_array.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/strings/test_string_array.py 2024-04-10 17:42:52.000000000 +0000 @@ -8,6 +8,7 @@ DataFrame, Series, _testing as tm, + option_context, ) @@ -56,7 +57,8 @@ columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == nullable_string_dtype) result[columns] = result[columns].astype(object) - expected[columns] = expected[columns].fillna(NA) # GH#18463 + with option_context("future.no_silent_downcasting", True): + expected[columns] = expected[columns].fillna(NA) # GH#18463 tm.assert_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/strings/test_strings.py pandas-2.2.2+dfsg/pandas/tests/strings/test_strings.py --- pandas-2.1.4+dfsg/pandas/tests/strings/test_strings.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/strings/test_strings.py 2024-04-10 17:42:52.000000000 +0000 @@ -76,7 +76,8 @@ ser = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) result = ser.str.repeat(3) expected = Series( - ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan] + ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -270,7 +271,8 @@ ) result = ser.str.split("_").str.join("_") expected = Series( - ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan] + ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan], + dtype=object, ) tm.assert_series_equal(result, expected) @@ -398,7 +400,7 @@ def test_slice_mixed_object(start, stop, step, expected): ser = Series(["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0]) result = ser.str.slice(start, stop, step) - expected = Series(expected) + expected = Series(expected, dtype=object) tm.assert_series_equal(result, expected) @@ -453,7 +455,7 @@ ser = Series([" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0]) result = getattr(ser.str, method)() - expected = Series(exp + [np.nan, np.nan, None, np.nan, np.nan]) + expected = Series(exp + [np.nan, np.nan, None, np.nan, np.nan], dtype=object) tm.assert_series_equal(result, expected) @@ -529,7 +531,7 @@ def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") - expected = ser.map(lambda x: x.decode("utf-8")) + expected = ser.map(lambda x: x.decode("utf-8")).astype(object) tm.assert_series_equal(result, expected) @@ -559,7 +561,7 @@ ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") - expected = ser.map(lambda x: x.decode("cp1252", "ignore")) + expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object) tm.assert_series_equal(result, expected) @@ -672,7 +674,7 @@ def test_zfill(): # https://github.com/pandas-dev/pandas/issues/20868 value = Series(["-1", "1", "1000", 10, np.nan]) - expected = Series(["-01", "001", "1000", np.nan, np.nan]) + expected = Series(["-01", "001", "1000", np.nan, np.nan], dtype=object) tm.assert_series_equal(value.str.zfill(3), expected) value = Series(["-2", "+5"]) @@ -704,10 +706,10 @@ ] ) result = s.str.get("name") - expected = Series(["Hello", "Goodbye", None]) + expected = Series(["Hello", "Goodbye", None], dtype=object) tm.assert_series_equal(result, expected) result = s.str.get("value") - expected = Series(["World", "Planet", "Sea"]) + expected = Series(["World", "Planet", "Sea"], dtype=object) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/test_algos.py pandas-2.2.2+dfsg/pandas/tests/test_algos.py --- pandas-2.1.4+dfsg/pandas/tests/test_algos.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/test_algos.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,5 +1,4 @@ from datetime import datetime -from itertools import permutations import struct import numpy as np @@ -17,7 +16,7 @@ is_integer_dtype, is_object_dtype, ) -from pandas.core.dtypes.dtypes import CategoricalDtype as CDT +from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import ( @@ -50,6 +49,20 @@ class TestFactorize: + def test_factorize_complex(self): + # GH#17927 + array = [1, 2, 2 + 1j] + msg = "factorize with argument that is not not a Series" + with tm.assert_produces_warning(FutureWarning, match=msg): + labels, uniques = algos.factorize(array) + + expected_labels = np.array([0, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(labels, expected_labels) + + # Should return a complex dtype in the future + expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=object) + tm.assert_numpy_array_equal(uniques, expected_uniques) + @pytest.mark.parametrize("sort", [True, False]) def test_factorize(self, index_or_series_obj, sort): obj = index_or_series_obj @@ -523,6 +536,25 @@ tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_index_equal(uniques, expected_uniques) + def test_factorize_interval_non_nano(self, unit): + # GH#56099 + left = DatetimeIndex(["2016-01-01", np.nan, "2015-10-11"]).as_unit(unit) + right = DatetimeIndex(["2016-01-02", np.nan, "2015-10-15"]).as_unit(unit) + idx = IntervalIndex.from_arrays(left, right) + codes, cats = idx.factorize() + assert cats.dtype == f"interval[datetime64[{unit}], right]" + + ts = Timestamp(0).as_unit(unit) + idx2 = IntervalIndex.from_arrays(left - ts, right - ts) + codes2, cats2 = idx2.factorize() + assert cats2.dtype == f"interval[timedelta64[{unit}], right]" + + idx3 = IntervalIndex.from_arrays( + left.tz_localize("US/Pacific"), right.tz_localize("US/Pacific") + ) + codes3, cats3 = idx3.factorize() + assert cats3.dtype == f"interval[datetime64[{unit}, US/Pacific], right]" + class TestUnique: def test_ints(self): @@ -707,59 +739,31 @@ result = pd.unique(ci) tm.assert_index_equal(result, expected) - def test_datetime64tz_aware(self): + def test_datetime64tz_aware(self, unit): # GH 15939 - result = Series( - Index( - [ - Timestamp("20160101", tz="US/Eastern"), - Timestamp("20160101", tz="US/Eastern"), - ] - ) - ).unique() - expected = DatetimeArray._from_sequence( - np.array([Timestamp("2016-01-01 00:00:00-0500", tz="US/Eastern")]) - ) - tm.assert_extension_array_equal(result, expected) - - result = Index( + dti = Index( [ Timestamp("20160101", tz="US/Eastern"), Timestamp("20160101", tz="US/Eastern"), ] - ).unique() - expected = DatetimeIndex( - ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None - ) + ).as_unit(unit) + ser = Series(dti) + + result = ser.unique() + expected = dti[:1]._data + tm.assert_extension_array_equal(result, expected) + + result = dti.unique() + expected = dti[:1] tm.assert_index_equal(result, expected) - result = pd.unique( - Series( - Index( - [ - Timestamp("20160101", tz="US/Eastern"), - Timestamp("20160101", tz="US/Eastern"), - ] - ) - ) - ) - expected = DatetimeArray._from_sequence( - np.array([Timestamp("2016-01-01", tz="US/Eastern")]) - ) + result = pd.unique(ser) + expected = dti[:1]._data tm.assert_extension_array_equal(result, expected) - result = pd.unique( - Index( - [ - Timestamp("20160101", tz="US/Eastern"), - Timestamp("20160101", tz="US/Eastern"), - ] - ) - ) - expected = DatetimeIndex( - ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None - ) + result = pd.unique(dti) + expected = dti[:1] tm.assert_index_equal(result, expected) def test_order_of_appearance(self): @@ -772,23 +776,6 @@ result = pd.unique(Series([2] + [1] * 5)) tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64")) - result = pd.unique(Series([Timestamp("20160101"), Timestamp("20160101")])) - expected = np.array(["2016-01-01T00:00:00.000000000"], dtype="datetime64[ns]") - tm.assert_numpy_array_equal(result, expected) - - result = pd.unique( - Index( - [ - Timestamp("20160101", tz="US/Eastern"), - Timestamp("20160101", tz="US/Eastern"), - ] - ) - ) - expected = DatetimeIndex( - ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None - ) - tm.assert_index_equal(result, expected) - msg = "unique with argument that is not not a Series, Index," with tm.assert_produces_warning(FutureWarning, match=msg): result = pd.unique(list("aabc")) @@ -799,6 +786,25 @@ expected = Categorical(list("abc")) tm.assert_categorical_equal(result, expected) + def test_order_of_appearance_dt64(self, unit): + ser = Series([Timestamp("20160101"), Timestamp("20160101")]).dt.as_unit(unit) + result = pd.unique(ser) + expected = np.array(["2016-01-01T00:00:00.000000000"], dtype=f"M8[{unit}]") + tm.assert_numpy_array_equal(result, expected) + + def test_order_of_appearance_dt64tz(self, unit): + dti = DatetimeIndex( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ).as_unit(unit) + result = pd.unique(dti) + expected = DatetimeIndex( + ["2016-01-01 00:00:00"], dtype=f"datetime64[{unit}, US/Eastern]", freq=None + ) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "arg ,expected", [ @@ -970,14 +976,7 @@ # Anything but object and we get all-False shortcut dta = date_range("2013-01-01", periods=3)._values - if dtype1 == "period[D]": - # TODO: fix Series.view to get this on its own - arr = dta.to_period("D") - elif dtype1 == "M8[ns, UTC]": - # TODO: fix Series.view to get this on its own - arr = dta.tz_localize("UTC") - else: - arr = Series(dta.view("i8")).view(dtype1)._values + arr = Series(dta.view("i8")).array.view(dtype1) comps = arr.view("i8").astype(dtype) @@ -993,6 +992,45 @@ expected[1] = True tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]", "period[D]"]) + def test_isin_datetimelike_all_nat(self, dtype): + # GH#56427 + dta = date_range("2013-01-01", periods=3)._values + arr = Series(dta.view("i8")).array.view(dtype) + + arr[0] = NaT + result = algos.isin(arr, [NaT]) + expected = np.array([True, False, False], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]"]) + def test_isin_datetimelike_strings_deprecated(self, dtype): + # GH#53111 + dta = date_range("2013-01-01", periods=3)._values + arr = Series(dta.view("i8")).array.view(dtype) + + vals = [str(x) for x in arr] + msg = "The behavior of 'isin' with dtype=.* is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = algos.isin(arr, vals) + assert res.all() + + vals2 = np.array(vals, dtype=str) + with tm.assert_produces_warning(FutureWarning, match=msg): + res2 = algos.isin(arr, vals2) + assert res2.all() + + def test_isin_dt64tz_with_nat(self): + # the all-NaT values used to get inferred to tznaive, which was evaluated + # as non-matching GH#56427 + dti = date_range("2016-01-01", periods=3, tz="UTC") + ser = Series(dti) + ser[0] = NaT + + res = algos.isin(ser._values, [NaT]) + exp = np.array([True, False, False], dtype=bool) + tm.assert_numpy_array_equal(res, exp) + def test_categorical_from_codes(self): # GH 16639 vals = np.array([0, 1, 2, 0]) @@ -1182,7 +1220,7 @@ with tm.assert_produces_warning(FutureWarning, match=msg): result = algos.value_counts(factor) breaks = [-1.606, -1.018, -0.431, 0.155, 0.741] - index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True)) + index = IntervalIndex.from_breaks(breaks).astype(CategoricalDtype(ordered=True)) expected = Series([1, 0, 2, 1], index=index, name="count") tm.assert_series_equal(result.sort_index(), expected.sort_index()) @@ -1230,21 +1268,27 @@ msg = "pandas.value_counts is deprecated" - for s in [td, dt]: + for ser in [td, dt]: with tm.assert_produces_warning(FutureWarning, match=msg): - vc = algos.value_counts(s) - vc_with_na = algos.value_counts(s, dropna=False) + vc = algos.value_counts(ser) + vc_with_na = algos.value_counts(ser, dropna=False) assert len(vc) == 1 assert len(vc_with_na) == 2 exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}, name="count") with tm.assert_produces_warning(FutureWarning, match=msg): - tm.assert_series_equal(algos.value_counts(dt), exp_dt) - # TODO same for (timedelta) + result_dt = algos.value_counts(dt) + tm.assert_series_equal(result_dt, exp_dt) + + exp_td = Series({np.timedelta64(10000): 1}, name="count") + with tm.assert_produces_warning(FutureWarning, match=msg): + result_td = algos.value_counts(td) + tm.assert_series_equal(result_td, exp_td) - def test_value_counts_datetime_outofbounds(self): + @pytest.mark.parametrize("dtype", [object, "M8[us]"]) + def test_value_counts_datetime_outofbounds(self, dtype): # GH 13663 - s = Series( + ser = Series( [ datetime(3000, 1, 1), datetime(5000, 1, 1), @@ -1252,22 +1296,18 @@ datetime(6000, 1, 1), datetime(3000, 1, 1), datetime(3000, 1, 1), - ] + ], + dtype=dtype, ) - res = s.value_counts() + res = ser.value_counts() exp_index = Index( [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)], - dtype=object, + dtype=dtype, ) exp = Series([3, 2, 1], index=exp_index, name="count") tm.assert_series_equal(res, exp) - # GH 12424 # TODO: belongs elsewhere - res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") - exp = Series(["2362-01-01", np.nan], dtype=object) - tm.assert_series_equal(res, exp) - def test_categorical(self): s = Series(Categorical(list("aaabbc"))) result = s.value_counts() @@ -1662,19 +1702,18 @@ class TestHashTable: @pytest.mark.parametrize( - "htable, tm_dtype", + "htable, data", [ - (ht.PyObjectHashTable, "String"), - (ht.StringHashTable, "String"), - (ht.Float64HashTable, "Float"), - (ht.Int64HashTable, "Int"), - (ht.UInt64HashTable, "UInt"), + (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), + (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), + (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), + (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), ], ) - def test_hashtable_unique(self, htable, tm_dtype, writable): + def test_hashtable_unique(self, htable, data, writable): # output of maker has guaranteed unique elements - maker = getattr(tm, "make" + tm_dtype + "Index") - s = Series(maker(1000)) + s = Series(data) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan @@ -1702,19 +1741,18 @@ tm.assert_numpy_array_equal(reconstr, s_duplicated.values) @pytest.mark.parametrize( - "htable, tm_dtype", + "htable, data", [ - (ht.PyObjectHashTable, "String"), - (ht.StringHashTable, "String"), - (ht.Float64HashTable, "Float"), - (ht.Int64HashTable, "Int"), - (ht.UInt64HashTable, "UInt"), + (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), + (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), + (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), + (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), ], ) - def test_hashtable_factorize(self, htable, tm_dtype, writable): + def test_hashtable_factorize(self, htable, writable, data): # output of maker has guaranteed unique elements - maker = getattr(tm, "make" + tm_dtype + "Index") - s = Series(maker(1000)) + s = Series(data) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan @@ -1799,212 +1837,6 @@ assert result == 1 -def test_pad_backfill_object_segfault(): - old = np.array([], dtype="O") - new = np.array([datetime(2010, 12, 31)], dtype="O") - - result = libalgos.pad["object"](old, new) - expected = np.array([-1], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - - result = libalgos.pad["object"](new, old) - expected = np.array([], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - - result = libalgos.backfill["object"](old, new) - expected = np.array([-1], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - - result = libalgos.backfill["object"](new, old) - expected = np.array([], dtype=np.intp) - tm.assert_numpy_array_equal(result, expected) - - -class TestTseriesUtil: - def test_backfill(self): - old = Index([1, 5, 10]) - new = Index(list(range(12))) - - filler = libalgos.backfill["int64_t"](old.values, new.values) - - expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.intp) - tm.assert_numpy_array_equal(filler, expect_filler) - - # corner case - old = Index([1, 4]) - new = Index(list(range(5, 10))) - filler = libalgos.backfill["int64_t"](old.values, new.values) - - expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp) - tm.assert_numpy_array_equal(filler, expect_filler) - - def test_pad(self): - old = Index([1, 5, 10]) - new = Index(list(range(12))) - - filler = libalgos.pad["int64_t"](old.values, new.values) - - expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.intp) - tm.assert_numpy_array_equal(filler, expect_filler) - - # corner case - old = Index([5, 10]) - new = Index(np.arange(5, dtype=np.int64)) - filler = libalgos.pad["int64_t"](old.values, new.values) - expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp) - tm.assert_numpy_array_equal(filler, expect_filler) - - -def test_is_lexsorted(): - failure = [ - np.array( - ([3] * 32) + ([2] * 32) + ([1] * 32) + ([0] * 32), - dtype="int64", - ), - np.array( - list(range(31))[::-1] * 4, - dtype="int64", - ), - ] - - assert not libalgos.is_lexsorted(failure) - - -def test_groupsort_indexer(): - a = np.random.default_rng(2).integers(0, 1000, 100).astype(np.intp) - b = np.random.default_rng(2).integers(0, 1000, 100).astype(np.intp) - - result = libalgos.groupsort_indexer(a, 1000)[0] - - # need to use a stable sort - # np.argsort returns int, groupsort_indexer - # always returns intp - expected = np.argsort(a, kind="mergesort") - expected = expected.astype(np.intp) - - tm.assert_numpy_array_equal(result, expected) - - # compare with lexsort - # np.lexsort returns int, groupsort_indexer - # always returns intp - key = a * 1000 + b - result = libalgos.groupsort_indexer(key, 1000000)[0] - expected = np.lexsort((b, a)) - expected = expected.astype(np.intp) - - tm.assert_numpy_array_equal(result, expected) - - -def test_infinity_sort(): - # GH 13445 - # numpy's argsort can be unhappy if something is less than - # itself. Instead, let's give our infinities a self-consistent - # ordering, but outside the float extended real line. - - Inf = libalgos.Infinity() - NegInf = libalgos.NegInfinity() - - ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf] - - assert all(Inf >= x for x in ref_nums) - assert all(Inf > x or x is Inf for x in ref_nums) - assert Inf >= Inf and Inf == Inf - assert not Inf < Inf and not Inf > Inf - assert libalgos.Infinity() == libalgos.Infinity() - assert not libalgos.Infinity() != libalgos.Infinity() - - assert all(NegInf <= x for x in ref_nums) - assert all(NegInf < x or x is NegInf for x in ref_nums) - assert NegInf <= NegInf and NegInf == NegInf - assert not NegInf < NegInf and not NegInf > NegInf - assert libalgos.NegInfinity() == libalgos.NegInfinity() - assert not libalgos.NegInfinity() != libalgos.NegInfinity() - - for perm in permutations(ref_nums): - assert sorted(perm) == ref_nums - - # smoke tests - np.array([libalgos.Infinity()] * 32).argsort() - np.array([libalgos.NegInfinity()] * 32).argsort() - - -def test_infinity_against_nan(): - Inf = libalgos.Infinity() - NegInf = libalgos.NegInfinity() - - assert not Inf > np.nan - assert not Inf >= np.nan - assert not Inf < np.nan - assert not Inf <= np.nan - assert not Inf == np.nan - assert Inf != np.nan - - assert not NegInf > np.nan - assert not NegInf >= np.nan - assert not NegInf < np.nan - assert not NegInf <= np.nan - assert not NegInf == np.nan - assert NegInf != np.nan - - -def test_ensure_platform_int(): - arr = np.arange(100, dtype=np.intp) - - result = libalgos.ensure_platform_int(arr) - assert result is arr - - -def test_int64_add_overflow(): - # see gh-14068 - msg = "Overflow in int64 addition" - m = np.iinfo(np.int64).max - n = np.iinfo(np.int64).min - - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), m) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m])) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([n, n]), n) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([n, n]), np.array([n, n])) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, n]), np.array([n, n])) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr( - np.array([m, m]), np.array([m, m]), arr_mask=np.array([False, True]) - ) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr( - np.array([m, m]), np.array([m, m]), b_mask=np.array([False, True]) - ) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr( - np.array([m, m]), - np.array([m, m]), - arr_mask=np.array([False, True]), - b_mask=np.array([False, True]), - ) - with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([np.nan, m])) - - # Check that the nan boolean arrays override whether or not - # the addition overflows. We don't check the result but just - # the fact that an OverflowError is not raised. - algos.checked_add_with_arr( - np.array([m, m]), np.array([m, m]), arr_mask=np.array([True, True]) - ) - algos.checked_add_with_arr( - np.array([m, m]), np.array([m, m]), b_mask=np.array([True, True]) - ) - algos.checked_add_with_arr( - np.array([m, m]), - np.array([m, m]), - arr_mask=np.array([True, False]), - b_mask=np.array([False, True]), - ) - - class TestMode: def test_no_mode(self): exp = Series([], dtype=np.float64, index=Index([], dtype=int)) diff -Nru pandas-2.1.4+dfsg/pandas/tests/test_downstream.py pandas-2.2.2+dfsg/pandas/tests/test_downstream.py --- pandas-2.1.4+dfsg/pandas/tests/test_downstream.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/test_downstream.py 2024-04-10 17:42:52.000000000 +0000 @@ -23,8 +23,6 @@ DatetimeArray, TimedeltaArray, ) -from pandas.core.arrays.datetimes import _sequence_to_dt64ns -from pandas.core.arrays.timedeltas import sequence_to_td64ns @pytest.fixture @@ -102,7 +100,7 @@ def test_xarray_cftimeindex_nearest(): # https://github.com/pydata/xarray/issues/3751 cftime = pytest.importorskip("cftime") - xarray = pytest.importorskip("xarray", minversion="0.21.0") + xarray = pytest.importorskip("xarray") times = xarray.cftime_range("0001", periods=2) key = cftime.DatetimeGregorian(2000, 1, 1) @@ -163,15 +161,11 @@ seaborn.stripplot(x="day", y="total_bill", data=tips) -def test_pandas_gbq(): - # Older versions import from non-public, non-existent pandas funcs - pytest.importorskip("pandas_gbq", minversion="0.10.0") - - def test_pandas_datareader(): pytest.importorskip("pandas_datareader") +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_pyarrow(df): pyarrow = pytest.importorskip("pyarrow") table = pyarrow.Table.from_pandas(df) @@ -310,15 +304,12 @@ cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype] - expected = cls(arr) - result = cls._from_sequence(data) + depr_msg = f"{cls.__name__}.__init__ is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + expected = cls(arr) + result = cls._from_sequence(data, dtype=dtype) tm.assert_extension_array_equal(result, expected) - func = {"M8[ns]": _sequence_to_dt64ns, "m8[ns]": sequence_to_td64ns}[dtype] - result = func(arr)[0] - expected = func(data)[0] - tm.assert_equal(result, expected) - if not isinstance(data, memoryview): # FIXME(GH#44431) these raise on memoryview and attempted fix # fails on py3.10 diff -Nru pandas-2.1.4+dfsg/pandas/tests/test_expressions.py pandas-2.2.2+dfsg/pandas/tests/test_expressions.py --- pandas-2.1.4+dfsg/pandas/tests/test_expressions.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/test_expressions.py 2024-04-10 17:42:52.000000000 +0000 @@ -102,12 +102,6 @@ @pytest.mark.skipif(not expr.USE_NUMEXPR, reason="not using numexpr") class TestExpressions: - @pytest.fixture(autouse=True) - def save_min_elements(self): - min_elements = expr._MIN_ELEMENTS - yield - expr._MIN_ELEMENTS = min_elements - @staticmethod def call_op(df, other, flex: bool, opname: str): if flex: @@ -140,21 +134,24 @@ @pytest.mark.parametrize( "arith", ["add", "sub", "mul", "mod", "truediv", "floordiv"] ) - def test_run_arithmetic(self, request, fixture, flex, arith): + def test_run_arithmetic(self, request, fixture, flex, arith, monkeypatch): df = request.getfixturevalue(fixture) - expr._MIN_ELEMENTS = 0 - result, expected = self.call_op(df, df, flex, arith) - - if arith == "truediv": - assert all(x.kind == "f" for x in expected.dtypes.values) - tm.assert_equal(expected, result) + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 0) + result, expected = self.call_op(df, df, flex, arith) - for i in range(len(df.columns)): - result, expected = self.call_op(df.iloc[:, i], df.iloc[:, i], flex, arith) if arith == "truediv": - assert expected.dtype.kind == "f" + assert all(x.kind == "f" for x in expected.dtypes.values) tm.assert_equal(expected, result) + for i in range(len(df.columns)): + result, expected = self.call_op( + df.iloc[:, i], df.iloc[:, i], flex, arith + ) + if arith == "truediv": + assert expected.dtype.kind == "f" + tm.assert_equal(expected, result) + @pytest.mark.parametrize( "fixture", [ @@ -168,7 +165,7 @@ ], ) @pytest.mark.parametrize("flex", [True, False]) - def test_run_binary(self, request, fixture, flex, comparison_op): + def test_run_binary(self, request, fixture, flex, comparison_op, monkeypatch): """ tests solely that the result is the same whether or not numexpr is enabled. Need to test whether the function does the correct thing @@ -179,18 +176,19 @@ with option_context("compute.use_numexpr", False): other = df.copy() + 1 - expr._MIN_ELEMENTS = 0 - expr.set_test_mode(True) + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 0) + expr.set_test_mode(True) - result, expected = self.call_op(df, other, flex, arith) + result, expected = self.call_op(df, other, flex, arith) + + used_numexpr = expr.get_test_result() + assert used_numexpr, "Did not use numexpr as expected." + tm.assert_equal(expected, result) - used_numexpr = expr.get_test_result() - assert used_numexpr, "Did not use numexpr as expected." - tm.assert_equal(expected, result) - - for i in range(len(df.columns)): - binary_comp = other.iloc[:, i] + 1 - self.call_op(df.iloc[:, i], binary_comp, flex, "add") + for i in range(len(df.columns)): + binary_comp = other.iloc[:, i] + 1 + self.call_op(df.iloc[:, i], binary_comp, flex, "add") def test_invalid(self): array = np.random.default_rng(2).standard_normal(1_000_001) @@ -406,7 +404,7 @@ "arith", ("add", "sub", "mul", "mod", "truediv", "floordiv") ) @pytest.mark.parametrize("axis", (0, 1)) - def test_frame_series_axis(self, axis, arith, _frame): + def test_frame_series_axis(self, axis, arith, _frame, monkeypatch): # GH#26736 Dataframe.floordiv(Series, axis=1) fails df = _frame @@ -415,15 +413,16 @@ else: other = df.iloc[:, 0] - expr._MIN_ELEMENTS = 0 + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 0) - op_func = getattr(df, arith) + op_func = getattr(df, arith) - with option_context("compute.use_numexpr", False): - expected = op_func(other, axis=axis) + with option_context("compute.use_numexpr", False): + expected = op_func(other, axis=axis) - result = op_func(other, axis=axis) - tm.assert_frame_equal(expected, result) + result = op_func(other, axis=axis) + tm.assert_frame_equal(expected, result) @pytest.mark.parametrize( "op", @@ -436,29 +435,32 @@ ) @pytest.mark.parametrize("box", [DataFrame, Series, Index]) @pytest.mark.parametrize("scalar", [-5, 5]) - def test_python_semantics_with_numexpr_installed(self, op, box, scalar): + def test_python_semantics_with_numexpr_installed( + self, op, box, scalar, monkeypatch + ): # https://github.com/pandas-dev/pandas/issues/36047 - expr._MIN_ELEMENTS = 0 - data = np.arange(-50, 50) - obj = box(data) - method = getattr(obj, op) - result = method(scalar) - - # compare result with numpy - with option_context("compute.use_numexpr", False): - expected = method(scalar) - - tm.assert_equal(result, expected) - - # compare result element-wise with Python - for i, elem in enumerate(data): - if box == DataFrame: - scalar_result = result.iloc[i, 0] - else: - scalar_result = result[i] - try: - expected = getattr(int(elem), op)(scalar) - except ZeroDivisionError: - pass - else: - assert scalar_result == expected + with monkeypatch.context() as m: + m.setattr(expr, "_MIN_ELEMENTS", 0) + data = np.arange(-50, 50) + obj = box(data) + method = getattr(obj, op) + result = method(scalar) + + # compare result with numpy + with option_context("compute.use_numexpr", False): + expected = method(scalar) + + tm.assert_equal(result, expected) + + # compare result element-wise with Python + for i, elem in enumerate(data): + if box == DataFrame: + scalar_result = result.iloc[i, 0] + else: + scalar_result = result[i] + try: + expected = getattr(int(elem), op)(scalar) + except ZeroDivisionError: + pass + else: + assert scalar_result == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/test_nanops.py pandas-2.2.2+dfsg/pandas/tests/test_nanops.py --- pandas-2.1.4+dfsg/pandas/tests/test_nanops.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/test_nanops.py 2024-04-10 17:42:52.000000000 +0000 @@ -14,7 +14,6 @@ ) import pandas._testing as tm from pandas.core import nanops -from pandas.core.arrays import DatetimeArray use_bn = nanops._USE_BOTTLENECK @@ -1113,13 +1112,13 @@ dti = pd.date_range("2016-01-01", periods=3).as_unit(unit) expected = dti[1] - for obj in [dti, DatetimeArray(dti), Series(dti)]: + for obj in [dti, dti._data]: result = nanops.nanmean(obj) assert result == expected dti2 = dti.insert(1, pd.NaT) - for obj in [dti2, DatetimeArray(dti2), Series(dti2)]: + for obj in [dti2, dti2._data]: result = nanops.nanmean(obj) assert result == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/test_optional_dependency.py pandas-2.2.2+dfsg/pandas/tests/test_optional_dependency.py --- pandas-2.1.4+dfsg/pandas/tests/test_optional_dependency.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/test_optional_dependency.py 2024-04-10 17:42:52.000000000 +0000 @@ -50,6 +50,20 @@ result = import_optional_dependency("fakemodule") assert result is module + with pytest.raises(ImportError, match="Pandas requires version '1.1.0'"): + import_optional_dependency("fakemodule", min_version="1.1.0") + + with tm.assert_produces_warning(UserWarning): + result = import_optional_dependency( + "fakemodule", errors="warn", min_version="1.1.0" + ) + assert result is None + + result = import_optional_dependency( + "fakemodule", errors="ignore", min_version="1.1.0" + ) + assert result is None + def test_submodule(monkeypatch): # Create a fake module with a submodule diff -Nru pandas-2.1.4+dfsg/pandas/tests/test_register_accessor.py pandas-2.2.2+dfsg/pandas/tests/test_register_accessor.py --- pandas-2.1.4+dfsg/pandas/tests/test_register_accessor.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/test_register_accessor.py 2024-04-10 17:42:52.000000000 +0000 @@ -82,19 +82,13 @@ def test_overwrite_warns(): - # Need to restore mean - mean = pd.Series.mean - try: - with tm.assert_produces_warning(UserWarning) as w: - pd.api.extensions.register_series_accessor("mean")(MyAccessor) + match = r".*MyAccessor.*fake.*Series.*" + with tm.assert_produces_warning(UserWarning, match=match): + with ensure_removed(pd.Series, "fake"): + setattr(pd.Series, "fake", 123) + pd.api.extensions.register_series_accessor("fake")(MyAccessor) s = pd.Series([1, 2]) - assert s.mean.prop == "item" - msg = str(w[0].message) - assert "mean" in msg - assert "MyAccessor" in msg - assert "Series" in msg - finally: - pd.Series.mean = mean + assert s.fake.prop == "item" def test_raises_attribute_error(): diff -Nru pandas-2.1.4+dfsg/pandas/tests/test_sorting.py pandas-2.2.2+dfsg/pandas/tests/test_sorting.py --- pandas-2.1.4+dfsg/pandas/tests/test_sorting.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/test_sorting.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,11 +5,6 @@ import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_windows, -) - from pandas import ( NA, DataFrame, @@ -111,7 +106,7 @@ gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! - assert is_int64_overflow_possible(gr.grouper.shape) + assert is_int64_overflow_possible(gr._grouper.shape) mi = MultiIndex.from_arrays( [ar.ravel() for ar in np.array_split(np.unique(arr, axis=0), 5, axis=1)], @@ -409,11 +404,6 @@ tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) - @pytest.mark.skipif( - is_platform_windows() and is_ci_environment(), - reason="In CI environment can crash thread with: " - "Windows fatal exception: access violation", - ) def test_codes_out_of_bound(self): values = np.array([3, 1, 2, 0, 4]) expected = np.array([0, 1, 2, 3, 4]) diff -Nru pandas-2.1.4+dfsg/pandas/tests/tools/test_to_datetime.py pandas-2.2.2+dfsg/pandas/tests/tools/test_to_datetime.py --- pandas-2.1.4+dfsg/pandas/tests/tools/test_to_datetime.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tools/test_to_datetime.py 2024-04-10 17:42:52.000000000 +0000 @@ -57,6 +57,10 @@ r"alongside this." ) +pytestmark = pytest.mark.filterwarnings( + "ignore:errors='ignore' is deprecated:FutureWarning" +) + @pytest.fixture(params=[True, False]) def cache(request): @@ -94,10 +98,7 @@ values = index_or_series(["1/1/2000", "1/2/2000", "1/3/2000"]) result = to_datetime(values, format=format, cache=cache) expected = index_or_series(expected) - if isinstance(expected, Series): - tm.assert_series_equal(result, expected) - else: - tm.assert_index_equal(result, expected) + tm.assert_equal(result, expected) @pytest.mark.parametrize( "arg, expected, format", @@ -182,7 +183,7 @@ errors="ignore", cache=cache, ) - expected = Index(["15010101", "20150101", np.nan]) + expected = Index(["15010101", "20150101", np.nan], dtype=object) tm.assert_index_equal(result, expected) def test_to_datetime_format_YYYYMMDD_coercion(self, cache): @@ -603,6 +604,20 @@ expected = to_datetime([d1, d2]).tz_convert(timezone(timedelta(minutes=-60))) tm.assert_index_equal(res, expected) + def test_to_datetime_mixed_string_and_numeric(self): + # GH#55780 np.array(vals) would incorrectly cast the number to str + vals = ["2016-01-01", 0] + expected = DatetimeIndex([Timestamp(x) for x in vals]) + result = to_datetime(vals, format="mixed") + result2 = to_datetime(vals[::-1], format="mixed")[::-1] + result3 = DatetimeIndex(vals) + result4 = DatetimeIndex(vals[::-1])[::-1] + + tm.assert_index_equal(result, expected) + tm.assert_index_equal(result2, expected) + tm.assert_index_equal(result3, expected) + tm.assert_index_equal(result4, expected) + @pytest.mark.parametrize( "format", ["%Y-%m-%d", "%Y-%d-%m"], ids=["ISO8601", "non-ISO8601"] ) @@ -610,7 +625,7 @@ # https://github.com/pandas-dev/pandas/issues/50108 d1 = date(2020, 1, 2) res = to_datetime(["2020-01-01", d1], format=format) - expected = DatetimeIndex(["2020-01-01", "2020-01-02"]) + expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[ns]") tm.assert_index_equal(res, expected) @pytest.mark.parametrize( @@ -976,7 +991,7 @@ def test_to_datetime_dtarr(self, tz): # DatetimeArray dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz) - arr = DatetimeArray(dti) + arr = dti._data result = to_datetime(arr) assert result is arr @@ -1025,7 +1040,7 @@ # See GH#18666 with tm.set_timezone("US/Eastern"): # GH#18705 - now = Timestamp("now") + now = Timestamp("now").as_unit("ns") pdnow = to_datetime("now") pdnow2 = to_datetime(["now"])[0] @@ -1051,7 +1066,7 @@ pdtoday = to_datetime("today") pdtoday2 = to_datetime(["today"])[0] - tstoday = Timestamp("today") + tstoday = Timestamp("today").as_unit("ns") tstoday2 = Timestamp.today().as_unit("ns") # These should all be equal with infinite perf; this gives @@ -1125,10 +1140,11 @@ assert ts.unit == "s" assert ts.asm8 == dt + @pytest.mark.skip_ubsan def test_to_datetime_dt64d_out_of_bounds(self, cache): dt64 = np.datetime64(np.iinfo(np.int64).max, "D") - msg = "Out of bounds nanosecond timestamp" + msg = "Out of bounds second timestamp: 25252734927768524-07-27" with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp(dt64) with pytest.raises(OutOfBoundsDatetime, match=msg): @@ -1187,6 +1203,16 @@ expected = np.datetime64("9999-01-01") assert result == expected + def test_out_of_bounds_errors_ignore2(self): + # GH#12424 + msg = "errors='ignore' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = to_datetime( + Series(["2362-01-01", np.nan], dtype=object), errors="ignore" + ) + exp = Series(["2362-01-01", np.nan], dtype=object) + tm.assert_series_equal(res, exp) + def test_to_datetime_tz(self, cache): # xref 8260 # uniform returns a DatetimeIndex @@ -1214,7 +1240,9 @@ with pytest.raises(ValueError, match=msg): to_datetime(arr, cache=cache) - result = to_datetime(arr, cache=cache, errors="ignore") + depr_msg = "errors='ignore' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result = to_datetime(arr, cache=cache, errors="ignore") expected = Index( [ Timestamp("2013-01-01 13:00:00-08:00"), @@ -1321,7 +1349,9 @@ ], ) def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, dtype): - expected = Series([Timestamp("2013-01-01 01:00:00", tz="UTC")]) + expected = Series( + [Timestamp("2013-01-01 01:00:00", tz="UTC")], dtype="M8[ns, UTC]" + ) result = to_datetime(Series([date], dtype=dtype), utc=True, cache=cache) tm.assert_series_equal(result, expected) @@ -1460,11 +1490,15 @@ warn = UserWarning else: warn = None - with tm.assert_produces_warning(warn, match="Could not infer format"): + with tm.assert_produces_warning( + warn, match="Could not infer format", raise_on_extra_warnings=False + ): res = to_datetime(values, errors="ignore", format=format) - tm.assert_index_equal(res, Index(values)) + tm.assert_index_equal(res, Index(values, dtype=object)) - with tm.assert_produces_warning(warn, match="Could not infer format"): + with tm.assert_produces_warning( + warn, match="Could not infer format", raise_on_extra_warnings=False + ): res = to_datetime(values, errors="coerce", format=format) tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values))) @@ -1479,7 +1513,9 @@ ] ) with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(warn, match="Could not infer format"): + with tm.assert_produces_warning( + warn, match="Could not infer format", raise_on_extra_warnings=False + ): to_datetime(values, errors="raise", format=format) @pytest.mark.parametrize("utc", [True, None]) @@ -1554,28 +1590,23 @@ @pytest.mark.parametrize("cache", [True, False]) @pytest.mark.parametrize( - ("input", "expected"), - ( - ( - Series([NaT] * 20 + [None] * 20, dtype="object"), - Series([NaT] * 40, dtype="datetime64[ns]"), - ), - ( - Series([NaT] * 60 + [None] * 60, dtype="object"), - Series([NaT] * 120, dtype="datetime64[ns]"), - ), - (Series([None] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([None] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - (Series([""] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([""] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - (Series([pd.NA] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([pd.NA] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - (Series([np.nan] * 20), Series([NaT] * 20, dtype="datetime64[ns]")), - (Series([np.nan] * 60), Series([NaT] * 60, dtype="datetime64[ns]")), - ), + "input", + [ + Series([NaT] * 20 + [None] * 20, dtype="object"), + Series([NaT] * 60 + [None] * 60, dtype="object"), + Series([None] * 20), + Series([None] * 60), + Series([""] * 20), + Series([""] * 60), + Series([pd.NA] * 20), + Series([pd.NA] * 60), + Series([np.nan] * 20), + Series([np.nan] * 60), + ], ) - def test_to_datetime_converts_null_like_to_nat(self, cache, input, expected): + def test_to_datetime_converts_null_like_to_nat(self, cache, input): # GH35888 + expected = Series([NaT] * len(input), dtype="M8[ns]") result = to_datetime(input, cache=cache) tm.assert_series_equal(result, expected) @@ -1641,14 +1672,16 @@ "errors, expected", [ ("coerce", Index([NaT, NaT])), - ("ignore", Index(["200622-12-31", "111111-24-11"])), + ("ignore", Index(["200622-12-31", "111111-24-11"], dtype=object)), ], ) def test_to_datetime_malformed_no_raise(self, errors, expected): # GH 28299 # GH 48633 ts_strings = ["200622-12-31", "111111-24-11"] - with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + with tm.assert_produces_warning( + UserWarning, match="Could not infer format", raise_on_extra_warnings=False + ): result = to_datetime(ts_strings, errors=errors) tm.assert_index_equal(result, expected) @@ -1787,7 +1820,9 @@ assert result.tz is timezone.utc def test_to_datetime_fixed_offset(self): - from pandas.tests.indexes.datetimes.test_timezones import fixed_off + from pandas.tests.indexes.datetimes.test_timezones import FixedOffset + + fixed_off = FixedOffset(-420, "-07:00") dates = [ datetime(2000, 1, 1, tzinfo=fixed_off), @@ -1821,7 +1856,7 @@ def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request): # GH#50870 Note we have separate tests that pd.Timestamp gets these right ts = Timestamp(item, unit=unit) - expected = DatetimeIndex([ts]) + expected = DatetimeIndex([ts], dtype="M8[ns]") result = to_datetime([item], unit=unit, cache=cache) tm.assert_index_equal(result, expected) @@ -1829,16 +1864,14 @@ result = to_datetime(np.array([item], dtype=object), unit=unit, cache=cache) tm.assert_index_equal(result, expected) - # TODO: this should also work - if isinstance(item, float): - request.node.add_marker( - pytest.mark.xfail( - reason=f"{type(item).__name__} in np.array should work" - ) - ) result = to_datetime(np.array([item]), unit=unit, cache=cache) tm.assert_index_equal(result, expected) + # with a nan! + result = to_datetime(np.array([item, np.nan]), unit=unit, cache=cache) + assert result.isna()[1] + tm.assert_index_equal(result[:1], expected) + @pytest.mark.parametrize("unit", ["Y", "M"]) def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # GH#50301 @@ -1849,6 +1882,8 @@ with pytest.raises(ValueError, match=msg): to_datetime([1.5], unit=unit, errors="raise") with pytest.raises(ValueError, match=msg): + to_datetime(np.array([1.5]), unit=unit, errors="raise") + with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match=warn_msg): to_datetime(["1.5"], unit=unit, errors="raise") @@ -1877,6 +1912,14 @@ with pytest.raises(ValueError, match=msg): to_datetime([1], unit="D", format="%Y%m%d", cache=cache) + def test_unit_str(self, cache): + # GH 57051 + # Test that strs aren't dropping precision to 32-bit accidentally. + with tm.assert_produces_warning(FutureWarning): + res = to_datetime(["1704660000"], unit="s", origin="unix") + expected = to_datetime([1704660000], unit="s", origin="unix") + tm.assert_index_equal(res, expected) + def test_unit_array_mixed_nans(self, cache): values = [11111111111111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""] result = to_datetime(values, unit="D", errors="ignore", cache=cache) @@ -1897,7 +1940,8 @@ result = to_datetime(values, unit="D", errors="coerce", cache=cache) expected = DatetimeIndex( - ["NaT", "1970-01-02", "1970-01-02", "NaT", "NaT", "NaT", "NaT", "NaT"] + ["NaT", "1970-01-02", "1970-01-02", "NaT", "NaT", "NaT", "NaT", "NaT"], + dtype="M8[ns]", ) tm.assert_index_equal(result, expected) @@ -1913,7 +1957,7 @@ tm.assert_index_equal(result, expected) result = to_datetime(values, errors="coerce", unit="s", cache=cache) - expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"]) + expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"], dtype="M8[ns]") tm.assert_index_equal(result, expected) msg = "cannot convert input 1420043460000000000000000 with the unit 's'" @@ -1940,7 +1984,9 @@ def test_unit_with_numeric(self, cache, errors, dtype): # GH 13180 # coercions from floats/ints are ok - expected = DatetimeIndex(["2015-06-19 05:33:20", "2015-05-27 22:33:20"]) + expected = DatetimeIndex( + ["2015-06-19 05:33:20", "2015-05-27 22:33:20"], dtype="M8[ns]" + ) arr = np.array([1.434692e18, 1.432766e18]).astype(dtype) result = to_datetime(arr, errors=errors, cache=cache) tm.assert_index_equal(result, expected) @@ -1963,7 +2009,7 @@ def test_unit_with_numeric_coerce(self, cache, exp, arr, warning): # but we want to make sure that we are coercing # if we have ints/strings - expected = DatetimeIndex(exp) + expected = DatetimeIndex(exp, dtype="M8[ns]") with tm.assert_produces_warning(warning, match="Could not infer format"): result = to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) @@ -1995,10 +2041,14 @@ def test_unit_rounding(self, cache): # GH 14156 & GH 20445: argument will incur floating point errors # but no premature rounding - result = to_datetime(1434743731.8770001, unit="s", cache=cache) - expected = Timestamp("2015-06-19 19:55:31.877000192") + value = 1434743731.8770001 + result = to_datetime(value, unit="s", cache=cache) + expected = Timestamp("2015-06-19 19:55:31.877000093") assert result == expected + alt = Timestamp(value, unit="s") + assert alt == result + def test_unit_ignore_keeps_name(self, cache): # GH 21697 expected = Index([15e9] * 2, name="name") @@ -2008,17 +2058,20 @@ def test_to_datetime_errors_ignore_utc_true(self): # GH#23758 result = to_datetime([1], unit="s", utc=True, errors="ignore") - expected = DatetimeIndex(["1970-01-01 00:00:01"], tz="UTC") + expected = DatetimeIndex(["1970-01-01 00:00:01"], dtype="M8[ns, UTC]") tm.assert_index_equal(result, expected) - # TODO: this is moved from tests.series.test_timeseries, may be redundant @pytest.mark.parametrize("dtype", [int, float]) def test_to_datetime_unit(self, dtype): epoch = 1370745748 ser = Series([epoch + t for t in range(20)]).astype(dtype) result = to_datetime(ser, unit="s") expected = Series( - [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) + for t in range(20) + ], + dtype="M8[ns]", ) tm.assert_series_equal(result, expected) @@ -2029,7 +2082,8 @@ result = to_datetime(ser, unit="s") expected = Series( [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] - + [NaT] + + [NaT], + dtype="M8[ns]", ) tm.assert_series_equal(result, expected) @@ -2043,7 +2097,8 @@ Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in np.arange(0, 2, 0.25) ] - + [NaT] + + [NaT], + dtype="M8[ns]", ) # GH20455 argument will incur floating point errors but no premature rounding result = result.round("ms") @@ -2052,7 +2107,8 @@ def test_to_datetime_unit_na_values(self): result = to_datetime([1, 2, "NaT", NaT, np.nan], unit="D") expected = DatetimeIndex( - [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 3 + [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 3, + dtype="M8[ns]", ) tm.assert_index_equal(result, expected) @@ -2066,7 +2122,8 @@ def test_to_timestamp_unit_coerce(self, bad_val): # coerce we can process expected = DatetimeIndex( - [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1 + [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1, + dtype="M8[ns]", ) result = to_datetime([1, 2, bad_val], unit="D", errors="coerce") tm.assert_index_equal(result, expected) @@ -2083,7 +2140,13 @@ expected = (should_succeed * oneday_in_ns).astype(np.int64) for error_mode in ["raise", "coerce", "ignore"]: result1 = to_datetime(should_succeed, unit="D", errors=error_mode) - tm.assert_almost_equal(result1.astype(np.int64), expected, rtol=1e-10) + # Cast to `np.float64` so that `rtol` and inexact checking kick in + # (`check_exact` doesn't take place for integer dtypes) + tm.assert_almost_equal( + result1.astype(np.int64).astype(np.float64), + expected.astype(np.float64), + rtol=1e-10, + ) # just out of bounds should_fail1 = Series([0, tsmax_in_days + 0.005], dtype=float) should_fail2 = Series([0, -tsmax_in_days - 0.005], dtype=float) @@ -2158,7 +2221,8 @@ # unit mappings result = to_datetime(df[list(unit.keys())].rename(columns=unit), cache=cache) expected = Series( - [Timestamp("20150204 06:58:10"), Timestamp("20160305 07:59:11")] + [Timestamp("20150204 06:58:10"), Timestamp("20160305 07:59:11")], + dtype="M8[ns]", ) tm.assert_series_equal(result, expected) @@ -2648,7 +2712,7 @@ result = to_datetime(malformed, errors="ignore", cache=cache) # GH 21864 - expected = Index(malformed) + expected = Index(malformed, dtype=object) tm.assert_index_equal(result, expected) with pytest.raises(ValueError, match=msg): @@ -2767,7 +2831,7 @@ ): to_datetime(arr, dayfirst=True) - @pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray]) + @pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray._from_sequence]) def test_to_datetime_dta_tz(self, klass): # GH#27733 dti = date_range("2015-04-05", periods=3).rename("foo") @@ -2817,7 +2881,7 @@ def test_to_datetime_infer_datetime_format_consistent_format( self, cache, test_format ): - ser = Series(date_range("20000101", periods=50, freq="H")) + ser = Series(date_range("20000101", periods=50, freq="h")) s_as_dt_strings = ser.apply(lambda x: x.strftime(test_format)) @@ -2920,7 +2984,8 @@ Timestamp("2015-03-03"), ] ) - tm.assert_series_equal(to_datetime(ser, format=format, cache=cache), expected) + result = to_datetime(ser, format=format, cache=cache) + tm.assert_series_equal(result, expected) def test_parse_dates_infer_datetime_format_warning(self): # GH 49024 @@ -3051,7 +3116,7 @@ ("Thu Sep 25 2003", datetime(2003, 9, 25)), ("Sep 25 2003", datetime(2003, 9, 25)), ("January 1 2014", datetime(2014, 1, 1)), - # GHE10537 + # GH#10537 ("2014-06", datetime(2014, 6, 1)), ("06-2014", datetime(2014, 6, 1)), ("2014-6", datetime(2014, 6, 1)), @@ -3314,7 +3379,8 @@ def test_unix(self): result = Series(to_datetime([0, 1, 2], unit="D", origin="unix")) expected = Series( - [Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")] + [Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")], + dtype="M8[ns]", ) tm.assert_series_equal(result, expected) @@ -3433,7 +3499,7 @@ # GH 25546 arg = "2019-01-01T00:00:00.000" + offset result = to_datetime([arg], unit="ns", utc=utc) - expected = to_datetime([exp]) + expected = to_datetime([exp]).as_unit("ns") tm.assert_index_equal(result, expected) @@ -3560,11 +3626,12 @@ ) def test_to_datetime_cache_coerce_50_lines_outofbounds(series_length): # GH#45319 - s = Series( + ser = Series( [datetime.fromisoformat("1446-04-12 00:00:00+00:00")] - + ([datetime.fromisoformat("1991-10-20 00:00:00+00:00")] * series_length) + + ([datetime.fromisoformat("1991-10-20 00:00:00+00:00")] * series_length), + dtype=object, ) - result1 = to_datetime(s, errors="coerce", utc=True) + result1 = to_datetime(ser, errors="coerce", utc=True) expected1 = Series( [NaT] + ([Timestamp("1991-10-20 00:00:00+00:00")] * series_length) @@ -3572,7 +3639,7 @@ tm.assert_series_equal(result1, expected1) - result2 = to_datetime(s, errors="ignore", utc=True) + result2 = to_datetime(ser, errors="ignore", utc=True) expected2 = Series( [datetime.fromisoformat("1446-04-12 00:00:00+00:00")] @@ -3582,7 +3649,7 @@ tm.assert_series_equal(result2, expected2) with pytest.raises(OutOfBoundsDatetime, match="Out of bounds nanosecond timestamp"): - to_datetime(s, errors="raise", utc=True) + to_datetime(ser, errors="raise", utc=True) def test_to_datetime_format_f_parse_nanos(): @@ -3637,7 +3704,7 @@ ("errors", "expected"), [ ("coerce", DatetimeIndex(["2020-01-01 00:00:00", NaT])), - ("ignore", Index(["2020-01-01", "01-01-2000"])), + ("ignore", Index(["2020-01-01", "01-01-2000"], dtype=object)), ], ) def test_to_datetime_mixed_not_necessarily_iso8601_coerce(errors, expected): @@ -3673,10 +3740,17 @@ def test_to_datetime_with_empty_str_utc_false_format_mixed(): # GH 50887 - result = to_datetime(["2020-01-01 00:00+00:00", ""], format="mixed") - expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype=object) + vals = ["2020-01-01 00:00+00:00", ""] + result = to_datetime(vals, format="mixed") + expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[ns, UTC]") tm.assert_index_equal(result, expected) + # Check that a couple of other similar paths work the same way + alt = to_datetime(vals) + tm.assert_index_equal(alt, expected) + alt2 = DatetimeIndex(vals) + tm.assert_index_equal(alt2, expected) + def test_to_datetime_with_empty_str_utc_false_offsets_and_format_mixed(): # GH 50887 @@ -3686,3 +3760,141 @@ to_datetime( ["2020-01-01 00:00+00:00", "2020-01-01 00:00+02:00", ""], format="mixed" ) + + +def test_to_datetime_mixed_tzs_mixed_types(): + # GH#55793, GH#55693 mismatched tzs but one is str and other is + # datetime object + ts = Timestamp("2016-01-02 03:04:05", tz="US/Pacific") + dtstr = "2023-10-30 15:06+01" + arr = [ts, dtstr] + + msg = ( + "Mixed timezones detected. pass utc=True in to_datetime or tz='UTC' " + "in DatetimeIndex to convert to a common timezone" + ) + with pytest.raises(ValueError, match=msg): + to_datetime(arr) + with pytest.raises(ValueError, match=msg): + to_datetime(arr, format="mixed") + with pytest.raises(ValueError, match=msg): + DatetimeIndex(arr) + + +def test_to_datetime_mixed_types_matching_tzs(): + # GH#55793 + dtstr = "2023-11-01 09:22:03-07:00" + ts = Timestamp(dtstr) + arr = [ts, dtstr] + res1 = to_datetime(arr) + res2 = to_datetime(arr[::-1])[::-1] + res3 = to_datetime(arr, format="mixed") + res4 = DatetimeIndex(arr) + + expected = DatetimeIndex([ts, ts]) + tm.assert_index_equal(res1, expected) + tm.assert_index_equal(res2, expected) + tm.assert_index_equal(res3, expected) + tm.assert_index_equal(res4, expected) + + +dtstr = "2020-01-01 00:00+00:00" +ts = Timestamp(dtstr) + + +@pytest.mark.filterwarnings("ignore:Could not infer format:UserWarning") +@pytest.mark.parametrize( + "aware_val", + [dtstr, Timestamp(dtstr)], + ids=lambda x: type(x).__name__, +) +@pytest.mark.parametrize( + "naive_val", + [dtstr[:-6], ts.tz_localize(None), ts.date(), ts.asm8, ts.value, float(ts.value)], + ids=lambda x: type(x).__name__, +) +@pytest.mark.parametrize("naive_first", [True, False]) +def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_first): + # GH#55793, GH#55693 + # Empty string parses to NaT + vals = [aware_val, naive_val, ""] + + vec = vals + if naive_first: + # alas, the behavior is order-dependent, so we test both ways + vec = [naive_val, aware_val, ""] + + # both_strs-> paths that were previously already deprecated with warning + # issued in _array_to_datetime_object + both_strs = isinstance(aware_val, str) and isinstance(naive_val, str) + has_numeric = isinstance(naive_val, (int, float)) + + depr_msg = "In a future version of pandas, parsing datetimes with mixed time zones" + + first_non_null = next(x for x in vec if x != "") + # if first_non_null is a not a string, _guess_datetime_format_for_array + # doesn't guess a format so we don't go through array_strptime + if not isinstance(first_non_null, str): + # that case goes through array_strptime which has different behavior + msg = "Cannot mix tz-aware with tz-naive values" + if naive_first and isinstance(aware_val, Timestamp): + if isinstance(naive_val, Timestamp): + msg = "Tz-aware datetime.datetime cannot be converted to datetime64" + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + else: + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + + # No warning/error with utc=True + to_datetime(vec, utc=True) + + elif has_numeric and vec.index(aware_val) < vec.index(naive_val): + msg = "time data .* doesn't match format" + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + with pytest.raises(ValueError, match=msg): + to_datetime(vec, utc=True) + + elif both_strs and vec.index(aware_val) < vec.index(naive_val): + msg = r"time data \"2020-01-01 00:00\" doesn't match format" + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + with pytest.raises(ValueError, match=msg): + to_datetime(vec, utc=True) + + elif both_strs and vec.index(naive_val) < vec.index(aware_val): + msg = "unconverted data remains when parsing with format" + with pytest.raises(ValueError, match=msg): + to_datetime(vec) + with pytest.raises(ValueError, match=msg): + to_datetime(vec, utc=True) + + else: + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + to_datetime(vec) + + # No warning/error with utc=True + to_datetime(vec, utc=True) + + if both_strs: + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + to_datetime(vec, format="mixed") + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + msg = "DatetimeIndex has mixed timezones" + with pytest.raises(TypeError, match=msg): + DatetimeIndex(vec) + else: + msg = "Cannot mix tz-aware with tz-naive values" + if naive_first and isinstance(aware_val, Timestamp): + if isinstance(naive_val, Timestamp): + msg = "Tz-aware datetime.datetime cannot be converted to datetime64" + with pytest.raises(ValueError, match=msg): + to_datetime(vec, format="mixed") + with pytest.raises(ValueError, match=msg): + DatetimeIndex(vec) + else: + with pytest.raises(ValueError, match=msg): + to_datetime(vec, format="mixed") + with pytest.raises(ValueError, match=msg): + DatetimeIndex(vec) diff -Nru pandas-2.1.4+dfsg/pandas/tests/tools/test_to_numeric.py pandas-2.2.2+dfsg/pandas/tests/tools/test_to_numeric.py --- pandas-2.1.4+dfsg/pandas/tests/tools/test_to_numeric.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tools/test_to_numeric.py 2024-04-10 17:42:52.000000000 +0000 @@ -119,6 +119,7 @@ @pytest.mark.parametrize( "errors,exp_data", [("ignore", [1, -3.14, "apple"]), ("coerce", [1, -3.14, np.nan])] ) +@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_ignore_error(errors, exp_data): ser = Series([1, -3.14, "apple"]) result = to_numeric(ser, errors=errors) @@ -136,6 +137,7 @@ ("coerce", [1.0, 0.0, np.nan]), ], ) +@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_bool_handling(errors, exp): ser = Series([True, False, "apple"]) @@ -236,6 +238,7 @@ tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_type_check(errors): # see gh-11776 df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]}) @@ -250,6 +253,7 @@ assert to_numeric(transform(val)) == float(val) +@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_really_large_scalar(large_val, signed, transform, errors): # see gh-24910 kwargs = {"errors": errors} if errors is not None else {} @@ -267,6 +271,7 @@ tm.assert_almost_equal(to_numeric(val, **kwargs), expected) +@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors): # see gh-24910 kwargs = {"errors": errors} if errors is not None else {} @@ -306,6 +311,7 @@ tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) +@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors): # see gh-24910 # @@ -344,6 +350,7 @@ ("coerce", lambda x: np.isnan(x)), ], ) +@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_scalar_fail(errors, checker): scalar = "fail" @@ -403,7 +410,7 @@ inp = transform(idx) if not isinstance(inp, Index): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="Missing PeriodDtype support in to_numeric") ) result = to_numeric(inp) @@ -419,6 +426,7 @@ ("coerce", Series([np.nan, 1.0, np.nan])), ], ) +@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_non_hashable(errors, expected): # see gh-13324 ser = Series([[10.0, 2], 1.0, "apple"]) @@ -503,7 +511,9 @@ data = ["foo", 2, 3] expected = np.array(data, dtype=object) - res = to_numeric(data, errors="ignore", downcast="unsigned") + msg = "errors='ignore' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = to_numeric(data, errors="ignore", downcast="unsigned") tm.assert_numpy_array_equal(res, expected) @@ -636,6 +646,7 @@ ("raise", "Unable to parse string"), ], ) +@pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_non_coerce_uint64_conflict(errors, exp): # see gh-17007 and gh-17125 # @@ -762,7 +773,9 @@ values = ["a", "1"] ser = Series(values, dtype=nullable_string_dtype) expected = ser.copy() - result = to_numeric(ser, errors="ignore") + msg = "errors='ignore' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_numeric(ser, errors="ignore") tm.assert_series_equal(result, expected) @@ -932,7 +945,9 @@ with pytest.raises(ValueError, match="Unable to parse string"): to_numeric(ser, dtype_backend=dtype_backend) - result = to_numeric(ser, dtype_backend=dtype_backend, errors="ignore") + msg = "errors='ignore' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_numeric(ser, dtype_backend=dtype_backend, errors="ignore") tm.assert_series_equal(result, expected) result = to_numeric(ser, dtype_backend=dtype_backend, errors="coerce") diff -Nru pandas-2.1.4+dfsg/pandas/tests/tools/test_to_time.py pandas-2.2.2+dfsg/pandas/tests/tools/test_to_time.py --- pandas-2.1.4+dfsg/pandas/tests/tools/test_to_time.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tools/test_to_time.py 2024-04-10 17:42:52.000000000 +0000 @@ -54,7 +54,9 @@ assert to_time(arg, infer_time_format=True) == expected_arr assert to_time(arg, format="%I:%M%p", errors="coerce") == [None, None] - res = to_time(arg, format="%I:%M%p", errors="ignore") + msg = "errors='ignore' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = to_time(arg, format="%I:%M%p", errors="ignore") tm.assert_numpy_array_equal(res, np.array(arg, dtype=np.object_)) msg = "Cannot convert.+to a time with given format" diff -Nru pandas-2.1.4+dfsg/pandas/tests/tools/test_to_timedelta.py pandas-2.2.2+dfsg/pandas/tests/tools/test_to_timedelta.py --- pandas-2.1.4+dfsg/pandas/tests/tools/test_to_timedelta.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tools/test_to_timedelta.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,6 +6,7 @@ import numpy as np import pytest +from pandas.compat import IS64 from pandas.errors import OutOfBoundsTimedelta import pandas as pd @@ -20,6 +21,17 @@ class TestTimedeltas: + def test_to_timedelta_dt64_raises(self): + # Passing datetime64-dtype data to TimedeltaIndex is no longer + # supported GH#29794 + msg = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" + + ser = Series([pd.NaT]) + with pytest.raises(TypeError, match=msg): + to_timedelta(ser) + with pytest.raises(TypeError, match=msg): + ser.to_frame().apply(to_timedelta) + @pytest.mark.parametrize("readonly", [True, False]) def test_to_timedelta_readonly(self, readonly): # GH#34857 @@ -86,12 +98,13 @@ TimedeltaIndex(arr) with pytest.raises(OutOfBoundsTimedelta, match=msg): - TimedeltaArray._from_sequence(arr) + TimedeltaArray._from_sequence(arr, dtype="m8[s]") @pytest.mark.parametrize( "arg", [np.arange(10).reshape(2, 5), pd.DataFrame(np.arange(10).reshape(2, 5))] ) @pytest.mark.parametrize("errors", ["ignore", "raise", "coerce"]) + @pytest.mark.filterwarnings("ignore:errors='ignore' is deprecated:FutureWarning") def test_to_timedelta_dataframe(self, arg, errors): # GH 11776 with pytest.raises(TypeError, match="1-d array"): @@ -137,22 +150,29 @@ def test_to_timedelta_invalid_errors_ignore(self): # gh-13613: these should not error because errors='ignore' + msg = "errors='ignore' is deprecated" invalid_data = "apple" - assert invalid_data == to_timedelta(invalid_data, errors="ignore") + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_timedelta(invalid_data, errors="ignore") + assert invalid_data == result invalid_data = ["apple", "1 days"] - tm.assert_numpy_array_equal( - np.array(invalid_data, dtype=object), - to_timedelta(invalid_data, errors="ignore"), - ) + expected = np.array(invalid_data, dtype=object) + + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_timedelta(invalid_data, errors="ignore") + tm.assert_numpy_array_equal(expected, result) invalid_data = pd.Index(["apple", "1 days"]) - tm.assert_index_equal(invalid_data, to_timedelta(invalid_data, errors="ignore")) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_timedelta(invalid_data, errors="ignore") + tm.assert_index_equal(invalid_data, result) invalid_data = Series(["apple", "1 days"]) - tm.assert_series_equal( - invalid_data, to_timedelta(invalid_data, errors="ignore") - ) + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_timedelta(invalid_data, errors="ignore") + tm.assert_series_equal(invalid_data, result) @pytest.mark.parametrize( "val, errors", @@ -224,6 +244,7 @@ actual = to_timedelta([val]) assert actual[0]._value == np.timedelta64("NaT").astype("int64") + @pytest.mark.xfail(not IS64, reason="Floating point error") def test_to_timedelta_float(self): # https://github.com/pandas-dev/pandas/issues/25077 arr = np.arange(0, 1, 1e-6)[-10:] @@ -239,7 +260,9 @@ def test_to_timedelta_ignore_strings_unit(self): arr = np.array([1, 2, "error"], dtype=object) - result = to_timedelta(arr, unit="ns", errors="ignore") + msg = "errors='ignore' is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = to_timedelta(arr, unit="ns", errors="ignore") tm.assert_numpy_array_equal(result, arr) @pytest.mark.parametrize( diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/frequencies/test_freq_code.py pandas-2.2.2+dfsg/pandas/tests/tseries/frequencies/test_freq_code.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/frequencies/test_freq_code.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/frequencies/test_freq_code.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,15 +3,13 @@ from pandas._libs.tslibs import ( Period, - Resolution, to_offset, ) -from pandas._libs.tslibs.dtypes import _attrname_to_abbrevs @pytest.mark.parametrize( "freqstr,exp_freqstr", - [("D", "D"), ("W", "D"), ("M", "D"), ("S", "S"), ("T", "S"), ("H", "S")], + [("D", "D"), ("W", "D"), ("ME", "D"), ("s", "s"), ("min", "s"), ("h", "s")], ) def test_get_to_timestamp_base(freqstr, exp_freqstr): off = to_offset(freqstr) @@ -23,40 +21,14 @@ @pytest.mark.parametrize( - "freqstr,expected", - [ - ("A", "year"), - ("Q", "quarter"), - ("M", "month"), - ("D", "day"), - ("H", "hour"), - ("T", "minute"), - ("S", "second"), - ("L", "millisecond"), - ("U", "microsecond"), - ("N", "nanosecond"), - ], -) -def test_get_attrname_from_abbrev(freqstr, expected): - assert Resolution.get_reso_from_freqstr(freqstr).attrname == expected - - -@pytest.mark.parametrize("freq", ["D", "H", "T", "S", "L", "U", "N"]) -def test_get_freq_roundtrip2(freq): - obj = Resolution.get_reso_from_freqstr(freq) - result = _attrname_to_abbrevs[obj.attrname] - assert freq == result - - -@pytest.mark.parametrize( "args,expected", [ - ((1.5, "T"), (90, "S")), - ((62.4, "T"), (3744, "S")), - ((1.04, "H"), (3744, "S")), + ((1.5, "min"), (90, "s")), + ((62.4, "min"), (3744, "s")), + ((1.04, "h"), (3744, "s")), ((1, "D"), (1, "D")), - ((0.342931, "H"), (1234551600, "U")), - ((1.2345, "D"), (106660800, "L")), + ((0.342931, "h"), (1234551600, "us")), + ((1.2345, "D"), (106660800, "ms")), ], ) def test_resolution_bumping(args, expected): @@ -69,9 +41,9 @@ @pytest.mark.parametrize( "args", [ - (0.5, "N"), + (0.5, "ns"), # Too much precision in the input can prevent. - (0.3429324798798269273987982, "H"), + (0.3429324798798269273987982, "h"), ], ) def test_cat(args): @@ -84,11 +56,11 @@ @pytest.mark.parametrize( "freqstr,expected", [ - ("1H", "2021-01-01T09:00:00"), + ("1h", "2021-01-01T09:00:00"), ("1D", "2021-01-02T08:00:00"), ("1W", "2021-01-03T08:00:00"), - ("1M", "2021-01-31T08:00:00"), - ("1Y", "2021-12-31T08:00:00"), + ("1ME", "2021-01-31T08:00:00"), + ("1YE", "2021-12-31T08:00:00"), ], ) def test_compatibility(freqstr, expected): diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/frequencies/test_inference.py pandas-2.2.2+dfsg/pandas/tests/tseries/frequencies/test_inference.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/frequencies/test_inference.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/frequencies/test_inference.py 2024-04-10 17:42:52.000000000 +0000 @@ -17,6 +17,7 @@ from pandas import ( DatetimeIndex, Index, + RangeIndex, Series, Timestamp, date_range, @@ -38,12 +39,12 @@ @pytest.fixture( params=[ (timedelta(1), "D"), - (timedelta(hours=1), "H"), - (timedelta(minutes=1), "T"), - (timedelta(seconds=1), "S"), - (np.timedelta64(1, "ns"), "N"), - (timedelta(microseconds=1), "U"), - (timedelta(microseconds=1000), "L"), + (timedelta(hours=1), "h"), + (timedelta(minutes=1), "min"), + (timedelta(seconds=1), "s"), + (np.timedelta64(1, "ns"), "ns"), + (timedelta(microseconds=1), "us"), + (timedelta(microseconds=1000), "ms"), ] ) def base_delta_code_pair(request): @@ -51,9 +52,9 @@ freqs = ( - [f"Q-{month}" for month in MONTHS] - + [f"{annual}-{month}" for annual in ["A", "BA"] for month in MONTHS] - + ["M", "BM", "BMS"] + [f"QE-{month}" for month in MONTHS] + + [f"{annual}-{month}" for annual in ["YE", "BYE"] for month in MONTHS] + + ["ME", "BME", "BMS"] + [f"WOM-{count}{day}" for count in range(1, 5) for day in DAYS] + [f"W-{day}" for day in DAYS] ) @@ -67,28 +68,28 @@ gen = date_range("1/1/2000", periods=periods, freq=freq) index = DatetimeIndex(gen.values) - if not freq.startswith("Q-"): + if not freq.startswith("QE-"): assert frequencies.infer_freq(index) == gen.freqstr else: inf_freq = frequencies.infer_freq(index) - is_dec_range = inf_freq == "Q-DEC" and gen.freqstr in ( - "Q", - "Q-DEC", - "Q-SEP", - "Q-JUN", - "Q-MAR", + is_dec_range = inf_freq == "QE-DEC" and gen.freqstr in ( + "QE", + "QE-DEC", + "QE-SEP", + "QE-JUN", + "QE-MAR", ) - is_nov_range = inf_freq == "Q-NOV" and gen.freqstr in ( - "Q-NOV", - "Q-AUG", - "Q-MAY", - "Q-FEB", + is_nov_range = inf_freq == "QE-NOV" and gen.freqstr in ( + "QE-NOV", + "QE-AUG", + "QE-MAY", + "QE-FEB", ) - is_oct_range = inf_freq == "Q-OCT" and gen.freqstr in ( - "Q-OCT", - "Q-JUL", - "Q-APR", - "Q-JAN", + is_oct_range = inf_freq == "QE-OCT" and gen.freqstr in ( + "QE-OCT", + "QE-JUL", + "QE-APR", + "QE-JAN", ) assert is_dec_range or is_nov_range or is_oct_range @@ -162,12 +163,12 @@ def test_monthly_ambiguous(): rng = DatetimeIndex(["1/31/2000", "2/29/2000", "3/31/2000"]) - assert rng.inferred_freq == "M" + assert rng.inferred_freq == "ME" def test_annual_ambiguous(): rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"]) - assert rng.inferred_freq == "A-JAN" + assert rng.inferred_freq == "YE-JAN" @pytest.mark.parametrize("count", range(1, 5)) @@ -202,11 +203,12 @@ @pytest.mark.parametrize( - "freq,expected", [("Q", "Q-DEC"), ("Q-NOV", "Q-NOV"), ("Q-OCT", "Q-OCT")] + "freq,expected", [("Q", "QE-DEC"), ("Q-NOV", "QE-NOV"), ("Q-OCT", "QE-OCT")] ) def test_infer_freq_index(freq, expected): rng = period_range("1959Q2", "2009Q3", freq=freq) - rng = Index(rng.to_timestamp("D", how="e").astype(object)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + rng = Index(rng.to_timestamp("D", how="e").astype(object)) assert rng.inferred_freq == expected @@ -215,12 +217,12 @@ "expected,dates", list( { - "AS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], - "Q-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"], - "M": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], + "YS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], + "QE-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"], + "ME": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], "D": ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], - "H": [ + "h": [ "2011-12-31 22:00", "2011-12-31 23:00", "2012-01-01 00:00", @@ -229,10 +231,11 @@ }.items() ), ) -def test_infer_freq_tz(tz_naive_fixture, expected, dates): - # see gh-7310 +@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) +def test_infer_freq_tz(tz_naive_fixture, expected, dates, unit): + # see gh-7310, GH#55609 tz = tz_naive_fixture - idx = DatetimeIndex(dates, tz=tz) + idx = DatetimeIndex(dates, tz=tz).as_unit(unit) assert idx.inferred_freq == expected @@ -254,7 +257,8 @@ ], ) @pytest.mark.parametrize( - "freq", ["H", "3H", "10T", "3601S", "3600001L", "3600000001U", "3600000000001N"] + "freq", + ["h", "3h", "10min", "3601s", "3600001ms", "3600000001us", "3600000000001ns"], ) def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq): # see gh-8772 @@ -264,7 +268,7 @@ def test_infer_freq_tz_transition_custom(): - index = date_range("2013-11-03", periods=5, freq="3H").tz_localize( + index = date_range("2013-11-03", periods=5, freq="3h").tz_localize( "America/Chicago" ) assert index.inferred_freq is None @@ -273,7 +277,7 @@ @pytest.mark.parametrize( "data,expected", [ - # Hourly freq in a day must result in "H" + # Hourly freq in a day must result in "h" ( [ "2014-07-01 09:00", @@ -283,7 +287,7 @@ "2014-07-01 13:00", "2014-07-01 14:00", ], - "H", + "h", ), ( [ @@ -299,7 +303,7 @@ "2014-07-02 10:00", "2014-07-02 11:00", ], - "BH", + "bh", ), ( [ @@ -315,7 +319,7 @@ "2014-07-07 10:00", "2014-07-07 11:00", ], - "BH", + "bh", ), ( [ @@ -344,7 +348,7 @@ "2014-07-08 15:00", "2014-07-08 16:00", ], - "BH", + "bh", ), ], ) @@ -358,7 +362,7 @@ rng = DatetimeIndex(["1/31/2000", "1/31/2001", "1/31/2002"]) rng = rng[::-1] - assert rng.inferred_freq == "-1A-JAN" + assert rng.inferred_freq == "-1YE-JAN" def test_non_datetime_index2(): @@ -372,10 +376,10 @@ @pytest.mark.parametrize( "idx", [ - tm.makeIntIndex(10), - tm.makeFloatIndex(10), - tm.makePeriodIndex(10), - tm.makeRangeIndex(10), + Index(np.arange(5), dtype=np.int64), + Index(np.arange(5), dtype=np.float64), + period_range("2020-01-01", periods=5), + RangeIndex(5), ], ) def test_invalid_index_types(idx): @@ -399,7 +403,7 @@ msg = "Unknown datetime string format" with pytest.raises(ValueError, match=msg): - frequencies.infer_freq(tm.makeStringIndex(10)) + frequencies.infer_freq(Index(["ZqgszYBfuL"])) def test_string_datetime_like_compat(): @@ -429,15 +433,21 @@ frequencies.infer_freq(s) -def test_series_inconvertible_string(): +def test_series_inconvertible_string(using_infer_string): # see gh-6407 - msg = "Unknown datetime string format" + if using_infer_string: + msg = "cannot infer freq from" - with pytest.raises(ValueError, match=msg): - frequencies.infer_freq(Series(["foo", "bar"])) + with pytest.raises(TypeError, match=msg): + frequencies.infer_freq(Series(["foo", "bar"])) + else: + msg = "Unknown datetime string format" + + with pytest.raises(ValueError, match=msg): + frequencies.infer_freq(Series(["foo", "bar"])) -@pytest.mark.parametrize("freq", [None, "L"]) +@pytest.mark.parametrize("freq", [None, "ms"]) def test_series_period_index(freq): # see gh-6407 # @@ -449,7 +459,7 @@ frequencies.infer_freq(s) -@pytest.mark.parametrize("freq", ["M", "L", "S"]) +@pytest.mark.parametrize("freq", ["ME", "ms", "s"]) def test_series_datetime_index(freq): s = Series(date_range("20130101", periods=10, freq=freq)) inferred = frequencies.infer_freq(s) @@ -475,22 +485,22 @@ "W@FRI", "W@SAT", "W@SUN", - "Q@JAN", - "Q@FEB", - "Q@MAR", - "A@JAN", - "A@FEB", - "A@MAR", - "A@APR", - "A@MAY", - "A@JUN", - "A@JUL", - "A@AUG", - "A@SEP", - "A@OCT", - "A@NOV", - "A@DEC", - "Y@JAN", + "QE@JAN", + "QE@FEB", + "QE@MAR", + "YE@JAN", + "YE@FEB", + "YE@MAR", + "YE@APR", + "YE@MAY", + "YE@JUN", + "YE@JUL", + "YE@AUG", + "YE@SEP", + "YE@OCT", + "YE@NOV", + "YE@DEC", + "YE@JAN", "WOM@1MON", "WOM@2MON", "WOM@3MON", @@ -530,12 +540,12 @@ arr = np.arange(10).astype(np.int64).view("M8[s]") dta = DatetimeArray._simple_new(arr, dtype=arr.dtype) res = frequencies.infer_freq(dta) - assert res == "S" + assert res == "s" arr2 = arr.view("m8[ms]") tda = TimedeltaArray._simple_new(arr2, dtype=arr2.dtype) res2 = frequencies.infer_freq(tda) - assert res2 == "L" + assert res2 == "ms" def test_infer_freq_non_nano_tzaware(tz_aware_fixture): diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/holiday/test_calendar.py pandas-2.2.2+dfsg/pandas/tests/tseries/holiday/test_calendar.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/holiday/test_calendar.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/holiday/test_calendar.py 2024-04-10 17:42:52.000000000 +0000 @@ -57,8 +57,11 @@ jan2 = TestCalendar(rules=[Holiday("jan2", year=2015, month=1, day=2)]) # Getting holidays for Jan 1 should not alter results for Jan 2. - tm.assert_index_equal(jan1.holidays(), DatetimeIndex(["01-Jan-2015"])) - tm.assert_index_equal(jan2.holidays(), DatetimeIndex(["02-Jan-2015"])) + expected = DatetimeIndex(["01-Jan-2015"]).as_unit("ns") + tm.assert_index_equal(jan1.holidays(), expected) + + expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("ns") + tm.assert_index_equal(jan2.holidays(), expected2) def test_calendar_observance_dates(): diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/holiday/test_holiday.py pandas-2.2.2+dfsg/pandas/tests/tseries/holiday/test_holiday.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/holiday/test_holiday.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/holiday/test_holiday.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,7 +3,10 @@ import pytest from pytz import utc -from pandas import DatetimeIndex +from pandas import ( + DatetimeIndex, + Series, +) import pandas._testing as tm from pandas.tseries.holiday import ( @@ -17,6 +20,7 @@ HolidayCalendarFactory, Timestamp, USColumbusDay, + USFederalHolidayCalendar, USLaborDay, USMartinLutherKingJr, USMemorialDay, @@ -311,3 +315,18 @@ tm.assert_index_equal(date_interval_low, expected_results) tm.assert_index_equal(date_window_edge, expected_results) tm.assert_index_equal(date_interval_high, expected_results) + + +def test_holidays_with_timezone_specified_but_no_occurences(): + # GH 54580 + # _apply_rule() in holiday.py was silently dropping timezones if you passed it + # an empty list of holiday dates that had timezone information + start_date = Timestamp("2018-01-01", tz="America/Chicago") + end_date = Timestamp("2018-01-11", tz="America/Chicago") + test_case = USFederalHolidayCalendar().holidays( + start_date, end_date, return_name=True + ) + expected_results = Series("New Year's Day", index=[start_date]) + expected_results.index = expected_results.index.as_unit("ns") + + tm.assert_equal(test_case, expected_results) diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/conftest.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/conftest.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/conftest.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,42 +0,0 @@ -import datetime - -import pytest - -from pandas._libs.tslibs import Timestamp -from pandas._libs.tslibs.offsets import MonthOffset - -from pandas.tseries import offsets - - -@pytest.fixture( - params=[ - getattr(offsets, o) for o in offsets.__all__ if o not in ("Tick", "BaseOffset") - ] -) -def offset_types(request): - """ - Fixture for all the datetime offsets available for a time series. - """ - return request.param - - -@pytest.fixture( - params=[ - getattr(offsets, o) - for o in offsets.__all__ - if issubclass(getattr(offsets, o), MonthOffset) and o != "MonthOffset" - ] -) -def month_classes(request): - """ - Fixture for month based datetime offsets available for a time series. - """ - return request.param - - -@pytest.fixture -def dt(): - """ - Fixture for common Timestamp. - """ - return Timestamp(datetime.datetime(2008, 1, 2)) diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_business_hour.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_business_hour.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_business_hour.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_business_hour.py 2024-04-10 17:42:52.000000000 +0000 @@ -148,17 +148,17 @@ offset9, offset10, ): - assert repr(offset1) == "" - assert repr(offset2) == "<3 * BusinessHours: BH=09:00-17:00>" - assert repr(offset3) == "<-1 * BusinessHour: BH=09:00-17:00>" - assert repr(offset4) == "<-4 * BusinessHours: BH=09:00-17:00>" - - assert repr(offset5) == "" - assert repr(offset6) == "" - assert repr(offset7) == "<-2 * BusinessHours: BH=21:30-06:30>" - assert repr(offset8) == "" - assert repr(offset9) == "<3 * BusinessHours: BH=09:00-13:00,22:00-03:00>" - assert repr(offset10) == "<-1 * BusinessHour: BH=13:00-17:00,23:00-02:00>" + assert repr(offset1) == "" + assert repr(offset2) == "<3 * BusinessHours: bh=09:00-17:00>" + assert repr(offset3) == "<-1 * BusinessHour: bh=09:00-17:00>" + assert repr(offset4) == "<-4 * BusinessHours: bh=09:00-17:00>" + + assert repr(offset5) == "" + assert repr(offset6) == "" + assert repr(offset7) == "<-2 * BusinessHours: bh=21:30-06:30>" + assert repr(offset8) == "" + assert repr(offset9) == "<3 * BusinessHours: bh=09:00-13:00,22:00-03:00>" + assert repr(offset10) == "<-1 * BusinessHour: bh=13:00-17:00,23:00-02:00>" def test_with_offset(self, dt): expected = Timestamp("2014-07-01 13:00") @@ -946,47 +946,16 @@ for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - def test_datetimeindex(self): - idx1 = date_range(start="2014-07-04 15:00", end="2014-07-08 10:00", freq="BH") - idx2 = date_range(start="2014-07-04 15:00", periods=12, freq="BH") - idx3 = date_range(end="2014-07-08 10:00", periods=12, freq="BH") - expected = DatetimeIndex( - [ - "2014-07-04 15:00", - "2014-07-04 16:00", - "2014-07-07 09:00", - "2014-07-07 10:00", - "2014-07-07 11:00", - "2014-07-07 12:00", - "2014-07-07 13:00", - "2014-07-07 14:00", - "2014-07-07 15:00", - "2014-07-07 16:00", - "2014-07-08 09:00", - "2014-07-08 10:00", - ], - freq="BH", - ) - for idx in [idx1, idx2, idx3]: - tm.assert_index_equal(idx, expected) + @pytest.mark.parametrize("td_unit", ["s", "ms", "us", "ns"]) + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_bday_ignores_timedeltas(self, unit, td_unit): + # GH#55608 + idx = date_range("2010/02/01", "2010/02/10", freq="12h", unit=unit) + td = Timedelta(3, unit="h").as_unit(td_unit) + off = BDay(offset=td) + t1 = idx + off - idx1 = date_range(start="2014-07-04 15:45", end="2014-07-08 10:45", freq="BH") - idx2 = date_range(start="2014-07-04 15:45", periods=12, freq="BH") - idx3 = date_range(end="2014-07-08 10:45", periods=12, freq="BH") - - expected = idx1 - for idx in [idx1, idx2, idx3]: - tm.assert_index_equal(idx, expected) - - def test_short_datetimeindex_creation(self): - # gh-49835 - idx4 = date_range(start="2014-07-01 10:00", freq="BH", periods=1) - expected4 = DatetimeIndex(["2014-07-01 10:00"], freq="BH") - tm.assert_index_equal(idx4, expected4) - - def test_bday_ignores_timedeltas(self): - idx = date_range("2010/02/01", "2010/02/10", freq="12H") - t1 = idx + BDay(offset=Timedelta(3, unit="H")) + exp_unit = tm.get_finest_unit(td.unit, idx.unit) expected = DatetimeIndex( [ @@ -1011,9 +980,22 @@ "2010-02-11 03:00:00", ], freq=None, - ) + ).as_unit(exp_unit) tm.assert_index_equal(t1, expected) + # TODO(GH#55564): as_unit will be unnecessary + pointwise = DatetimeIndex([x + off for x in idx]).as_unit(exp_unit) + tm.assert_index_equal(pointwise, expected) + + def test_add_bday_offset_nanos(self): + # GH#55608 + idx = date_range("2010/02/01", "2010/02/10", freq="12h", unit="ns") + off = BDay(offset=Timedelta(3, unit="ns")) + + result = idx + off + expected = DatetimeIndex([x + off for x in idx]) + tm.assert_index_equal(result, expected) + class TestOpeningTimes: # opening time should be affected by sign of n, not by n's value and end diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_business_month.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_business_month.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_business_month.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_business_month.py 2024-04-10 17:42:52.000000000 +0000 @@ -31,7 +31,7 @@ ) def test_apply_index(cls, n): offset = cls(n=n) - rng = pd.date_range(start="1/1/2000", periods=100000, freq="T") + rng = pd.date_range(start="1/1/2000", periods=100000, freq="min") ser = pd.Series(rng) res = rng + offset diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_business_quarter.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_business_quarter.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_business_quarter.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_business_quarter.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,7 @@ import pytest +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( assert_is_on_offset, assert_offset_equal, @@ -54,9 +55,12 @@ assert repr(BQuarterBegin(startingMonth=1)) == expected def test_is_anchored(self): - assert BQuarterBegin(startingMonth=1).is_anchored() - assert BQuarterBegin().is_anchored() - assert not BQuarterBegin(2, startingMonth=1).is_anchored() + msg = "BQuarterBegin.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert BQuarterBegin(startingMonth=1).is_anchored() + assert BQuarterBegin().is_anchored() + assert not BQuarterBegin(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -177,9 +181,12 @@ assert repr(BQuarterEnd(startingMonth=1)) == expected def test_is_anchored(self): - assert BQuarterEnd(startingMonth=1).is_anchored() - assert BQuarterEnd().is_anchored() - assert not BQuarterEnd(2, startingMonth=1).is_anchored() + msg = "BQuarterEnd.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert BQuarterEnd(startingMonth=1).is_anchored() + assert BQuarterEnd().is_anchored() + assert not BQuarterEnd(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_common.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_common.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_common.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_common.py 2024-04-10 17:42:52.000000000 +0000 @@ -142,7 +142,7 @@ if isinstance(tz, tzlocal) and not IS64 and _offset is not DateOffset: # If we hit OutOfBoundsDatetime on non-64 bit machines # we'll drop out of the try clause before the next test - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="OverflowError inside tzlocal past 2038") ) elif ( @@ -150,7 +150,7 @@ and is_platform_windows() and _offset in (QuarterEnd, BQuarterBegin, BQuarterEnd) ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="After GH#49737 t.tzinfo is None on CI") ) assert str(t.tzinfo) == str(result.tzinfo) @@ -250,7 +250,8 @@ [BusinessHour, BusinessHour()], ], ) -def test_Mult1(offset_box, offset1, dt): +def test_Mult1(offset_box, offset1): + dt = Timestamp(2008, 1, 2) assert dt + 10 * offset1 == dt + offset_box(10) assert dt + 5 * offset1 == dt + offset_box(5) diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_custom_business_hour.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_custom_business_hour.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_custom_business_hour.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_custom_business_hour.py 2024-04-10 17:42:52.000000000 +0000 @@ -69,8 +69,8 @@ assert offset != offset2 def test_repr(self, offset1, offset2): - assert repr(offset1) == "" - assert repr(offset2) == "" + assert repr(offset1) == "" + assert repr(offset2) == "" def test_with_offset(self, dt): expected = Timestamp("2014-07-01 13:00") diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_custom_business_month.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_custom_business_month.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_custom_business_month.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_custom_business_month.py 2024-04-10 17:42:52.000000000 +0000 @@ -21,17 +21,13 @@ CDay, ) -from pandas import ( - _testing as tm, - date_range, -) +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( assert_is_on_offset, assert_offset_equal, ) from pandas.tseries import offsets -from pandas.tseries.holiday import USFederalHolidayCalendar @pytest.fixture @@ -201,14 +197,6 @@ assert dt + bm_offset == datetime(2012, 1, 2) assert dt + 2 * bm_offset == datetime(2012, 2, 3) - @pytest.mark.filterwarnings("ignore:Non:pandas.errors.PerformanceWarning") - def test_datetimeindex(self): - hcal = USFederalHolidayCalendar() - cbmb = CBMonthBegin(calendar=hcal) - assert date_range(start="20120101", end="20130101", freq=cbmb).tolist()[ - 0 - ] == datetime(2012, 1, 3) - @pytest.mark.parametrize( "case", [ @@ -397,15 +385,6 @@ assert dt + bm_offset == datetime(2012, 1, 30) assert dt + 2 * bm_offset == datetime(2012, 2, 27) - @pytest.mark.filterwarnings("ignore:Non:pandas.errors.PerformanceWarning") - def test_datetimeindex(self): - hcal = USFederalHolidayCalendar() - freq = CBMonthEnd(calendar=hcal) - - assert date_range(start="20120101", end="20130101", freq=freq).tolist()[ - 0 - ] == datetime(2012, 1, 31) - @pytest.mark.parametrize( "case", [ diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_dst.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_dst.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_dst.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_dst.py 2024-04-10 17:42:52.000000000 +0000 @@ -29,7 +29,10 @@ YearBegin, YearEnd, ) +from pandas.errors import PerformanceWarning +from pandas import DatetimeIndex +import pandas._testing as tm from pandas.util.version import Version # error: Module has no attribute "__version__" @@ -83,6 +86,29 @@ def _test_offset(self, offset_name, offset_n, tstart, expected_utc_offset): offset = DateOffset(**{offset_name: offset_n}) + if ( + offset_name in ["hour", "minute", "second", "microsecond"] + and offset_n == 1 + and tstart == Timestamp("2013-11-03 01:59:59.999999-0500", tz="US/Eastern") + ): + # This addition results in an ambiguous wall time + err_msg = { + "hour": "2013-11-03 01:59:59.999999", + "minute": "2013-11-03 01:01:59.999999", + "second": "2013-11-03 01:59:01.999999", + "microsecond": "2013-11-03 01:59:59.000001", + }[offset_name] + with pytest.raises(pytz.AmbiguousTimeError, match=err_msg): + tstart + offset + # While we're here, let's check that we get the same behavior in a + # vectorized path + dti = DatetimeIndex([tstart]) + warn_msg = "Non-vectorized DateOffset" + with pytest.raises(pytz.AmbiguousTimeError, match=err_msg): + with tm.assert_produces_warning(PerformanceWarning, match=warn_msg): + dti + offset + return + t = tstart + offset if expected_utc_offset is not None: assert get_utc_offset_hours(t) == expected_utc_offset diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_fiscal.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_fiscal.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_fiscal.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_fiscal.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,6 +7,7 @@ import pytest from pandas import Timestamp +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( WeekDay, assert_is_on_offset, @@ -295,15 +296,18 @@ class TestFY5253LastOfMonthQuarter: def test_is_anchored(self): - assert makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).is_anchored() - assert makeFY5253LastOfMonthQuarter( - weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 - ).is_anchored() - assert not makeFY5253LastOfMonthQuarter( - 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).is_anchored() + msg = "FY5253Quarter.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).is_anchored() + assert makeFY5253LastOfMonthQuarter( + weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 + ).is_anchored() + assert not makeFY5253LastOfMonthQuarter( + 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).is_anchored() def test_equality(self): assert makeFY5253LastOfMonthQuarter( diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_index.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_index.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_index.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_index.py 2024-04-10 17:42:52.000000000 +0000 @@ -44,7 +44,7 @@ ) def test_apply_index(cls, n): offset = cls(n=n) - rng = date_range(start="1/1/2000", periods=100000, freq="T") + rng = date_range(start="1/1/2000", periods=100000, freq="min") ser = Series(rng) res = rng + offset diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_month.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_month.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_month.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_month.py 2024-04-10 17:42:52.000000000 +0000 @@ -23,7 +23,6 @@ DatetimeIndex, Series, _testing as tm, - date_range, ) from pandas.tests.tseries.offsets.common import ( assert_is_on_offset, @@ -74,11 +73,6 @@ exp = DatetimeIndex(dates[1:]) tm.assert_index_equal(result, exp) - # ensure generating a range with DatetimeIndex gives same result - result = date_range(start=dates[0], end=dates[-1], freq="SM") - exp = DatetimeIndex(dates, freq="SM") - tm.assert_index_equal(result, exp) - offset_cases = [] offset_cases.append( ( @@ -330,11 +324,6 @@ exp = DatetimeIndex(dates[1:]) tm.assert_index_equal(result, exp) - # ensure generating a range with DatetimeIndex gives same result - result = date_range(start=dates[0], end=dates[-1], freq="SMS") - exp = DatetimeIndex(dates, freq="SMS") - tm.assert_index_equal(result, exp) - offset_cases = [ ( SemiMonthBegin(), diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_offsets.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_offsets.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_offsets.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_offsets.py 2024-04-10 17:42:52.000000000 +0000 @@ -22,6 +22,7 @@ from pandas._libs.tslibs.offsets import ( _get_offset, _offset_map, + to_offset, ) from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG from pandas.errors import PerformanceWarning @@ -101,6 +102,33 @@ return klass +@pytest.fixture( + params=[ + getattr(offsets, o) + for o in offsets.__all__ + if issubclass(getattr(offsets, o), liboffsets.MonthOffset) + and o != "MonthOffset" + ] +) +def month_classes(request): + """ + Fixture for month based datetime offsets available for a time series. + """ + return request.param + + +@pytest.fixture( + params=[ + getattr(offsets, o) for o in offsets.__all__ if o not in ("Tick", "BaseOffset") + ] +) +def offset_types(request): + """ + Fixture for all the datetime offsets available for a time series. + """ + return request.param + + @pytest.fixture def dt(): return Timestamp(datetime(2008, 1, 2)) @@ -228,18 +256,9 @@ assert result == expected # see gh-14101 - exp_warning = None ts = Timestamp(dt) + Nano(5) - - if ( - type(offset_s).__name__ == "DateOffset" - and (funcname in ["apply", "_apply"] or normalize) - and ts.nanosecond > 0 - ): - exp_warning = UserWarning - # test nanosecond is preserved - with tm.assert_produces_warning(exp_warning): + with tm.assert_produces_warning(None): result = func(ts) assert isinstance(result, Timestamp) @@ -273,18 +292,9 @@ assert result == expected_localize # see gh-14101 - exp_warning = None ts = Timestamp(dt, tz=tz) + Nano(5) - - if ( - type(offset_s).__name__ == "DateOffset" - and (funcname in ["apply", "_apply"] or normalize) - and ts.nanosecond > 0 - ): - exp_warning = UserWarning - # test nanosecond is preserved - with tm.assert_produces_warning(exp_warning): + with tm.assert_produces_warning(None): result = func(ts) assert isinstance(result, Timestamp) if normalize is False: @@ -494,7 +504,7 @@ # GH#12724, GH#30336 offset_s = _create_offset(offset_types) - dti = DatetimeIndex([], tz=tz_naive_fixture) + dti = DatetimeIndex([], tz=tz_naive_fixture).as_unit("ns") warn = None if isinstance( @@ -560,29 +570,27 @@ off = _create_offset(offset_types) assert hash(off) is not None + # TODO: belongs in arithmetic tests? @pytest.mark.filterwarnings( "ignore:Non-vectorized DateOffset being applied to Series or DatetimeIndex" ) @pytest.mark.parametrize("unit", ["s", "ms", "us"]) - def test_add_dt64_ndarray_non_nano(self, offset_types, unit, request): + def test_add_dt64_ndarray_non_nano(self, offset_types, unit): # check that the result with non-nano matches nano off = _create_offset(offset_types) - dti = date_range("2016-01-01", periods=35, freq="D") + dti = date_range("2016-01-01", periods=35, freq="D", unit=unit) - arr = dti._data._ndarray.astype(f"M8[{unit}]") - dta = type(dti._data)._simple_new(arr, dtype=arr.dtype) - - expected = dti._data + off - result = dta + off + result = (dti + off)._with_freq(None) exp_unit = unit - if isinstance(off, Tick) and off._creso > dta._creso: + if isinstance(off, Tick) and off._creso > dti._data._creso: # cast to higher reso like we would with Timedelta scalar exp_unit = Timedelta(off).unit - expected = expected.as_unit(exp_unit) + # TODO(GH#55564): as_unit will be unnecessary + expected = DatetimeIndex([x + off for x in dti]).as_unit(exp_unit) - tm.assert_numpy_array_equal(result._ndarray, expected._ndarray) + tm.assert_index_equal(result, expected) class TestDateOffset: @@ -602,7 +610,7 @@ @pytest.mark.parametrize("kwd", sorted(liboffsets._relativedelta_kwds)) def test_constructor(self, kwd, request): if kwd == "millisecond": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=NotImplementedError, reason="Constructing DateOffset object with `millisecond` is not " @@ -617,8 +625,11 @@ assert (dt + DateOffset(2)) == datetime(2008, 1, 4) def test_is_anchored(self): - assert not DateOffset(2).is_anchored() - assert DateOffset(1).is_anchored() + msg = "DateOffset.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert not DateOffset(2).is_anchored() + assert DateOffset(1).is_anchored() def test_copy(self): assert DateOffset(months=2).copy() == DateOffset(months=2) @@ -757,7 +768,7 @@ def test_get_offset_name(self): assert BDay().freqstr == "B" assert BDay(2).freqstr == "2B" - assert BMonthEnd().freqstr == "BM" + assert BMonthEnd().freqstr == "BME" assert Week(weekday=0).freqstr == "W-MON" assert Week(weekday=1).freqstr == "W-TUE" assert Week(weekday=2).freqstr == "W-WED" @@ -776,8 +787,8 @@ pairs = [ ("B", BDay()), ("b", BDay()), - ("bm", BMonthEnd()), - ("Bm", BMonthEnd()), + ("bme", BMonthEnd()), + ("Bme", BMonthEnd()), ("W-MON", Week(weekday=0)), ("W-TUE", Week(weekday=1)), ("W-WED", Week(weekday=2)), @@ -811,7 +822,7 @@ assert k == v.copy() def test_rule_code(self): - lst = ["M", "MS", "BM", "BMS", "D", "B", "H", "T", "S", "L", "U"] + lst = ["ME", "MS", "BME", "BMS", "D", "B", "h", "min", "s", "ms", "us"] for k in lst: assert k == _get_offset(k).rule_code # should be cached - this is kind of an internals test... @@ -839,7 +850,7 @@ "NOV", "DEC", ] - base_lst = ["A", "AS", "BA", "BAS", "Q", "QS", "BQ", "BQS"] + base_lst = ["YE", "YS", "BYE", "BYS", "QE", "QS", "BQE", "BQS"] for base in base_lst: for v in suffix_lst: alias = "-".join([base, v]) @@ -858,7 +869,7 @@ class TestReprNames: def test_str_for_named_is_name(self): # look at all the amazing combinations! - month_prefixes = ["A", "AS", "BA", "BAS", "Q", "BQ", "BQS", "QS"] + month_prefixes = ["YE", "YS", "BYE", "BYS", "QE", "BQE", "BQS", "QS"] names = [ prefix + "-" + month for prefix in month_prefixes @@ -916,7 +927,7 @@ @pytest.mark.parametrize("kwd", sorted(liboffsets._relativedelta_kwds)) def test_valid_relativedelta_kwargs(kwd, request): if kwd == "millisecond": - request.node.add_marker( + request.applymarker( pytest.mark.xfail( raises=NotImplementedError, reason="Constructing DateOffset object with `millisecond` is not " @@ -1011,6 +1022,14 @@ result = offset + ts assert result == expected + offset2 = DateOffset(minutes=2, nanoseconds=9, hour=1) + assert offset2._use_relativedelta + with tm.assert_produces_warning(None): + # no warning about Discarding nonzero nanoseconds + result2 = ts + offset2 + expected2 = Timestamp("1970-01-01 01:02:00.000000013") + assert result2 == expected2 + @pytest.mark.parametrize( "attribute", @@ -1116,3 +1135,51 @@ assert frameresult1[0] == expecteddate assert frameresult2[0] == expecteddate + + +def test_is_yqm_start_end(): + freq_m = to_offset("ME") + bm = to_offset("BME") + qfeb = to_offset("QE-FEB") + qsfeb = to_offset("QS-FEB") + bq = to_offset("BQE") + bqs_apr = to_offset("BQS-APR") + as_nov = to_offset("YS-NOV") + + tests = [ + (freq_m.is_month_start(Timestamp("2013-06-01")), 1), + (bm.is_month_start(Timestamp("2013-06-01")), 0), + (freq_m.is_month_start(Timestamp("2013-06-03")), 0), + (bm.is_month_start(Timestamp("2013-06-03")), 1), + (qfeb.is_month_end(Timestamp("2013-02-28")), 1), + (qfeb.is_quarter_end(Timestamp("2013-02-28")), 1), + (qfeb.is_year_end(Timestamp("2013-02-28")), 1), + (qfeb.is_month_start(Timestamp("2013-03-01")), 1), + (qfeb.is_quarter_start(Timestamp("2013-03-01")), 1), + (qfeb.is_year_start(Timestamp("2013-03-01")), 1), + (qsfeb.is_month_end(Timestamp("2013-03-31")), 1), + (qsfeb.is_quarter_end(Timestamp("2013-03-31")), 0), + (qsfeb.is_year_end(Timestamp("2013-03-31")), 0), + (qsfeb.is_month_start(Timestamp("2013-02-01")), 1), + (qsfeb.is_quarter_start(Timestamp("2013-02-01")), 1), + (qsfeb.is_year_start(Timestamp("2013-02-01")), 1), + (bq.is_month_end(Timestamp("2013-06-30")), 0), + (bq.is_quarter_end(Timestamp("2013-06-30")), 0), + (bq.is_year_end(Timestamp("2013-06-30")), 0), + (bq.is_month_end(Timestamp("2013-06-28")), 1), + (bq.is_quarter_end(Timestamp("2013-06-28")), 1), + (bq.is_year_end(Timestamp("2013-06-28")), 0), + (bqs_apr.is_month_end(Timestamp("2013-06-30")), 0), + (bqs_apr.is_quarter_end(Timestamp("2013-06-30")), 0), + (bqs_apr.is_year_end(Timestamp("2013-06-30")), 0), + (bqs_apr.is_month_end(Timestamp("2013-06-28")), 1), + (bqs_apr.is_quarter_end(Timestamp("2013-06-28")), 1), + (bqs_apr.is_year_end(Timestamp("2013-03-29")), 1), + (as_nov.is_year_start(Timestamp("2013-11-01")), 1), + (as_nov.is_year_end(Timestamp("2013-10-31")), 1), + (Timestamp("2012-02-01").days_in_month, 29), + (Timestamp("2013-02-01").days_in_month, 28), + ] + + for ts, value in tests: + assert ts == value diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_quarter.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_quarter.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_quarter.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_quarter.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,7 @@ import pytest +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( assert_is_on_offset, assert_offset_equal, @@ -53,9 +54,12 @@ assert repr(QuarterBegin(startingMonth=1)) == expected def test_is_anchored(self): - assert QuarterBegin(startingMonth=1).is_anchored() - assert QuarterBegin().is_anchored() - assert not QuarterBegin(2, startingMonth=1).is_anchored() + msg = "QuarterBegin.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert QuarterBegin(startingMonth=1).is_anchored() + assert QuarterBegin().is_anchored() + assert not QuarterBegin(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -161,9 +165,12 @@ assert repr(QuarterEnd(startingMonth=1)) == expected def test_is_anchored(self): - assert QuarterEnd(startingMonth=1).is_anchored() - assert QuarterEnd().is_anchored() - assert not QuarterEnd(2, startingMonth=1).is_anchored() + msg = "QuarterEnd.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert QuarterEnd(startingMonth=1).is_anchored() + assert QuarterEnd().is_anchored() + assert not QuarterEnd(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_ticks.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_ticks.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_ticks.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_ticks.py 2024-04-10 17:42:52.000000000 +0000 @@ -15,6 +15,7 @@ import pytest from pandas._libs.tslibs.offsets import delta_to_tick +from pandas.errors import OutOfBoundsTimedelta from pandas import ( Timedelta, @@ -237,6 +238,16 @@ assert result == expected +def test_tick_delta_overflow(): + # GH#55503 raise OutOfBoundsTimedelta, not OverflowError + tick = offsets.Day(10**9) + msg = "Cannot cast 1000000000 days 00:00:00 to unit='ns' without overflow" + depr_msg = "Day.delta is deprecated" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + tick.delta + + @pytest.mark.parametrize("cls", tick_classes) def test_tick_division(cls): off = cls(10) @@ -245,24 +256,24 @@ assert off / 2 == cls(5) assert off / 2.0 == cls(5) - assert off / off.delta == 1 - assert off / off.delta.to_timedelta64() == 1 + assert off / off._as_pd_timedelta == 1 + assert off / off._as_pd_timedelta.to_timedelta64() == 1 - assert off / Nano(1) == off.delta / Nano(1).delta + assert off / Nano(1) == off._as_pd_timedelta / Nano(1)._as_pd_timedelta if cls is not Nano: # A case where we end up with a smaller class result = off / 1000 assert isinstance(result, offsets.Tick) assert not isinstance(result, cls) - assert result.delta == off.delta / 1000 + assert result._as_pd_timedelta == off._as_pd_timedelta / 1000 if cls._nanos_inc < Timedelta(seconds=1)._value: # Case where we end up with a bigger class result = off / 0.001 assert isinstance(result, offsets.Tick) assert not isinstance(result, cls) - assert result.delta == off.delta / 0.001 + assert result._as_pd_timedelta == off._as_pd_timedelta / 0.001 def test_tick_mul_float(): @@ -284,7 +295,7 @@ @pytest.mark.parametrize("cls", tick_classes) def test_tick_rdiv(cls): off = cls(10) - delta = off.delta + delta = off._as_pd_timedelta td64 = delta.to_timedelta64() instance__type = ".".join([cls.__module__, cls.__name__]) msg = ( @@ -328,7 +339,10 @@ @pytest.mark.parametrize("cls", tick_classes) def test_tick_offset(cls): - assert not cls().is_anchored() + msg = f"{cls.__name__}.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert not cls().is_anchored() @pytest.mark.parametrize("cls", tick_classes) @@ -376,7 +390,7 @@ def test_compare_ticks_to_timedeltalike(cls): off = cls(19) - td = off.delta + td = off._as_pd_timedelta others = [td, td.to_timedelta64()] if cls is not Nano: diff -Nru pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_week.py pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_week.py --- pandas-2.1.4+dfsg/pandas/tests/tseries/offsets/test_week.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tseries/offsets/test_week.py 2024-04-10 17:42:52.000000000 +0000 @@ -21,6 +21,7 @@ WeekOfMonth, ) +import pandas._testing as tm from pandas.tests.tseries.offsets.common import ( WeekDay, assert_is_on_offset, @@ -42,10 +43,13 @@ Week(weekday=-1) def test_is_anchored(self): - assert Week(weekday=0).is_anchored() - assert not Week().is_anchored() - assert not Week(2, weekday=2).is_anchored() - assert not Week(2).is_anchored() + msg = "Week.is_anchored is deprecated " + + with tm.assert_produces_warning(FutureWarning, match=msg): + assert Week(weekday=0).is_anchored() + assert not Week().is_anchored() + assert not Week(2, weekday=2).is_anchored() + assert not Week(2).is_anchored() offset_cases = [] # not business week diff -Nru pandas-2.1.4+dfsg/pandas/tests/tslibs/test_api.py pandas-2.2.2+dfsg/pandas/tests/tslibs/test_api.py --- pandas-2.1.4+dfsg/pandas/tests/tslibs/test_api.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tslibs/test_api.py 2024-04-10 17:42:52.000000000 +0000 @@ -54,9 +54,10 @@ "get_unit_from_dtype", "periods_per_day", "periods_per_second", - "is_supported_unit", - "get_supported_reso", - "npy_unit_to_abbrev", + "guess_datetime_format", + "add_overflowsafe", + "get_supported_dtype", + "is_supported_dtype", ] expected = set(submodules + api) diff -Nru pandas-2.1.4+dfsg/pandas/tests/tslibs/test_array_to_datetime.py pandas-2.2.2+dfsg/pandas/tests/tslibs/test_array_to_datetime.py --- pandas-2.1.4+dfsg/pandas/tests/tslibs/test_array_to_datetime.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tslibs/test_array_to_datetime.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,13 +10,128 @@ import pytest from pandas._libs import ( + NaT, iNaT, tslib, ) +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas import Timestamp import pandas._testing as tm +creso_infer = NpyDatetimeUnit.NPY_FR_GENERIC.value + + +class TestArrayToDatetimeResolutionInference: + # TODO: tests that include tzs, ints + + def test_infer_all_nat(self): + arr = np.array([NaT, np.nan], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + assert result.dtype == "M8[s]" + + def test_infer_homogeoneous_datetimes(self): + dt = datetime(2023, 10, 27, 18, 3, 5, 678000) + arr = np.array([dt, dt, dt], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([dt, dt, dt], dtype="M8[us]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_homogeoneous_date_objects(self): + dt = datetime(2023, 10, 27, 18, 3, 5, 678000) + dt2 = dt.date() + arr = np.array([None, dt2, dt2, dt2], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([np.datetime64("NaT"), dt2, dt2, dt2], dtype="M8[s]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_homogeoneous_dt64(self): + dt = datetime(2023, 10, 27, 18, 3, 5, 678000) + dt64 = np.datetime64(dt, "ms") + arr = np.array([None, dt64, dt64, dt64], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([np.datetime64("NaT"), dt64, dt64, dt64], dtype="M8[ms]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_homogeoneous_timestamps(self): + dt = datetime(2023, 10, 27, 18, 3, 5, 678000) + ts = Timestamp(dt).as_unit("ns") + arr = np.array([None, ts, ts, ts], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([np.datetime64("NaT")] + [ts.asm8] * 3, dtype="M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_homogeoneous_datetimes_strings(self): + item = "2023-10-27 18:03:05.678000" + arr = np.array([None, item, item, item], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([np.datetime64("NaT"), item, item, item], dtype="M8[us]") + tm.assert_numpy_array_equal(result, expected) + + def test_infer_heterogeneous(self): + dtstr = "2023-10-27 18:03:05.678000" + + arr = np.array([dtstr, dtstr[:-3], dtstr[:-7], None], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array(arr, dtype="M8[us]") + tm.assert_numpy_array_equal(result, expected) + + result, tz = tslib.array_to_datetime(arr[::-1], creso=creso_infer) + assert tz is None + tm.assert_numpy_array_equal(result, expected[::-1]) + + @pytest.mark.parametrize( + "item", [float("nan"), NaT.value, float(NaT.value), "NaT", ""] + ) + def test_infer_with_nat_int_float_str(self, item): + # floats/ints get inferred to nanos *unless* they are NaN/iNaT, + # similar NaT string gets treated like NaT scalar (ignored for resolution) + dt = datetime(2023, 11, 15, 15, 5, 6) + + arr = np.array([dt, item], dtype=object) + result, tz = tslib.array_to_datetime(arr, creso=creso_infer) + assert tz is None + expected = np.array([dt, np.datetime64("NaT")], dtype="M8[us]") + tm.assert_numpy_array_equal(result, expected) + + result2, tz2 = tslib.array_to_datetime(arr[::-1], creso=creso_infer) + assert tz2 is None + tm.assert_numpy_array_equal(result2, expected[::-1]) + + +class TestArrayToDatetimeWithTZResolutionInference: + def test_array_to_datetime_with_tz_resolution(self): + tz = tzoffset("custom", 3600) + vals = np.array(["2016-01-01 02:03:04.567", NaT], dtype=object) + res = tslib.array_to_datetime_with_tz(vals, tz, False, False, creso_infer) + assert res.dtype == "M8[ms]" + + vals2 = np.array([datetime(2016, 1, 1, 2, 3, 4), NaT], dtype=object) + res2 = tslib.array_to_datetime_with_tz(vals2, tz, False, False, creso_infer) + assert res2.dtype == "M8[us]" + + vals3 = np.array([NaT, np.datetime64(12345, "s")], dtype=object) + res3 = tslib.array_to_datetime_with_tz(vals3, tz, False, False, creso_infer) + assert res3.dtype == "M8[s]" + + def test_array_to_datetime_with_tz_resolution_all_nat(self): + tz = tzoffset("custom", 3600) + vals = np.array(["NaT"], dtype=object) + res = tslib.array_to_datetime_with_tz(vals, tz, False, False, creso_infer) + assert res.dtype == "M8[s]" + + vals2 = np.array([NaT, NaT], dtype=object) + res2 = tslib.array_to_datetime_with_tz(vals2, tz, False, False, creso_infer) + assert res2.dtype == "M8[s]" + @pytest.mark.parametrize( "data,expected", @@ -132,7 +247,7 @@ if errors == "raise": msg = "^Out of bounds nanosecond timestamp: .*, at position 0$" - with pytest.raises(ValueError, match=msg): + with pytest.raises(OutOfBoundsDatetime, match=msg): tslib.array_to_datetime(**kwargs) else: # coerce. result, _ = tslib.array_to_datetime(**kwargs) @@ -181,6 +296,23 @@ tslib.array_to_datetime(arr) +@pytest.mark.parametrize( + "timestamp", + [ + # Close enough to bounds that scaling micros to nanos overflows + # but adding nanos would result in an in-bounds datetime. + "1677-09-21T00:12:43.145224193", + "1677-09-21T00:12:43.145224999", + # this always worked + "1677-09-21T00:12:43.145225000", + ], +) +def test_to_datetime_barely_inside_bounds(timestamp): + # see gh-57150 + result, _ = tslib.array_to_datetime(np.array([timestamp], dtype=object)) + tm.assert_numpy_array_equal(result, np.array([timestamp], dtype="M8[ns]")) + + class SubDatetime(datetime): pass diff -Nru pandas-2.1.4+dfsg/pandas/tests/tslibs/test_conversion.py pandas-2.2.2+dfsg/pandas/tests/tslibs/test_conversion.py --- pandas-2.1.4+dfsg/pandas/tests/tslibs/test_conversion.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tslibs/test_conversion.py 2024-04-10 17:42:52.000000000 +0000 @@ -66,14 +66,14 @@ def test_tz_convert_single_matches_tz_convert_hourly(tz_aware_fixture): tz = tz_aware_fixture - tz_didx = date_range("2014-03-01", "2015-01-10", freq="H", tz=tz) - naive_didx = date_range("2014-03-01", "2015-01-10", freq="H") + tz_didx = date_range("2014-03-01", "2015-01-10", freq="h", tz=tz) + naive_didx = date_range("2014-03-01", "2015-01-10", freq="h") _compare_utc_to_local(tz_didx) _compare_local_to_utc(tz_didx, naive_didx) -@pytest.mark.parametrize("freq", ["D", "A"]) +@pytest.mark.parametrize("freq", ["D", "YE"]) def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq): tz = tz_aware_fixture tz_didx = date_range("2018-01-01", "2020-01-01", freq=freq, tz=tz) diff -Nru pandas-2.1.4+dfsg/pandas/tests/tslibs/test_libfrequencies.py pandas-2.2.2+dfsg/pandas/tests/tslibs/test_libfrequencies.py --- pandas-2.1.4+dfsg/pandas/tests/tslibs/test_libfrequencies.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tslibs/test_libfrequencies.py 2024-04-10 17:42:52.000000000 +0000 @@ -16,10 +16,8 @@ (offsets.QuarterEnd(startingMonth=12).freqstr, "DEC"), ("Q-JAN", "JAN"), (offsets.QuarterEnd(startingMonth=1).freqstr, "JAN"), - ("A-DEC", "DEC"), ("Y-DEC", "DEC"), (offsets.YearEnd().freqstr, "DEC"), - ("A-MAY", "MAY"), ("Y-MAY", "MAY"), (offsets.YearEnd(month=5).freqstr, "MAY"), ], diff -Nru pandas-2.1.4+dfsg/pandas/tests/tslibs/test_npy_units.py pandas-2.2.2+dfsg/pandas/tests/tslibs/test_npy_units.py --- pandas-2.1.4+dfsg/pandas/tests/tslibs/test_npy_units.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tslibs/test_npy_units.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,27 @@ +import numpy as np + +from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit +from pandas._libs.tslibs.vectorized import is_date_array_normalized + +# a datetime64 ndarray which *is* normalized +day_arr = np.arange(10, dtype="i8").view("M8[D]") + + +class TestIsDateArrayNormalized: + def test_is_date_array_normalized_day(self): + arr = day_arr + abbrev = "D" + unit = abbrev_to_npy_unit(abbrev) + result = is_date_array_normalized(arr.view("i8"), None, unit) + assert result is True + + def test_is_date_array_normalized_seconds(self): + abbrev = "s" + arr = day_arr.astype(f"M8[{abbrev}]") + unit = abbrev_to_npy_unit(abbrev) + result = is_date_array_normalized(arr.view("i8"), None, unit) + assert result is True + + arr[0] += np.timedelta64(1, abbrev) + result2 = is_date_array_normalized(arr.view("i8"), None, unit) + assert result2 is False diff -Nru pandas-2.1.4+dfsg/pandas/tests/tslibs/test_parsing.py pandas-2.2.2+dfsg/pandas/tests/tslibs/test_parsing.py --- pandas-2.1.4+dfsg/pandas/tests/tslibs/test_parsing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tslibs/test_parsing.py 2024-04-10 17:42:52.000000000 +0000 @@ -6,6 +6,7 @@ from dateutil.parser import parse as du_parse from dateutil.tz import tzlocal +from hypothesis import given import numpy as np import pytest @@ -21,6 +22,7 @@ import pandas.util._test_decorators as td import pandas._testing as tm +from pandas._testing._hypothesis import DATETIME_NO_TZ @pytest.mark.skipif( @@ -138,8 +140,8 @@ "date_str,freq,expected", [ ("2013Q2", None, datetime(2013, 4, 1)), - ("2013Q2", "A-APR", datetime(2012, 8, 1)), - ("2013-Q2", "A-DEC", datetime(2013, 4, 1)), + ("2013Q2", "Y-APR", datetime(2012, 8, 1)), + ("2013-Q2", "Y-DEC", datetime(2013, 4, 1)), ], ) def test_parsers_quarterly_with_freq(date_str, freq, expected): @@ -148,7 +150,7 @@ @pytest.mark.parametrize( - "date_str", ["2Q 2005", "2Q-200A", "2Q-200", "22Q2005", "2Q200.", "6Q-20"] + "date_str", ["2Q 2005", "2Q-200Y", "2Q-200", "22Q2005", "2Q200.", "6Q-20"] ) def test_parsers_quarter_invalid(date_str): if date_str == "6Q-20": @@ -168,7 +170,7 @@ [("201101", datetime(2011, 1, 1, 0, 0)), ("200005", datetime(2000, 5, 1, 0, 0))], ) def test_parsers_month_freq(date_str, expected): - result, _ = parsing.parse_datetime_string_with_reso(date_str, freq="M") + result, _ = parsing.parse_datetime_string_with_reso(date_str, freq="ME") assert result == expected @@ -367,3 +369,46 @@ result = parsing.guess_datetime_format(input) expected = "%Y-%m-%dT%H:%M:%S.%f" assert result == expected + + +def _helper_hypothesis_delimited_date(call, date_string, **kwargs): + msg, result = None, None + try: + result = call(date_string, **kwargs) + except ValueError as err: + msg = str(err) + return msg, result + + +@given(DATETIME_NO_TZ) +@pytest.mark.parametrize("delimiter", list(" -./")) +@pytest.mark.parametrize("dayfirst", [True, False]) +@pytest.mark.parametrize( + "date_format", + ["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"], +) +def test_hypothesis_delimited_date( + request, date_format, dayfirst, delimiter, test_datetime +): + if date_format == "%m %Y" and delimiter == ".": + request.applymarker( + pytest.mark.xfail( + reason="parse_datetime_string cannot reliably tell whether " + "e.g. %m.%Y is a float or a date" + ) + ) + date_string = test_datetime.strftime(date_format.replace(" ", delimiter)) + + except_out_dateutil, result = _helper_hypothesis_delimited_date( + parsing.py_parse_datetime_string, date_string, dayfirst=dayfirst + ) + except_in_dateutil, expected = _helper_hypothesis_delimited_date( + du_parse, + date_string, + default=datetime(1, 1, 1), + dayfirst=dayfirst, + yearfirst=False, + ) + + assert except_out_dateutil == except_in_dateutil + assert result == expected diff -Nru pandas-2.1.4+dfsg/pandas/tests/tslibs/test_period.py pandas-2.2.2+dfsg/pandas/tests/tslibs/test_period.py --- pandas-2.1.4+dfsg/pandas/tests/tslibs/test_period.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tslibs/test_period.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,123 @@ +import numpy as np +import pytest + +from pandas._libs.tslibs import ( + iNaT, + to_offset, +) +from pandas._libs.tslibs.period import ( + extract_ordinals, + get_period_field_arr, + period_asfreq, + period_ordinal, +) + +import pandas._testing as tm + + +def get_freq_code(freqstr: str) -> int: + off = to_offset(freqstr, is_period=True) + # error: "BaseOffset" has no attribute "_period_dtype_code" + code = off._period_dtype_code # type: ignore[attr-defined] + return code + + +@pytest.mark.parametrize( + "freq1,freq2,expected", + [ + ("D", "h", 24), + ("D", "min", 1440), + ("D", "s", 86400), + ("D", "ms", 86400000), + ("D", "us", 86400000000), + ("D", "ns", 86400000000000), + ("h", "min", 60), + ("h", "s", 3600), + ("h", "ms", 3600000), + ("h", "us", 3600000000), + ("h", "ns", 3600000000000), + ("min", "s", 60), + ("min", "ms", 60000), + ("min", "us", 60000000), + ("min", "ns", 60000000000), + ("s", "ms", 1000), + ("s", "us", 1000000), + ("s", "ns", 1000000000), + ("ms", "us", 1000), + ("ms", "ns", 1000000), + ("us", "ns", 1000), + ], +) +def test_intra_day_conversion_factors(freq1, freq2, expected): + assert ( + period_asfreq(1, get_freq_code(freq1), get_freq_code(freq2), False) == expected + ) + + +@pytest.mark.parametrize( + "freq,expected", [("Y", 0), ("M", 0), ("W", 1), ("D", 0), ("B", 0)] +) +def test_period_ordinal_start_values(freq, expected): + # information for Jan. 1, 1970. + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq_code(freq)) == expected + + +@pytest.mark.parametrize( + "dt,expected", + [ + ((1970, 1, 4, 0, 0, 0, 0, 0), 1), + ((1970, 1, 5, 0, 0, 0, 0, 0), 2), + ((2013, 10, 6, 0, 0, 0, 0, 0), 2284), + ((2013, 10, 7, 0, 0, 0, 0, 0), 2285), + ], +) +def test_period_ordinal_week(dt, expected): + args = dt + (get_freq_code("W"),) + assert period_ordinal(*args) == expected + + +@pytest.mark.parametrize( + "day,expected", + [ + # Thursday (Oct. 3, 2013). + (3, 11415), + # Friday (Oct. 4, 2013). + (4, 11416), + # Saturday (Oct. 5, 2013). + (5, 11417), + # Sunday (Oct. 6, 2013). + (6, 11417), + # Monday (Oct. 7, 2013). + (7, 11417), + # Tuesday (Oct. 8, 2013). + (8, 11418), + ], +) +def test_period_ordinal_business_day(day, expected): + # 5000 is PeriodDtypeCode for BusinessDay + args = (2013, 10, day, 0, 0, 0, 0, 0, 5000) + assert period_ordinal(*args) == expected + + +class TestExtractOrdinals: + def test_extract_ordinals_raises(self): + # with non-object, make sure we raise TypeError, not segfault + arr = np.arange(5) + freq = to_offset("D") + with pytest.raises(TypeError, match="values must be object-dtype"): + extract_ordinals(arr, freq) + + def test_extract_ordinals_2d(self): + freq = to_offset("D") + arr = np.empty(10, dtype=object) + arr[:] = iNaT + + res = extract_ordinals(arr, freq) + res2 = extract_ordinals(arr.reshape(5, 2), freq) + tm.assert_numpy_array_equal(res, res2.reshape(-1)) + + +def test_get_period_field_array_raises_on_out_of_range(): + msg = "Buffer dtype mismatch, expected 'const int64_t' but got 'double'" + with pytest.raises(ValueError, match=msg): + get_period_field_arr(-1, np.empty(1), 0) diff -Nru pandas-2.1.4+dfsg/pandas/tests/tslibs/test_period_asfreq.py pandas-2.2.2+dfsg/pandas/tests/tslibs/test_period_asfreq.py --- pandas-2.1.4+dfsg/pandas/tests/tslibs/test_period_asfreq.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tslibs/test_period_asfreq.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,116 +0,0 @@ -import numpy as np -import pytest - -from pandas._libs.tslibs import ( - iNaT, - to_offset, -) -from pandas._libs.tslibs.period import ( - extract_ordinals, - period_asfreq, - period_ordinal, -) - -import pandas._testing as tm - - -def get_freq_code(freqstr: str) -> int: - off = to_offset(freqstr) - # error: "BaseOffset" has no attribute "_period_dtype_code" - code = off._period_dtype_code # type: ignore[attr-defined] - return code - - -@pytest.mark.parametrize( - "freq1,freq2,expected", - [ - ("D", "H", 24), - ("D", "T", 1440), - ("D", "S", 86400), - ("D", "L", 86400000), - ("D", "U", 86400000000), - ("D", "N", 86400000000000), - ("H", "T", 60), - ("H", "S", 3600), - ("H", "L", 3600000), - ("H", "U", 3600000000), - ("H", "N", 3600000000000), - ("T", "S", 60), - ("T", "L", 60000), - ("T", "U", 60000000), - ("T", "N", 60000000000), - ("S", "L", 1000), - ("S", "U", 1000000), - ("S", "N", 1000000000), - ("L", "U", 1000), - ("L", "N", 1000000), - ("U", "N", 1000), - ], -) -def test_intra_day_conversion_factors(freq1, freq2, expected): - assert ( - period_asfreq(1, get_freq_code(freq1), get_freq_code(freq2), False) == expected - ) - - -@pytest.mark.parametrize( - "freq,expected", [("A", 0), ("M", 0), ("W", 1), ("D", 0), ("B", 0)] -) -def test_period_ordinal_start_values(freq, expected): - # information for Jan. 1, 1970. - assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq_code(freq)) == expected - - -@pytest.mark.parametrize( - "dt,expected", - [ - ((1970, 1, 4, 0, 0, 0, 0, 0), 1), - ((1970, 1, 5, 0, 0, 0, 0, 0), 2), - ((2013, 10, 6, 0, 0, 0, 0, 0), 2284), - ((2013, 10, 7, 0, 0, 0, 0, 0), 2285), - ], -) -def test_period_ordinal_week(dt, expected): - args = dt + (get_freq_code("W"),) - assert period_ordinal(*args) == expected - - -@pytest.mark.parametrize( - "day,expected", - [ - # Thursday (Oct. 3, 2013). - (3, 11415), - # Friday (Oct. 4, 2013). - (4, 11416), - # Saturday (Oct. 5, 2013). - (5, 11417), - # Sunday (Oct. 6, 2013). - (6, 11417), - # Monday (Oct. 7, 2013). - (7, 11417), - # Tuesday (Oct. 8, 2013). - (8, 11418), - ], -) -def test_period_ordinal_business_day(day, expected): - # 5000 is PeriodDtypeCode for BusinessDay - args = (2013, 10, day, 0, 0, 0, 0, 0, 5000) - assert period_ordinal(*args) == expected - - -class TestExtractOrdinals: - def test_extract_ordinals_raises(self): - # with non-object, make sure we raise TypeError, not segfault - arr = np.arange(5) - freq = to_offset("D") - with pytest.raises(TypeError, match="values must be object-dtype"): - extract_ordinals(arr, freq) - - def test_extract_ordinals_2d(self): - freq = to_offset("D") - arr = np.empty(10, dtype=object) - arr[:] = iNaT - - res = extract_ordinals(arr, freq) - res2 = extract_ordinals(arr.reshape(5, 2), freq) - tm.assert_numpy_array_equal(res, res2.reshape(-1)) diff -Nru pandas-2.1.4+dfsg/pandas/tests/tslibs/test_resolution.py pandas-2.2.2+dfsg/pandas/tests/tslibs/test_resolution.py --- pandas-2.1.4+dfsg/pandas/tests/tslibs/test_resolution.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tslibs/test_resolution.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,4 +1,5 @@ import numpy as np +import pytest import pytz from pandas._libs.tslibs import ( @@ -7,6 +8,8 @@ ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +import pandas._testing as tm + def test_get_resolution_nano(): # don't return the fallback RESO_DAY @@ -22,3 +25,33 @@ res = get_resolution(arr, pytz.UTC, NpyDatetimeUnit.NPY_FR_us.value) assert res == Resolution.RESO_US + + +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("Y", "year"), + ("Q", "quarter"), + ("M", "month"), + ("D", "day"), + ("h", "hour"), + ("min", "minute"), + ("s", "second"), + ("ms", "millisecond"), + ("us", "microsecond"), + ("ns", "nanosecond"), + ], +) +def test_get_attrname_from_abbrev(freqstr, expected): + reso = Resolution.get_reso_from_freqstr(freqstr) + assert reso.attr_abbrev == freqstr + assert reso.attrname == expected + + +@pytest.mark.parametrize("freq", ["A", "H", "T", "S", "L", "U", "N"]) +def test_units_A_H_T_S_L_U_N_deprecated_from_attrname_to_abbrevs(freq): + # GH#52536 + msg = f"'{freq}' is deprecated and will be removed in a future version." + + with tm.assert_produces_warning(FutureWarning, match=msg): + Resolution.get_reso_from_freqstr(freq) diff -Nru pandas-2.1.4+dfsg/pandas/tests/tslibs/test_strptime.py pandas-2.2.2+dfsg/pandas/tests/tslibs/test_strptime.py --- pandas-2.1.4+dfsg/pandas/tests/tslibs/test_strptime.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tslibs/test_strptime.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,110 @@ +from datetime import ( + datetime, + timezone, +) + +import numpy as np +import pytest + +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit +from pandas._libs.tslibs.strptime import array_strptime + +from pandas import ( + NaT, + Timestamp, +) +import pandas._testing as tm + +creso_infer = NpyDatetimeUnit.NPY_FR_GENERIC.value + + +class TestArrayStrptimeResolutionInference: + def test_array_strptime_resolution_all_nat(self): + arr = np.array([NaT, np.nan], dtype=object) + + fmt = "%Y-%m-%d %H:%M:%S" + res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer) + assert res.dtype == "M8[s]" + + res, _ = array_strptime(arr, fmt=fmt, utc=True, creso=creso_infer) + assert res.dtype == "M8[s]" + + @pytest.mark.parametrize("tz", [None, timezone.utc]) + def test_array_strptime_resolution_inference_homogeneous_strings(self, tz): + dt = datetime(2016, 1, 2, 3, 4, 5, 678900, tzinfo=tz) + + fmt = "%Y-%m-%d %H:%M:%S" + dtstr = dt.strftime(fmt) + arr = np.array([dtstr] * 3, dtype=object) + expected = np.array([dt.replace(tzinfo=None)] * 3, dtype="M8[s]") + + res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer) + tm.assert_numpy_array_equal(res, expected) + + fmt = "%Y-%m-%d %H:%M:%S.%f" + dtstr = dt.strftime(fmt) + arr = np.array([dtstr] * 3, dtype=object) + expected = np.array([dt.replace(tzinfo=None)] * 3, dtype="M8[us]") + + res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer) + tm.assert_numpy_array_equal(res, expected) + + fmt = "ISO8601" + res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer) + tm.assert_numpy_array_equal(res, expected) + + @pytest.mark.parametrize("tz", [None, timezone.utc]) + def test_array_strptime_resolution_mixed(self, tz): + dt = datetime(2016, 1, 2, 3, 4, 5, 678900, tzinfo=tz) + + ts = Timestamp(dt).as_unit("ns") + + arr = np.array([dt, ts], dtype=object) + expected = np.array( + [Timestamp(dt).as_unit("ns").asm8, ts.asm8], + dtype="M8[ns]", + ) + + fmt = "%Y-%m-%d %H:%M:%S" + res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer) + tm.assert_numpy_array_equal(res, expected) + + fmt = "ISO8601" + res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer) + tm.assert_numpy_array_equal(res, expected) + + def test_array_strptime_resolution_todaynow(self): + # specifically case where today/now is the *first* item + vals = np.array(["today", np.datetime64("2017-01-01", "us")], dtype=object) + + now = Timestamp("now").asm8 + res, _ = array_strptime(vals, fmt="%Y-%m-%d", utc=False, creso=creso_infer) + res2, _ = array_strptime( + vals[::-1], fmt="%Y-%m-%d", utc=False, creso=creso_infer + ) + + # 1s is an arbitrary cutoff for call overhead; in local testing the + # actual difference is about 250us + tolerance = np.timedelta64(1, "s") + + assert res.dtype == "M8[us]" + assert abs(res[0] - now) < tolerance + assert res[1] == vals[1] + + assert res2.dtype == "M8[us]" + assert abs(res2[1] - now) < tolerance * 2 + assert res2[0] == vals[1] + + def test_array_strptime_str_outside_nano_range(self): + vals = np.array(["2401-09-15"], dtype=object) + expected = np.array(["2401-09-15"], dtype="M8[s]") + fmt = "ISO8601" + res, _ = array_strptime(vals, fmt=fmt, creso=creso_infer) + tm.assert_numpy_array_equal(res, expected) + + # non-iso -> different path + vals2 = np.array(["Sep 15, 2401"], dtype=object) + expected2 = np.array(["2401-09-15"], dtype="M8[s]") + fmt2 = "%b %d, %Y" + res2, _ = array_strptime(vals2, fmt=fmt2, creso=creso_infer) + tm.assert_numpy_array_equal(res2, expected2) diff -Nru pandas-2.1.4+dfsg/pandas/tests/tslibs/test_to_offset.py pandas-2.2.2+dfsg/pandas/tests/tslibs/test_to_offset.py --- pandas-2.1.4+dfsg/pandas/tests/tslibs/test_to_offset.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/tslibs/test_to_offset.py 2024-04-10 17:42:52.000000000 +0000 @@ -20,14 +20,14 @@ ("2h 60min", offsets.Hour(3)), ("2h 20.5min", offsets.Second(8430)), ("1.5min", offsets.Second(90)), - ("0.5S", offsets.Milli(500)), - ("15l500u", offsets.Micro(15500)), - ("10s75L", offsets.Milli(10075)), + ("0.5s", offsets.Milli(500)), + ("15ms500us", offsets.Micro(15500)), + ("10s75ms", offsets.Milli(10075)), ("1s0.25ms", offsets.Micro(1000250)), - ("1s0.25L", offsets.Micro(1000250)), - ("2800N", offsets.Nano(2800)), - ("2SM", offsets.SemiMonthEnd(2)), - ("2SM-16", offsets.SemiMonthEnd(2, day_of_month=16)), + ("1s0.25ms", offsets.Micro(1000250)), + ("2800ns", offsets.Nano(2800)), + ("2SME", offsets.SemiMonthEnd(2)), + ("2SME-16", offsets.SemiMonthEnd(2, day_of_month=16)), ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), ("2SMS-15", offsets.SemiMonthBegin(2)), ], @@ -38,23 +38,24 @@ @pytest.mark.parametrize( - "freqstr,expected", [("-1S", -1), ("-2SM", -2), ("-1SMS", -1), ("-5min10s", -310)] + "freqstr,expected", [("-1s", -1), ("-2SME", -2), ("-1SMS", -1), ("-5min10s", -310)] ) def test_to_offset_negative(freqstr, expected): result = to_offset(freqstr) assert result.n == expected +@pytest.mark.filterwarnings("ignore:.*'m' is deprecated.*:FutureWarning") @pytest.mark.parametrize( "freqstr", [ "2h20m", - "U1", - "-U", - "3U1", - "-2-3U", - "-2D:3H", - "1.5.0S", + "us1", + "-us", + "3us1", + "-2-3us", + "-2D:3h", + "1.5.0s", "2SMS-15-15", "2SMS-15D", "100foo", @@ -66,12 +67,12 @@ "+d", "-m", # Invalid shortcut anchors. - "SM-0", - "SM-28", - "SM-29", - "SM-FOO", + "SME-0", + "SME-28", + "SME-29", + "SME-FOO", "BSM", - "SM--1", + "SME--1", "SMS-1", "SMS-28", "SMS-30", @@ -105,12 +106,12 @@ @pytest.mark.parametrize( "freqstr,expected", [ - ("2D 3H", offsets.Hour(51)), - ("2 D3 H", offsets.Hour(51)), - ("2 D 3 H", offsets.Hour(51)), - (" 2 D 3 H ", offsets.Hour(51)), - (" H ", offsets.Hour()), - (" 3 H ", offsets.Hour(3)), + ("2D 3h", offsets.Hour(51)), + ("2 D3 h", offsets.Hour(51)), + ("2 D 3 h", offsets.Hour(51)), + (" 2 D 3 h ", offsets.Hour(51)), + (" h ", offsets.Hour()), + (" 3 h ", offsets.Hour(3)), ], ) def test_to_offset_whitespace(freqstr, expected): @@ -119,7 +120,7 @@ @pytest.mark.parametrize( - "freqstr,expected", [("00H 00T 01S", 1), ("-00H 03T 14S", -194)] + "freqstr,expected", [("00h 00min 01s", 1), ("-00h 03min 14s", -194)] ) def test_to_offset_leading_zero(freqstr, expected): result = to_offset(freqstr) @@ -158,13 +159,13 @@ [ ("W", offsets.Week(weekday=6)), ("W-SUN", offsets.Week(weekday=6)), - ("Q", offsets.QuarterEnd(startingMonth=12)), - ("Q-DEC", offsets.QuarterEnd(startingMonth=12)), - ("Q-MAY", offsets.QuarterEnd(startingMonth=5)), - ("SM", offsets.SemiMonthEnd(day_of_month=15)), - ("SM-15", offsets.SemiMonthEnd(day_of_month=15)), - ("SM-1", offsets.SemiMonthEnd(day_of_month=1)), - ("SM-27", offsets.SemiMonthEnd(day_of_month=27)), + ("QE", offsets.QuarterEnd(startingMonth=12)), + ("QE-DEC", offsets.QuarterEnd(startingMonth=12)), + ("QE-MAY", offsets.QuarterEnd(startingMonth=5)), + ("SME", offsets.SemiMonthEnd(day_of_month=15)), + ("SME-15", offsets.SemiMonthEnd(day_of_month=15)), + ("SME-1", offsets.SemiMonthEnd(day_of_month=1)), + ("SME-27", offsets.SemiMonthEnd(day_of_month=27)), ("SMS-2", offsets.SemiMonthBegin(day_of_month=2)), ("SMS-27", offsets.SemiMonthBegin(day_of_month=27)), ], @@ -172,3 +173,47 @@ def test_anchored_shortcuts(shortcut, expected): result = to_offset(shortcut) assert result == expected + + +@pytest.mark.parametrize( + "freq_depr", + [ + "2ye-mar", + "2ys", + "2qe", + "2qs-feb", + "2bqs", + "2sms", + "2bms", + "2cbme", + "2me", + "2w", + ], +) +def test_to_offset_lowercase_frequency_deprecated(freq_depr): + # GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.upper()[1:]}' instead." + + with pytest.raises(FutureWarning, match=depr_msg): + to_offset(freq_depr) + + +@pytest.mark.parametrize( + "freq_depr", + [ + "2H", + "2BH", + "2MIN", + "2S", + "2Us", + "2NS", + ], +) +def test_to_offset_uppercase_frequency_deprecated(freq_depr): + # GH#54939 + depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a " + f"future version, please use '{freq_depr.lower()[1:]}' instead." + + with pytest.raises(FutureWarning, match=depr_msg): + to_offset(freq_depr) diff -Nru pandas-2.1.4+dfsg/pandas/tests/util/test_assert_almost_equal.py pandas-2.2.2+dfsg/pandas/tests/util/test_assert_almost_equal.py --- pandas-2.1.4+dfsg/pandas/tests/util/test_assert_almost_equal.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/util/test_assert_almost_equal.py 2024-04-10 17:42:52.000000000 +0000 @@ -340,7 +340,7 @@ # TODO: to get the same deprecation in assert_numpy_array_equal we need # to change/deprecate the default for strict_nan to become True - # TODO: to get the same deprecateion in assert_index_equal we need to + # TODO: to get the same deprecation in assert_index_equal we need to # change/deprecate array_equivalent_object to be stricter, as # assert_index_equal uses Index.equal which uses array_equivalent. with tm.assert_produces_warning(FutureWarning, match=msg): diff -Nru pandas-2.1.4+dfsg/pandas/tests/util/test_assert_extension_array_equal.py pandas-2.2.2+dfsg/pandas/tests/util/test_assert_extension_array_equal.py --- pandas-2.1.4+dfsg/pandas/tests/util/test_assert_extension_array_equal.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/util/test_assert_extension_array_equal.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,10 @@ import numpy as np import pytest -from pandas import array +from pandas import ( + Timestamp, + array, +) import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray @@ -111,3 +114,13 @@ left = array([1, 2, 3], dtype="Int64") right = array([1, 2, 3], dtype=right_dtype) tm.assert_extension_array_equal(left, right, check_dtype=False) + + +def test_assert_extension_array_equal_time_units(): + # https://github.com/pandas-dev/pandas/issues/55730 + timestamp = Timestamp("2023-11-04T12") + naive = array([timestamp], dtype="datetime64[ns]") + utc = array([timestamp], dtype="datetime64[ns, UTC]") + + tm.assert_extension_array_equal(naive, utc, check_dtype=False) + tm.assert_extension_array_equal(utc, naive, check_dtype=False) diff -Nru pandas-2.1.4+dfsg/pandas/tests/util/test_assert_frame_equal.py pandas-2.2.2+dfsg/pandas/tests/util/test_assert_frame_equal.py --- pandas-2.1.4+dfsg/pandas/tests/util/test_assert_frame_equal.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/util/test_assert_frame_equal.py 2024-04-10 17:42:52.000000000 +0000 @@ -109,12 +109,16 @@ @pytest.mark.parametrize("check_like", [True, False]) -def test_frame_equal_index_mismatch(check_like, obj_fixture): +def test_frame_equal_index_mismatch(check_like, obj_fixture, using_infer_string): + if using_infer_string: + dtype = "string" + else: + dtype = "object" msg = f"""{obj_fixture}\\.index are different {obj_fixture}\\.index values are different \\(33\\.33333 %\\) -\\[left\\]: Index\\(\\['a', 'b', 'c'\\], dtype='object'\\) -\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\) +\\[left\\]: Index\\(\\['a', 'b', 'c'\\], dtype='{dtype}'\\) +\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='{dtype}'\\) At positional index 2, first diff: c != d""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) @@ -125,12 +129,16 @@ @pytest.mark.parametrize("check_like", [True, False]) -def test_frame_equal_columns_mismatch(check_like, obj_fixture): +def test_frame_equal_columns_mismatch(check_like, obj_fixture, using_infer_string): + if using_infer_string: + dtype = "string" + else: + dtype = "object" msg = f"""{obj_fixture}\\.columns are different {obj_fixture}\\.columns values are different \\(50\\.0 %\\) -\\[left\\]: Index\\(\\['A', 'B'\\], dtype='object'\\) -\\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""" +\\[left\\]: Index\\(\\['A', 'B'\\], dtype='{dtype}'\\) +\\[right\\]: Index\\(\\['A', 'b'\\], dtype='{dtype}'\\)""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"]) @@ -228,11 +236,17 @@ tm.assert_frame_equal(left, right, check_dtype=True) -@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) -def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): +def test_assert_frame_equal_ignore_extension_dtype_mismatch(): # https://github.com/pandas-dev/pandas/issues/35715 left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") - right = DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) + right = DataFrame({"a": [1, 2, 3]}, dtype="Int32") + tm.assert_frame_equal(left, right, check_dtype=False) + + +def test_assert_frame_equal_ignore_extension_dtype_mismatch_cross_class(): + # https://github.com/pandas-dev/pandas/issues/35715 + left = DataFrame({"a": [1, 2, 3]}, dtype="Int64") + right = DataFrame({"a": [1, 2, 3]}, dtype="int64") tm.assert_frame_equal(left, right, check_dtype=False) @@ -282,9 +296,7 @@ dtypes = (any_numeric_ea_dtype, "int64") obj1 = frame_or_series([1, 2], dtype=dtypes[indexer[0]]) obj2 = frame_or_series([1, 2], dtype=dtypes[indexer[1]]) - msg = r'(Series|DataFrame.iloc\[:, 0\] \(column name="0"\) classes) are different' - with pytest.raises(AssertionError, match=msg): - tm.assert_equal(obj1, obj2, check_exact=True, check_dtype=False) + tm.assert_equal(obj1, obj2, check_exact=True, check_dtype=False) def test_assert_frame_equal_check_like_different_indexes(): diff -Nru pandas-2.1.4+dfsg/pandas/tests/util/test_assert_index_equal.py pandas-2.2.2+dfsg/pandas/tests/util/test_assert_index_equal.py --- pandas-2.1.4+dfsg/pandas/tests/util/test_assert_index_equal.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/util/test_assert_index_equal.py 2024-04-10 17:42:52.000000000 +0000 @@ -205,14 +205,18 @@ tm.assert_index_equal(idx1, idx2) -def test_index_equal_category_mismatch(check_categorical): - msg = """Index are different +def test_index_equal_category_mismatch(check_categorical, using_infer_string): + if using_infer_string: + dtype = "string" + else: + dtype = "object" + msg = f"""Index are different Attribute "dtype" are different \\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False, \ -categories_dtype=object\\) +categories_dtype={dtype}\\) \\[right\\]: CategoricalDtype\\(categories=\\['a', 'b', 'c'\\], \ -ordered=False, categories_dtype=object\\)""" +ordered=False, categories_dtype={dtype}\\)""" idx1 = Index(Categorical(["a", "b"])) idx2 = Index(Categorical(["a", "b"], categories=["a", "b", "c"])) diff -Nru pandas-2.1.4+dfsg/pandas/tests/util/test_assert_series_equal.py pandas-2.2.2+dfsg/pandas/tests/util/test_assert_series_equal.py --- pandas-2.1.4+dfsg/pandas/tests/util/test_assert_series_equal.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/util/test_assert_series_equal.py 2024-04-10 17:42:52.000000000 +0000 @@ -214,8 +214,18 @@ tm.assert_series_equal(s1, s2, rtol=rtol) -def test_series_equal_categorical_values_mismatch(rtol): - msg = """Series are different +def test_series_equal_categorical_values_mismatch(rtol, using_infer_string): + if using_infer_string: + msg = """Series are different + +Series values are different \\(66\\.66667 %\\) +\\[index\\]: \\[0, 1, 2\\] +\\[left\\]: \\['a', 'b', 'c'\\] +Categories \\(3, string\\): \\[a, b, c\\] +\\[right\\]: \\['a', 'c', 'b'\\] +Categories \\(3, string\\): \\[a, b, c\\]""" + else: + msg = """Series are different Series values are different \\(66\\.66667 %\\) \\[index\\]: \\[0, 1, 2\\] @@ -246,14 +256,18 @@ tm.assert_series_equal(s1, s2, rtol=rtol) -def test_series_equal_categorical_mismatch(check_categorical): - msg = """Attributes of Series are different +def test_series_equal_categorical_mismatch(check_categorical, using_infer_string): + if using_infer_string: + dtype = "string" + else: + dtype = "object" + msg = f"""Attributes of Series are different Attribute "dtype" are different \\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False, \ -categories_dtype=object\\) +categories_dtype={dtype}\\) \\[right\\]: CategoricalDtype\\(categories=\\['a', 'b', 'c'\\], \ -ordered=False, categories_dtype=object\\)""" +ordered=False, categories_dtype={dtype}\\)""" s1 = Series(Categorical(["a", "b"])) s2 = Series(Categorical(["a", "b"], categories=list("abc"))) @@ -348,11 +362,17 @@ tm.assert_series_equal(s3, s1, check_exact=True) -@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) -def test_assert_series_equal_ignore_extension_dtype_mismatch(right_dtype): +def test_assert_series_equal_ignore_extension_dtype_mismatch(): # https://github.com/pandas-dev/pandas/issues/35715 left = Series([1, 2, 3], dtype="Int64") - right = Series([1, 2, 3], dtype=right_dtype) + right = Series([1, 2, 3], dtype="Int32") + tm.assert_series_equal(left, right, check_dtype=False) + + +def test_assert_series_equal_ignore_extension_dtype_mismatch_cross_class(): + # https://github.com/pandas-dev/pandas/issues/35715 + left = Series([1, 2, 3], dtype="Int64") + right = Series([1, 2, 3], dtype="int64") tm.assert_series_equal(left, right, check_dtype=False) @@ -423,3 +443,42 @@ with pytest.raises(AssertionError, match="Series are different"): tm.assert_series_equal(ser_s, ser_ms, check_dtype=False) + + +@pytest.mark.parametrize("dtype", ["Int64", "int64"]) +def test_large_unequal_ints(dtype): + # https://github.com/pandas-dev/pandas/issues/55882 + left = Series([1577840521123000], dtype=dtype) + right = Series([1577840521123543], dtype=dtype) + with pytest.raises(AssertionError, match="Series are different"): + tm.assert_series_equal(left, right) + + +@pytest.mark.parametrize("dtype", [None, object]) +@pytest.mark.parametrize("check_exact", [True, False]) +@pytest.mark.parametrize("val", [3, 3.5]) +def test_ea_and_numpy_no_dtype_check(val, check_exact, dtype): + # GH#56651 + left = Series([1, 2, val], dtype=dtype) + right = Series(pd.array([1, 2, val])) + tm.assert_series_equal(left, right, check_dtype=False, check_exact=check_exact) + + +def test_assert_series_equal_int_tol(): + # GH#56646 + left = Series([81, 18, 121, 38, 74, 72, 81, 81, 146, 81, 81, 170, 74, 74]) + right = Series([72, 9, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72]) + tm.assert_series_equal(left, right, rtol=1.5) + + tm.assert_frame_equal(left.to_frame(), right.to_frame(), rtol=1.5) + tm.assert_extension_array_equal( + left.astype("Int64").values, right.astype("Int64").values, rtol=1.5 + ) + + +def test_assert_series_equal_index_exact_default(): + # GH#57067 + ser1 = Series(np.zeros(6, dtype=int), [0, 0.2, 0.4, 0.6, 0.8, 1]) + ser2 = Series(np.zeros(6, dtype=int), np.linspace(0, 1, 6)) + tm.assert_series_equal(ser1, ser2) + tm.assert_frame_equal(ser1.to_frame(), ser2.to_frame()) diff -Nru pandas-2.1.4+dfsg/pandas/tests/util/test_hashing.py pandas-2.2.2+dfsg/pandas/tests/util/test_hashing.py --- pandas-2.1.4+dfsg/pandas/tests/util/test_hashing.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/util/test_hashing.py 2024-04-10 17:42:52.000000000 +0000 @@ -7,6 +7,8 @@ Index, MultiIndex, Series, + period_range, + timedelta_range, ) import pandas._testing as tm from pandas.core.util.hashing import hash_tuples @@ -25,7 +27,7 @@ Series([True, False, True] * 3), Series(pd.date_range("20130101", periods=9)), Series(pd.date_range("20130101", periods=9, tz="US/Eastern")), - Series(pd.timedelta_range("2000", periods=9)), + Series(timedelta_range("2000", periods=9)), ] ) def series(request): @@ -136,10 +138,17 @@ DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), DataFrame(), DataFrame(np.full((10, 4), np.nan)), - tm.makeMixedDataFrame(), - tm.makeTimeDataFrame(), - tm.makeTimeSeries(), - Series(tm.makePeriodIndex()), + DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "D": pd.date_range("20130101", periods=5), + } + ), + DataFrame(range(5), index=pd.date_range("2020-01-01", periods=5)), + Series(range(5), index=pd.date_range("2020-01-01", periods=5)), + Series(period_range("2020-01-01", periods=10, freq="D")), Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), ], ) @@ -162,10 +171,17 @@ Series([True, False, True]), DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), DataFrame(np.full((10, 4), np.nan)), - tm.makeMixedDataFrame(), - tm.makeTimeDataFrame(), - tm.makeTimeSeries(), - Series(tm.makePeriodIndex()), + DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object), + "D": pd.date_range("20130101", periods=5), + } + ), + DataFrame(range(5), index=pd.date_range("2020-01-01", periods=5)), + Series(range(5), index=pd.date_range("2020-01-01", periods=5)), + Series(period_range("2020-01-01", periods=10, freq="D")), Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), ], ) @@ -180,8 +196,8 @@ [ Index([1, 2, 3]), Index([True, False, True]), - tm.makeTimedeltaIndex(), - tm.makePeriodIndex(), + timedelta_range("1 day", periods=2), + period_range("2020-01-01", freq="D", periods=2), MultiIndex.from_product( [range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2)] ), @@ -328,9 +344,9 @@ @pytest.mark.parametrize("l_add", [0, 1]) def test_same_len_hash_collisions(l_exp, l_add): length = 2 ** (l_exp + 8) + l_add - s = tm.makeStringIndex(length).to_numpy() + idx = np.array([str(i) for i in range(length)], dtype=object) - result = hash_array(s, "utf8") + result = hash_array(idx, "utf8") assert not result[0] == result[1] @@ -339,8 +355,8 @@ # # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 hashes = [ - "Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa: E501 - "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe", # noqa: E501 + "Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", + "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe", ] # These should be different. diff -Nru pandas-2.1.4+dfsg/pandas/tests/util/test_make_objects.py pandas-2.2.2+dfsg/pandas/tests/util/test_make_objects.py --- pandas-2.1.4+dfsg/pandas/tests/util/test_make_objects.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/util/test_make_objects.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,15 +0,0 @@ -""" -Tests for tm.makeFoo functions. -""" - - -import numpy as np - -import pandas._testing as tm - - -def test_make_multiindex_respects_k(): - # GH#38795 respect 'k' arg - N = np.random.default_rng(2).integers(0, 100) - mi = tm.makeMultiIndex(k=N) - assert len(mi) == N diff -Nru pandas-2.1.4+dfsg/pandas/tests/util/test_safe_import.py pandas-2.2.2+dfsg/pandas/tests/util/test_safe_import.py --- pandas-2.1.4+dfsg/pandas/tests/util/test_safe_import.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/util/test_safe_import.py 1970-01-01 00:00:00.000000000 +0000 @@ -1,39 +0,0 @@ -import sys -import types - -import pytest - -import pandas.util._test_decorators as td - - -@pytest.mark.parametrize("name", ["foo", "hello123"]) -def test_safe_import_non_existent(name): - assert not td.safe_import(name) - - -def test_safe_import_exists(): - assert td.safe_import("pandas") - - -@pytest.mark.parametrize("min_version,valid", [("0.0.0", True), ("99.99.99", False)]) -def test_safe_import_versions(min_version, valid): - result = td.safe_import("pandas", min_version=min_version) - result = result if valid else not result - assert result - - -@pytest.mark.parametrize( - "min_version,valid", [(None, False), ("1.0", True), ("2.0", False)] -) -def test_safe_import_dummy(monkeypatch, min_version, valid): - mod_name = "hello123" - - mod = types.ModuleType(mod_name) - mod.__version__ = "1.5" - - if min_version is not None: - monkeypatch.setitem(sys.modules, mod_name, mod) - - result = td.safe_import(mod_name, min_version=min_version) - result = result if valid else not result - assert result diff -Nru pandas-2.1.4+dfsg/pandas/tests/util/test_shares_memory.py pandas-2.2.2+dfsg/pandas/tests/util/test_shares_memory.py --- pandas-2.1.4+dfsg/pandas/tests/util/test_shares_memory.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/util/test_shares_memory.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,3 +1,5 @@ +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -11,3 +13,18 @@ assert tm.shares_memory(obj, obj[:2]) assert not tm.shares_memory(obj, obj._data.copy()) + + +@td.skip_if_no("pyarrow") +def test_shares_memory_string(): + # GH#55823 + import pyarrow as pa + + obj = pd.array(["a", "b"], dtype="string[pyarrow]") + assert tm.shares_memory(obj, obj) + + obj = pd.array(["a", "b"], dtype="string[pyarrow_numpy]") + assert tm.shares_memory(obj, obj) + + obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string())) + assert tm.shares_memory(obj, obj) diff -Nru pandas-2.1.4+dfsg/pandas/tests/util/test_util.py pandas-2.2.2+dfsg/pandas/tests/util/test_util.py --- pandas-2.1.4+dfsg/pandas/tests/util/test_util.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/util/test_util.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,7 +2,10 @@ import pytest -from pandas import compat +from pandas import ( + array, + compat, +) import pandas._testing as tm @@ -44,3 +47,12 @@ def test_external_error_raised(): with tm.external_error_raised(TypeError): raise TypeError("Should not check this error message, so it will pass") + + +def test_is_sorted(): + arr = array([1, 2, 3], dtype="Int64") + tm.assert_is_sorted(arr) + + arr = array([4, 2, 3], dtype="Int64") + with pytest.raises(AssertionError, match="ExtensionArray are different"): + tm.assert_is_sorted(arr) diff -Nru pandas-2.1.4+dfsg/pandas/tests/window/moments/test_moments_consistency_expanding.py pandas-2.2.2+dfsg/pandas/tests/window/moments/test_moments_consistency_expanding.py --- pandas-2.1.4+dfsg/pandas/tests/window/moments/test_moments_consistency_expanding.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/window/moments/test_moments_consistency_expanding.py 2024-04-10 17:42:52.000000000 +0000 @@ -19,7 +19,7 @@ if not no_nans(all_data) and not ( all_na(all_data) and not all_data.empty and min_periods > 0 ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="np.sum has different behavior with NaNs") ) expanding_f_result = all_data.expanding(min_periods=min_periods).sum() diff -Nru pandas-2.1.4+dfsg/pandas/tests/window/moments/test_moments_consistency_rolling.py pandas-2.2.2+dfsg/pandas/tests/window/moments/test_moments_consistency_rolling.py --- pandas-2.1.4+dfsg/pandas/tests/window/moments/test_moments_consistency_rolling.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/window/moments/test_moments_consistency_rolling.py 2024-04-10 17:42:52.000000000 +0000 @@ -29,7 +29,7 @@ if not no_nans(all_data) and not ( all_na(all_data) and not all_data.empty and min_periods > 0 ): - request.node.add_marker( + request.applymarker( pytest.mark.xfail(reason="np.sum has different behavior with NaNs") ) rolling_f_result = all_data.rolling( diff -Nru pandas-2.1.4+dfsg/pandas/tests/window/test_api.py pandas-2.2.2+dfsg/pandas/tests/window/test_api.py --- pandas-2.1.4+dfsg/pandas/tests/window/test_api.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/window/test_api.py 2024-04-10 17:42:52.000000000 +0000 @@ -70,7 +70,9 @@ def test_sum_object_str_raises(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3, step=step) - with pytest.raises(DataError, match="Cannot aggregate non-numeric type: object"): + with pytest.raises( + DataError, match="Cannot aggregate non-numeric type: object|string" + ): # GH#42738, enforced in 2.0 r.sum() diff -Nru pandas-2.1.4+dfsg/pandas/tests/window/test_apply.py pandas-2.2.2+dfsg/pandas/tests/window/test_apply.py --- pandas-2.1.4+dfsg/pandas/tests/window/test_apply.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/window/test_apply.py 2024-04-10 17:42:52.000000000 +0000 @@ -301,8 +301,9 @@ tm.assert_series_equal(series_xp, series_rs) -def test_center_reindex_frame(raw, frame): +def test_center_reindex_frame(raw): # shifter index + frame = DataFrame(range(100), index=date_range("2020-01-01", freq="D", periods=100)) s = [f"x{x:d}" for x in range(12)] minp = 10 diff -Nru pandas-2.1.4+dfsg/pandas/tests/window/test_ewm.py pandas-2.2.2+dfsg/pandas/tests/window/test_ewm.py --- pandas-2.1.4+dfsg/pandas/tests/window/test_ewm.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/window/test_ewm.py 2024-04-10 17:42:52.000000000 +0000 @@ -60,7 +60,7 @@ def test_ewma_times_not_datetime_type(): - msg = r"times must be datetime64\[ns\] dtype." + msg = r"times must be datetime64 dtype." with pytest.raises(ValueError, match=msg): Series(range(5)).ewm(times=np.arange(5)) @@ -102,12 +102,14 @@ tm.assert_frame_equal(result, expected) -def test_ewma_with_times_variable_spacing(tz_aware_fixture): +def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit): tz = tz_aware_fixture halflife = "23 days" - times = DatetimeIndex( - ["2020-01-01", "2020-01-10T00:04:05", "2020-02-23T05:00:23"] - ).tz_localize(tz) + times = ( + DatetimeIndex(["2020-01-01", "2020-01-10T00:04:05", "2020-02-23T05:00:23"]) + .tz_localize(tz) + .as_unit(unit) + ) data = np.arange(3) df = DataFrame(data) result = df.ewm(halflife=halflife, times=times).mean() diff -Nru pandas-2.1.4+dfsg/pandas/tests/window/test_groupby.py pandas-2.2.2+dfsg/pandas/tests/window/test_groupby.py --- pandas-2.1.4+dfsg/pandas/tests/window/test_groupby.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/window/test_groupby.py 2024-04-10 17:42:52.000000000 +0000 @@ -3,6 +3,7 @@ from pandas import ( DataFrame, + DatetimeIndex, Index, MultiIndex, Series, @@ -99,7 +100,9 @@ r = g.rolling(window=4) result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.rolling(4), f)()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -113,7 +116,9 @@ r = g.rolling(window=4) result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -129,9 +134,11 @@ r = g.rolling(window=4) result = r.quantile(0.4, interpolation=interpolation) - expected = g.apply( - lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -174,7 +181,9 @@ def func(x): return getattr(x.rolling(4), f)(roll_frame) - expected = g.apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) expected["A"] = np.nan @@ -190,7 +199,9 @@ def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) - expected = g.apply(func) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply(func) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -235,7 +246,9 @@ # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -361,24 +374,11 @@ .rolling(6, on="Date", center=True, min_periods=1) .value.mean() ) + mi = MultiIndex.from_arrays([df["gb"], df["Date"]], names=["gb", "Date"]) expected = Series( [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 7.0, 7.5, 7.5, 7.5], name="value", - index=MultiIndex.from_tuples( - ( - ("group_1", Timestamp("2020-01-01")), - ("group_1", Timestamp("2020-01-02")), - ("group_1", Timestamp("2020-01-03")), - ("group_1", Timestamp("2020-01-04")), - ("group_1", Timestamp("2020-01-05")), - ("group_1", Timestamp("2020-01-06")), - ("group_2", Timestamp("2020-01-07")), - ("group_2", Timestamp("2020-01-08")), - ("group_2", Timestamp("2020-01-09")), - ("group_2", Timestamp("2020-01-10")), - ), - names=["gb", "Date"], - ), + index=mi, ) tm.assert_series_equal(result, expected) @@ -466,20 +466,23 @@ # GH 35549 df = DataFrame( { - "column1": range(6), - "column2": range(6), - "group": 3 * ["A", "B"], - "date": [Timestamp("2019-01-01")] * 6, + "column1": range(8), + "column2": range(8), + "group": ["A"] * 4 + ["B"] * 4, + "date": [ + Timestamp(date) + for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"] + ] + * 2, } ) result = ( df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum() ) expected = Series( - [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], - index=MultiIndex.from_tuples( - [("A", Timestamp("2019-01-01"))] * 3 - + [("B", Timestamp("2019-01-01"))] * 3, + [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0], + index=MultiIndex.from_frame( + df[["group", "date"]], names=["group", "date"], ), name="column1", @@ -490,10 +493,14 @@ # GH 35549 df = DataFrame( { - "column1": range(6), - "column2": range(6), - "group": 3 * ["A", "B"], - "date": [Timestamp("2019-01-01")] * 6, + "column1": range(8), + "column2": range(8), + "group": ["A"] * 4 + ["B"] * 4, + "date": [ + Timestamp(date) + for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"] + ] + * 2, } ) @@ -503,10 +510,9 @@ .sum() ) expected = Series( - [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], - index=MultiIndex.from_tuples( - [("A", Timestamp("2019-01-01"))] * 3 - + [("B", Timestamp("2019-01-01"))] * 3, + [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0], + index=MultiIndex.from_frame( + df[["group", "date"]], names=["group", "date"], ), name="column1", @@ -607,14 +613,14 @@ expected = expected.drop(columns="foo") tm.assert_frame_equal(result, expected) - def test_groupby_rolling_count_closed_on(self): + def test_groupby_rolling_count_closed_on(self, unit): # GH 35869 df = DataFrame( { "column1": range(6), "column2": range(6), "group": 3 * ["A", "B"], - "date": date_range(end="20190101", periods=6), + "date": date_range(end="20190101", periods=6, unit=unit), } ) result = ( @@ -622,20 +628,28 @@ .rolling("3d", on="date", closed="left")["column1"] .count() ) + dti = DatetimeIndex( + [ + "2018-12-27", + "2018-12-29", + "2018-12-31", + "2018-12-28", + "2018-12-30", + "2019-01-01", + ], + dtype=f"M8[{unit}]", + ) + mi = MultiIndex.from_arrays( + [ + ["A", "A", "A", "B", "B", "B"], + dti, + ], + names=["group", "date"], + ) expected = Series( [np.nan, 1.0, 1.0, np.nan, 1.0, 1.0], name="column1", - index=MultiIndex.from_tuples( - [ - ("A", Timestamp("2018-12-27")), - ("A", Timestamp("2018-12-29")), - ("A", Timestamp("2018-12-31")), - ("B", Timestamp("2018-12-28")), - ("B", Timestamp("2018-12-30")), - ("B", Timestamp("2019-01-01")), - ], - names=["group", "date"], - ), + index=mi, ) tm.assert_series_equal(result, expected) @@ -778,9 +792,13 @@ def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) - expected = g.apply(lambda x: x.rolling(4).sum()).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) - result = g.apply(lambda x: x.rolling(4).sum()).index + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -863,7 +881,7 @@ ], ], ) - def test_as_index_false(self, by, expected_data): + def test_as_index_false(self, by, expected_data, unit): # GH 39433 data = [ ["A", "2018-01-01", 100.0], @@ -872,7 +890,7 @@ ["B", "2018-01-02", 250.0], ] df = DataFrame(data, columns=["id", "date", "num"]) - df["date"] = to_datetime(df["date"]) + df["date"] = df["date"].astype(f"M8[{unit}]") df = df.set_index(["date"]) gp_by = [getattr(df, attr) for attr in by] @@ -886,6 +904,8 @@ expected, index=df.index, ) + if "date" in expected_data: + expected["date"] = expected["date"].astype(f"M8[{unit}]") tm.assert_frame_equal(result, expected) def test_nan_and_zero_endpoints(self, any_int_numpy_dtype): @@ -954,11 +974,13 @@ df["date"] = to_datetime(df["date"]) df = df.sort_values("date") - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) @@ -977,9 +999,13 @@ } ) - expected = ( - df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = ( + df.set_index("B") + .groupby("A") + .apply(lambda x: x.rolling("4s")["C"].mean()) + ) result = df.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) @@ -1009,7 +1035,9 @@ r = g.expanding() result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.expanding(), f)()) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply(lambda x: getattr(x.expanding(), f)()) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1023,7 +1051,9 @@ r = g.expanding() result = getattr(r, f)(ddof=0) - expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1039,9 +1069,11 @@ r = g.expanding() result = r.quantile(0.4, interpolation=interpolation) - expected = g.apply( - lambda x: x.expanding().quantile(0.4, interpolation=interpolation) - ) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1059,7 +1091,9 @@ def func_0(x): return getattr(x.expanding(), f)(frame) - expected = g.apply(func_0) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values null_idx = list(range(20, 61)) + list(range(72, 113)) @@ -1074,7 +1108,9 @@ def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) - expected = g.apply(func_1) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply(func_1) tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw, frame): @@ -1083,7 +1119,11 @@ # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) + msg = "DataFrameGroupBy.apply operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + expected = g.apply( + lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) + ) # groupby.apply doesn't drop the grouped-by column expected = expected.drop("A", axis=1) # GH 39732 @@ -1141,7 +1181,9 @@ ) tm.assert_frame_equal(result, expected) - expected = df.groupby("A").apply(lambda x: getattr(x.ewm(com=1.0), method)()) + expected = df.groupby("A")[["B"]].apply( + lambda x: getattr(x.ewm(com=1.0), method)() + ) tm.assert_frame_equal(result, expected) def test_times(self, times_frame): @@ -1197,15 +1239,15 @@ df = DataFrame( { "id": ["a", "a", "b", "b", "b"], - "timestamp": date_range("2021-9-1", periods=5, freq="H"), + "timestamp": date_range("2021-9-1", periods=5, freq="h"), "y": range(5), } ) - grp = df.groupby("id").rolling("1H", on="timestamp") + grp = df.groupby("id").rolling("1h", on="timestamp") result = grp.count() expected_df = DataFrame( { - "timestamp": date_range("2021-9-1", periods=5, freq="H"), + "timestamp": date_range("2021-9-1", periods=5, freq="h"), "y": [1.0] * 5, }, index=MultiIndex.from_arrays( @@ -1220,7 +1262,7 @@ index=MultiIndex.from_arrays( [ ["a", "a", "b", "b", "b"], - date_range("2021-9-1", periods=5, freq="H"), + date_range("2021-9-1", periods=5, freq="h"), ], names=["id", "timestamp"], ), diff -Nru pandas-2.1.4+dfsg/pandas/tests/window/test_numba.py pandas-2.2.2+dfsg/pandas/tests/window/test_numba.py --- pandas-2.1.4+dfsg/pandas/tests/window/test_numba.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/window/test_numba.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,11 +1,6 @@ import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, - is_platform_windows, -) from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td @@ -17,15 +12,7 @@ ) import pandas._testing as tm -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.skipif( - is_ci_environment() and (is_platform_windows() or is_platform_mac()), - reason="On GHA CI, Windows can fail with " - "'Windows fatal exception: stack overflow' " - "and macOS can timeout", - ), -] +pytestmark = pytest.mark.single_cpu @pytest.fixture(params=["single", "table"]) @@ -459,3 +446,10 @@ engine_kwargs=engine_kwargs, engine="numba" ) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba") +def test_npfunc_no_warnings(): + df = DataFrame({"col1": [1, 2, 3, 4, 5]}) + with tm.assert_produces_warning(False): + df.col1.rolling(2).apply(np.prod, raw=True, engine="numba") diff -Nru pandas-2.1.4+dfsg/pandas/tests/window/test_online.py pandas-2.2.2+dfsg/pandas/tests/window/test_online.py --- pandas-2.1.4+dfsg/pandas/tests/window/test_online.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/window/test_online.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,27 +1,13 @@ import numpy as np import pytest -from pandas.compat import ( - is_ci_environment, - is_platform_mac, - is_platform_windows, -) - from pandas import ( DataFrame, Series, ) import pandas._testing as tm -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.skipif( - is_ci_environment() and (is_platform_windows() or is_platform_mac()), - reason="On GHA CI, Windows can fail with " - "'Windows fatal exception: stack overflow' " - "and macOS can timeout", - ), -] +pytestmark = pytest.mark.single_cpu pytest.importorskip("numba") diff -Nru pandas-2.1.4+dfsg/pandas/tests/window/test_pairwise.py pandas-2.2.2+dfsg/pandas/tests/window/test_pairwise.py --- pandas-2.1.4+dfsg/pandas/tests/window/test_pairwise.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/window/test_pairwise.py 2024-04-10 17:42:52.000000000 +0000 @@ -62,9 +62,13 @@ result = A.rolling(window=50, min_periods=25).corr(B) tm.assert_almost_equal(result.iloc[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) + +def test_rolling_corr_bias_correction(): # test for correct bias correction - a = tm.makeTimeSeries() - b = tm.makeTimeSeries() + a = Series( + np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20) + ) + b = a.copy() a[:5] = np.nan b[:10] = np.nan @@ -392,7 +396,7 @@ def test_corr_freq_memory_error(self): # GH 31789 s = Series(range(5), index=date_range("2020", periods=5)) - result = s.rolling("12H").corr(s) + result = s.rolling("12h").corr(s) expected = Series([np.nan] * 5, index=date_range("2020", periods=5)) tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tests/window/test_rolling.py pandas-2.2.2+dfsg/pandas/tests/window/test_rolling.py --- pandas-2.1.4+dfsg/pandas/tests/window/test_rolling.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/window/test_rolling.py 2024-04-10 17:42:52.000000000 +0000 @@ -100,9 +100,9 @@ index=date_range("2015-12-24", periods=10, freq="D"), ) with pytest.raises( - NotImplementedError, match="step is not supported with frequency windows" + NotImplementedError, match="^step (not implemented|is not supported)" ): - df.rolling("3D", step=3) + df.rolling(window, step=3).sum() @pytest.mark.parametrize("agg", ["cov", "corr"]) @@ -304,6 +304,76 @@ tm.assert_equal(result, expected) +@pytest.mark.parametrize( + "closed,expected", + [ + ("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14, 18, 21]), + ("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5, 13, 8]), + ("right", [0, 1, 3, 6, 10, 14, 11, 18, 21, 17]), + ("both", [0, 1, 3, 6, 10, 15, 20, 27, 26, 30]), + ], +) +def test_variable_window_nonunique(closed, expected, frame_or_series): + # GH 20712 + index = DatetimeIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-04", + "2011-01-04", + "2011-01-05", + "2011-01-06", + ] + ) + + df = frame_or_series(range(10), index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float) + + result = df.rolling("2D", closed=closed).sum() + + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "closed,expected", + [ + ("left", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 18, 21]), + ("neither", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 13, 8]), + ("right", [0, 1, 3, 6, 10, 15, 21, 28, 21, 17]), + ("both", [0, 1, 3, 6, 10, 15, 21, 28, 26, 30]), + ], +) +def test_variable_offset_window_nonunique(closed, expected, frame_or_series): + # GH 20712 + index = DatetimeIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-04", + "2011-01-04", + "2011-01-05", + "2011-01-06", + ] + ) + + df = frame_or_series(range(10), index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float) + + offset = BusinessDay(2) + indexer = VariableOffsetWindowIndexer(index=index, offset=offset) + result = df.rolling(indexer, closed=closed, min_periods=1).sum() + + tm.assert_equal(result, expected) + + def test_even_number_window_alignment(): # see discussion in GH 38780 s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3)) @@ -907,20 +977,23 @@ @pytest.mark.parametrize("add", [0.0, 2.0]) -def test_rolling_numerical_accuracy_kahan_mean(add): +def test_rolling_numerical_accuracy_kahan_mean(add, unit): # GH: 36031 implementing kahan summation - df = DataFrame( - {"A": [3002399751580331.0 + add, -0.0, -0.0]}, - index=[ + dti = DatetimeIndex( + [ Timestamp("19700101 09:00:00"), Timestamp("19700101 09:00:03"), Timestamp("19700101 09:00:06"), - ], + ] + ).as_unit(unit) + df = DataFrame( + {"A": [3002399751580331.0 + add, -0.0, -0.0]}, + index=dti, ) result = ( df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean() ) - dates = date_range("19700101 09:00:00", periods=7, freq="S") + dates = date_range("19700101 09:00:00", periods=7, freq="s", unit=unit) expected = DataFrame( { "A": [ @@ -1065,11 +1138,13 @@ ("index", "window"), [ ( - period_range(start="2020-01-01 08:00", end="2020-01-01 08:08", freq="T"), - "2T", + period_range(start="2020-01-01 08:00", end="2020-01-01 08:08", freq="min"), + "2min", ), ( - period_range(start="2020-01-01 08:00", end="2020-01-01 12:00", freq="30T"), + period_range( + start="2020-01-01 08:00", end="2020-01-01 12:00", freq="30min" + ), "1h", ), ], @@ -1124,8 +1199,19 @@ tm.assert_series_equal(result == 0, expected == 0) -def test_timeoffset_as_window_parameter_for_corr(): +def test_timeoffset_as_window_parameter_for_corr(unit): # GH: 28266 + dti = DatetimeIndex( + [ + Timestamp("20130101 09:00:00"), + Timestamp("20130102 09:00:02"), + Timestamp("20130103 09:00:03"), + Timestamp("20130105 09:00:05"), + Timestamp("20130106 09:00:06"), + ] + ).as_unit(unit) + mi = MultiIndex.from_product([dti, ["B", "A"]]) + exp = DataFrame( { "B": [ @@ -1153,31 +1239,12 @@ 1.0000000000000002, ], }, - index=MultiIndex.from_tuples( - [ - (Timestamp("20130101 09:00:00"), "B"), - (Timestamp("20130101 09:00:00"), "A"), - (Timestamp("20130102 09:00:02"), "B"), - (Timestamp("20130102 09:00:02"), "A"), - (Timestamp("20130103 09:00:03"), "B"), - (Timestamp("20130103 09:00:03"), "A"), - (Timestamp("20130105 09:00:05"), "B"), - (Timestamp("20130105 09:00:05"), "A"), - (Timestamp("20130106 09:00:06"), "B"), - (Timestamp("20130106 09:00:06"), "A"), - ] - ), + index=mi, ) df = DataFrame( {"B": [0, 1, 2, 4, 3], "A": [7, 4, 6, 9, 3]}, - index=[ - Timestamp("20130101 09:00:00"), - Timestamp("20130102 09:00:02"), - Timestamp("20130103 09:00:03"), - Timestamp("20130105 09:00:05"), - Timestamp("20130106 09:00:06"), - ], + index=dti, ) res = df.rolling(window="3d").corr() diff -Nru pandas-2.1.4+dfsg/pandas/tests/window/test_rolling_functions.py pandas-2.2.2+dfsg/pandas/tests/window/test_rolling_functions.py --- pandas-2.1.4+dfsg/pandas/tests/window/test_rolling_functions.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/window/test_rolling_functions.py 2024-04-10 17:42:52.000000000 +0000 @@ -346,7 +346,7 @@ lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), pytest.param( lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), - marks=td.skip_if_no_scipy, + marks=td.skip_if_no("scipy"), ), ], ) @@ -388,7 +388,7 @@ # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) + series = Series(list(range(5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically @@ -425,7 +425,7 @@ # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) + series = Series(list(range(5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically @@ -445,7 +445,7 @@ # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) - series = Series(list(range(0, 5)) + [10, 20], index=indices) + series = Series(list(range(5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically @@ -508,7 +508,7 @@ lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), pytest.param( lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), - marks=td.skip_if_no_scipy, + marks=td.skip_if_no("scipy"), ), ], ) diff -Nru pandas-2.1.4+dfsg/pandas/tests/window/test_timeseries_window.py pandas-2.2.2+dfsg/pandas/tests/window/test_timeseries_window.py --- pandas-2.1.4+dfsg/pandas/tests/window/test_timeseries_window.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tests/window/test_timeseries_window.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,8 +1,11 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, + DatetimeIndex, Index, MultiIndex, NaT, @@ -178,21 +181,22 @@ result = df.rolling("2s", on="A")[["B"]].sum() tm.assert_frame_equal(result, expected) - def test_frame_on2(self): + def test_frame_on2(self, unit): # using multiple aggregation columns + dti = DatetimeIndex( + [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + ).as_unit(unit) df = DataFrame( { "A": [0, 1, 2, 3, 4], "B": [0, 1, 2, np.nan, 4], - "C": Index( - [ - Timestamp("20130101 09:00:00"), - Timestamp("20130101 09:00:02"), - Timestamp("20130101 09:00:03"), - Timestamp("20130101 09:00:05"), - Timestamp("20130101 09:00:06"), - ] - ), + "C": dti, }, columns=["A", "C", "B"], ) @@ -248,18 +252,22 @@ result = df.rolling("2s", min_periods=1).sum() tm.assert_frame_equal(result, expected) - def test_closed(self, regular): + def test_closed(self, regular, unit): # xref GH13965 - df = DataFrame( - {"A": [1] * 5}, - index=[ + dti = DatetimeIndex( + [ Timestamp("20130101 09:00:01"), Timestamp("20130101 09:00:02"), Timestamp("20130101 09:00:03"), Timestamp("20130101 09:00:04"), Timestamp("20130101 09:00:06"), - ], + ] + ).as_unit(unit) + + df = DataFrame( + {"A": [1] * 5}, + index=dti, ) # closed must be 'right', 'left', 'both', 'neither' @@ -599,12 +607,12 @@ # more sophisticated comparison of integer vs. # time-based windowing df = DataFrame( - {"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="H") + {"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="h") ) # in-range data dft = df.between_time("09:00", "16:00") - r = dft.rolling(window="5H") + r = dft.rolling(window="5h") result = getattr(r, f)() @@ -642,15 +650,17 @@ expected2 = ss.rolling(3, min_periods=1).cov() tm.assert_series_equal(result, expected2) - def test_rolling_on_decreasing_index(self): + def test_rolling_on_decreasing_index(self, unit): # GH-19248, GH-32385 - index = [ - Timestamp("20190101 09:00:30"), - Timestamp("20190101 09:00:27"), - Timestamp("20190101 09:00:20"), - Timestamp("20190101 09:00:18"), - Timestamp("20190101 09:00:10"), - ] + index = DatetimeIndex( + [ + Timestamp("20190101 09:00:30"), + Timestamp("20190101 09:00:27"), + Timestamp("20190101 09:00:20"), + Timestamp("20190101 09:00:18"), + Timestamp("20190101 09:00:10"), + ] + ).as_unit(unit) df = DataFrame({"column": [3, 4, 4, 5, 6]}, index=index) result = df.rolling("5s").min() @@ -690,3 +700,16 @@ with pytest.raises(ValueError, match=f"{msg} values must not have NaT"): with tm.assert_produces_warning(FutureWarning, match=warn_msg): df.rolling("D", axis=axis).mean() + + +@td.skip_if_no("pyarrow") +def test_arrow_datetime_axis(): + # GH 55849 + expected = Series( + np.arange(5, dtype=np.float64), + index=Index( + date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]" + ), + ) + result = expected.rolling("1D").sum() + tm.assert_series_equal(result, expected) diff -Nru pandas-2.1.4+dfsg/pandas/tseries/api.py pandas-2.2.2+dfsg/pandas/tseries/api.py --- pandas-2.1.4+dfsg/pandas/tseries/api.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tseries/api.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,7 +2,9 @@ Timeseries API """ +from pandas._libs.tslibs.parsing import guess_datetime_format + from pandas.tseries import offsets from pandas.tseries.frequencies import infer_freq -__all__ = ["infer_freq", "offsets"] +__all__ = ["infer_freq", "offsets", "guess_datetime_format"] diff -Nru pandas-2.1.4+dfsg/pandas/tseries/frequencies.py pandas-2.2.2+dfsg/pandas/tseries/frequencies.py --- pandas-2.1.4+dfsg/pandas/tseries/frequencies.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tseries/frequencies.py 2024-04-10 17:42:52.000000000 +0000 @@ -19,6 +19,10 @@ MONTHS, int_to_weekday, ) +from pandas._libs.tslibs.dtypes import ( + OFFSET_TO_PERIOD_FREQSTR, + freq_to_period_freqstr, +) from pandas._libs.tslibs.fields import ( build_field_sarray, month_position_check, @@ -52,59 +56,30 @@ TimedeltaIndex, ) from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin -# --------------------------------------------------------------------- -# Offset names ("time rules") and related functions - -_offset_to_period_map = { - "WEEKDAY": "D", - "EOM": "M", - "BM": "M", - "BQS": "Q", - "QS": "Q", - "BQ": "Q", - "BA": "A", - "AS": "A", - "BAS": "A", - "MS": "M", - "D": "D", - "B": "B", - "T": "T", - "S": "S", - "L": "L", - "U": "U", - "N": "N", - "H": "H", - "Q": "Q", - "A": "A", - "W": "W", - "M": "M", - "Y": "A", - "BY": "A", - "YS": "A", - "BYS": "A", -} +# -------------------------------------------------------------------- +# Offset related functions -_need_suffix = ["QS", "BQ", "BQS", "YS", "AS", "BY", "BA", "BYS", "BAS"] +_need_suffix = ["QS", "BQE", "BQS", "YS", "BYE", "BYS"] for _prefix in _need_suffix: for _m in MONTHS: key = f"{_prefix}-{_m}" - _offset_to_period_map[key] = _offset_to_period_map[_prefix] + OFFSET_TO_PERIOD_FREQSTR[key] = OFFSET_TO_PERIOD_FREQSTR[_prefix] -for _prefix in ["A", "Q"]: +for _prefix in ["Y", "Q"]: for _m in MONTHS: _alias = f"{_prefix}-{_m}" - _offset_to_period_map[_alias] = _alias + OFFSET_TO_PERIOD_FREQSTR[_alias] = _alias for _d in DAYS: - _offset_to_period_map[f"W-{_d}"] = f"W-{_d}" + OFFSET_TO_PERIOD_FREQSTR[f"W-{_d}"] = f"W-{_d}" def get_period_alias(offset_str: str) -> str | None: """ Alias to closest period strings BQ->Q etc. """ - return _offset_to_period_map.get(offset_str, None) + return OFFSET_TO_PERIOD_FREQSTR.get(offset_str, None) # --------------------------------------------------------------------- @@ -254,7 +229,7 @@ # Business hourly, maybe. 17: one day / 65: one weekend if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]): - return "BH" + return "bh" # Possibly intraday frequency. Here we use the # original .asi8 values as the modified values @@ -268,22 +243,22 @@ pps = ppm // 60 if _is_multiple(delta, pph): # Hours - return _maybe_add_count("H", delta / pph) + return _maybe_add_count("h", delta / pph) elif _is_multiple(delta, ppm): # Minutes - return _maybe_add_count("T", delta / ppm) + return _maybe_add_count("min", delta / ppm) elif _is_multiple(delta, pps): # Seconds - return _maybe_add_count("S", delta / pps) + return _maybe_add_count("s", delta / pps) elif _is_multiple(delta, (pps // 1000)): # Milliseconds - return _maybe_add_count("L", delta / (pps // 1000)) + return _maybe_add_count("ms", delta / (pps // 1000)) elif _is_multiple(delta, (pps // 1_000_000)): # Microseconds - return _maybe_add_count("U", delta / (pps // 1_000_000)) + return _maybe_add_count("us", delta / (pps // 1_000_000)) else: # Nanoseconds - return _maybe_add_count("N", delta) + return _maybe_add_count("ns", delta) @cache_readonly def day_deltas(self) -> list[int]: @@ -301,7 +276,7 @@ @cache_readonly def rep_stamp(self) -> Timestamp: - return Timestamp(self.i8values[0]) + return Timestamp(self.i8values[0], unit=self.index.unit) def month_position_check(self) -> str | None: return month_position_check(self.fields, self.index.dayofweek) @@ -370,7 +345,7 @@ if pos_check is None: return None else: - return {"cs": "AS", "bs": "BAS", "ce": "A", "be": "BA"}.get(pos_check) + return {"cs": "YS", "bs": "BYS", "ce": "YE", "be": "BYE"}.get(pos_check) def _get_quarterly_rule(self) -> str | None: if len(self.mdiffs) > 1: @@ -384,7 +359,7 @@ if pos_check is None: return None else: - return {"cs": "QS", "bs": "BQS", "ce": "Q", "be": "BQ"}.get(pos_check) + return {"cs": "QS", "bs": "BQS", "ce": "QE", "be": "BQE"}.get(pos_check) def _get_monthly_rule(self) -> str | None: if len(self.mdiffs) > 1: @@ -394,7 +369,7 @@ if pos_check is None: return None else: - return {"cs": "MS", "bs": "BMS", "ce": "M", "be": "BM"}.get(pos_check) + return {"cs": "MS", "bs": "BMS", "ce": "ME", "be": "BME"}.get(pos_check) def _is_business_daily(self) -> bool: # quick check: cannot be business daily @@ -472,7 +447,6 @@ ------- bool """ - if target is None or source is None: return False source = _maybe_coerce_freq(source) @@ -483,31 +457,31 @@ return _quarter_months_conform( get_rule_month(source), get_rule_month(target) ) - return source in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"} + return source in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"} elif _is_quarterly(target): - return source in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"} + return source in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"} elif _is_monthly(target): - return source in {"D", "C", "B", "H", "T", "S", "L", "U", "N"} + return source in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif _is_weekly(target): - return source in {target, "D", "C", "B", "H", "T", "S", "L", "U", "N"} + return source in {target, "D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif target == "B": - return source in {"B", "H", "T", "S", "L", "U", "N"} + return source in {"B", "h", "min", "s", "ms", "us", "ns"} elif target == "C": - return source in {"C", "H", "T", "S", "L", "U", "N"} + return source in {"C", "h", "min", "s", "ms", "us", "ns"} elif target == "D": - return source in {"D", "H", "T", "S", "L", "U", "N"} - elif target == "H": - return source in {"H", "T", "S", "L", "U", "N"} - elif target == "T": - return source in {"T", "S", "L", "U", "N"} - elif target == "S": - return source in {"S", "L", "U", "N"} - elif target == "L": - return source in {"L", "U", "N"} - elif target == "U": - return source in {"U", "N"} - elif target == "N": - return source in {"N"} + return source in {"D", "h", "min", "s", "ms", "us", "ns"} + elif target == "h": + return source in {"h", "min", "s", "ms", "us", "ns"} + elif target == "min": + return source in {"min", "s", "ms", "us", "ns"} + elif target == "s": + return source in {"s", "ms", "us", "ns"} + elif target == "ms": + return source in {"ms", "us", "ns"} + elif target == "us": + return source in {"us", "ns"} + elif target == "ns": + return source in {"ns"} else: return False @@ -541,31 +515,31 @@ smonth = get_rule_month(source) tmonth = get_rule_month(target) return _quarter_months_conform(smonth, tmonth) - return target in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"} + return target in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"} elif _is_quarterly(source): - return target in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"} + return target in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"} elif _is_monthly(source): - return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"} + return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif _is_weekly(source): - return target in {source, "D", "C", "B", "H", "T", "S", "L", "U", "N"} + return target in {source, "D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif source == "B": - return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"} + return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif source == "C": - return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"} + return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} elif source == "D": - return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"} - elif source == "H": - return target in {"H", "T", "S", "L", "U", "N"} - elif source == "T": - return target in {"T", "S", "L", "U", "N"} - elif source == "S": - return target in {"S", "L", "U", "N"} - elif source == "L": - return target in {"L", "U", "N"} - elif source == "U": - return target in {"U", "N"} - elif source == "N": - return target in {"N"} + return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"} + elif source == "h": + return target in {"h", "min", "s", "ms", "us", "ns"} + elif source == "min": + return target in {"min", "s", "ms", "us", "ns"} + elif source == "s": + return target in {"s", "ms", "us", "ns"} + elif source == "ms": + return target in {"ms", "us", "ns"} + elif source == "us": + return target in {"us", "ns"} + elif source == "ns": + return target in {"ns"} else: return False @@ -585,8 +559,11 @@ """ assert code is not None if isinstance(code, DateOffset): - code = code.rule_code - return code.upper() + code = freq_to_period_freqstr(1, code.name) + if code in {"h", "min", "s", "ms", "us", "ns"}: + return code + else: + return code.upper() def _quarter_months_conform(source: str, target: str) -> bool: @@ -597,7 +574,7 @@ def _is_annual(rule: str) -> bool: rule = rule.upper() - return rule == "A" or rule.startswith("A-") + return rule == "Y" or rule.startswith("Y-") def _is_quarterly(rule: str) -> bool: diff -Nru pandas-2.1.4+dfsg/pandas/tseries/holiday.py pandas-2.2.2+dfsg/pandas/tseries/holiday.py --- pandas-2.1.4+dfsg/pandas/tseries/holiday.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/tseries/holiday.py 2024-04-10 17:42:52.000000000 +0000 @@ -354,7 +354,7 @@ Dates with rules applied """ if dates.empty: - return DatetimeIndex([]) + return dates.copy() if self.observance is not None: return dates.map(lambda d: self.observance(d)) diff -Nru pandas-2.1.4+dfsg/pandas/util/__init__.py pandas-2.2.2+dfsg/pandas/util/__init__.py --- pandas-2.1.4+dfsg/pandas/util/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/util/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -23,3 +23,7 @@ return cache_readonly raise AttributeError(f"module 'pandas.util' has no attribute '{key}'") + + +def capitalize_first_letter(s): + return s[:1].upper() + s[1:] diff -Nru pandas-2.1.4+dfsg/pandas/util/_decorators.py pandas-2.2.2+dfsg/pandas/util/_decorators.py --- pandas-2.1.4+dfsg/pandas/util/_decorators.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/util/_decorators.py 2024-04-10 17:42:52.000000000 +0000 @@ -371,7 +371,7 @@ continue if hasattr(docstring, "_docstring_components"): docstring_components.extend( - docstring._docstring_components # pyright: ignore[reportGeneralTypeIssues] # noqa: E501 + docstring._docstring_components # pyright: ignore[reportGeneralTypeIssues] ) elif isinstance(docstring, str) or docstring.__doc__: docstring_components.append(docstring) diff -Nru pandas-2.1.4+dfsg/pandas/util/_exceptions.py pandas-2.2.2+dfsg/pandas/util/_exceptions.py --- pandas-2.1.4+dfsg/pandas/util/_exceptions.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/util/_exceptions.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,7 @@ if TYPE_CHECKING: from collections.abc import Generator + from types import FrameType @contextlib.contextmanager @@ -42,15 +43,20 @@ test_dir = os.path.join(pkg_dir, "tests") # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow - frame = inspect.currentframe() - n = 0 - while frame: - fname = inspect.getfile(frame) - if fname.startswith(pkg_dir) and not fname.startswith(test_dir): - frame = frame.f_back - n += 1 - else: - break + frame: FrameType | None = inspect.currentframe() + try: + n = 0 + while frame: + filename = inspect.getfile(frame) + if filename.startswith(pkg_dir) and not filename.startswith(test_dir): + frame = frame.f_back + n += 1 + else: + break + finally: + # See note in + # https://docs.python.org/3/library/inspect.html#inspect.Traceback + del frame return n diff -Nru pandas-2.1.4+dfsg/pandas/util/_test_decorators.py pandas-2.2.2+dfsg/pandas/util/_test_decorators.py --- pandas-2.1.4+dfsg/pandas/util/_test_decorators.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/util/_test_decorators.py 2024-04-10 17:42:52.000000000 +0000 @@ -2,8 +2,8 @@ This module provides decorator functions which can be applied to test objects in order to skip those objects when certain conditions occur. A sample use case is to detect if the platform is missing ``matplotlib``. If so, any test objects -which require ``matplotlib`` and decorated with ``@td.skip_if_no_mpl`` will be -skipped by ``pytest`` during the execution of the test suite. +which require ``matplotlib`` and decorated with ``@td.skip_if_no("matplotlib")`` +will be skipped by ``pytest`` during the execution of the test suite. To illustrate, after importing this module: @@ -11,13 +11,13 @@ The decorators can be applied to classes: -@td.skip_if_some_reason +@td.skip_if_no("package") class Foo: ... Or individual functions: -@td.skip_if_some_reason +@td.skip_if_no("package") def test_foo(): ... @@ -31,7 +31,6 @@ Callable, ) -import numpy as np import pytest from pandas._config import get_option @@ -39,63 +38,13 @@ if TYPE_CHECKING: from pandas._typing import F +from pandas._config.config import _get_option + from pandas.compat import ( IS64, is_platform_windows, ) - -from pandas.core.computation.expressions import ( - NUMEXPR_INSTALLED, - USE_NUMEXPR, -) -from pandas.util.version import Version - - -def safe_import(mod_name: str, min_version: str | None = None): - """ - Parameters - ---------- - mod_name : str - Name of the module to be imported - min_version : str, default None - Minimum required version of the specified mod_name - - Returns - ------- - object - The imported module if successful, or False - """ - try: - mod = __import__(mod_name) - except ImportError: - return False - - if not min_version: - return mod - else: - import sys - - version = getattr(sys.modules[mod_name], "__version__") - if version and Version(version) >= Version(min_version): - return mod - - return False - - -def _skip_if_not_us_locale() -> bool: - lang, _ = locale.getlocale() - if lang != "en_US": - return True - return False - - -def _skip_if_no_scipy() -> bool: - return not ( - safe_import("scipy.stats") - and safe_import("scipy.sparse") - and safe_import("scipy.interpolate") - and safe_import("scipy.signal") - ) +from pandas.compat._optional import import_optional_dependency def skip_if_installed(package: str) -> pytest.MarkDecorator: @@ -114,7 +63,8 @@ parametrization mark. """ return pytest.mark.skipif( - safe_import(package), reason=f"Skipping because {package} is installed." + bool(import_optional_dependency(package, errors="ignore")), + reason=f"Skipping because {package} is installed.", ) @@ -153,38 +103,21 @@ if min_version: msg += f" satisfying a min_version of {min_version}" return pytest.mark.skipif( - not safe_import(package, min_version=min_version), reason=msg + not bool( + import_optional_dependency( + package, errors="ignore", min_version=min_version + ) + ), + reason=msg, ) -skip_if_mpl = pytest.mark.skipif( - bool(safe_import("matplotlib")), reason="matplotlib is present" -) skip_if_32bit = pytest.mark.skipif(not IS64, reason="skipping for 32 bit") skip_if_windows = pytest.mark.skipif(is_platform_windows(), reason="Running on Windows") skip_if_not_us_locale = pytest.mark.skipif( - _skip_if_not_us_locale(), - reason=f"Specific locale is set {locale.getlocale()[0]}", -) -skip_if_no_scipy = pytest.mark.skipif( - _skip_if_no_scipy(), reason="Missing SciPy requirement" + locale.getlocale()[0] != "en_US", + reason=f"Set local {locale.getlocale()[0]} is not en_US", ) -skip_if_no_ne = pytest.mark.skipif( - not USE_NUMEXPR, - reason=f"numexpr enabled->{USE_NUMEXPR}, installed->{NUMEXPR_INSTALLED}", -) - - -def skip_if_np_lt( - ver_str: str, *args, reason: str | None = None -) -> pytest.MarkDecorator: - if reason is None: - reason = f"NumPy {ver_str} or greater required" - return pytest.mark.skipif( - Version(np.__version__) < Version(ver_str), - *args, - reason=reason, - ) def parametrize_fixture_doc(*args) -> Callable[[F], F]: @@ -216,25 +149,25 @@ def mark_array_manager_not_yet_implemented(request) -> None: mark = pytest.mark.xfail(reason="Not yet implemented for ArrayManager") - request.node.add_marker(mark) + request.applymarker(mark) skip_array_manager_not_yet_implemented = pytest.mark.xfail( - get_option("mode.data_manager") == "array", + _get_option("mode.data_manager", silent=True) == "array", reason="Not yet implemented for ArrayManager", ) skip_array_manager_invalid_test = pytest.mark.skipif( - get_option("mode.data_manager") == "array", + _get_option("mode.data_manager", silent=True) == "array", reason="Test that relies on BlockManager internals or specific behaviour", ) skip_copy_on_write_not_yet_implemented = pytest.mark.xfail( - get_option("mode.copy_on_write"), + get_option("mode.copy_on_write") is True, reason="Not yet implemented/adapted for Copy-on-Write mode", ) skip_copy_on_write_invalid_test = pytest.mark.skipif( - get_option("mode.copy_on_write"), + get_option("mode.copy_on_write") is True, reason="Test not valid for Copy-on-Write mode", ) diff -Nru pandas-2.1.4+dfsg/pandas/util/_validators.py pandas-2.2.2+dfsg/pandas/util/_validators.py --- pandas-2.1.4+dfsg/pandas/util/_validators.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/util/_validators.py 2024-04-10 17:42:52.000000000 +0000 @@ -26,7 +26,7 @@ BoolishNoneT = TypeVar("BoolishNoneT", bool, int, None) -def _check_arg_length(fname, args, max_fname_arg_count, compat_args): +def _check_arg_length(fname, args, max_fname_arg_count, compat_args) -> None: """ Checks whether 'args' has length of at most 'compat_args'. Raises a TypeError if that is not the case, similar to in Python when a @@ -46,7 +46,7 @@ ) -def _check_for_default_values(fname, arg_val_dict, compat_args): +def _check_for_default_values(fname, arg_val_dict, compat_args) -> None: """ Check that the keys in `arg_val_dict` are mapped to their default values as specified in `compat_args`. @@ -125,7 +125,7 @@ _check_for_default_values(fname, kwargs, compat_args) -def _check_for_invalid_keys(fname, kwargs, compat_args): +def _check_for_invalid_keys(fname, kwargs, compat_args) -> None: """ Checks whether 'kwargs' contains any keys that are not in 'compat_args' and raises a TypeError if there is one. diff -Nru pandas-2.1.4+dfsg/pandas/util/version/__init__.py pandas-2.2.2+dfsg/pandas/util/version/__init__.py --- pandas-2.1.4+dfsg/pandas/util/version/__init__.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pandas/util/version/__init__.py 2024-04-10 17:42:52.000000000 +0000 @@ -4,8 +4,7 @@ # 04/30/2021 # This file is dual licensed under the terms of the Apache License, Version -# 2.0, and the BSD License. See the LICENSE file in the root of this repository -# for complete details. +# 2.0, and the BSD License. Licence at LICENSES/PACKAGING_LICENSE from __future__ import annotations import collections diff -Nru pandas-2.1.4+dfsg/pyproject.toml pandas-2.2.2+dfsg/pyproject.toml --- pandas-2.1.4+dfsg/pyproject.toml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pyproject.toml 2024-04-10 17:42:52.000000000 +0000 @@ -5,12 +5,10 @@ "meson-python==0.13.1", "meson==1.2.1", "wheel", - "Cython>=0.29.33,<3", # Note: sync with setup.py, environment.yml and asv.conf.json - # Note: numpy 1.25 has a backwards compatible C API by default - # we don't want to force users to compile with 1.25 though - # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) - "oldest-supported-numpy>=2022.8.16; python_version<'3.12'", - "numpy>=1.26.0,<2; python_version>='3.12'", + "Cython==3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json + # Force numpy higher than 2.0rc1, so that built wheels are compatible + # with both numpy 1 and 2 + "numpy>=2.0.0rc1", "versioneer[toml]" ] @@ -29,12 +27,12 @@ license = {file = 'LICENSE'} requires-python = '>=3.9' dependencies = [ - "numpy>=1.22.4,<2; python_version<'3.11'", - "numpy>=1.23.2,<2; python_version=='3.11'", - "numpy>=1.26.0,<2; python_version>='3.12'", + "numpy>=1.22.4; python_version<'3.11'", + "numpy>=1.23.2; python_version=='3.11'", + "numpy>=1.26.0; python_version>='3.12'", "python-dateutil>=2.8.2", "pytz>=2020.1", - "tzdata>=2022.1" + "tzdata>=2022.7" ] classifiers = [ 'Development Status :: 5 - Production/Stable', @@ -49,6 +47,7 @@ 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', 'Topic :: Scientific/Engineering' ] @@ -62,64 +61,68 @@ [project.optional-dependencies] test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] -performance = ['bottleneck>=1.3.4', 'numba>=0.55.2', 'numexpr>=2.8.0'] -computation = ['scipy>=1.8.1', 'xarray>=2022.03.0'] -fss = ['fsspec>=2022.05.0'] -aws = ['s3fs>=2022.05.0'] -gcp = ['gcsfs>=2022.05.0', 'pandas-gbq>=0.17.5'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.10', 'pyxlsb>=1.0.9', 'xlrd>=2.0.1', 'xlsxwriter>=3.0.3'] -parquet = ['pyarrow>=7.0.0'] -feather = ['pyarrow>=7.0.0'] +pyarrow = ['pyarrow>=10.0.1'] +performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4'] +computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] +fss = ['fsspec>=2022.11.0'] +aws = ['s3fs>=2022.11.0'] +gcp = ['gcsfs>=2022.11.0', 'pandas-gbq>=0.19.0'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.1.0', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'xlrd>=2.0.1', 'xlsxwriter>=3.0.5'] +parquet = ['pyarrow>=10.0.1'] +feather = ['pyarrow>=10.0.1'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) #'blosc>=1.20.1', - 'tables>=3.7.0'] -spss = ['pyreadstat>=1.1.5'] -postgresql = ['SQLAlchemy>=1.4.36', 'psycopg2>=2.9.3'] -mysql = ['SQLAlchemy>=1.4.36', 'pymysql>=1.0.2'] -sql-other = ['SQLAlchemy>=1.4.36'] -html = ['beautifulsoup4>=4.11.1', 'html5lib>=1.1', 'lxml>=4.8.0'] -xml = ['lxml>=4.8.0'] -plot = ['matplotlib>=3.6.1'] -output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] -clipboard = ['PyQt5>=5.15.6', 'qtpy>=2.2.0'] -compression = ['zstandard>=0.17.0'] + 'tables>=3.8.0'] +spss = ['pyreadstat>=1.2.0'] +postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.8.0'] +mysql = ['SQLAlchemy>=2.0.0', 'pymysql>=1.0.2'] +sql-other = ['SQLAlchemy>=2.0.0', 'adbc-driver-postgresql>=0.8.0', 'adbc-driver-sqlite>=0.8.0'] +html = ['beautifulsoup4>=4.11.2', 'html5lib>=1.1', 'lxml>=4.9.2'] +xml = ['lxml>=4.9.2'] +plot = ['matplotlib>=3.6.3'] +output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.9.0'] +clipboard = ['PyQt5>=5.15.9', 'qtpy>=2.3.0'] +compression = ['zstandard>=0.19.0'] consortium-standard = ['dataframe-api-compat>=0.1.7'] -all = ['beautifulsoup4>=4.11.1', +all = ['adbc-driver-postgresql>=0.8.0', + 'adbc-driver-sqlite>=0.8.0', + 'beautifulsoup4>=4.11.2', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) - #'blosc>=1.21.0', - 'bottleneck>=1.3.4', + #'blosc>=1.21.3', + 'bottleneck>=1.3.6', 'dataframe-api-compat>=0.1.7', - 'fastparquet>=0.8.1', - 'fsspec>=2022.05.0', - 'gcsfs>=2022.05.0', + 'fastparquet>=2022.12.0', + 'fsspec>=2022.11.0', + 'gcsfs>=2022.11.0', 'html5lib>=1.1', 'hypothesis>=6.46.1', 'jinja2>=3.1.2', - 'lxml>=4.8.0', - 'matplotlib>=3.6.1', - 'numba>=0.55.2', - 'numexpr>=2.8.0', + 'lxml>=4.9.2', + 'matplotlib>=3.6.3', + 'numba>=0.56.4', + 'numexpr>=2.8.4', 'odfpy>=1.4.1', - 'openpyxl>=3.0.10', - 'pandas-gbq>=0.17.5', - 'psycopg2>=2.9.3', - 'pyarrow>=7.0.0', + 'openpyxl>=3.1.0', + 'pandas-gbq>=0.19.0', + 'psycopg2>=2.9.6', + 'pyarrow>=10.0.1', 'pymysql>=1.0.2', - 'PyQt5>=5.15.6', - 'pyreadstat>=1.1.5', + 'PyQt5>=5.15.9', + 'pyreadstat>=1.2.0', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', - 'pyxlsb>=1.0.9', - 'qtpy>=2.2.0', - 'scipy>=1.8.1', - 's3fs>=2022.05.0', - 'SQLAlchemy>=1.4.36', - 'tables>=3.7.0', - 'tabulate>=0.8.10', - 'xarray>=2022.03.0', + 'python-calamine>=0.1.7', + 'pyxlsb>=1.0.10', + 'qtpy>=2.3.0', + 'scipy>=1.10.0', + 's3fs>=2022.11.0', + 'SQLAlchemy>=2.0.0', + 'tables>=3.8.0', + 'tabulate>=0.9.0', + 'xarray>=2022.12.0', 'xlrd>=2.0.1', - 'xlsxwriter>=3.0.3', - 'zstandard>=0.17.0'] + 'xlsxwriter>=3.0.5', + 'zstandard>=0.19.0'] # TODO: Remove after setuptools support is dropped. [tool.setuptools] @@ -147,9 +150,12 @@ setup = ['--vsenv'] # For Windows [tool.cibuildwheel] -skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x *-musllinux_aarch64" +skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} +# TODO: remove this once numpy 2.0 proper releases +# and specify numpy 2.0 as a dependency in [build-system] requires in pyproject.toml +before-build = "pip install numpy==2.0.0rc1" test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" test-command = """ PANDAS_CI='1' python -c 'import pandas as pd; \ @@ -157,12 +163,10 @@ pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ """ -[tool.cibuildwheel.macos] -archs = "x86_64 arm64" -test-skip = "*_arm64" - [tool.cibuildwheel.windows] -before-build = "pip install delvewheel" +# TODO: remove this once numpy 2.0 proper releases +# and specify numpy 2.0 as a dependency in [build-system] requires in pyproject.toml +before-build = "pip install delvewheel numpy==2.0.0rc1" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" [[tool.cibuildwheel.overrides]] @@ -185,7 +189,7 @@ [tool.black] target-version = ['py39', 'py310'] -required-version = '23.7.0' +required-version = '23.11.0' exclude = ''' ( asv_bench/env @@ -209,6 +213,7 @@ target-version = "py310" fix = true unfixable = [] +typing-modules = ["pandas._typing"] select = [ # pyflakes @@ -226,7 +231,7 @@ # flake8-gettext "INT", # pylint - "PLC", "PLE", "PLR", "PLW", + "PL", # misc lints "PIE", # flake8-pyi @@ -249,6 +254,12 @@ "NPY002", # Perflint "PERF", + # flynt + "FLY", + # flake8-logging-format + "G", + # flake8-future-annotations + "FA", ] ignore = [ @@ -301,16 +312,14 @@ "PLW0603", # Docstrings should not be included in stubs "PYI021", + # Use `typing.NamedTuple` instead of `collections.namedtuple` + "PYI024", # No builtin `eval()` allowed "PGH001", # compare-to-empty-string "PLC1901", - # Use typing_extensions.TypeAlias for type aliases - # "PYI026", # not yet implemented - # Use "collections.abc.*" instead of "typing.*" (PEP 585 syntax) - # "PYI027", # not yet implemented # while int | float can be shortened to float, the former is more explicit - # "PYI041", # not yet implemented + "PYI041", # incorrect-dict-iterator, flags valid Series.items usage "PERF102", # try-except-in-loop, becomes useless in Python 3.11 @@ -355,7 +364,7 @@ "asv_bench/*" = ["TID", "NPY002"] # to be enabled gradually "pandas/core/*" = ["PLR5501"] -"pandas/tests/*" = ["B028"] +"pandas/tests/*" = ["B028", "FLY"] "scripts/*" = ["B028"] # Keep this one enabled "pandas/_typing.py" = ["TCH"] @@ -515,7 +524,7 @@ "db: tests requiring a database (mysql or postgres)", "clipboard: mark a pd.read_clipboard test", "arm_slow: mark a test as slow for arm64 architecture", - "arraymanager: mark a test to run with ArrayManager enabled", + "skip_ubsan: Tests known to fail UBSAN check", ] [tool.mypy] @@ -735,7 +744,7 @@ typeCheckingMode = "basic" useLibraryCodeForTypes = false include = ["pandas", "typings"] -exclude = ["pandas/tests", "pandas/io/clipboard", "pandas/util/version"] +exclude = ["pandas/tests", "pandas/io/clipboard", "pandas/util/version", "pandas/core/_numba/extensions.py"] # enable subset of "strict" reportDuplicateImport = true reportInconsistentConstructor = true @@ -746,6 +755,7 @@ reportUntypedFunctionDecorator = true reportUntypedNamedTuple = true reportUnusedImport = true +disableBytesTypePromotions = true # disable subset of "basic" reportGeneralTypeIssues = false reportMissingModuleSource = false @@ -787,5 +797,5 @@ directory = "coverage_html_report" [tool.codespell] -ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere" +ignore-words-list = "blocs, coo, hist, nd, sav, ser, recuse, nin, timere, expec, expecs" ignore-regex = 'https://([\w/\.])+' diff -Nru pandas-2.1.4+dfsg/pyright_reportGeneralTypeIssues.json pandas-2.2.2+dfsg/pyright_reportGeneralTypeIssues.json --- pandas-2.1.4+dfsg/pyright_reportGeneralTypeIssues.json 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/pyright_reportGeneralTypeIssues.json 2024-04-10 17:42:52.000000000 +0000 @@ -2,6 +2,8 @@ "typeCheckingMode": "off", "reportGeneralTypeIssues": true, "useLibraryCodeForTypes": false, + "analyzeUnannotatedFunctions": false, + "disableBytesTypePromotions": true, "include": [ "pandas", @@ -16,9 +18,10 @@ "pandas/_testing/__init__.py", "pandas/_testing/_io.py", + "pandas/compat/pickle_compat.py", + "pandas/core/_numba/extensions.py", "pandas/core/_numba/kernels/sum_.py", "pandas/core/_numba/kernels/var_.py", - "pandas/compat/pickle_compat.py", "pandas/core/algorithms.py", "pandas/core/apply.py", "pandas/core/array_algos/take.py", @@ -38,6 +41,7 @@ "pandas/core/arrays/string_arrow.py", "pandas/core/arrays/timedeltas.py", "pandas/core/computation/align.py", + "pandas/core/computation/ops.py", "pandas/core/construction.py", "pandas/core/dtypes/cast.py", "pandas/core/dtypes/common.py", @@ -88,6 +92,7 @@ "pandas/io/formats/info.py", "pandas/io/formats/printing.py", "pandas/io/formats/style.py", + "pandas/io/formats/style_render.py", "pandas/io/json/_json.py", "pandas/io/json/_normalize.py", "pandas/io/parsers/arrow_parser_wrapper.py", @@ -98,6 +103,9 @@ "pandas/io/sql.py", "pandas/io/stata.py", "pandas/plotting/_matplotlib/boxplot.py", + "pandas/plotting/_matplotlib/core.py", + "pandas/plotting/_matplotlib/timeseries.py", + "pandas/plotting/_matplotlib/tools.py", "pandas/tseries/frequencies.py", "pandas/tseries/holiday.py", ], diff -Nru pandas-2.1.4+dfsg/requirements-dev.txt pandas-2.2.2+dfsg/requirements-dev.txt --- pandas-2.1.4+dfsg/requirements-dev.txt 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/requirements-dev.txt 2024-04-10 17:42:52.000000000 +0000 @@ -3,62 +3,65 @@ pip versioneer[toml] -cython==0.29.33 +cython==3.0.5 meson[ninja]==1.2.1 meson-python==0.13.1 pytest>=7.3.2 pytest-cov pytest-xdist>=2.2.0 +pytest-qt>=4.2.0 +PyQt5>=5.15.9 coverage python-dateutil numpy<2 pytz -beautifulsoup4>=4.11.1 +beautifulsoup4>=4.11.2 blosc -bottleneck>=1.3.4 -fastparquet>=0.8.1 -fsspec>=2022.05.0 +bottleneck>=1.3.6 +fastparquet>=2022.12.0 +fsspec>=2022.11.0 html5lib>=1.1 hypothesis>=6.46.1 -gcsfs>=2022.05.0 +gcsfs>=2022.11.0 ipython jinja2>=3.1.2 -lxml>=4.8.0 -matplotlib>=3.6.1, <3.8 -numba>=0.55.2 -numexpr>=2.8.0 -openpyxl>=3.0.10 +lxml>=4.9.2 +matplotlib>=3.6.3 +numba>=0.56.4 +numexpr>=2.8.4 +openpyxl>=3.1.0 odfpy>=1.4.1 py -psycopg2-binary>=2.9.3 -pyarrow>=7.0.0 +psycopg2-binary>=2.9.6 +pyarrow>=10.0.1 pymysql>=1.0.2 -pyreadstat>=1.1.5 -tables>=3.7.0 -pyxlsb>=1.0.9 -s3fs>=2022.05.0 -scipy>=1.8.1 -SQLAlchemy>=1.4.36 -tabulate>=0.8.10 -xarray>=2022.03.0 +pyreadstat>=1.2.0 +tables>=3.8.0 +python-calamine>=0.1.7 +pyxlsb>=1.0.10 +s3fs>=2022.11.0 +scipy>=1.10.0 +SQLAlchemy>=2.0.0 +tabulate>=0.9.0 +xarray>=2022.12.0 xlrd>=2.0.1 -xlsxwriter>=3.0.3 -zstandard>=0.17.0 +xlsxwriter>=3.0.5 +zstandard>=0.19.0 dask seaborn moto flask -asv>=0.5.1 -flake8==6.0.0 -mypy==1.4.1 +asv>=0.6.1 +flake8==6.1.0 +mypy==1.8.0 tokenize-rt -pre-commit>=2.15.0 +pre-commit>=3.6.0 gitpython gitdb google-auth natsort numpydoc -pydata-sphinx-theme==0.13 +pydata-sphinx-theme==0.14 pytest-cython sphinx sphinx-design @@ -68,20 +71,20 @@ types-pytz types-PyYAML types-setuptools -nbconvert>=6.4.5 +nbconvert>=7.11.0 nbsphinx pandoc ipywidgets nbformat -notebook>=6.0.3 +notebook>=7.0.6 ipykernel -jinja2 markdown feedparser pyyaml requests pygments +adbc-driver-postgresql>=0.8.0 +adbc-driver-sqlite>=0.8.0 dataframe-api-compat>=0.1.7 -sphinx-toggleprompt typing_extensions; python_version<"3.11" -tzdata>=2022.1 +tzdata>=2022.7 diff -Nru pandas-2.1.4+dfsg/scripts/download_wheels.sh pandas-2.2.2+dfsg/scripts/download_wheels.sh --- pandas-2.1.4+dfsg/scripts/download_wheels.sh 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/download_wheels.sh 2024-04-10 17:42:52.000000000 +0000 @@ -11,6 +11,7 @@ # one by one to the dist/ directory where they would be generated. VERSION=$1 +mkdir -p $(dirname -- $0)/../dist DIST_DIR="$(realpath $(dirname -- $0)/../dist)" if [ -z $VERSION ]; then @@ -20,7 +21,7 @@ curl "https://anaconda.org/multibuild-wheels-staging/pandas/files?version=${VERSION}" | \ grep "href=\"/multibuild-wheels-staging/pandas/${VERSION}" | \ - sed -r 's/.*.*/\1/g' | \ + sed -r 's/.*.*/\1/g' | \ awk '{print "https://anaconda.org" $0 }' | \ xargs wget -P $DIST_DIR diff -Nru pandas-2.1.4+dfsg/scripts/generate_pip_deps_from_conda.py pandas-2.2.2+dfsg/scripts/generate_pip_deps_from_conda.py --- pandas-2.1.4+dfsg/scripts/generate_pip_deps_from_conda.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/generate_pip_deps_from_conda.py 2024-04-10 17:42:52.000000000 +0000 @@ -23,14 +23,15 @@ import tomli as tomllib import yaml -EXCLUDE = {"python", "c-compiler", "cxx-compiler"} -REMAP_VERSION = {"tzdata": "2022.1"} -RENAME = { +EXCLUDE = {"python", "c-compiler", "cxx-compiler", "c-blosc2"} +REMAP_VERSION = {"tzdata": "2022.7"} +CONDA_TO_PIP = { "pytables": "tables", "psycopg2": "psycopg2-binary", "dask-core": "dask", "seaborn-base": "seaborn", "sqlalchemy": "SQLAlchemy", + "pyqt": "PyQt5", } @@ -40,7 +41,7 @@ In most cases they are the same, those are the exceptions: - Packages that should be excluded (in `EXCLUDE`) - - Packages that should be renamed (in `RENAME`) + - Packages that should be renamed (in `CONDA_TO_PIP`) - A package requiring a specific version, in conda is defined with a single equal (e.g. ``pandas=1.0``) and in pip with two (e.g. ``pandas==1.0``) """ @@ -53,14 +54,14 @@ return if pkg in REMAP_VERSION: return "".join((pkg, compare, REMAP_VERSION[pkg])) - if pkg in RENAME: - return "".join((RENAME[pkg], compare, version)) + if pkg in CONDA_TO_PIP: + return "".join((CONDA_TO_PIP[pkg], compare, version)) if package in EXCLUDE: return - if package in RENAME: - return RENAME[package] + if package in CONDA_TO_PIP: + return CONDA_TO_PIP[package] return package diff -Nru pandas-2.1.4+dfsg/scripts/no_bool_in_generic.py pandas-2.2.2+dfsg/scripts/no_bool_in_generic.py --- pandas-2.1.4+dfsg/scripts/no_bool_in_generic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/no_bool_in_generic.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,6 +9,7 @@ The function `visit` is adapted from a function by the same name in pyupgrade: https://github.com/asottile/pyupgrade/blob/5495a248f2165941c5d3b82ac3226ba7ad1fa59d/pyupgrade/_data.py#L70-L113 +Licence at LICENSES/PYUPGRADE_LICENSE """ from __future__ import annotations @@ -64,7 +65,7 @@ + replaced_line[col_offset + 4 :] ) new_lines.append(replaced_line) - return "\n".join(new_lines) + return "\n".join(new_lines) + "\n" def check_for_bool_in_generic(content: str) -> tuple[bool, str]: diff -Nru pandas-2.1.4+dfsg/scripts/run_stubtest.py pandas-2.2.2+dfsg/scripts/run_stubtest.py --- pandas-2.1.4+dfsg/scripts/run_stubtest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/run_stubtest.py 2024-04-10 17:42:52.000000000 +0000 @@ -47,6 +47,8 @@ # stubtest might be too sensitive "pandas._libs.lib.NoDefault", "pandas._libs.lib._NoDefault.no_default", + # stubtest/Cython is not recognizing the default value for the dtype parameter + "pandas._libs.lib.map_infer_mask", # internal type alias (should probably be private) "pandas._libs.lib.ndarray_obj_2d", # runtime argument "owner" has a default value but stub argument does not @@ -67,7 +69,6 @@ "pandas._libs.sparse.SparseIndex.to_block_index", "pandas._libs.sparse.SparseIndex.to_int_index", # TODO (decorator changes argument names) - "pandas._libs.tslibs.offsets.BaseOffset._apply_array", "pandas._libs.tslibs.offsets.BusinessHour.rollback", "pandas._libs.tslibs.offsets.BusinessHour.rollforward ", # type alias diff -Nru pandas-2.1.4+dfsg/scripts/tests/conftest.py pandas-2.2.2+dfsg/scripts/tests/conftest.py --- pandas-2.1.4+dfsg/scripts/tests/conftest.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/tests/conftest.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,6 +1,6 @@ # pyproject.toml defines addopts: --strict-data-files # strict-data-files is defined & used in pandas/conftest.py -def pytest_addoption(parser): +def pytest_addoption(parser) -> None: parser.addoption( "--strict-data-files", action="store_true", diff -Nru pandas-2.1.4+dfsg/scripts/tests/data/deps_expected_random.yaml pandas-2.2.2+dfsg/scripts/tests/data/deps_expected_random.yaml --- pandas-2.1.4+dfsg/scripts/tests/data/deps_expected_random.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/tests/data/deps_expected_random.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -37,12 +37,12 @@ - numexpr>=2.7.3 - openpyxl>=3.0.7 - odfpy>=1.4.1 - - pandas-gbq>=0.15.0 - psycopg2>=2.8.6 - pyarrow<11, >=7.0.0 - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 + - python-calamine>=0.1.7 - pyxlsb>=1.0.8 - s3fs>=2021.08.0 - scipy>=1.7.1 diff -Nru pandas-2.1.4+dfsg/scripts/tests/data/deps_minimum.toml pandas-2.2.2+dfsg/scripts/tests/data/deps_minimum.toml --- pandas-2.1.4+dfsg/scripts/tests/data/deps_minimum.toml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/tests/data/deps_minimum.toml 2024-04-10 17:42:52.000000000 +0000 @@ -61,8 +61,8 @@ computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2021.07.0'] aws = ['s3fs>=2021.08.0'] -gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] -excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] +gcp = ['gcsfs>=2021.07.0'] +excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) @@ -94,7 +94,6 @@ 'numexpr>=2.7.3', 'odfpy>=1.4.1', 'openpyxl>=3.0.7', - 'pandas-gbq>=0.15.0', 'psycopg2>=2.8.6', 'pyarrow>=7.0.0', 'pymysql>=1.0.2', @@ -102,6 +101,7 @@ 'pyreadstat>=1.1.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', + 'python-calamine>=0.1.7', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', 'scipy>=1.7.1', @@ -382,7 +382,7 @@ "db: tests requiring a database (mysql or postgres)", "clipboard: mark a pd.read_clipboard test", "arm_slow: mark a test as slow for arm64 architecture", - "arraymanager: mark a test to run with ArrayManager enabled", + "skip_ubsan: tests known to invoke undefined behavior", ] [tool.mypy] diff -Nru pandas-2.1.4+dfsg/scripts/tests/data/deps_unmodified_random.yaml pandas-2.2.2+dfsg/scripts/tests/data/deps_unmodified_random.yaml --- pandas-2.1.4+dfsg/scripts/tests/data/deps_unmodified_random.yaml 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/tests/data/deps_unmodified_random.yaml 2024-04-10 17:42:52.000000000 +0000 @@ -37,12 +37,12 @@ - numexpr>=2.7.3 - openpyxl>=3.0.7 - odfpy>=1.4.1 - - pandas-gbq>=0.15.0 - psycopg2 - pyarrow<11, >=7.0.0 - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 + - python-calamine>=0.1.7 - pyxlsb>=1.0.8 - s3fs>=2021.08.0 - scipy>=1.7.1 diff -Nru pandas-2.1.4+dfsg/scripts/tests/test_no_bool_in_generic.py pandas-2.2.2+dfsg/scripts/tests/test_no_bool_in_generic.py --- pandas-2.1.4+dfsg/scripts/tests/test_no_bool_in_generic.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/tests/test_no_bool_in_generic.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,10 +1,10 @@ from scripts.no_bool_in_generic import check_for_bool_in_generic -BAD_FILE = "def foo(a: bool) -> bool:\n return bool(0)" -GOOD_FILE = "def foo(a: bool_t) -> bool_t:\n return bool(0)" +BAD_FILE = "def foo(a: bool) -> bool:\n return bool(0)\n" +GOOD_FILE = "def foo(a: bool_t) -> bool_t:\n return bool(0)\n" -def test_bad_file_with_replace(): +def test_bad_file_with_replace() -> None: content = BAD_FILE mutated, result = check_for_bool_in_generic(content) expected = GOOD_FILE @@ -12,7 +12,7 @@ assert mutated -def test_good_file_with_replace(): +def test_good_file_with_replace() -> None: content = GOOD_FILE mutated, result = check_for_bool_in_generic(content) expected = content diff -Nru pandas-2.1.4+dfsg/scripts/tests/test_sort_whatsnew_note.py pandas-2.2.2+dfsg/scripts/tests/test_sort_whatsnew_note.py --- pandas-2.1.4+dfsg/scripts/tests/test_sort_whatsnew_note.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/tests/test_sort_whatsnew_note.py 2024-04-10 17:42:52.000000000 +0000 @@ -1,7 +1,7 @@ from scripts.sort_whatsnew_note import sort_whatsnew_note -def test_sort_whatsnew_note(): +def test_sort_whatsnew_note() -> None: content = ( ".. _whatsnew_200:\n" "\n" diff -Nru pandas-2.1.4+dfsg/scripts/tests/test_use_io_common_urlopen.py pandas-2.2.2+dfsg/scripts/tests/test_use_io_common_urlopen.py --- pandas-2.1.4+dfsg/scripts/tests/test_use_io_common_urlopen.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/tests/test_use_io_common_urlopen.py 2024-04-10 17:42:52.000000000 +0000 @@ -5,7 +5,7 @@ PATH = "t.py" -def test_inconsistent_usage(capsys): +def test_inconsistent_usage(capsys) -> None: content = "from urllib.request import urlopen" result_msg = ( "t.py:1:0: Don't use urllib.request.urlopen, " @@ -17,7 +17,7 @@ assert result_msg == expected_msg -def test_consistent_usage(): +def test_consistent_usage() -> None: # should not raise content = "from pandas.io.common import urlopen" use_io_common_urlopen(content, PATH) diff -Nru pandas-2.1.4+dfsg/scripts/tests/test_use_pd_array_in_core.py pandas-2.2.2+dfsg/scripts/tests/test_use_pd_array_in_core.py --- pandas-2.1.4+dfsg/scripts/tests/test_use_pd_array_in_core.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/tests/test_use_pd_array_in_core.py 2024-04-10 17:42:52.000000000 +0000 @@ -10,7 +10,7 @@ @pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1]) -def test_inconsistent_usage(content, capsys): +def test_inconsistent_usage(content, capsys) -> None: result_msg = ( "t.py:2:0: Don't use pd.array in core, import array as pd_array instead\n" ) @@ -21,6 +21,6 @@ @pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1]) -def test_consistent_usage(content): +def test_consistent_usage(content) -> None: # should not raise use_pd_array(content, PATH) diff -Nru pandas-2.1.4+dfsg/scripts/tests/test_validate_docstrings.py pandas-2.2.2+dfsg/scripts/tests/test_validate_docstrings.py --- pandas-2.1.4+dfsg/scripts/tests/test_validate_docstrings.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/tests/test_validate_docstrings.py 2024-04-10 17:42:52.000000000 +0000 @@ -9,12 +9,12 @@ class BadDocstrings: """Everything here has a bad docstring""" - def private_classes(self): + def private_classes(self) -> None: """ This mentions NDFrame, which is not correct. """ - def prefix_pandas(self): + def prefix_pandas(self) -> None: """ Have `pandas` prefix in See Also section. @@ -24,7 +24,7 @@ DataFrame.head : The first `n` rows of the caller object. """ - def redundant_import(self, paramx=None, paramy=None): + def redundant_import(self, paramx=None, paramy=None) -> None: """ A sample DataFrame method. @@ -45,7 +45,7 @@ Series([], dtype: bool) """ - def unused_import(self): + def unused_import(self) -> None: """ Examples -------- @@ -53,7 +53,7 @@ >>> df = pd.DataFrame(np.ones((3, 3)), columns=('a', 'b', 'c')) """ - def missing_whitespace_around_arithmetic_operator(self): + def missing_whitespace_around_arithmetic_operator(self) -> None: """ Examples -------- @@ -61,7 +61,7 @@ 7 """ - def indentation_is_not_a_multiple_of_four(self): + def indentation_is_not_a_multiple_of_four(self) -> None: """ Examples -------- @@ -69,19 +69,19 @@ ... pass """ - def missing_whitespace_after_comma(self): + def missing_whitespace_after_comma(self) -> None: """ Examples -------- >>> df = pd.DataFrame(np.ones((3,3)),columns=('a','b', 'c')) """ - def write_array_like_with_hyphen_not_underscore(self): + def write_array_like_with_hyphen_not_underscore(self) -> None: """ In docstrings, use array-like over array_like """ - def leftover_files(self): + def leftover_files(self) -> None: """ Examples -------- @@ -110,14 +110,14 @@ base_path = "scripts.tests.test_validate_docstrings" if klass: - base_path = ".".join([base_path, klass]) + base_path = f"{base_path}.{klass}" if func: - base_path = ".".join([base_path, func]) + base_path = f"{base_path}.{func}" return base_path - def test_bad_class(self, capsys): + def test_bad_class(self, capsys) -> None: errors = validate_docstrings.pandas_validate( self._import_path(klass="BadDocstrings") )["errors"] @@ -192,20 +192,20 @@ ), ], ) - def test_bad_docstrings(self, capsys, klass, func, msgs): + def test_bad_docstrings(self, capsys, klass, func, msgs) -> None: result = validate_docstrings.pandas_validate( self._import_path(klass=klass, func=func) ) for msg in msgs: assert msg in " ".join([err[1] for err in result["errors"]]) - def test_leftover_files_raises(self): + def test_leftover_files_raises(self) -> None: with pytest.raises(Exception, match="The following files"): validate_docstrings.pandas_validate( self._import_path(klass="BadDocstrings", func="leftover_files") ) - def test_validate_all_ignore_functions(self, monkeypatch): + def test_validate_all_ignore_functions(self, monkeypatch) -> None: monkeypatch.setattr( validate_docstrings, "get_all_api_items", @@ -231,7 +231,7 @@ assert len(result) == 1 assert "pandas.Index.all" in result - def test_validate_all_ignore_deprecated(self, monkeypatch): + def test_validate_all_ignore_deprecated(self, monkeypatch) -> None: monkeypatch.setattr( validate_docstrings, "pandas_validate", @@ -303,7 +303,7 @@ (4, "random.randint"), ], ) - def test_item_name(self, idx, name): + def test_item_name(self, idx, name) -> None: result = list(validate_docstrings.get_api_items(self.api_doc)) assert result[idx][0] == name @@ -311,7 +311,7 @@ "idx,func", [(0, "cycle"), (1, "count"), (2, "chain"), (3, "seed"), (4, "randint")], ) - def test_item_function(self, idx, func): + def test_item_function(self, idx, func) -> None: result = list(validate_docstrings.get_api_items(self.api_doc)) assert callable(result[idx][1]) assert result[idx][1].__name__ == func @@ -326,7 +326,7 @@ (4, "Random"), ], ) - def test_item_section(self, idx, section): + def test_item_section(self, idx, section) -> None: result = list(validate_docstrings.get_api_items(self.api_doc)) assert result[idx][2] == section @@ -334,7 +334,7 @@ "idx,subsection", [(0, "Infinite"), (1, "Infinite"), (2, "Finite"), (3, "All"), (4, "All")], ) - def test_item_subsection(self, idx, subsection): + def test_item_subsection(self, idx, subsection) -> None: result = list(validate_docstrings.get_api_items(self.api_doc)) assert result[idx][3] == subsection @@ -343,7 +343,7 @@ @pytest.mark.parametrize( "name", ["pandas.Series.str.isdecimal", "pandas.Series.str.islower"] ) - def test_encode_content_write_to_file(self, name): + def test_encode_content_write_to_file(self, name) -> None: # GH25466 docstr = validate_docstrings.PandasDocstring(name).validate_pep8() # the list of pep8 errors should be empty @@ -351,7 +351,7 @@ class TestMainFunction: - def test_exit_status_for_main(self, monkeypatch): + def test_exit_status_for_main(self, monkeypatch) -> None: monkeypatch.setattr( validate_docstrings, "pandas_validate", @@ -375,7 +375,7 @@ ) assert exit_status == 0 - def test_exit_status_errors_for_validate_all(self, monkeypatch): + def test_exit_status_errors_for_validate_all(self, monkeypatch) -> None: monkeypatch.setattr( validate_docstrings, "validate_all", @@ -406,7 +406,7 @@ ) assert exit_status == 5 - def test_no_exit_status_noerrors_for_validate_all(self, monkeypatch): + def test_no_exit_status_noerrors_for_validate_all(self, monkeypatch) -> None: monkeypatch.setattr( validate_docstrings, "validate_all", @@ -425,7 +425,7 @@ ) assert exit_status == 0 - def test_exit_status_for_validate_all_json(self, monkeypatch): + def test_exit_status_for_validate_all_json(self, monkeypatch) -> None: print("EXECUTED") monkeypatch.setattr( validate_docstrings, @@ -451,7 +451,7 @@ ) assert exit_status == 0 - def test_errors_param_filters_errors(self, monkeypatch): + def test_errors_param_filters_errors(self, monkeypatch) -> None: monkeypatch.setattr( validate_docstrings, "validate_all", diff -Nru pandas-2.1.4+dfsg/scripts/tests/test_validate_exception_location.py pandas-2.2.2+dfsg/scripts/tests/test_validate_exception_location.py --- pandas-2.1.4+dfsg/scripts/tests/test_validate_exception_location.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/tests/test_validate_exception_location.py 2024-04-10 17:42:52.000000000 +0000 @@ -34,7 +34,7 @@ def test_class_that_inherits_an_exception_and_is_not_in_the_testing_rst_is_flagged( capsys, error_type -): +) -> None: content = TEST_CODE.format( custom_name=CUSTOM_EXCEPTION_NOT_IN_TESTING_RST, error_type=error_type ) @@ -47,13 +47,13 @@ def test_class_that_inherits_an_exception_but_is_in_the_testing_rst_is_not_flagged( capsys, error_type -): +) -> None: content = TEST_CODE.format( custom_name=CUSTOM_EXCEPTION__IN_TESTING_RST, error_type=error_type ) validate_exception_and_warning_placement(PATH, content, ERRORS_IN_TESTING_RST) -def test_class_that_does_not_inherit_an_exception_is_not_flagged(capsys): +def test_class_that_does_not_inherit_an_exception_is_not_flagged(capsys) -> None: content = "class MyClass(NonExceptionClass): pass" validate_exception_and_warning_placement(PATH, content, ERRORS_IN_TESTING_RST) diff -Nru pandas-2.1.4+dfsg/scripts/tests/test_validate_min_versions_in_sync.py pandas-2.2.2+dfsg/scripts/tests/test_validate_min_versions_in_sync.py --- pandas-2.1.4+dfsg/scripts/tests/test_validate_min_versions_in_sync.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/tests/test_validate_min_versions_in_sync.py 2024-04-10 17:42:52.000000000 +0000 @@ -46,7 +46,7 @@ ), ], ) -def test_pin_min_versions_to_yaml_file(src_toml, src_yaml, expected_yaml): +def test_pin_min_versions_to_yaml_file(src_toml, src_yaml, expected_yaml) -> None: with open(src_toml, "rb") as toml_f: toml_map = tomllib.load(toml_f) with open(src_yaml, encoding="utf-8") as yaml_f: diff -Nru pandas-2.1.4+dfsg/scripts/tests/test_validate_unwanted_patterns.py pandas-2.2.2+dfsg/scripts/tests/test_validate_unwanted_patterns.py --- pandas-2.1.4+dfsg/scripts/tests/test_validate_unwanted_patterns.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/tests/test_validate_unwanted_patterns.py 2024-04-10 17:42:52.000000000 +0000 @@ -38,7 +38,7 @@ ), ], ) - def test_pytest_raises(self, data): + def test_pytest_raises(self, data) -> None: fd = io.StringIO(data.strip()) result = list(validate_unwanted_patterns.bare_pytest_raises(fd)) assert result == [] @@ -147,7 +147,7 @@ ), ], ) - def test_pytest_raises_raises(self, data, expected): + def test_pytest_raises_raises(self, data, expected) -> None: fd = io.StringIO(data.strip()) result = list(validate_unwanted_patterns.bare_pytest_raises(fd)) assert result == expected @@ -200,7 +200,7 @@ ), ], ) - def test_strings_with_wrong_placed_whitespace(self, data): + def test_strings_with_wrong_placed_whitespace(self, data) -> None: fd = io.StringIO(data.strip()) result = list( validate_unwanted_patterns.strings_with_wrong_placed_whitespace(fd) @@ -369,7 +369,7 @@ ), ], ) - def test_strings_with_wrong_placed_whitespace_raises(self, data, expected): + def test_strings_with_wrong_placed_whitespace_raises(self, data, expected) -> None: fd = io.StringIO(data.strip()) result = list( validate_unwanted_patterns.strings_with_wrong_placed_whitespace(fd) @@ -401,7 +401,7 @@ ), ], ) - def test_nodefault_used_not_only_for_typing(self, data): + def test_nodefault_used_not_only_for_typing(self, data) -> None: fd = io.StringIO(data.strip()) result = list(validate_unwanted_patterns.nodefault_used_not_only_for_typing(fd)) assert result == [] @@ -440,7 +440,7 @@ ), ], ) - def test_nodefault_used_not_only_for_typing_raises(self, data, expected): + def test_nodefault_used_not_only_for_typing_raises(self, data, expected) -> None: fd = io.StringIO(data.strip()) result = list(validate_unwanted_patterns.nodefault_used_not_only_for_typing(fd)) assert result == expected diff -Nru pandas-2.1.4+dfsg/scripts/validate_docstrings.py pandas-2.2.2+dfsg/scripts/validate_docstrings.py --- pandas-2.1.4+dfsg/scripts/validate_docstrings.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/validate_docstrings.py 2024-04-10 17:42:52.000000000 +0000 @@ -143,7 +143,7 @@ func = getattr(func, part) yield ( - ".".join([current_module, line_stripped]), + f"{current_module}.{line_stripped}", func, current_section, current_subsection, @@ -228,11 +228,12 @@ file.name, ] response = subprocess.run(cmd, capture_output=True, check=False, text=True) - stdout = response.stdout - stdout = stdout.replace(file.name, "") - messages = stdout.strip("\n").splitlines() - if messages: - error_messages.extend(messages) + for output in ("stdout", "stderr"): + out = getattr(response, output) + out = out.replace(file.name, "") + messages = out.strip("\n").splitlines() + if messages: + error_messages.extend(messages) finally: file.close() os.unlink(file.name) @@ -410,8 +411,8 @@ return exit_status -def print_validate_one_results(func_name: str): - def header(title, width=80, char="#"): +def print_validate_one_results(func_name: str) -> None: + def header(title, width=80, char="#") -> str: full_line = char * width side_len = (width - len(title) - 2) // 2 adj = "" if len(title) % 2 == 0 else " " diff -Nru pandas-2.1.4+dfsg/scripts/validate_exception_location.py pandas-2.2.2+dfsg/scripts/validate_exception_location.py --- pandas-2.1.4+dfsg/scripts/validate_exception_location.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/validate_exception_location.py 2024-04-10 17:42:52.000000000 +0000 @@ -71,7 +71,7 @@ def validate_exception_and_warning_placement( file_path: str, file_content: str, errors: set[str] -): +) -> None: tree = ast.parse(file_content) visitor = Visitor(file_path, errors) visitor.visit(tree) diff -Nru pandas-2.1.4+dfsg/scripts/validate_min_versions_in_sync.py pandas-2.2.2+dfsg/scripts/validate_min_versions_in_sync.py --- pandas-2.1.4+dfsg/scripts/validate_min_versions_in_sync.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/validate_min_versions_in_sync.py 2024-04-10 17:42:52.000000000 +0000 @@ -26,7 +26,7 @@ from typing import Any -from scripts.generate_pip_deps_from_conda import RENAME +from scripts.generate_pip_deps_from_conda import CONDA_TO_PIP DOC_PATH = pathlib.Path("doc/source/getting_started/install.rst").resolve() CI_PATH = next( @@ -36,7 +36,7 @@ SETUP_PATH = pathlib.Path("pyproject.toml").resolve() YAML_PATH = pathlib.Path("ci/deps") ENV_PATH = pathlib.Path("environment.yml") -EXCLUDE_DEPS = {"tzdata", "blosc"} +EXCLUDE_DEPS = {"tzdata", "blosc", "c-blosc2", "pandas-gbq", "pyqt", "pyqt5"} EXCLUSION_LIST = frozenset(["python=3.8[build=*_pypy]"]) # pandas package is not available # in pre-commit environment @@ -169,8 +169,8 @@ old_dep = yaml_package if yaml_versions is not None: old_dep = old_dep + ", ".join(yaml_versions) - if RENAME.get(yaml_package, yaml_package) in toml_map: - min_dep = toml_map[RENAME.get(yaml_package, yaml_package)] + if CONDA_TO_PIP.get(yaml_package, yaml_package) in toml_map: + min_dep = toml_map[CONDA_TO_PIP.get(yaml_package, yaml_package)] elif yaml_package in toml_map: min_dep = toml_map[yaml_package] else: @@ -197,8 +197,10 @@ def get_versions_from_code() -> dict[str, str]: """Min versions for checking within pandas code.""" install_map = _optional.INSTALL_MAPPING + inverse_install_map = {v: k for k, v in install_map.items()} versions = _optional.VERSIONS for item in EXCLUDE_DEPS: + item = inverse_install_map.get(item, item) versions.pop(item, None) return {install_map.get(k, k).casefold(): v for k, v in versions.items()} @@ -223,6 +225,9 @@ seen_required = True elif "# optional dependencies" in line: seen_optional = True + elif "#" in line: + # just a comment + continue elif "- pip:" in line: continue elif seen_required and line.strip(): @@ -230,7 +235,7 @@ package, version = line.strip().split("==", maxsplit=1) else: package, version = line.strip().split("=", maxsplit=1) - package = package[2:] + package = package.split()[-1] if package in EXCLUDE_DEPS: continue if not seen_optional: diff -Nru pandas-2.1.4+dfsg/scripts/validate_unwanted_patterns.py pandas-2.2.2+dfsg/scripts/validate_unwanted_patterns.py --- pandas-2.1.4+dfsg/scripts/validate_unwanted_patterns.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/scripts/validate_unwanted_patterns.py 2024-04-10 17:42:52.000000000 +0000 @@ -33,6 +33,7 @@ "_agg_template_series", "_agg_template_frame", "_pipe_template", + "_apply_groupings_depr", "__main__", "_transform_template", "_use_inf_as_na", @@ -49,7 +50,15 @@ "_global_config", "_chained_assignment_msg", "_chained_assignment_method_msg", + "_chained_assignment_warning_msg", + "_chained_assignment_warning_method_msg", + "_check_cacher", "_version_meson", + # The numba extensions need this to mock the iloc object + "_iLocIndexer", + # TODO(3.0): GH#55043 - remove upon removal of ArrayManager + "_get_option", + "_fill_limit_area_1d", } diff -Nru pandas-2.1.4+dfsg/setup.py pandas-2.2.2+dfsg/setup.py --- pandas-2.1.4+dfsg/setup.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/setup.py 2024-04-10 17:42:52.000000000 +0000 @@ -37,7 +37,7 @@ # note: sync with pyproject.toml, environment.yml and asv.conf.json -min_cython_ver = "0.29.33" +min_cython_ver = "3.0.5" try: from Cython import ( @@ -75,7 +75,7 @@ class build_ext(_build_ext): @classmethod - def render_templates(cls, pxifiles): + def render_templates(cls, pxifiles) -> None: for pxifile in pxifiles: # build pxifiles first, template extension must be .pxi.in assert pxifile.endswith(".pxi.in") @@ -95,7 +95,7 @@ with open(outfile, "w", encoding="utf-8") as f: f.write(pyxcontent) - def build_extensions(self): + def build_extensions(self) -> None: # if building from c files, don't need to # generate template output if _CYTHON_INSTALLED: @@ -109,7 +109,7 @@ user_options = [("all", "a", "")] - def initialize_options(self): + def initialize_options(self) -> None: self.all = True self._clean_me = [] self._clean_trees = [] @@ -161,10 +161,10 @@ self._clean_trees.append(d for d in ("build", "dist") if os.path.exists(d)) - def finalize_options(self): + def finalize_options(self) -> None: pass - def run(self): + def run(self) -> None: for clean_me in self._clean_me: try: os.unlink(clean_me) @@ -227,10 +227,10 @@ "pandas/_libs/window/aggregations.pyx", ] - def initialize_options(self): + def initialize_options(self) -> None: sdist_class.initialize_options(self) - def run(self): + def run(self) -> None: if "cython" in cmdclass: self.run_command("cython") else: @@ -254,7 +254,7 @@ Subclass build_ext to get clearer report if Cython is necessary. """ - def check_cython_extensions(self, extensions): + def check_cython_extensions(self, extensions) -> None: for ext in extensions: for src in ext.sources: if not os.path.exists(src): @@ -266,7 +266,7 @@ """ ) - def build_extensions(self): + def build_extensions(self) -> None: self.check_cython_extensions(self.extensions) build_ext.build_extensions(self) @@ -278,7 +278,7 @@ C-compile method build_extension() with a no-op. """ - def build_extension(self, ext): + def build_extension(self, ext) -> None: pass @@ -287,13 +287,13 @@ user_options = [] - def initialize_options(self): + def initialize_options(self) -> None: self.py_modules_dict = {} - def finalize_options(self): + def finalize_options(self) -> None: pass - def run(self): + def run(self) -> None: pass @@ -418,6 +418,9 @@ kwargs["nthreads"] = parsed.parallel build_ext.render_templates(_pxifiles) + if debugging_symbols_requested: + kwargs["gdb_debug"] = True + return cythonize(extensions, *args, **kwargs) diff -Nru pandas-2.1.4+dfsg/tooling/debug/Dockerfile.pandas-debug pandas-2.2.2+dfsg/tooling/debug/Dockerfile.pandas-debug --- pandas-2.1.4+dfsg/tooling/debug/Dockerfile.pandas-debug 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/tooling/debug/Dockerfile.pandas-debug 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,34 @@ +FROM ubuntu:latest + +RUN apt-get update && apt-get upgrade -y +RUN apt-get install -y build-essential git valgrind + +# cpython dev install +RUN git clone -b 3.10 --depth 1 https://github.com/python/cpython.git /clones/cpython +RUN apt-get install -y libbz2-dev libffi-dev libssl-dev zlib1g-dev liblzma-dev libsqlite3-dev libreadline-dev +RUN cd /clones/cpython && ./configure --with-pydebug && CFLAGS="-g3" make -s -j$(nproc) && make install + +# gdb installation +RUN apt-get install -y wget libgmp-dev +RUN cd /tmp && wget http://mirrors.kernel.org/sourceware/gdb/releases/gdb-12.1.tar.gz && tar -zxf gdb-12.1.tar.gz +RUN cd /tmp/gdb-12.1 && ./configure --with-python=python3 && make -j$(nproc) && make install +RUN rm -r /tmp/gdb-12.1 + +# pandas dependencies +RUN python3 -m pip install \ + cython \ + hypothesis \ + ninja \ + numpy \ + meson \ + meson-python \ + pytest \ + python-dateutil \ + pytz \ + versioneer[toml] + +# At the time this docker image was built, there was a bug/limitation +# with meson where only having a python3 executable and not python +# would cause the build to fail. This symlink could be removed if +# users stick to always calling python3 within the container +RUN ln -s /usr/local/bin/python3 /usr/local/bin/python diff -Nru pandas-2.1.4+dfsg/tooling/debug/README pandas-2.2.2+dfsg/tooling/debug/README --- pandas-2.1.4+dfsg/tooling/debug/README 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/tooling/debug/README 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,19 @@ +The Docker image here helps to set up an isolated environment containing a debug version of Python and a gdb installation which the Cython debugger can work with. + +If you have internet access, you can pull a pre-built image via + +```sh +docker pull pandas/pandas-debug +``` + +To build the image locally, you can do + +```sh +docker build . -t pandas/pandas-debug -f Dockerfile.pandas-debug +``` + +For pandas developers, you can push a new copy of the image to dockerhub via + +```sh +docker push pandas/pandas-debug +``` diff -Nru pandas-2.1.4+dfsg/web/pandas/_templates/layout.html pandas-2.2.2+dfsg/web/pandas/_templates/layout.html --- pandas-2.1.4+dfsg/web/pandas/_templates/layout.html 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/web/pandas/_templates/layout.html 2024-04-10 17:42:52.000000000 +0000 @@ -1,13 +1,7 @@ - - + pandas - Python Data Analysis Library diff -Nru pandas-2.1.4+dfsg/web/pandas/community/ecosystem.md pandas-2.2.2+dfsg/web/pandas/community/ecosystem.md --- pandas-2.1.4+dfsg/web/pandas/community/ecosystem.md 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/web/pandas/community/ecosystem.md 2024-04-10 17:42:52.000000000 +0000 @@ -345,6 +345,29 @@ ## IO +### [NTV-pandas](https://github.com/loco-philippe/ntv-pandas) + +NTV-pandas provides a JSON converter with more data types than the ones supported by pandas directly. + +It supports the following data types: + +- pandas data types +- data types defined in the [NTV format](https://loco-philippe.github.io/ES/JSON%20semantic%20format%20(JSON-NTV).htm) +- data types defined in [Table Schema specification](http://dataprotocols.org/json-table-schema/#field-types-and-formats) + +The interface is always reversible (conversion round trip) with two formats (JSON-NTV and JSON-TableSchema). + +Example: + +```python +import ntv_pandas as npd + +jsn = df.npd.to_json(table=False) # save df as a JSON-value (format Table Schema if table is True else format NTV ) +df = npd.read_json(jsn) # load a JSON-value as a `DataFrame` + +df.equals(npd.read_json(df.npd.to_json(df))) # `True` in any case, whether `table=True` or not +``` + ### [BCPandas](https://github.com/yehoshuadimarsky/bcpandas) BCPandas provides high performance writes from pandas to Microsoft SQL Server, @@ -489,6 +512,21 @@ - ``vaex.from_pandas`` - ``vaex.to_pandas_df`` +### [Hail Query](https://hail.is/) + +An out-of-core, preemptible-safe, distributed, dataframe library serving +the genetics community. Hail Query ships with on-disk data formats, +in-memory data formats, an expression compiler, a query planner, and a +distributed sort algorithm all designed to accelerate queries on large +matrices of genome sequencing data. + +It is often easiest to use pandas to manipulate the summary statistics or +other small aggregates produced by Hail. For this reason, Hail provides +native import to and export from pandas DataFrames: + +- [`Table.from_pandas`](https://hail.is/docs/latest/hail.Table.html#hail.Table.from_pandas) +- [`Table.to_pandas`](https://hail.is/docs/latest/hail.Table.html#hail.Table.to_pandas) + ## Data cleaning and validation ### [pyjanitor](https://github.com/pyjanitor-devs/pyjanitor) @@ -511,9 +549,16 @@ Pandas provides an interface for defining [extension types](https://pandas.pydata.org/docs/development/extending.html#extension-types) to extend NumPy's type system. -The following librariesimplement that interface to provide types not found in NumPy or pandas, +The following libraries implement that interface to provide types not found in NumPy or pandas, which work well with pandas' data containers. +### [awkward-pandas](https://awkward-pandas.readthedocs.io/) + +Awkward-pandas provides an extension type for storing [Awkward +Arrays](https://awkward-array.org/) inside pandas' Series and +DataFrame. It also provides an accessor for using awkward functions +on Series that are of awkward type. + ### [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) Cyberpandas provides an extension type for storing arrays of IP @@ -553,6 +598,7 @@ | Library | Accessor | Classes | | -------------------------------------------------------------------- | ---------- | --------------------- | + | [awkward-pandas](https://awkward-pandas.readthedocs.io/en/latest/) | `ak` | `Series` | | [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) | `ip` | `Series` | | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | | [pandas-genomics](https://pandas-genomics.readthedocs.io/en/latest/) | `genomics` | `Series`, `DataFrame` | diff -Nru pandas-2.1.4+dfsg/web/pandas/pdeps/0007-copy-on-write.md pandas-2.2.2+dfsg/web/pandas/pdeps/0007-copy-on-write.md --- pandas-2.1.4+dfsg/web/pandas/pdeps/0007-copy-on-write.md 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/web/pandas/pdeps/0007-copy-on-write.md 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,589 @@ +# PDEP-7: Consistent copy/view semantics in pandas with Copy-on-Write + +- Created: July 2021 +- Status: Accepted +- Discussion: [#36195](https://github.com/pandas-dev/pandas/issues/36195) +- Author: [Joris Van den Bossche](https://github.com/jorisvandenbossche) +- Revision: 1 + +## Abstract + +Short summary of the proposal: + +1. The result of _any_ indexing operation (subsetting a DataFrame or Series in any way, + i.e. including accessing a DataFrame column as a Series) or any method returning a + new DataFrame or Series, always _behaves as if_ it were a copy in terms of user + API. +2. We implement Copy-on-Write (as implementation detail). This way, we can actually use + views as much as possible under the hood, while ensuring the user API behaves as a + copy. +3. As a consequence, if you want to modify an object (DataFrame or Series), the only way + to do this is to directly modify that object itself . + +This addresses multiple aspects: 1) a clear and consistent user API (a clear rule: _any_ +subset or returned series/dataframe **always** behaves as a copy of the original, and +thus never modifies the original) and 2) improving performance by avoiding excessive +copies (e.g. a chained method workflow would no longer return an actual data copy at each +step). + +Because every single indexing step behaves as a copy, this also means that with this +proposal, "chained assignment" (with multiple setitem steps) will _never_ work and +the `SettingWithCopyWarning` can be removed. + +## Background + +pandas' current behavior on whether indexing returns a view or copy is confusing. Even +for experienced users, it's hard to tell whether a view or copy will be returned (see +below for a summary). We'd like to provide an API that is consistent and sensible about +returning views vs. copies. + +We also care about performance. Returning views from indexing operations is faster and +reduces memory usage. The same is true for several methods that don't modify the data +such as setting/resetting the index, renaming columns, etc. that can be used in a method +chaining workflow and currently return a new copy at each step. + +Finally, there are API / usability issues around views. It can be challenging to know +the user's intent in operations that modify a subset of a DataFrame (column and/or row +selection), like: + +```python +>>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) +>>> df2 = df[["A", "B"]] +>>> df2.loc[df2["A"] > 1, "A"] = 1 +``` + +Did the user intend to modify `df` when they modified `df2` (setting aside issues with +the current implementation)? In other words, if we had a perfectly consistent world +where indexing the columns always returned views or always returned a copy, does the +code above imply that the user wants to mutate `df`? + +There are two possible behaviours the user might intend: + +1. Case 1: I know my subset might be a view of the original and I want to modify the + original as well. +2. Case 2: I just want to modify the subset without modifying the original. + +Today, pandas' inconsistency means _neither_ of these workflows is really possible. The +first is difficult, because indexing operations often (though not always) return copies, +and even when a view is returned you sometimes get a `SettingWithCopyWarning` when +mutating. The second is somewhat possible, but requires many defensive copies (to avoid +`SettingWithCopyWarning`, or to ensure that you have a copy when a view _was_ returned). + +## Proposal + +For these reasons (consistency, performance, code clarity), this PDEP proposes the +following changes: + +1. The result of _any_ indexing operation (subsetting a DataFrame or Series in any way, + i.e. including accessing a DataFrame column as a Series) or any method returning a + new DataFrame or Series, always _behaves as if_ it were a copy in terms of user + API. +2. We implement Copy-on-Write. This way, we can actually use views as much as possible + under the hood, while ensuring the user API behaves as a copy. + +The intent is to capture the performance benefits of views as much as possible, while +providing consistent and clear behaviour to the user. This essentially makes returning +views an internal optimization, without the user needing to know if the specific +indexing operation would return a view or a copy. The new rule would be simple: any +series/dataframe derived from another series/dataframe, through an indexing operation or +a method, always behaves as a copy of the original series/dataframe. + +The mechanism to ensure this consistent behaviour, Copy-on-Write, would entail the +following: the setitem operation (i.e. `df[..] = ..` or `df.loc[..] = ..` or +`df.iloc[..] = ..`, or equivalent for Series) would check if the data that is being +modified is a view on another dataframe (or is being viewed by another dataframe). If it +is, then we would copy the data before mutating. + +Taking the example from above, if the user wishes to not mutate the parent, we no longer +require a defensive copy just to avoid a `SettingWithCopyWarning`. + +```python +# Case 2: The user does not want mutating df2 to mutate the parent df, via CoW +>>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) +>>> df2 = df[["A", "B"]] +>>> df2.loc[df2["A"] > 1, "A"] = 1 +>>> df.iloc[1, 0] # df was not mutated +2 +``` + +On the other hand, if the user actually wants to modify the original df, they can no +longer rely on the fact that `df2` could be a view, as mutating a subset would now never +mutate the parent. The only way to modify the original df is by combining all indexing +steps in a single indexing operation on the original (no "chained" setitem): + +```python +# Case 1: user wants mutations of df2 to be reflected in df -> no longer possible +>>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) +>>> df2 = df[["A", "B"]] +>>> df2.loc[df2["A"] > 1, "A"] = 1 # mutating df2 will not mutate df +>>> df.loc[df["A"] > 1, "A"] = 1 # need to directly mutate df instead +``` + +### This proposal also extends to methods + +In principle, there's nothing special about indexing when it comes to defensive copying. +_Any_ method that returns a new series/dataframe without altering existing data (rename, +set_index, assign, dropping columns, etc.) currently returns a copy by default and is a +candidate for returning a view: + +```python +>>> df2 = df.rename(columns=str.lower) +>>> df3 = df2.set_index("a") +``` + +Now, generally, pandas users won't expect `df2` or `df3` to be a view such that mutating +`df2` or `df3` would mutate `df`. Copy-on-Write allows us to also avoid +unnecessary copies in methods such as the above (or in the variant using method chaining +like `df.rename(columns=str.lower).set_index("a")`). + +### Propagating mutation forwards + +Thus far we have considered the (more common) case of taking a subset, mutating the +subset, and how that should affect the parent. What about the other direction, where the +parent is mutated? + +```python +>>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) +>>> df2 = df[["A"]] +>>> df.iloc[0, 0] = 10 +>>> df2.iloc[0, 0] # what is this value? +``` + +Given that `df2` is _considered_ as a copy of df under this proposal (i.e. behaves as a +copy), also mutating the parent `df` will not mutate the subset `df2`. + +### When do mutations propagate to other objects and when not? + +This proposal basically means that mutations _never_ propagate to _other_ objects (as +would happen with views). The only way to modify a DataFrame or Series is to modify the +object itself directly. + +But let's illustrate this in Python terms. Consider that we have a DataFrame `df1`, and we +assign that to another name `df2`: + +```python +>>> df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) +>>> df2 = df1 +``` + +Although we have now two variables (`df1` and `df2`), this assignment follows the standard +python semantics, and both names are pointing to the same object ("df1 and df2 are +_identical_"): + +```python +>>> id(df1) == id(df2) # or: df1 is df2 +True +``` + +Thus, if you modify DataFrame `df2`, this is also reflected in the other variable `df1`, and +the other way around (since it's the same object): + +```python +>>> df1.iloc[0, 0] +1 +>>> df2.iloc[0, 0] = 10 +>>> df1.iloc[0, 0] +10 +``` + +In summary, modifications are only "propagated" between _identical_ objects (not just +equal (`==`), but identical (`is`) in python terms, see +[docs](https://docs.python.org/3/reference/expressions.html#is)). Propagation is not +really the proper term, since there is only one object that was modified. + +However, when in some way creating a new object (even though it might be a DataFrame +with the same data, and thus be an "equal" DataFrame): + +```python +>>> df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) +>>> df2 = df1[:] # or df1.loc[...] with some indexer +``` + +Those objects are no longer identical: + +```python +>>> id(df1) == id(df2) # or df1 is df2 +False +``` + +And thus modifications to one will not propagate to the other: + +```python +>>> df1.iloc[0, 0] +1 +>>> df2.iloc[0, 0] = 10 +>>> df1.iloc[0, 0] # not changed +1 +``` + +Currently, any getitem indexing operation returns _new_ objects, and also almost all +DataFrame/Series methods return a _new_ object (except with `inplace=True` in some +cases), and thus follow the above logic of never modifying its parent/child DataFrame or +Series (using the lazy Copy-on-Write mechanism where possible). + +## Copy / view behaviour in NumPy versus pandas + +NumPy has the concept of "views" (an array that shares data with another array, viewing +the same memory, see e.g. +[this explanation](https://scipy-cookbook.readthedocs.io/items/ViewsVsCopies.html) for +more details). Typically you create views as a slice of another array. But other +indexing methods, often called "fancy indexing", do not return views but copies: using a +list of indices or a boolean mask. + +Pandas, being built on NumPy, uses those concepts, and also exposes the behaviour +consequences to its users. This basically means that pandas users, to understand the +details of how indexing works, also need to understand those view / fancy indexing +concepts of numpy. + +However, because DataFrames are not an array, the copy/view rules still differ from +NumPy's rules with current pandas. Slicing rows generally gives a view (following +NumPy), but slicing columns doesn't always give a view (this could be changed to match +NumPy however, see "Alternatives" 1b below). Fancy indexing rows (e.g. with a list of +(positional) labels) gives a copy, but fancy indexing columns _could_ give a view +(currently this gives a copy as well, but one of the "Alternatives" (1b) is to have this +always return a view). + +The proposal in this document is to decouple the pandas user-facing behaviour from those +NumPy concepts. Creating a subset of a DataFrame with a slice or with a mask would +behave in a similar way for the user (both return a new object and behave as a copy of +the original). We still use the concept of views internally in pandas to optimize the +implementation, but this becomes hidden from the user. + +## Alternatives + +The [original document](https://docs.google.com/document/d/1csGE4qigPR2vzmU2--jwURn3sK5k5eVewinxd8OUPk0/edit) and GitHub issue ([Proposal for future copy / view semantics in indexing operations - #36195](https://github.com/pandas-dev/pandas/issues/36195)) discussed several options for making the copy/view situation more consistent and clear: + +1. **Well-Defined copy/view rules:** ensure we have more consistent rules about which + operations result in a copy and which in a view, and then views result in mutating + the parent, copies not. + a. A minimal change would be to officialize the current behaviour. This comes down to + fixing some bugs and clearly documenting and testing which operations are views, + and which are copies. + b. An alternative would be to simplify the set of rules. For example: selecting + columns is always a view, subsetting rows is always a copy. Or: selecting columns + is always a view, subsetting rows as a slice is a view otherwise always a copy. + +2. **Copy-on-Write**: The setitem operation would check if it's a view on another + dataframe. If it is, then we would copy our data before mutating. (i.e. this + proposal) + +3. **Error-on-Write**: The setitem operation would check if it's a subset of another + dataframe (both view of copy). Only rather than copying in case of a view we would + raise an exception telling the user to either copy the data with + ``.copy_if_needed()`` (name TBD) or mark the frame as "a mutable view" with + ``.as_mutable_view()`` (name TBD). + +This document basically proposes an extended version of option 2 (Copy-on-Write). Some +arguments in favor of Copy-on-Write compared to the other options: + +* Copy-on-Write will improve the copy/view efficiency of _methods_ (e.g. rename, + (re)set_index, drop columns, etc. See section above). This will result in + lower memory usage and better performance. + +* This proposal can also be seen as a clear "well-defined rule". Using Copy-on-Write + under the hood is an implementation detail to delay the actual copy until it is + needed. The rule of "always copy" is the simplest "well-defined rule" we can get. + + Other "well-defined rule" ideas above would always include some specific cases (and + deviations from the NumPy rules). And even with clear rules a user still needs to know + the details of those rules to understand that `df['a'][df['b'] < 0] = 0` or + `df[df['b'] < 0]['a'] = 0` does something differently (switched order of column/row + indexing: the first mutates df (if selecting a column is a view) and the second + doesn't). While with the "always copy" rule with Copy-on-Write, neither of those + examples will work to update `df`. + +On the other hand, the proposal in this document does not give the user control over +whether a subset should be a view (when possible) that mutates the parent when being +mutated. The only way to modify the parent dataframe is with a direct indexing operation +on this dataframe itself. + +See the GitHub comment with some more detailed argumentation: +[https://github.com/pandas-dev/pandas/issues/36195#issuecomment-786654449](https://github.com/pandas-dev/pandas/issues/36195#issuecomment-786654449) + +## Disadvantages + +Other than the fact that this proposal would result in a backwards incompatible, +breaking change in behaviour (see next section), there are some other potential +disadvantages: + +* Deviation from NumPy: NumPy uses the copy and view concepts, while in this proposal + views would basically not exist anymore in pandas (for the user, at least; we would + still use it internally as an implementation detail) + * But as a counter argument: many pandas users are probably not familiar with those + concepts, and pandas already deviates from the exact rules in NumPy. +* Performance cost of indexing and methods becomes harder to predict: because the copy + of the data doesn't happen at the moment when actually creating the new object, but + can happen at a later stage when modifying either the parent or child object, it + becomes less transparent about when pandas copies data (but in general we should copy + less often). This is somewhat mitigated because Copy-on-Write will only copy the columns + that are mutated. Unrelated columns won't get copied. +* Increased memory usage for some use cases: while the majority of use cases will + see an improvement in memory usage with this proposal, there are a few use + cases where this might not be the case. Specifically in cases where pandas currently + does return a view (e.g. slicing rows) and in the case you are fine with (or don't care + about) the current behaviour of it being a view when mutating that subset (i.e. + mutating the sliced subset also mutates the parent dataframe), in such a case the + proposal would introduce a new copy compared to the current behaviour. There is a + workaround for this though: the copy is not needed if the previous object goes out + of scope, e.g. the variable is reassigned to something else. + +## Backward compatibility + +The proposal in this document is clearly a backwards incompatible change that breaks +existing behaviour. Because of the current inconsistencies and subtleties around views +vs. copies and mutation, it would be difficult to change anything without breaking +changes. The current proposal is not the proposal with the minimal changes, though. A +change like this will in any case need to be accompanied with a major version bump (for +example pandas 3.0). + +Doing a traditional deprecation cycle that lives in several minor feature releases will +be too noisy. Indexing is too common an operation to include a warning (even if we limit +it to just those operations that previously returned views). However, this proposal is +already implemented and thus available. Users can opt-in and test their code (this is +possible starting with version 1.5 with `pd.options.mode.copy_on_write = True`). + +Further we will add a warning mode for pandas 2.2 that raises warnings for all cases that +will change behaviour under the Copy-on-Write proposal. We can +provide a clearly documented upgrade path to first enable the warnings, fix all +warnings, and then enable the Copy-on-Write mode and ensure your code is still working, +and then finally upgrade to the new major release. + +## Implementation + +The implementation is available since pandas 1.5 (and significantly improved starting +with pandas 2.0). It uses weakrefs to keep track of whether the +data of a Dataframe/Series are viewing the data of another (pandas) object or are being +viewed by another object. This way, whenever the series/dataframe gets modified, we can +check if its data first needs to be copied before mutating it +(see [here](https://pandas.pydata.org/docs/development/copy_on_write.html)). + +To test the implementation and experiment with the new behaviour, you can +enable it with the following option: + +```python +>>> pd.options.mode.copy_on_write = True +``` + +after importing pandas (or setting the `PANDAS_COPY_ON_WRITE=1` environment variable +before importing pandas). + +## Concrete examples + +### Chained assignment + +Consider a "classic" case of chained indexing, which was the original motivation for the SettingWithCopy warning: + +```python +>>> df[df['B'] > 3]['B'] = 10 +``` + +That is roughly equivalent to + +```python +>>> df2 = df[df['B'] > 3] # Copy under NumPy's rules +>>> df2['B'] = 10 # Update (the copy) df2, df not changed +>>> del df2 # All references to df2 are lost, goes out of scope +``` + +And so `df` is not modified. For this reason, the SettingWithCopyWarning was introduced. + +_With this proposal_, any result of an indexing operation behaves as a copy +(Copy-on-Write), and thus chained assignment will _never_ work. Given that there is then +no ambiguity, the idea is to drop the warning. + +The above example is a case where chained assignment doesn't work with current pandas. +But there are of course also patterns with chained assignment that currently _do_ work +and are used. _With this proposal_, any chained assignment will not work, and so those +cases will stop working (e.g. the case above but switching the order): + +```python +>>> df['B'][df['B'] > 3] = 10 +# or +>>> df['B'][0:5] = 10 +``` + +These cases will raise a warning ``ChainedAssignmentError``, because they can never +accomplish what the user intended. There will be false-positive cases when these +operations are triggered from Cython, because Cython uses a different reference counting +mechanism. These cases should be rare, since calling pandas code from Cython does not +have any performance benefits. + +### Filtered dataframe + +A typical example where the current SettingWithCopyWarning becomes annoying is when +filtering a DataFrame (which always already returns a copy): + +```python +>>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) +>>> df_filtered = df[df["A"] > 1] +>>> df_filtered["new_column"] = 1 +SettingWithCopyWarning: +A value is trying to be set on a copy of a slice from a DataFrame. +Try using .loc[row_indexer,col_indexer] = value instead +``` + +If you then modify your filtered dataframe (e.g. adding a column), you get the +unnecessary SettingWithCopyWarning (with confusing message). The only way to get rid of +the warning is by doing a defensive copy (`df_filtered = df[df["A"] > 1].copy()`, which +results in copying the data twice in the current implementation, Copy-on-Write would +not require ``.copy()`` anymore). + +_With this proposal_, the filtered dataframe is never a view and the above +workflow would work as expected without warning (and thus without needing the extra +copy). + +### Modifying a Series (from DataFrame column) + +_Currently_, accessing a column of a DataFrame as a Series is one of the few cases that +is actually guaranteed to always be a view: + +```python +>>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) +>>> s = df["A"] +>>> s.loc[0] = 0 # will also modify df (but no longer with this proposal) +``` + +_With this proposal_, any indexing operation results in a copy, so also accessing a +column as a Series (in practice, it will still be a view of course, but behave as a copy +through Copy-on-Write). In the above example, mutating `s` will no longer modify the +parent `df`. + +This situation is similar as the "chained assignment" case above, except with +an explicit intermediate variable. To actually change the original DataFrame, +the solution is the same: mutate directly the DataFrame in a single step. +For example: + +```python +>>> df.loc[0, "A"] = 0 +``` + +### "Shallow" copies + +_Currently_, it is possible to create a "shallow" copy of a DataFrame with +`copy(deep=False)`. This creates a new DataFrame object but without copying the +underlying index and data. Any changes to the data of the original will be reflected in +the shallow copy (and vice versa). See the +[docs](https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.DataFrame.copy.html). + +```python +>>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) +>>> df2 = df.copy(deep=False) +>>> df2.iloc[0, 0] = 0 # will also modify df (but no longer with this proposal) +``` + +_With this proposal_, this kind of shallow copy is no longer possible. Only "identical" +objects (in Python terms: `df2 is df`) can share data without triggering Copy-on-Write. +A shallow copy will rather become a "delayed" copy through Copy-on-Write. + +See +[#36195 (comment)](https://github.com/pandas-dev/pandas/issues/36195#issuecomment-830579242) +for a more detailed comment on this. + +### Methods returning a new DataFrame with the same data + +This example is already shown above as well, but so _currently_ almost all methods on a +Series/DataFrame by default return a new object that is a copy of the original data: + +```python +>>> df2 = df.rename(columns=str.lower) +>>> df3 = df2.set_index("a") +``` + +In the above example, df2 holds a copy of the data of df, and df3 holds a copy of the +data of df2. Mutating any of those DataFrames would not modify the parent dataframe. + +_With this proposal_, those methods would continue to return new objects, but would use +the shallow copy mechanism with Copy-on-Write so that in practice, those methods don't +need to copy the data at each step, while preserving the current behaviour. + +### Series and DataFrame constructors + +_Currently_, the Series and DataFrame constructors don't always copy the input +(depending on the type of the input). For example: + +```python +>>> s = pd.Series([1, 2, 3]) +>>> s2 = pd.Series(s) +>>> s2.iloc[0] = 0 # will also modify the parent Series s +>>> s +0 0 # <-- modified +1 2 +2 3 +dtype: int64 +``` + +_With this proposal_, we can also use the shallow copy with Copy-on-Write approach _by +default_ in the constructors. This would mean that by default, a new Series or DataFrame +(like `s2` in the above example) would not modify the data from which it is being +constructed (when being modified itself), honoring the proposed rules. + +## More background: Current behaviour of views vs copy + +To the best of our knowledge, indexing operations currently return views in the +following cases: + +* Selecting a single column (as a Series) out of a DataFrame is always a view + (``df['a']``) +* Slicing columns from a DataFrame creating a subset DataFrame (``df[['a':'b']]`` or + ``df.loc[:, 'a': 'b']``) is a view _if_ the the original DataFrame consists of a + single block (single dtype, consolidated) and _if_ you are slicing (so not a list + selection). In all other cases, getting a subset is always a copy. +* Selecting rows _can_ return a view, when the row indexer is a `slice` object. + +Remaining operations (subsetting rows with a list indexer or boolean mask) in practice +return a copy, and we will raise a SettingWithCopyWarning when the user tries to modify +the subset. + +## More background: Previous attempts + +We've discussed this general issue before. [https://github.com/pandas-dev/pandas/issues/10954](https://github.com/pandas-dev/pandas/issues/10954) and a few pull requests ([https://github.com/pandas-dev/pandas/pull/12036](https://github.com/pandas-dev/pandas/pull/12036), [https://github.com/pandas-dev/pandas/pull/11207](https://github.com/pandas-dev/pandas/pull/11207), [https://github.com/pandas-dev/pandas/pull/11500](https://github.com/pandas-dev/pandas/pull/11500)). + +## Comparison with other languages / libraries + +### R + +For the user, R has somewhat similar behaviour. Most R objects can be considered +immutable, through "copy-on-modify" +([https://adv-r.hadley.nz/names-values.html#copy-on-modify](https://adv-r.hadley.nz/names-values.html#copy-on-modify)). +But in contrast to Python, in R this is a language feature, and any assignment (binding +a variable to a new name) or passing as function argument will essentially create a +"copy" (when mutating such an object, at that point the actual data get copied and +rebind to the name): + +```r +x <- c(1, 2, 3) +y <- x +y[[1]] <- 10 # does not modify x +``` + +While if you would do the above example in Python with a list, x and y are "identical" +and mutating one will also mutate the other. + +As a consequence of this language behaviour, modifying a `data.frame` will not modify +other data.frames that might share memory (before being copied with "copy-on-modify"). + +### Polars + +Polars ([https://github.com/pola-rs/polars](https://github.com/pola-rs/polars)) is a +DataFrame library with a Python interface, mainly written in Rust on top of Arrow. It +explicitly +[mentions](https://pola-rs.github.io/polars-book/user-guide/introduction.html#current-status) +"Copy-on-Write" semantics as one its features. + +Based on some experiments, the user-facing behaviour of Polars seems similar to the behaviour +described in this proposal (mutating a DataFrame/Series never mutates a parent/child +object, and so chained assignment also doesn't work) + + +## PDEP-7 History + +- July 2021: Initial version +- February 2023: Converted into a PDEP + +Note: this proposal has been discussed before it was turned into a PDEP. The main +discussion happened in [GH-36195](https://github.com/pandas-dev/pandas/issues/36195). +This document is modified from the original document discussing different options for +clear copy/view semantics started by Tom Augspurger +([google doc](https://docs.google.com/document/d/1csGE4qigPR2vzmU2--jwURn3sK5k5eVewinxd8OUPk0/edit)). + +Related mailing list discussion: [https://mail.python.org/pipermail/pandas-dev/2021-July/001358.html](https://t.co/vT5dOMhNjV?amp=1) diff -Nru pandas-2.1.4+dfsg/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md pandas-2.2.2+dfsg/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md --- pandas-2.1.4+dfsg/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,479 @@ +# PDEP-12: Compact and reversible JSON interface + +- Created: 16 June 2023 +- Status: Rejected +- Discussion: + [#53252](https://github.com/pandas-dev/pandas/issues/53252) + [#55038](https://github.com/pandas-dev/pandas/issues/55038) +- Author: [Philippe THOMY](https://github.com/loco-philippe) +- Revision: 3 + +##### Summary + +- [Abstract](./0012-compact-and-reversible-JSON-interface.md/#Abstract) + - [Problem description](./0012-compact-and-reversible-JSON-interface.md/#Problem-description) + - [Feature Description](./0012-compact-and-reversible-JSON-interface.md/#Feature-Description) +- [Scope](./0012-compact-and-reversible-JSON-interface.md/#Scope) +- [Motivation](./0012-compact-and-reversible-JSON-interface.md/#Motivation) + - [Why is it important to have a compact and reversible JSON interface ?](./0012-compact-and-reversible-JSON-interface.md/#Why-is-it-important-to-have-a-compact-and-reversible-JSON-interface-?) + - [Is it relevant to take an extended type into account ?](./0012-compact-and-reversible-JSON-interface.md/#Is-it-relevant-to-take-an-extended-type-into-account-?) + - [Is this only useful for pandas ?](./0012-compact-and-reversible-JSON-interface.md/#Is-this-only-useful-for-pandas-?) +- [Description](./0012-compact-and-reversible-JSON-interface.md/#Description) + - [Data typing](./0012-compact-and-reversible-JSON-interface.md/#Data-typing) + - [Correspondence between TableSchema and pandas](./panda0012-compact-and-reversible-JSON-interfaces_PDEP.md/#Correspondence-between-TableSchema-and-pandas) + - [JSON format](./0012-compact-and-reversible-JSON-interface.md/#JSON-format) + - [Conversion](./0012-compact-and-reversible-JSON-interface.md/#Conversion) +- [Usage and impact](./0012-compact-and-reversible-JSON-interface.md/#Usage-and-impact) + - [Usage](./0012-compact-and-reversible-JSON-interface.md/#Usage) + - [Compatibility](./0012-compact-and-reversible-JSON-interface.md/#Compatibility) + - [Impacts on the pandas framework](./0012-compact-and-reversible-JSON-interface.md/#Impacts-on-the-pandas-framework) + - [Risk to do / risk not to do](./0012-compact-and-reversible-JSON-interface.md/#Risk-to-do-/-risk-not-to-do) +- [Implementation](./0012-compact-and-reversible-JSON-interface.md/#Implementation) + - [Modules](./0012-compact-and-reversible-JSON-interface.md/#Modules) + - [Implementation options](./0012-compact-and-reversible-JSON-interface.md/#Implementation-options) +- [F.A.Q.](./0012-compact-and-reversible-JSON-interface.md/#F.A.Q.) +- [Synthesis](./0012-compact-and-reversible-JSON-interface.md/Synthesis) +- [Core team decision](./0012-compact-and-reversible-JSON-interface.md/#Core-team-decision) +- [Timeline](./0012-compact-and-reversible-JSON-interface.md/#Timeline) +- [PDEP history](./0012-compact-and-reversible-JSON-interface.md/#PDEP-history) + +------------------------- + +## Abstract + +### Problem description + +The `dtype` and "Python type" are not explicitly taken into account in the current JSON interface. + +So, the JSON interface is not always reversible and has inconsistencies related to the consideration of the `dtype`. + +Another consequence is the partial application of the Table Schema specification in the `orient="table"` option (6 Table Schema data types are taken into account out of the 24 defined). + +Some JSON-interface problems are detailed in the [linked NoteBook](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_json_pandas.ipynb#Current-Json-interface) + +### Feature Description + +To have a simple, compact and reversible solution, I propose to use the [JSON-NTV format (Named and Typed Value)](https://github.com/loco-philippe/NTV#readme) - which integrates the notion of type - and its JSON-TAB variation for tabular data (the JSON-NTV format is defined in an [IETF Internet-Draft](https://datatracker.ietf.org/doc/draft-thomy-json-ntv/) (not yet an RFC !!) ). + +This solution allows to include a large number of types (not necessarily pandas `dtype`) which allows to have: + +- a Table Schema JSON interface (`orient="table"`) which respects the Table Schema specification (going from 6 types to 20 types), +- a global JSON interface for all pandas data formats. + +#### Global JSON interface example + +In the example below, a DataFrame with several data types is converted to JSON. + +The DataFrame resulting from this JSON is identical to the initial DataFrame (reversibility). + +With the existing JSON interface, this conversion is not possible. + +This example uses `ntv_pandas` module defined in the [ntv-pandas repository](https://github.com/loco-philippe/ntv-pandas#readme). + +Data example: + +```python +In [1]: from shapely.geometry import Point + from datetime import date + import pandas as pd + import ntv_pandas as npd + +In [2]: data = {'index': [100, 200, 300, 400, 500, 600], + 'dates::date': [date(1964,1,1), date(1985,2,5), date(2022,1,21), date(1964,1,1), date(1985,2,5), date(2022,1,21)], + 'value': [10, 10, 20, 20, 30, 30], + 'value32': pd.Series([12, 12, 22, 22, 32, 32], dtype='int32'), + 'res': [10, 20, 30, 10, 20, 30], + 'coord::point': [Point(1,2), Point(3,4), Point(5,6), Point(7,8), Point(3,4), Point(5,6)], + 'names': pd.Series(['john', 'eric', 'judith', 'mila', 'hector', 'maria'], dtype='string'), + 'unique': True } + +In [3]: df = pd.DataFrame(data).set_index('index') + +In [4]: df +Out[4]: dates::date value value32 res coord::point names unique + index + 100 1964-01-01 10 12 10 POINT (1 2) john True + 200 1985-02-05 10 12 20 POINT (3 4) eric True + 300 2022-01-21 20 22 30 POINT (5 6) judith True + 400 1964-01-01 20 22 10 POINT (7 8) mila True + 500 1985-02-05 30 32 20 POINT (3 4) hector True + 600 2022-01-21 30 32 30 POINT (5 6) maria True +``` + +JSON representation + +```python +In [5]: df_to_json = npd.to_json(df) + pprint(df_to_json, width=120) +Out[5]: {':tab': {'coord::point': [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0], [3.0, 4.0], [5.0, 6.0]], + 'dates::date': ['1964-01-01', '1985-02-05', '2022-01-21', '1964-01-01', '1985-02-05', '2022-01-21'], + 'index': [100, 200, 300, 400, 500, 600], + 'names::string': ['john', 'eric', 'judith', 'mila', 'hector', 'maria'], + 'res': [10, 20, 30, 10, 20, 30], + 'unique': [True, True, True, True, True, True], + 'value': [10, 10, 20, 20, 30, 30], + 'value32::int32': [12, 12, 22, 22, 32, 32]}} +``` + +Reversibility + +```python +In [5]: df_from_json = npd.read_json(df_to_json) + print('df created from JSON is equal to initial df ? ', df_from_json.equals(df)) +Out[5]: df created from JSON is equal to initial df ? True +``` + +Several other examples are provided in the [linked NoteBook](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_ntv_pandas.ipynb) + +#### Table Schema JSON interface example + +In the example below, a DataFrame with several Table Schema data types is converted to JSON. + +The DataFrame resulting from this JSON is identical to the initial DataFrame (reversibility). + +With the existing Table Schema JSON interface, this conversion is not possible. + +```python +In [1]: from shapely.geometry import Point + from datetime import date + +In [2]: df = pd.DataFrame({ + 'end february::date': ['date(2023,2,28)', 'date(2024,2,29)', 'date(2025,2,28)'], + 'coordinates::point': ['Point([2.3, 48.9])', 'Point([5.4, 43.3])', 'Point([4.9, 45.8])'], + 'contact::email': ['john.doe@table.com', 'lisa.minelli@schema.com', 'walter.white@breaking.com'] + }) + +In [3]: df +Out[3]: end february::date coordinates::point contact::email + 0 2023-02-28 POINT (2.3 48.9) john.doe@table.com + 1 2024-02-29 POINT (5.4 43.3) lisa.minelli@schema.com + 2 2025-02-28 POINT (4.9 45.8) walter.white@breaking.com +``` + +JSON representation + +```python +In [4]: df_to_table = npd.to_json(df, table=True) + pprint(df_to_table, width=140, sort_dicts=False) +Out[4]: {'schema': {'fields': [{'name': 'index', 'type': 'integer'}, + {'name': 'end february', 'type': 'date'}, + {'name': 'coordinates', 'type': 'geopoint', 'format': 'array'}, + {'name': 'contact', 'type': 'string', 'format': 'email'}], + 'primaryKey': ['index'], + 'pandas_version': '1.4.0'}, + 'data': [{'index': 0, 'end february': '2023-02-28', 'coordinates': [2.3, 48.9], 'contact': 'john.doe@table.com'}, + {'index': 1, 'end february': '2024-02-29', 'coordinates': [5.4, 43.3], 'contact': 'lisa.minelli@schema.com'}, + {'index': 2, 'end february': '2025-02-28', 'coordinates': [4.9, 45.8], 'contact': 'walter.white@breaking.com'}]} +``` + +Reversibility + +```python +In [5]: df_from_table = npd.read_json(df_to_table) + print('df created from JSON is equal to initial df ? ', df_from_table.equals(df)) +Out[5]: df created from JSON is equal to initial df ? True +``` + +Several other examples are provided in the [linked NoteBook](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_table_pandas.ipynb) + +## Scope + +The objective is to make available the proposed JSON interface for any type of data and for `orient="table"` option or a new option `orient="ntv"`. + +The proposed interface is compatible with existing data. + +## Motivation + +### Why extend the `orient=table` option to other data types? + +- The Table Schema specification defines 24 data types, 6 are taken into account in the pandas interface + +### Why is it important to have a compact and reversible JSON interface ? + +- a reversible interface provides an exchange format. +- a textual exchange format facilitates exchanges between platforms (e.g. OpenData) +- a JSON exchange format can be used at API level + +### Is it relevant to take an extended type into account ? + +- it avoids the addition of an additional data schema +- it increases the semantic scope of the data processed by pandas +- it is an answer to several issues (e.g. #12997, #14358, #16492, #35420, #35464, #36211, #39537, #49585, #50782, #51375, #52595, #53252) +- the use of a complementary type avoids having to modify the pandas data model + +### Is this only useful for pandas ? + +- the JSON-TAB format is applicable to tabular data and multi-dimensional data. +- this JSON interface can therefore be used for any application using tabular or multi-dimensional data. This would allow for example reversible data exchanges between pandas - DataFrame and Xarray - DataArray (Xarray issue under construction) [see example DataFrame / DataArray](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#Multidimensional-data). + +## Description + +The proposed solution is based on several key points: + +- data typing +- correspondence between TableSchema and pandas +- JSON format for tabular data +- conversion to and from JSON format + +### Data typing + +Data types are defined and managed in the NTV project (name, JSON encoder and decoder). + +Pandas `dtype` are compatible with NTV types : + +| **pandas dtype** | **NTV type** | +|--------------------|------------| +| intxx | intxx | +| uintxx | uintxx | +| floatxx | floatxx | +| datetime[ns] | datetime | +| datetime[ns, ] | datetimetz | +| timedelta[ns] | durationiso| +| string | string | +| boolean | boolean | + +Note: + +- datetime with timezone is a single NTV type (string ISO8601) +- `CategoricalDtype` and `SparseDtype` are included in the tabular JSON format +- `object` `dtype` is depending on the context (see below) +- `PeriodDtype` and `IntervalDtype` are to be defined + +JSON types (implicit or explicit) are converted in `dtype` following pandas JSON interface: + +| **JSON type** | **pandas dtype** | +|----------------|-------------------| +| number | int64 / float64 | +| string | string / object | +| array | object | +| object | object | +| true, false | boolean | +| null | NaT / NaN / None | + +Note: + +- if an NTV type is defined, the `dtype` is adjusted accordingly +- the consideration of null type data needs to be clarified + +The other NTV types are associated with `object` `dtype`. + +### Correspondence between TableSchema and pandas + +The TableSchema typing is carried by two attributes `format` and `type`. + +The table below shows the correspondence between TableSchema format / type and pandas NTVtype / dtype: + +| **format / type** | **NTV type / dtype** | +|--------------------|----------------------| +| default / datetime | / datetime64[ns] | +| default / number | / float64 | +| default / integer | / int64 | +| default / boolean | / bool | +| default / string | / object | +| default / duration | / timedelta64[ns] | +| email / string | email / string | +| uri / string | uri / string | +| default / object | object / object | +| default / array | array / object | +| default / date | date / object | +| default / time | time / object | +| default / year | year / int64 | +| default / yearmonth| month / int64 | +| array / geopoint | point / object | +| default / geojson | geojson / object | + +Note: + +- other TableSchema format are defined and are to be studied (uuid, binary, topojson, specific format for geopoint and datation) +- the first six lines correspond to the existing + +### JSON format + +The JSON format for the TableSchema interface is the existing. + +The JSON format for the Global interface is defined in [JSON-TAB](https://github.com/loco-philippe/NTV/blob/main/documentation/JSON-TAB-standard.pdf) specification. +It includes the naming rules originally defined in the [JSON-ND project](https://github.com/glenkleidon/JSON-ND) and support for categorical data. +The specification have to be updated to include sparse data. + +### Conversion + +When data is associated with a non-`object` `dtype`, pandas conversion methods are used. +Otherwise, NTV conversion is used. + +#### pandas -> JSON + +- `NTV type` is not defined : use `to_json()` +- `NTV type` is defined and `dtype` is not `object` : use `to_json()` +- `NTV type` is defined and `dtype` is `object` : use NTV conversion (if pandas conversion does not exist) + +#### JSON -> pandas + +- `NTV type` is compatible with a `dtype` : use `read_json()` +- `NTV type` is not compatible with a `dtype` : use NTV conversion (if pandas conversion does not exist) + +## Usage and Impact + +### Usage + +It seems to me that this proposal responds to important issues: + +- having an efficient text format for data exchange + + The alternative CSV format is not reversible and obsolete (last revision in 2005). Current CSV tools do not comply with the standard. + +- taking into account "semantic" data in pandas objects + +- having a complete Table Schema interface + +### Compatibility + +Interface can be used without NTV type (compatibility with existing data - [see examples](https://nbviewer.org/github/loco-philippe/ntv-pandas/blob/main/example/example_ntv_pandas.ipynb#Appendix-:-Series-tests)) + +If the interface is available, throw a new `orient` option in the JSON interface, the use of the feature is decoupled from the other features. + +### Impacts on the pandas framework + +Initially, the impacts are very limited: + +- modification of the `name` of `Series` or `DataFrame columns` (no functional impact), +- added an option in the Json interface (e.g. `orient='ntv'`) and added associated methods (no functional interference with the other methods) + +In later stages, several developments could be considered: + +- validation of the `name` of `Series` or `DataFrame columns` , +- management of the NTV type as a "complementary-object-dtype" +- functional extensions depending on the NTV type + +### Risk to do / risk not to do + +The JSON-NTV format and the JSON-TAB format are not (yet) recognized and used formats. The risk for pandas is that this function is not used (no functional impacts). + +On the other hand, the early use by pandas will allow a better consideration of the expectations and needs of pandas as well as a reflection on the evolution of the types supported by pandas. + +## Implementation + +### Modules + +Two modules are defined for NTV: + +- json-ntv + + this module manages NTV data without dependency to another module + +- ntvconnector + + those modules manage the conversion between objects and JSON data. They have dependency with objects modules (e.g. connectors with shapely location have dependency with shapely). + +The pandas integration of the JSON interface requires importing only the json-ntv module. + +### Implementation options + +The interface can be implemented as NTV connector (`SeriesConnector` and `DataFrameConnector`) and as a new pandas JSON interface `orient` option. + +Several pandas implementations are possible: + +1. External: + + In this implementation, the interface is available only in the NTV side. + This option means that this evolution of the JSON interface is not useful or strategic for pandas. + +2. NTV side: + + In this implementation, the interface is available in the both sides and the conversion is located inside NTV. + This option is the one that minimizes the impacts on the pandas side + +3. pandas side: + + In this implementation, the interface is available in the both sides and the conversion is located inside pandas. + This option allows pandas to keep control of this evolution + +4. pandas restricted: + + In this implementation, the pandas interface and the conversion are located inside pandas and only for non-object `dtype`. + This option makes it possible to offer a compact and reversible interface while prohibiting the introduction of types incompatible with the existing `dtype` + +## F.A.Q. + +**Q: Does `orient="table"` not do what you are proposing already?** + +**A**: In principle, yes, this option takes into account the notion of type. + +But this is very limited (see examples added in the [Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb)) : + +- **Types and Json interface** + - the only way to keep the types in the json interface is to use the `orient='table'` option + - few dtypes are not allowed in json-table interface : period, timedelta64, interval + - allowed types are not always kept in json-table interface + - data with 'object' dtype is kept only id data is string + - with categorical dtype, the underlying dtype is not included in json interface +- **Data compactness** + - json-table interface is not compact (in the example in the [Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#data-compactness))the size is triple or quadruple the size of the compact format +- **Reversibility** + - Interface is reversible only with few dtypes : int64, float64, bool, string, datetime64 and partially categorical +- **External types** + - the interface does not accept external types + - Table-schema defines 20 data types but the `orient="table"` interface takes into account 5 data types (see [table](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#Converting-table-schema-type-to-pandas-dtype)) + - to integrate external types, it is necessary to first create ExtensionArray and ExtensionDtype objects + +The current interface is not compatible with the data structure defined by table-schema. For this to be possible, it is necessary to integrate a "type extension" like the one proposed (this has moreover been partially achieved with the notion of `extDtype` found in the interface for several formats). + +**Q: In general, we should only have 1 `"table"` format for pandas in read_json/to_json. There is also the issue of backwards compatibility if we do change the format. The fact that the table interface is buggy is not a reason to add a new interface (I'd rather fix those bugs). Can the existing format be adapted in a way that fixes the type issues/issues with roundtripping?** + +**A**: I will add two additional remarks: + +- the types defined in Tableschema are partially taken into account (examples of types not taken into account in the interface: string-uri, array, date, time, year, geopoint, string-email): +- the `read_json()` interface works too with the following data: `{'simple': [1,2,3] }` (contrary to what is indicated in the documentation) but it is impossible with `to_json()` to recreate this simple json. + +I think that the problem cannot be limited to bug fixes and that a clear strategy must be defined for the Json interface in particular with the gradual abandonment in open-data solutions of the obsolete CSV format in favor of a Json format. + +As stated, the proposed solution addresses several shortcomings of the current interface and could simply fit into the pandas environment (the other option would be to consider that the Json interface is a peripheral function of pandas and can remain external to pandas) regardless of the `orient='table'` option. + +It is nevertheless possible to merge the proposed format and the `orient='table'` format in order to have an explicit management of the notion of `extDtype` + +**Q: As far as I can tell, JSON NTV is not in any form a standardised JSON format. I believe that pandas (and geopandas, which is where I came from to this issue) should try to follow either de facto or de jure standards and do not opt in for a file format that does not have any community support at this moment. This can obviously change in the future and that is where this PR should be revised. Why would pandas use this standard?** + +**A**: As indicated in the issue (and detailed in [the attached Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb)), the json interface is not reversible (`to_json` then `read_json` does not always return the initial object) and several shortcomings and bugs are present. The main cause of this problem is that the data type is not taken into account in the JSON format (or very partially with the `orient='table'` option). + +The proposal made answers this problem ([the example at the beginning of Notebook](https://nbviewer.org/github/loco-philippe/NTV/blob/main/example/example_pandas.ipynb#0---Simple-example) simply and clearly illustrates the interest of the proposal). + +Regarding the underlying JSON-NTV format, its impact is quite low for tabular data (it is limited to adding the type in the field name). +Nevertheless, the question is relevant: The JSON-NTV format ([IETF Internet-Draft](https://datatracker.ietf.org/doc/draft-thomy-json-ntv/)) is a shared, documented, supported and implemented format, but indeed the community support is for the moment reduced but it only asks to expand !! + +## Synthesis + +To conclude, + +- if it is important (or strategic) to have a reversible JSON interface for any type of data, the proposal can be allowed, +- if not, a third-party package listed in the [ecosystem](https://pandas.pydata.org/community/ecosystem.html) that reads/writes this format to/from pandas DataFrames should be considered + +## Core team decision + +Vote was open from september-11 to setpember-26: + +- Final tally is 0 approvals, 5 abstentions, 7 disapprove. The quorum has been met. The PDEP fails. + +**Disapprove comments** : + +- 1 Given the newness of the proposed JSON NTV format, I would support (as described in the PDEP): "if not, a third-party package listed in the ecosystem that reads/writes this format to/from pandas DataFrames should be considered" +- 2 Same reason as -1-, this should be a third party package for now +- 3 Not mature enough, and not clear what the market size would be. +- 4 for the same reason I left in the PDEP: "I think this (JSON-NTV format) does not meet the bar of being a commonly used format for implementation within pandas" +- 5 agree with -4- +- 6 agree with the other core-dev responders. I think work in the existing json interface is extremely valuable. A number of the original issues raised are just bug fixes / extensions of already existing functionality. Trying to start anew is likely not worth the migration effort. That said if a format is well supported in the community we can reconsider in the future (obviously json is well supported but the actual specification detailed here is too new / not accepted as a standard) +- 7 while I do think having a more comprehensive JSON format would be worthwhile, making a new format part of pandas means an implicit endorsement of a standard that is still being reviewed by the broader community. + +**Decision**: + +- add the `ntv-pandas` package in the [ecosystem](https://pandas.pydata.org/community/ecosystem.html) +- revisit again this PDEP at a later stage, for example in 1/2 to 1 year (based on the evolution of the Internet draft [JSON semantic format (JSON-NTV)](https://www.ietf.org/archive/id/draft-thomy-json-ntv-01.html) and the usage of the [ntv-pandas](https://github.com/loco-philippe/ntv-pandas#readme)) + +## Timeline + +Not applicable + +## PDEP History + +- 16 June 2023: Initial draft +- 22 July 2023: Add F.A.Q. +- 06 September 2023: Add Table Schema extension +- 01 Octobre: Add Core team decision diff -Nru pandas-2.1.4+dfsg/web/pandas/versions.json pandas-2.2.2+dfsg/web/pandas/versions.json --- pandas-2.1.4+dfsg/web/pandas/versions.json 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/web/pandas/versions.json 2024-04-10 17:42:52.000000000 +0000 @@ -5,9 +5,20 @@ "url": "https://pandas.pydata.org/docs/dev/" }, { - "name": "2.0 (stable)", + "name": "2.2 (stable)", + "version": "2.2", + "url": "https://pandas.pydata.org/docs/", + "preferred": true + }, + { + "name": "2.1", + "version": "2.1", + "url": "https://pandas.pydata.org/pandas-docs/version/2.1/" + }, + { + "name": "2.0", "version": "2.0", - "url": "https://pandas.pydata.org/docs/" + "url": "https://pandas.pydata.org/pandas-docs/version/2.0/" }, { "name": "1.5", diff -Nru pandas-2.1.4+dfsg/web/pandas_web.py pandas-2.2.2+dfsg/web/pandas_web.py --- pandas-2.1.4+dfsg/web/pandas_web.py 2023-12-08 14:17:35.000000000 +0000 +++ pandas-2.2.2+dfsg/web/pandas_web.py 2024-04-10 17:42:52.000000000 +0000 @@ -27,6 +27,7 @@ import collections import datetime import importlib +import itertools import json import operator import os @@ -40,6 +41,7 @@ import feedparser import jinja2 import markdown +from packaging import version import requests import yaml @@ -175,7 +177,9 @@ context["maintainers"]["active"] + context["maintainers"]["inactive"] ): resp = requests.get( - f"https://api.github.com/users/{user}", headers=GITHUB_API_HEADERS + f"https://api.github.com/users/{user}", + headers=GITHUB_API_HEADERS, + timeout=5, ) if resp.status_code == 403: sys.stderr.write( @@ -184,7 +188,7 @@ # if we exceed github api quota, we use the github info # of maintainers saved with the website resp_bkp = requests.get( - context["main"]["production_url"] + "maintainers.json" + context["main"]["production_url"] + "maintainers.json", timeout=5 ) resp_bkp.raise_for_status() maintainers_info = resp_bkp.json() @@ -214,10 +218,13 @@ resp = requests.get( f"https://api.github.com/repos/{github_repo_url}/releases", headers=GITHUB_API_HEADERS, + timeout=5, ) if resp.status_code == 403: sys.stderr.write("WARN: GitHub API quota exceeded when fetching releases\n") - resp_bkp = requests.get(context["main"]["production_url"] + "releases.json") + resp_bkp = requests.get( + context["main"]["production_url"] + "releases.json", timeout=5 + ) resp_bkp.raise_for_status() releases = resp_bkp.json() else: @@ -240,6 +247,7 @@ context["releases"].append( { "name": release["tag_name"].lstrip("v"), + "parsed_version": version.parse(release["tag_name"].lstrip("v")), "tag": release["tag_name"], "published": published, "url": ( @@ -249,7 +257,17 @@ ), } ) - + # sorting out obsolete versions + grouped_releases = itertools.groupby( + context["releases"], + key=lambda r: (r["parsed_version"].major, r["parsed_version"].minor), + ) + context["releases"] = [ + max(release_group, key=lambda r: r["parsed_version"].minor) + for _, release_group in grouped_releases + ] + # sorting releases by version number + context["releases"].sort(key=lambda r: r["parsed_version"], reverse=True) return context @staticmethod @@ -302,10 +320,13 @@ "https://api.github.com/search/issues?" f"q=is:pr is:open label:PDEP repo:{github_repo_url}", headers=GITHUB_API_HEADERS, + timeout=5, ) if resp.status_code == 403: sys.stderr.write("WARN: GitHub API quota exceeded when fetching pdeps\n") - resp_bkp = requests.get(context["main"]["production_url"] + "pdeps.json") + resp_bkp = requests.get( + context["main"]["production_url"] + "pdeps.json", timeout=5 + ) resp_bkp.raise_for_status() pdeps = resp_bkp.json() else: diff -Nru pandas-2.1.4+dfsg/web/tests/test_pandas_web.py pandas-2.2.2+dfsg/web/tests/test_pandas_web.py --- pandas-2.1.4+dfsg/web/tests/test_pandas_web.py 1970-01-01 00:00:00.000000000 +0000 +++ pandas-2.2.2+dfsg/web/tests/test_pandas_web.py 2024-04-10 17:42:52.000000000 +0000 @@ -0,0 +1,88 @@ +from unittest.mock import ( + mock_open, + patch, +) + +import pytest +import requests + +from web.pandas_web import Preprocessors + + +class MockResponse: + def __init__(self, status_code: int, response: dict) -> None: + self.status_code = status_code + self._resp = response + + def json(self): + return self._resp + + @staticmethod + def raise_for_status() -> None: + return + + +@pytest.fixture +def context() -> dict: + return { + "main": {"github_repo_url": "pandas-dev/pandas"}, + "target_path": "test_target_path", + } + + +@pytest.fixture(scope="function") +def mock_response(monkeypatch, request) -> None: + def mocked_resp(*args, **kwargs): + status_code, response = request.param + return MockResponse(status_code, response) + + monkeypatch.setattr(requests, "get", mocked_resp) + + +_releases_list = [ + { + "prerelease": False, + "published_at": "2024-01-19T03:34:05Z", + "tag_name": "v1.5.6", + "assets": None, + }, + { + "prerelease": False, + "published_at": "2023-11-10T19:07:37Z", + "tag_name": "v2.1.3", + "assets": None, + }, + { + "prerelease": False, + "published_at": "2023-08-30T13:24:32Z", + "tag_name": "v2.1.0", + "assets": None, + }, + { + "prerelease": False, + "published_at": "2023-04-30T13:24:32Z", + "tag_name": "v2.0.0", + "assets": None, + }, + { + "prerelease": True, + "published_at": "2023-01-19T03:34:05Z", + "tag_name": "v1.5.3xd", + "assets": None, + }, + { + "prerelease": False, + "published_at": "2027-01-19T03:34:05Z", + "tag_name": "v10.0.1", + "assets": None, + }, +] + + +@pytest.mark.parametrize("mock_response", [(200, _releases_list)], indirect=True) +def test_web_preprocessor_creates_releases(mock_response, context) -> None: + m = mock_open() + with patch("builtins.open", m): + context = Preprocessors.home_add_releases(context) + release_versions = [release["name"] for release in context["releases"]] + assert release_versions == ["10.0.1", "2.1.3", "2.0.0", "1.5.6"]